-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ add support for invoice splitter auto extraction (#107)
- Loading branch information
1 parent
4920ed1
commit 366ed5d
Showing
25 changed files
with
745 additions
and
339 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# | ||
# Run Integration tests. | ||
# | ||
name: Integration Tests | ||
|
||
on: | ||
- push | ||
|
||
jobs: | ||
tests: | ||
name: Test ${{ matrix.os }}, Ruby ${{ matrix.ruby }} | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
matrix: | ||
os: | ||
- "ubuntu-22.04" | ||
ruby: | ||
- "2.6" | ||
- "2.7" | ||
- "3.0" | ||
- "3.1" | ||
- "3.2" | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
submodules: recursive | ||
|
||
- name: set up Ruby ${{ matrix.ruby }} | ||
uses: ruby/setup-ruby@v1 | ||
with: | ||
ruby-version: ${{ matrix.ruby }} | ||
bundler-cache: true | ||
|
||
- name: Install Ghostscript on Ubuntu | ||
if: runner.os == 'Linux' | ||
run: | | ||
sudo apt update | ||
sudo apt-get install -y ghostscript | ||
- name: Install Ghostscript and ImageMagick on macOS | ||
if: runner.os == 'macOS' | ||
run: brew install ghostscript imagemagick | ||
- name: Change ImageMagick security policy on Ubuntu | ||
if: runner.os == 'Linux' | ||
run: | | ||
DQT='"' | ||
SRC="rights=${DQT}none${DQT} pattern=${DQT}PDF${DQT}" | ||
RPL="rights=${DQT}read|write${DQT} pattern=${DQT}PDF${DQT}" | ||
sudo sed -i "s/$SRC/$RPL/" /etc/ImageMagick-6/policy.xml | ||
- name: Run Rspec for integration tests | ||
env: | ||
MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }} | ||
run: | | ||
bundle exec rake integration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'mindee' | ||
|
||
# Init a new client | ||
mindee_client = Mindee::Client.new(api_key: 'my-api-key') | ||
|
||
# Load a file from disk | ||
input_source = mindee_client.source_from_path('/path/to/the/file.ext') | ||
|
||
if input_source.pdf? | ||
pdf_extractor = Mindee::Extraction::PdfExtractor.new(input_source) | ||
if pdf_extractor.page_count > 1 | ||
invoice_splitter_response = mindee_client.enqueue_and_parse( | ||
input_source, | ||
Mindee::Product::InvoiceSplitter::InvoiceSplitterV1 | ||
) | ||
page_groups = invoice_splitter_response.document.inference.prediction.invoice_page_groups | ||
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict: false) | ||
extracted_pdfs.each do |extracted_pdf| | ||
# Optional: Save the files locally | ||
# extracted_pdf.write_to_file("output/path") | ||
|
||
invoice_result = mindee_client.parse( | ||
InvoiceV4, | ||
extracted_pdf.as_source | ||
) | ||
puts invoice_result | ||
end | ||
else | ||
invoice_result = mindee_client.parse( | ||
input_source, | ||
Mindee::Product::Invoice::InvoiceV4 | ||
) | ||
puts invoice_result.document | ||
end | ||
else | ||
invoice_result = mindee_client.parse( | ||
input_source, | ||
Mindee::Product::Invoice::InvoiceV4 | ||
) | ||
puts invoice_result.document | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
# frozen_string_literal: true | ||
|
||
require_relative 'extraction/tax_extractor' | ||
require_relative 'extraction/multi_receipts_extractor' | ||
require_relative 'extraction/common' | ||
require_relative 'extraction/pdf_extractor' |
1 change: 1 addition & 0 deletions
1
lib/mindee/image_extraction/common.rb → lib/mindee/extraction/common.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
# frozen_string_literal: true | ||
|
||
require_relative 'common/extracted_image' | ||
require_relative 'common/image_extractor' |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# frozen_string_literal: true | ||
|
||
require_relative 'pdf_extractor/pdf_extractor' | ||
require_relative 'pdf_extractor/extracted_pdf' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# frozen_string_literal: true | ||
|
||
module Mindee | ||
# Pdf Extraction Module. | ||
module Extraction | ||
module PdfExtractor | ||
# An extracted sub-Pdf. | ||
class ExtractedPdf | ||
# Byte contents of the pdf | ||
# @return [StreamIO] | ||
attr_reader :pdf_bytes | ||
|
||
# Name of the file. | ||
# @return [String] | ||
attr_reader :filename | ||
|
||
# @param pdf_bytes [StreamIO] | ||
# @param filename [String] | ||
def initialize(pdf_bytes, filename) | ||
@pdf_bytes = pdf_bytes | ||
@filename = filename | ||
end | ||
|
||
# Retrieves the page count for a given pdf. | ||
# @return [Integer] | ||
def page_count | ||
current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes) | ||
current_pdf.pages.size | ||
rescue TypeError | ||
raise 'Could not retrieve page count from Extracted PDF object.' | ||
end | ||
|
||
# Writes the contents of the current PDF object to a file. | ||
# @param output_path [String] Path to write to. | ||
def write_to_file(output_path) | ||
raise 'Provided path is not a file' if File.directory?(destination) | ||
raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path)) | ||
|
||
if File.extname(output_path).downcase == '.pdf' | ||
base_path = File.expand_path('..', output_path) | ||
output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path) | ||
end | ||
|
||
File.write(output_path, @pdf_bytes) | ||
end | ||
|
||
# Returns the current PDF object as a usable BytesInputSource. | ||
# @return [Mindee::Input::Source::BytesInputSource] | ||
def as_input_source | ||
Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename) | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# frozen_string_literal: true | ||
|
||
module Mindee | ||
# Pdf Extraction Module. | ||
module Extraction | ||
# Pdf Extraction class. | ||
module PdfExtractor | ||
# Pdf extraction class. | ||
class PdfExtractor | ||
# @param local_input [Mindee::Input::Source::LocalInputSource] | ||
def initialize(local_input) | ||
@filename = local_input.filename | ||
if local_input.pdf? | ||
@source_pdf = local_input.io_stream | ||
else | ||
pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream) | ||
io_buffer = StringIO.new | ||
pdf_image.save(io_buffer) | ||
|
||
@source_pdf = io_buffer | ||
end | ||
end | ||
|
||
# Retrieves the page count for the Pdf object. | ||
# @return [Integer] | ||
def page_count | ||
Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size | ||
end | ||
|
||
# Creates a new Pdf from pages and save it into a buffer. | ||
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf. | ||
# @return [StreamIO] The buffer containing the new Pdf. | ||
def cut_pages(page_indexes) | ||
options = { | ||
page_indexes: page_indexes, | ||
} | ||
|
||
Mindee::PDF::PdfProcessor.parse(@source_pdf, options) | ||
end | ||
|
||
# Extract the sub-documents from the main pdf, based on the given list of page indexes. | ||
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf. | ||
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf. | ||
def extract_sub_documents(page_indexes) | ||
extracted_pdfs = [] | ||
extension = File.extname(@filename) | ||
basename = File.basename(@filename, extension) | ||
page_indexes.each do |page_index_list| | ||
if page_index_list.empty? || page_index_list.nil? | ||
raise "Empty indexes aren't allowed for extraction #{page_index_list}" | ||
end | ||
|
||
page_index_list.each do |page_index| | ||
raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative? | ||
end | ||
formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s | ||
field_filename = "#{basename}_#{format('%03d', | ||
(page_index_list[0] + 1))}-#{formatted_max_index}#{extension}" | ||
extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list), | ||
field_filename) | ||
extracted_pdfs << extracted_pdf | ||
end | ||
extracted_pdfs | ||
end | ||
|
||
# rubocop:disable Metrics/CyclomaticComplexity | ||
# rubocop:disable Metrics/PerceivedComplexity | ||
# Extracts invoices as complete PDFs from the document. | ||
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>] | ||
# @param strict [Boolean] | ||
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] | ||
def extract_invoices(page_indexes, strict: false) | ||
raise 'No indexes provided.' if page_indexes.empty? | ||
unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup) | ||
return extract_sub_documents(page_indexes) | ||
end | ||
return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict | ||
|
||
correct_page_indexes = [] | ||
current_list = [] | ||
previous_confidence = nil | ||
page_indexes.each_with_index do |page_index, i| | ||
confidence = page_index.confidence | ||
page_list = page_index.page_indexes | ||
|
||
if confidence >= 0.5 && previous_confidence.nil? | ||
current_list = page_list | ||
elsif confidence >= 0.5 && i < page_indexes.length - 1 | ||
correct_page_indexes << current_list | ||
current_list = page_list | ||
elsif confidence < 0.5 && i == page_indexes.length - 1 | ||
current_list.concat page_list | ||
correct_page_indexes << current_list | ||
else | ||
correct_page_indexes << current_list | ||
correct_page_indexes << page_list | ||
end | ||
previous_confidence = confidence | ||
end | ||
extract_sub_documents(correct_page_indexes) | ||
end | ||
# rubocop:enable Metrics/CyclomaticComplexity | ||
# rubocop:enable Metrics/PerceivedComplexity | ||
|
||
private | ||
|
||
attr_reader :source_pdf, :filename | ||
end | ||
end | ||
end | ||
end |
Oops, something went wrong.