Skip to content

Commit

Permalink
✨ add support for invoice splitter auto extraction (#107)
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastianMindee committed Sep 2, 2024
1 parent 4920ed1 commit 366ed5d
Show file tree
Hide file tree
Showing 25 changed files with 745 additions and 339 deletions.
55 changes: 55 additions & 0 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Run Integration tests.
#
name: Integration Tests

on:
- push

jobs:
tests:
name: Test ${{ matrix.os }}, Ruby ${{ matrix.ruby }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os:
- "ubuntu-22.04"
ruby:
- "2.6"
- "2.7"
- "3.0"
- "3.1"
- "3.2"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: set up Ruby ${{ matrix.ruby }}
uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby }}
bundler-cache: true

- name: Install Ghostscript on Ubuntu
if: runner.os == 'Linux'
run: |
sudo apt update
sudo apt-get install -y ghostscript
- name: Install Ghostscript and ImageMagick on macOS
if: runner.os == 'macOS'
run: brew install ghostscript imagemagick
- name: Change ImageMagick security policy on Ubuntu
if: runner.os == 'Linux'
run: |
DQT='"'
SRC="rights=${DQT}none${DQT} pattern=${DQT}PDF${DQT}"
RPL="rights=${DQT}read|write${DQT} pattern=${DQT}PDF${DQT}"
sudo sed -i "s/$SRC/$RPL/" /etc/ImageMagick-6/policy.xml
- name: Run Rspec for integration tests
env:
MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }}
run: |
bundle exec rake integration
5 changes: 5 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ YARD::Rake::YardocTask.new(:doc) do |task|
task.files = ['lib/**/*.rb']
end

desc 'Run integration tests'
RSpec::Core::RakeTask.new(:integration) do |t|
t.pattern = 'spec/**/*_integration.rb'
end

Rake::Task[:doc].enhance do
FileUtils.cp_r(
File.join('docs', 'code_samples'),
Expand Down
43 changes: 43 additions & 0 deletions examples/auto_invoice_splitter_extraction.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

require 'mindee'

# Init a new client
mindee_client = Mindee::Client.new(api_key: 'my-api-key')

# Load a file from disk
input_source = mindee_client.source_from_path('/path/to/the/file.ext')

if input_source.pdf?
pdf_extractor = Mindee::Extraction::PdfExtractor.new(input_source)
if pdf_extractor.page_count > 1
invoice_splitter_response = mindee_client.enqueue_and_parse(
input_source,
Mindee::Product::InvoiceSplitter::InvoiceSplitterV1
)
page_groups = invoice_splitter_response.document.inference.prediction.invoice_page_groups
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict: false)
extracted_pdfs.each do |extracted_pdf|
# Optional: Save the files locally
# extracted_pdf.write_to_file("output/path")

invoice_result = mindee_client.parse(
InvoiceV4,
extracted_pdf.as_source
)
puts invoice_result
end
else
invoice_result = mindee_client.parse(
input_source,
Mindee::Product::Invoice::InvoiceV4
)
puts invoice_result.document
end
else
invoice_result = mindee_client.parse(
input_source,
Mindee::Product::Invoice::InvoiceV4
)
puts invoice_result.document
end
12 changes: 6 additions & 6 deletions lib/mindee/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def initialize(api_key: '')
# Call prediction API on a document and parse the results.
#
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint] Endpoint of the API
# Doesn't need to be set in the case of OTS APIs.
#
Expand Down Expand Up @@ -59,8 +59,8 @@ def parse(

# Enqueue a document for async parsing
#
# @param product_class [Mindee::Inference] class of the product
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
# Doesn't need to be set in the case of OTS APIs.
#
Expand Down Expand Up @@ -104,7 +104,7 @@ def enqueue(
# Parses a queued document
#
# @param job_id [String] Id of the job (queue) to poll from
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
# Doesn't need to be set in the case of OTS APIs.
#
Expand All @@ -123,7 +123,7 @@ def parse_queued(
# Enqueue a document for async parsing and automatically try to retrieve it
#
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
# Doesn't need to be set in the case of OTS APIs.
# @param all_words [Boolean] Whether to extract all the words on each page.
Expand Down Expand Up @@ -184,7 +184,7 @@ def enqueue_and_parse(

# Load a prediction.
#
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param local_response [Mindee::Input::LocalResponse]
# @return [Mindee::Parsing::Common::ApiResponse]
def load_prediction(product_class, local_response)
Expand Down Expand Up @@ -269,7 +269,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries)
end

# Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
#
# @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
# API Builder. Do not set for standard (off the shelf) endpoints.
Expand Down
3 changes: 3 additions & 0 deletions lib/mindee/extraction.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# frozen_string_literal: true

require_relative 'extraction/tax_extractor'
require_relative 'extraction/multi_receipts_extractor'
require_relative 'extraction/common'
require_relative 'extraction/pdf_extractor'
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# frozen_string_literal: true

require_relative 'common/extracted_image'
require_relative 'common/image_extractor'
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
module Mindee
# Image Extraction Module.
module ImageExtraction
def attach_image_as_new_file(input_buffer)
def self.attach_image_as_new_file(input_buffer)
# Attaches an image as a new page in a PdfDocument object.
#
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
Expand All @@ -24,9 +24,7 @@ def attach_image_as_new_file(input_buffer)
scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
# the pdf otherwise the resulting image shrinks.
magick_image.format('pdf', 0, { density: scale_factor.to_s })
io_buffer = StringIO.new
magick_image.write(io_buffer)
Origami::PDF.read(io_buffer)
Origami::PDF.read(StringIO.new(magick_image.to_blob))
end

# Extracts multiple images from a given local input source.
Expand Down
4 changes: 4 additions & 0 deletions lib/mindee/extraction/pdf_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# frozen_string_literal: true

require_relative 'pdf_extractor/pdf_extractor'
require_relative 'pdf_extractor/extracted_pdf'
55 changes: 55 additions & 0 deletions lib/mindee/extraction/pdf_extractor/extracted_pdf.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# frozen_string_literal: true

module Mindee
# Pdf Extraction Module.
module Extraction
module PdfExtractor
# An extracted sub-Pdf.
class ExtractedPdf
# Byte contents of the pdf
# @return [StreamIO]
attr_reader :pdf_bytes

# Name of the file.
# @return [String]
attr_reader :filename

# @param pdf_bytes [StreamIO]
# @param filename [String]
def initialize(pdf_bytes, filename)
@pdf_bytes = pdf_bytes
@filename = filename
end

# Retrieves the page count for a given pdf.
# @return [Integer]
def page_count
current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
current_pdf.pages.size
rescue TypeError
raise 'Could not retrieve page count from Extracted PDF object.'
end

# Writes the contents of the current PDF object to a file.
# @param output_path [String] Path to write to.
def write_to_file(output_path)
raise 'Provided path is not a file' if File.directory?(destination)
raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))

if File.extname(output_path).downcase == '.pdf'
base_path = File.expand_path('..', output_path)
output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
end

File.write(output_path, @pdf_bytes)
end

# Returns the current PDF object as a usable BytesInputSource.
# @return [Mindee::Input::Source::BytesInputSource]
def as_input_source
Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
end
end
end
end
end
111 changes: 111 additions & 0 deletions lib/mindee/extraction/pdf_extractor/pdf_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# frozen_string_literal: true

module Mindee
# Pdf Extraction Module.
module Extraction
# Pdf Extraction class.
module PdfExtractor
# Pdf extraction class.
class PdfExtractor
# @param local_input [Mindee::Input::Source::LocalInputSource]
def initialize(local_input)
@filename = local_input.filename
if local_input.pdf?
@source_pdf = local_input.io_stream
else
pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
io_buffer = StringIO.new
pdf_image.save(io_buffer)

@source_pdf = io_buffer
end
end

# Retrieves the page count for the Pdf object.
# @return [Integer]
def page_count
Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
end

# Creates a new Pdf from pages and save it into a buffer.
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
# @return [StreamIO] The buffer containing the new Pdf.
def cut_pages(page_indexes)
options = {
page_indexes: page_indexes,
}

Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
end

# Extract the sub-documents from the main pdf, based on the given list of page indexes.
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
def extract_sub_documents(page_indexes)
extracted_pdfs = []
extension = File.extname(@filename)
basename = File.basename(@filename, extension)
page_indexes.each do |page_index_list|
if page_index_list.empty? || page_index_list.nil?
raise "Empty indexes aren't allowed for extraction #{page_index_list}"
end

page_index_list.each do |page_index|
raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
end
formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
field_filename = "#{basename}_#{format('%03d',
(page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
field_filename)
extracted_pdfs << extracted_pdf
end
extracted_pdfs
end

# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/PerceivedComplexity
# Extracts invoices as complete PDFs from the document.
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
# @param strict [Boolean]
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
def extract_invoices(page_indexes, strict: false)
raise 'No indexes provided.' if page_indexes.empty?
unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
return extract_sub_documents(page_indexes)
end
return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict

correct_page_indexes = []
current_list = []
previous_confidence = nil
page_indexes.each_with_index do |page_index, i|
confidence = page_index.confidence
page_list = page_index.page_indexes

if confidence >= 0.5 && previous_confidence.nil?
current_list = page_list
elsif confidence >= 0.5 && i < page_indexes.length - 1
correct_page_indexes << current_list
current_list = page_list
elsif confidence < 0.5 && i == page_indexes.length - 1
current_list.concat page_list
correct_page_indexes << current_list
else
correct_page_indexes << current_list
correct_page_indexes << page_list
end
previous_confidence = confidence
end
extract_sub_documents(correct_page_indexes)
end
# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/PerceivedComplexity

private

attr_reader :source_pdf, :filename
end
end
end
end
Loading

0 comments on commit 366ed5d

Please sign in to comment.