Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ add support for invoice splitter auto extraction #107

Merged
merged 8 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Run Integration tests.
#
name: Integration Tests

on:
- push

jobs:
tests:
name: Test ${{ matrix.os }}, Ruby ${{ matrix.ruby }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os:
- "ubuntu-22.04"
ruby:
- "2.6"
- "2.7"
- "3.0"
- "3.1"
- "3.2"
steps:
- uses: actions/checkout@v4
with:
submodules: recursive

- name: set up Ruby ${{ matrix.ruby }}
uses: ruby/setup-ruby@v1
with:
ruby-version: ${{ matrix.ruby }}
bundler-cache: true

- name: Install Ghostscript on Ubuntu
if: runner.os == 'Linux'
run: |
sudo apt update
sudo apt-get install -y ghostscript

- name: Install Ghostscript and ImageMagick on macOS
if: runner.os == 'macOS'
run: brew install ghostscript imagemagick
- name: Change ImageMagick security policy on Ubuntu
if: runner.os == 'Linux'
run: |
DQT='"'
SRC="rights=${DQT}none${DQT} pattern=${DQT}PDF${DQT}"
RPL="rights=${DQT}read|write${DQT} pattern=${DQT}PDF${DQT}"
sudo sed -i "s/$SRC/$RPL/" /etc/ImageMagick-6/policy.xml

- name: Run Rspec for integration tests
env:
MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }}
run: |
bundle exec rake integration
5 changes: 5 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ YARD::Rake::YardocTask.new(:doc) do |task|
task.files = ['lib/**/*.rb']
end

desc 'Run integration tests'
RSpec::Core::RakeTask.new(:integration) do |t|
t.pattern = 'spec/**/*_integration.rb'
end

Rake::Task[:doc].enhance do
FileUtils.cp_r(
File.join('docs', 'code_samples'),
Expand Down
43 changes: 43 additions & 0 deletions examples/auto_invoice_splitter_extraction.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# frozen_string_literal: true

require 'mindee'

# Init a new client
mindee_client = Mindee::Client.new(api_key: 'my-api-key')

# Load a file from disk
input_source = mindee_client.source_from_path('/path/to/the/file.ext')

if input_source.pdf?
pdf_extractor = Mindee::Extraction::PdfExtractor.new(input_source)
if pdf_extractor.page_count > 1
invoice_splitter_response = mindee_client.enqueue_and_parse(
input_source,
Mindee::Product::InvoiceSplitter::InvoiceSplitterV1
)
page_groups = invoice_splitter_response.document.inference.prediction.invoice_page_groups
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict: false)
extracted_pdfs.each do |extracted_pdf|
# Optional: Save the files locally
# extracted_pdf.write_to_file("output/path")

invoice_result = mindee_client.parse(
InvoiceV4,
extracted_pdf.as_source
)
puts invoice_result
end
else
invoice_result = mindee_client.parse(
input_source,
Mindee::Product::Invoice::InvoiceV4
)
puts invoice_result.document
end
else
invoice_result = mindee_client.parse(
input_source,
Mindee::Product::Invoice::InvoiceV4
)
puts invoice_result.document
end
12 changes: 6 additions & 6 deletions lib/mindee/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def initialize(api_key: '')
# Call prediction API on a document and parse the results.
#
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint] Endpoint of the API
# Doesn't need to be set in the case of OTS APIs.
#
Expand Down Expand Up @@ -59,8 +59,8 @@ def parse(

# Enqueue a document for async parsing
#
# @param product_class [Mindee::Inference] class of the product
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
# Doesn't need to be set in the case of OTS APIs.
#
Expand Down Expand Up @@ -104,7 +104,7 @@ def enqueue(
# Parses a queued document
#
# @param job_id [String] Id of the job (queue) to poll from
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
# Doesn't need to be set in the case of OTS APIs.
#
Expand All @@ -123,7 +123,7 @@ def parse_queued(
# Enqueue a document for async parsing and automatically try to retrieve it
#
# @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
# Doesn't need to be set in the case of OTS APIs.
# @param all_words [Boolean] Whether to extract all the words on each page.
Expand Down Expand Up @@ -184,7 +184,7 @@ def enqueue_and_parse(

# Load a prediction.
#
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
# @param local_response [Mindee::Input::LocalResponse]
# @return [Mindee::Parsing::Common::ApiResponse]
def load_prediction(product_class, local_response)
Expand Down Expand Up @@ -269,7 +269,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries)
end

# Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
# @param product_class [Mindee::Product] class of the product
# @param product_class [Mindee::Inference] class of the product
#
# @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
# API Builder. Do not set for standard (off the shelf) endpoints.
Expand Down
3 changes: 3 additions & 0 deletions lib/mindee/extraction.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# frozen_string_literal: true

require_relative 'extraction/tax_extractor'
require_relative 'extraction/multi_receipts_extractor'
require_relative 'extraction/common'
require_relative 'extraction/pdf_extractor'
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# frozen_string_literal: true

require_relative 'common/extracted_image'
require_relative 'common/image_extractor'
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
module Mindee
# Image Extraction Module.
module ImageExtraction
def attach_image_as_new_file(input_buffer)
def self.attach_image_as_new_file(input_buffer)
# Attaches an image as a new page in a PdfDocument object.
#
# @param [StringIO] input_buffer Input buffer. Only supports JPEG.
Expand All @@ -24,9 +24,7 @@ def attach_image_as_new_file(input_buffer)
scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
# the pdf otherwise the resulting image shrinks.
magick_image.format('pdf', 0, { density: scale_factor.to_s })
io_buffer = StringIO.new
magick_image.write(io_buffer)
Origami::PDF.read(io_buffer)
Origami::PDF.read(StringIO.new(magick_image.to_blob))
end

# Extracts multiple images from a given local input source.
Expand Down
4 changes: 4 additions & 0 deletions lib/mindee/extraction/pdf_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# frozen_string_literal: true

require_relative 'pdf_extractor/pdf_extractor'
require_relative 'pdf_extractor/extracted_pdf'
55 changes: 55 additions & 0 deletions lib/mindee/extraction/pdf_extractor/extracted_pdf.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# frozen_string_literal: true

module Mindee
# Pdf Extraction Module.
module Extraction
module PdfExtractor
# An extracted sub-Pdf.
class ExtractedPdf
# Byte contents of the pdf
# @return [StreamIO]
attr_reader :pdf_bytes

# Name of the file.
# @return [String]
attr_reader :filename

# @param pdf_bytes [StreamIO]
# @param filename [String]
def initialize(pdf_bytes, filename)
@pdf_bytes = pdf_bytes
@filename = filename
end

# Retrieves the page count for a given pdf.
# @return [Integer]
def page_count
current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
current_pdf.pages.size
rescue TypeError
raise 'Could not retrieve page count from Extracted PDF object.'
end

# Writes the contents of the current PDF object to a file.
# @param output_path [String] Path to write to.
def write_to_file(output_path)
raise 'Provided path is not a file' if File.directory?(destination)
raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))

if File.extname(output_path).downcase == '.pdf'
base_path = File.expand_path('..', output_path)
output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
end

File.write(output_path, @pdf_bytes)
end

# Returns the current PDF object as a usable BytesInputSource.
# @return [Mindee::Input::Source::BytesInputSource]
def as_input_source
Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
end
end
end
end
end
111 changes: 111 additions & 0 deletions lib/mindee/extraction/pdf_extractor/pdf_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# frozen_string_literal: true

module Mindee
# Pdf Extraction Module.
module Extraction
# Pdf Extraction class.
module PdfExtractor
# Pdf extraction class.
class PdfExtractor
# @param local_input [Mindee::Input::Source::LocalInputSource]
def initialize(local_input)
@filename = local_input.filename
if local_input.pdf?
@source_pdf = local_input.io_stream
else
pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
io_buffer = StringIO.new
pdf_image.save(io_buffer)

@source_pdf = io_buffer
end
end

# Retrieves the page count for the Pdf object.
# @return [Integer]
def page_count
Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
end

# Creates a new Pdf from pages and save it into a buffer.
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
# @return [StreamIO] The buffer containing the new Pdf.
def cut_pages(page_indexes)
options = {
page_indexes: page_indexes,
}

Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
end

# Extract the sub-documents from the main pdf, based on the given list of page indexes.
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
def extract_sub_documents(page_indexes)
extracted_pdfs = []
extension = File.extname(@filename)
basename = File.basename(@filename, extension)
page_indexes.each do |page_index_list|
if page_index_list.empty? || page_index_list.nil?
raise "Empty indexes aren't allowed for extraction #{page_index_list}"
end

page_index_list.each do |page_index|
raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
end
formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
field_filename = "#{basename}_#{format('%03d',
(page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
field_filename)
extracted_pdfs << extracted_pdf
end
extracted_pdfs
end

# rubocop:disable Metrics/CyclomaticComplexity
# rubocop:disable Metrics/PerceivedComplexity
# Extracts invoices as complete PDFs from the document.
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
# @param strict [Boolean]
# @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
def extract_invoices(page_indexes, strict: false)
raise 'No indexes provided.' if page_indexes.empty?
unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
return extract_sub_documents(page_indexes)
end
return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict

correct_page_indexes = []
current_list = []
previous_confidence = nil
page_indexes.each_with_index do |page_index, i|
confidence = page_index.confidence
page_list = page_index.page_indexes

if confidence >= 0.5 && previous_confidence.nil?
current_list = page_list
elsif confidence >= 0.5 && i < page_indexes.length - 1
correct_page_indexes << current_list
current_list = page_list
elsif confidence < 0.5 && i == page_indexes.length - 1
current_list.concat page_list
correct_page_indexes << current_list
else
correct_page_indexes << current_list
correct_page_indexes << page_list
end
previous_confidence = confidence
end
extract_sub_documents(correct_page_indexes)
end
# rubocop:enable Metrics/CyclomaticComplexity
# rubocop:enable Metrics/PerceivedComplexity

private

attr_reader :source_pdf, :filename
end
end
end
end
Loading
Loading