✨ add support for invoice splitter auto extraction (#107)

mindee · Sep 2, 2024 · 366ed5d · 366ed5d
1 parent 4920ed1
commit 366ed5d
Show file tree

Hide file tree

Showing 25 changed files with 745 additions and 339 deletions.
diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
@@ -0,0 +1,55 @@
+#
+# Run Integration tests.
+#
+name: Integration Tests
+
+on:
+  - push
+
+jobs:
+  tests:
+    name: Test ${{ matrix.os }}, Ruby ${{ matrix.ruby }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os:
+          - "ubuntu-22.04"
+        ruby:
+          - "2.6"
+          - "2.7"
+          - "3.0"
+          - "3.1"
+          - "3.2"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: set up Ruby ${{ matrix.ruby }}
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ matrix.ruby }}
+          bundler-cache: true
+
+      - name: Install Ghostscript on Ubuntu
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt update
+          sudo apt-get install -y ghostscript
+
+      - name: Install Ghostscript and ImageMagick on macOS
+        if: runner.os == 'macOS'
+        run: brew install ghostscript imagemagick
+      - name: Change ImageMagick security policy on Ubuntu
+        if: runner.os == 'Linux'
+        run: |
+          DQT='"'
+          SRC="rights=${DQT}none${DQT} pattern=${DQT}PDF${DQT}"
+          RPL="rights=${DQT}read|write${DQT} pattern=${DQT}PDF${DQT}"
+          sudo sed -i "s/$SRC/$RPL/" /etc/ImageMagick-6/policy.xml
+
+      - name: Run Rspec for integration tests
+        env:
+          MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }}
+        run: |
+          bundle exec rake integration
diff --git a/Rakefile b/Rakefile
@@ -20,6 +20,11 @@ YARD::Rake::YardocTask.new(:doc) do |task|
   task.files = ['lib/**/*.rb']
 end
 
+desc 'Run integration tests'
+RSpec::Core::RakeTask.new(:integration) do |t|
+  t.pattern = 'spec/**/*_integration.rb'
+end
+
 Rake::Task[:doc].enhance do
   FileUtils.cp_r(
     File.join('docs', 'code_samples'),

diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb
@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+
+require 'mindee'
+
+# Init a new client
+mindee_client = Mindee::Client.new(api_key: 'my-api-key')
+
+# Load a file from disk
+input_source = mindee_client.source_from_path('/path/to/the/file.ext')
+
+if input_source.pdf?
+  pdf_extractor = Mindee::Extraction::PdfExtractor.new(input_source)
+  if pdf_extractor.page_count > 1
+    invoice_splitter_response = mindee_client.enqueue_and_parse(
+      input_source,
+      Mindee::Product::InvoiceSplitter::InvoiceSplitterV1
+    )
+    page_groups = invoice_splitter_response.document.inference.prediction.invoice_page_groups
+    extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict: false)
+    extracted_pdfs.each do |extracted_pdf|
+      # Optional: Save the files locally
+      # extracted_pdf.write_to_file("output/path")
+
+      invoice_result = mindee_client.parse(
+        InvoiceV4,
+        extracted_pdf.as_source
+      )
+      puts invoice_result
+    end
+  else
+    invoice_result = mindee_client.parse(
+      input_source,
+      Mindee::Product::Invoice::InvoiceV4
+    )
+    puts invoice_result.document
+  end
+else
+  invoice_result = mindee_client.parse(
+    input_source,
+    Mindee::Product::Invoice::InvoiceV4
+  )
+  puts invoice_result.document
+end
diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb
@@ -17,7 +17,7 @@ def initialize(api_key: '')
     # Call prediction API on a document and parse the results.
     #
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint] Endpoint of the API
     # Doesn't need to be set in the case of OTS APIs.
     #
@@ -59,8 +59,8 @@ def parse(
 
     # Enqueue a document for async parsing
     #
+    # @param product_class [Mindee::Inference] class of the product
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
     # Doesn't need to be set in the case of OTS APIs.
     #
@@ -104,7 +104,7 @@ def enqueue(
     # Parses a queued document
     #
     # @param job_id [String] Id of the job (queue) to poll from
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API
     # Doesn't need to be set in the case of OTS APIs.
     #
@@ -123,7 +123,7 @@ def parse_queued(
     # Enqueue a document for async parsing and automatically try to retrieve it
     #
     # @param input_source [Mindee::Input::Source::LocalInputSource, Mindee::Input::Source::UrlInputSource]
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param endpoint [HTTP::Endpoint, nil] Endpoint of the API.
     #   Doesn't need to be set in the case of OTS APIs.
     # @param all_words [Boolean] Whether to extract all the words on each page.
@@ -184,7 +184,7 @@ def enqueue_and_parse(
 
     # Load a prediction.
     #
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     # @param local_response [Mindee::Input::LocalResponse]
     # @return [Mindee::Parsing::Common::ApiResponse]
     def load_prediction(product_class, local_response)
@@ -269,7 +269,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries)
     end
 
     # Creates an endpoint with the given values. Raises an error if the endpoint is invalid.
-    # @param product_class [Mindee::Product] class of the product
+    # @param product_class [Mindee::Inference] class of the product
     #
     # @param endpoint_name [String] For custom endpoints, the "API name" field in the "Settings" page of the
     #  API Builder. Do not set for standard (off the shelf) endpoints.

diff --git a/lib/mindee/extraction.rb b/lib/mindee/extraction.rb
@@ -1,3 +1,6 @@
 # frozen_string_literal: true
 
 require_relative 'extraction/tax_extractor'
+require_relative 'extraction/multi_receipts_extractor'
+require_relative 'extraction/common'
+require_relative 'extraction/pdf_extractor'
diff --git a/lib/mindee/image_extraction/common.rb → lib/mindee/extraction/common.rb b/lib/mindee/image_extraction/common.rb → lib/mindee/extraction/common.rb
@@ -1,3 +1,4 @@
 # frozen_string_literal: true
 
+require_relative 'common/extracted_image'
 require_relative 'common/image_extractor'
diff --git a/...mage_extraction/common/extracted_image.rb → ...ndee/extraction/common/extracted_image.rb b/...mage_extraction/common/extracted_image.rb → ...ndee/extraction/common/extracted_image.rb
diff --git a/...mage_extraction/common/image_extractor.rb → ...ndee/extraction/common/image_extractor.rb b/...mage_extraction/common/image_extractor.rb → ...ndee/extraction/common/image_extractor.rb
@@ -10,7 +10,7 @@
 module Mindee
   # Image Extraction Module.
   module ImageExtraction
-    def attach_image_as_new_file(input_buffer)
+    def self.attach_image_as_new_file(input_buffer)
       # Attaches an image as a new page in a PdfDocument object.
       #
       # @param [StringIO] input_buffer Input buffer. Only supports JPEG.
@@ -24,9 +24,7 @@ def attach_image_as_new_file(input_buffer)
       scale_factor = original_density[0].to_f / 4.166666 # No clue why bit the resolution needs to be reduced for
       # the pdf otherwise the resulting image shrinks.
       magick_image.format('pdf', 0, { density: scale_factor.to_s })
-      io_buffer = StringIO.new
-      magick_image.write(io_buffer)
-      Origami::PDF.read(io_buffer)
+      Origami::PDF.read(StringIO.new(magick_image.to_blob))
     end
 
     # Extracts multiple images from a given local input source.

diff --git a/...ge_extraction/multi_receipts_extractor.rb → ...ee/extraction/multi_receipts_extractor.rb b/...ge_extraction/multi_receipts_extractor.rb → ...ee/extraction/multi_receipts_extractor.rb
diff --git a/...pts_extractor/multi_receipts_extractor.rb → ...pts_extractor/multi_receipts_extractor.rb b/...pts_extractor/multi_receipts_extractor.rb → ...pts_extractor/multi_receipts_extractor.rb
diff --git a/lib/mindee/extraction/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor.rb
@@ -0,0 +1,4 @@
+# frozen_string_literal: true
+
+require_relative 'pdf_extractor/pdf_extractor'
+require_relative 'pdf_extractor/extracted_pdf'
diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb
@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+
+module Mindee
+  # Pdf Extraction Module.
+  module Extraction
+    module PdfExtractor
+      # An extracted sub-Pdf.
+      class ExtractedPdf
+        # Byte contents of the pdf
+        # @return [StreamIO]
+        attr_reader :pdf_bytes
+
+        # Name of the file.
+        # @return [String]
+        attr_reader :filename
+
+        # @param pdf_bytes [StreamIO]
+        # @param filename [String]
+        def initialize(pdf_bytes, filename)
+          @pdf_bytes = pdf_bytes
+          @filename = filename
+        end
+
+        # Retrieves the page count for a given pdf.
+        # @return [Integer]
+        def page_count
+          current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes)
+          current_pdf.pages.size
+        rescue TypeError
+          raise 'Could not retrieve page count from Extracted PDF object.'
+        end
+
+        # Writes the contents of the current PDF object to a file.
+        # @param output_path [String] Path to write to.
+        def write_to_file(output_path)
+          raise 'Provided path is not a file' if File.directory?(destination)
+          raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path))
+
+          if File.extname(output_path).downcase == '.pdf'
+            base_path = File.expand_path('..', output_path)
+            output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
+          end
+
+          File.write(output_path, @pdf_bytes)
+        end
+
+        # Returns the current PDF object as a usable BytesInputSource.
+        # @return [Mindee::Input::Source::BytesInputSource]
+        def as_input_source
+          Mindee::Input::Source::BytesInputSource.new(@pdf_bytes.read, @filename)
+        end
+      end
+    end
+  end
+end
diff --git a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb
@@ -0,0 +1,111 @@
+# frozen_string_literal: true
+
+module Mindee
+  # Pdf Extraction Module.
+  module Extraction
+    # Pdf Extraction class.
+    module PdfExtractor
+      # Pdf extraction class.
+      class PdfExtractor
+        # @param local_input [Mindee::Input::Source::LocalInputSource]
+        def initialize(local_input)
+          @filename = local_input.filename
+          if local_input.pdf?
+            @source_pdf = local_input.io_stream
+          else
+            pdf_image = ImageExtraction.attach_image_as_new_file(local_input.io_stream)
+            io_buffer = StringIO.new
+            pdf_image.save(io_buffer)
+
+            @source_pdf = io_buffer
+          end
+        end
+
+        # Retrieves the page count for the Pdf object.
+        # @return [Integer]
+        def page_count
+          Mindee::PDF::PdfProcessor.open_pdf(@source_pdf).pages.size
+        end
+
+        # Creates a new Pdf from pages and save it into a buffer.
+        # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
+        # @return [StreamIO] The buffer containing the new Pdf.
+        def cut_pages(page_indexes)
+          options = {
+            page_indexes: page_indexes,
+          }
+
+          Mindee::PDF::PdfProcessor.parse(@source_pdf, options)
+        end
+
+        # Extract the sub-documents from the main pdf, based on the given list of page indexes.
+        # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
+        # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>] The buffer containing the new Pdf.
+        def extract_sub_documents(page_indexes)
+          extracted_pdfs = []
+          extension = File.extname(@filename)
+          basename = File.basename(@filename, extension)
+          page_indexes.each do |page_index_list|
+            if page_index_list.empty? || page_index_list.nil?
+              raise "Empty indexes aren't allowed for extraction #{page_index_list}"
+            end
+
+            page_index_list.each do |page_index|
+              raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative?
+            end
+            formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s
+            field_filename = "#{basename}_#{format('%03d',
+                                                   (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}"
+            extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list),
+                                                                               field_filename)
+            extracted_pdfs << extracted_pdf
+          end
+          extracted_pdfs
+        end
+
+        # rubocop:disable Metrics/CyclomaticComplexity
+        # rubocop:disable Metrics/PerceivedComplexity
+        # Extracts invoices as complete PDFs from the document.
+        # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1PageGroup>]
+        # @param strict [Boolean]
+        # @return [Array<Mindee::Extraction::PdfExtractor::ExtractedPdf>]
+        def extract_invoices(page_indexes, strict: false)
+          raise 'No indexes provided.' if page_indexes.empty?
+          unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup)
+            return extract_sub_documents(page_indexes)
+          end
+          return extract_sub_documents(page_indexes.map(&:page_indexes)) unless strict
+
+          correct_page_indexes = []
+          current_list = []
+          previous_confidence = nil
+          page_indexes.each_with_index do |page_index, i|
+            confidence = page_index.confidence
+            page_list = page_index.page_indexes
+
+            if confidence >= 0.5 && previous_confidence.nil?
+              current_list = page_list
+            elsif confidence >= 0.5 && i < page_indexes.length - 1
+              correct_page_indexes << current_list
+              current_list = page_list
+            elsif confidence < 0.5 && i == page_indexes.length - 1
+              current_list.concat page_list
+              correct_page_indexes << current_list
+            else
+              correct_page_indexes << current_list
+              correct_page_indexes << page_list
+            end
+            previous_confidence = confidence
+          end
+          extract_sub_documents(correct_page_indexes)
+        end
+        # rubocop:enable Metrics/CyclomaticComplexity
+        # rubocop:enable Metrics/PerceivedComplexity
+
+        private
+
+        attr_reader :source_pdf, :filename
+      end
+    end
+  end
+end