Add text extraction and page objects support models (#110)

pypdfium2-team · May 20, 2022 · bbc2438 · bbc2438
1 parent 9d5f996
commit bbc2438
Show file tree

Hide file tree

Showing 24 changed files with 484 additions and 62 deletions.
diff --git a/.reuse/dep5 b/.reuse/dep5
@@ -39,9 +39,18 @@ Copyright: 2022 PDFium Developers
            2022 geisserml <geisserml@gmail.com>
 License: BSD-3-Clause OR Apache-2.0
 
+Files: tests/resources/images.pdf
+Copyright: 2022 geisserml <geisserml@gmail.com>
+           2022 Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
+License: LicenseRef-FairUse
+Comments:
+    Contains `mono.png` from the img2pdf test suite.
+
 Files: tests/resources/render.pdf
        tests/resources/multipage.pdf
        tests/resources/encrypted.pdf
+       tests/resources/text.pdf
+       tests/resources/empty.pdf
 Copyright: 2022 geisserml <geisserml@gmail.com>
 License: CC-BY-4.0
 

diff --git a/docs/source/changelog.md b/docs/source/changelog.md
@@ -4,6 +4,14 @@
 # Changelog
 
 
+## 1.10.0 (sched 2022-??-??)
+
+- Updated PDFium from `5065` to `????`.
+- Added cropping capabilities to the rendering engine.
+- Added a support model for text extraction. Thanks to Mike Kroutikov for the `pdf_text_page` example in `pdfbrain`.
+- Added a support model for locating page objects. Thanks to the `doctr` project for the idea.
+
+
 ## 1.9.1 (2022-05-16)
 
 - Bugfix release to address incompatibility of the CLI with Python 3.6, caused by recent changes.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -29,6 +29,7 @@
     'members': True,
     'undoc-members': True,
     'member-order': 'bysource',
+    'special-members': "__len__",
 }
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None),

diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
@@ -1,26 +1,18 @@
 .. SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
 .. SPDX-License-Identifier: CC-BY-4.0
 
-==========
 Python API
 ==========
 
 
 Version
-=======
+*******
 .. automodule:: pypdfium2._version
 
 
-Object oriented API
-===================
-.. automodule:: pypdfium2._helpers.classes
-
-
-Functional API
-==============
-
 Opener
 ******
+.. automodule:: pypdfium2._helpers.classes
 .. automodule:: pypdfium2._helpers.opener
     :exclude-members: open_pdf_auto, close_pdf
 
@@ -40,6 +32,16 @@ Text Inserter
 *************
 .. automodule:: pypdfium2._helpers.text_inserter
 
+Text Extractor
+**************
+.. versionadded:: 1.10.0
+.. automodule:: pypdfium2._helpers.text_extractor
+
+Page Objects
+************
+.. versionadded:: 1.10.0
+.. automodule:: pypdfium2._helpers.pageobjects
+
 TOC Parser
 **********
 .. automodule:: pypdfium2._helpers.toc
@@ -58,6 +60,6 @@ Error Handler
 
 
 Constants
-=========
+*********
 
 .. automodule:: pypdfium2._helpers.constants
diff --git a/src/pypdfium2/_cli/_parsers.py b/src/pypdfium2/_cli/_parsers.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+
+def pagetext_type(value):
+
+    if not value:
+        return
+
+    page_indices = []
+    splitted = value.split(',')
+
+    for page_or_range in splitted:
+
+        if '-' in page_or_range:
+
+            start, end = page_or_range.split('-')
+            start = int(start) - 1
+            end = int(end) - 1
+
+            if start < end:
+                pages = [i for i in range(start, end+1)]
+            else:
+                pages = [i for i in range(start, end-1, -1)]
+
+            page_indices.extend(pages)
+
+        else:
+
+            page_indices.append(int(page_or_range) - 1)
+
+    return page_indices
diff --git a/src/pypdfium2/_cli/extract_text.py b/src/pypdfium2/_cli/extract_text.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import os.path
+from pypdfium2 import _namespace as pdfium
+from pypdfium2._cli._parsers import pagetext_type
+
+
+def attach_parser(subparsers):
+    parser = subparsers.add_parser(
+        "extract-text",
+        help = "Extract text from a PDF page in given boundaries",
+    )
+    parser.add_argument(
+        "input",
+        type = os.path.abspath,
+        help = "Path to the PDF document to work with",
+    )
+    parser.add_argument(
+        "--password",
+        help = "Password to unlock the PDF, if encrypted",
+    )
+    parser.add_argument(
+        "--pages",
+        help = "Page numbers to include (defaults to all)",
+        type = pagetext_type,
+    )
+    parser.add_argument(
+        "--left",
+        type = int,
+        default = 0,
+        help = "Left coordinate of the area to search for text.",
+    )
+    parser.add_argument(
+        "--bottom",
+        type = int,
+        default = 0,
+        help = "Bottom coordinate of the area to search for text.",
+    )
+    parser.add_argument(
+        "--right",
+        type = int,
+        default = 0,
+        help = "Right coordinate of the area to search for text.",
+    )
+    parser.add_argument(
+        "--top",
+        type = int,
+        default = 0,
+        help = "Top coordinate of the area to search for text.",
+    )
+
+
+def main(args):
+
+    doc = pdfium.PdfDocument(args.input, args.password)
+    if args.pages is None:
+        args.pages = [i for i in range(len(doc))]
+
+    sep = ''
+    for index in args.pages:
+        textpage = doc.get_textpage(index)
+        text = textpage.get_text(
+            left = args.left,
+            bottom = args.bottom,
+            right = args.right,
+            top = args.top,
+        )
+        textpage.close()
+        print(sep + "# Page %s\n" % (index+1) + text)
+        sep = '\n'
+
+    doc.close()
diff --git a/src/pypdfium2/_cli/find_pageobjects.py b/src/pypdfium2/_cli/find_pageobjects.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
+
+import os.path
+from pypdfium2 import _namespace as pdfium
+from pypdfium2._cli._parsers import pagetext_type
+
+
+NameToObjtype = dict(
+    unknown = pdfium.FPDF_PAGEOBJ_UNKNOWN,
+    text    = pdfium.FPDF_PAGEOBJ_TEXT,
+    path    = pdfium.FPDF_PAGEOBJ_PATH,
+    image   = pdfium.FPDF_PAGEOBJ_IMAGE,
+    shading = pdfium.FPDF_PAGEOBJ_SHADING,
+    form    = pdfium.FPDF_PAGEOBJ_FORM,
+)
+
+
+def attach_parser(subparsers):
+    parser = subparsers.add_parser(
+        "find-pageobjects",
+        help = "Locate page objects of a certain type",
+    )
+    parser.add_argument(
+        "input",
+        type = os.path.abspath,
+        help = "Path to the PDF document to work with",
+    )
+    parser.add_argument(
+        "--password",
+        help = "Password to unlock the PDF, if encrypted"
+    )
+    parser.add_argument(
+        "--pages",
+        type = pagetext_type,
+        help = "The pages to search (defaults to all)",
+    )
+    parser.add_argument(
+        "--type",
+        required = True,
+        choices = [k for k in NameToObjtype.keys()],
+        help = "Object types to consider",
+    )
+
+
+def main(args):
+
+    doc = pdfium.PdfDocument(args.input, args.password)
+    args.type = NameToObjtype[args.type]
+    if args.pages is None:
+        args.pages = [i for i in range(len(doc))]
+
+    for index in args.pages:
+        page = doc.get_page(index)
+        pageobjs = pdfium.get_pageobjs(page)
+        for obj in pdfium.filter_pageobjs(pageobjs, args.type):
+            print( pdfium.locate_pageobj(obj) )
+        pdfium.close_page(page)
+
+    doc.close()
diff --git a/src/pypdfium2/_cli/main.py b/src/pypdfium2/_cli/main.py
@@ -13,6 +13,8 @@
     toc,
     merge,
     tile,
+    extract_text,
+    find_pageobjects,
 )
 
 try:
@@ -23,12 +25,14 @@
     have_argcomplete = True
 
 
-Subcommands = dict(
-    render = render,
-    toc = toc,
-    merge = merge,
-    tile = tile,
-)
+Subcommands = {
+    "render": render,
+    "toc": toc,
+    "merge": merge,
+    "tile": tile,
+    "extract-text": extract_text,
+    "find-pageobjects": find_pageobjects,
+}
 
 
 def parse_args(argv=sys.argv[1:]):

diff --git a/src/pypdfium2/_cli/render.py b/src/pypdfium2/_cli/render.py
@@ -3,13 +3,14 @@
 
 import os
 import ast
-from pypdfium2 import _namespace as pdfium
 from os.path import (
     join,
     abspath,
     basename,
     splitext,
 )
+from pypdfium2 import _namespace as pdfium
+from pypdfium2._cli._parsers import pagetext_type
 
 
 def rotation_type(string):
@@ -20,21 +21,16 @@ def rotation_type(string):
 
 
 def colour_type(string):
-
     if string.lower() == 'none':
         return
-
     else:
-
         colour = ast.literal_eval(string)
-
         if not isinstance(colour, (tuple, list)):
             raise ValueError("Invalid colour type %s. Must be list or tuple." % type(colour))
         if not len(colour) in (3, 4):
             raise ValueError("Invalid number of colour values. Must be 3 or 4.")
         if not all(isinstance(val, int) and 0 <= val <= 255 for val in colour):
             raise ValueError("Colour values must be integers ranging from 0 to 255.")
-
         return colour
 
 
@@ -45,36 +41,6 @@ def crop_type(string):
     return crop
 
 
-def pagetext_type(value):
-
-    if not value:
-        return
-
-    page_indices = []
-    splitted = value.split(',')
-
-    for page_or_range in splitted:
-
-        if '-' in page_or_range:
-
-            start, end = page_or_range.split('-')
-            start = int(start) - 1
-            end = int(end) - 1
-
-            if start < end:
-                pages = [i for i in range(start, end+1)]
-            else:
-                pages = [i for i in range(start, end-1, -1)]
-
-            page_indices.extend(pages)
-
-        else:
-
-            page_indices.append(int(page_or_range) - 1)
-
-    return page_indices
-
-
 def attach_parser(subparsers):
     parser = subparsers.add_parser(
         "render",

diff --git a/src/pypdfium2/_helpers/__init__.py b/src/pypdfium2/_helpers/__init__.py
@@ -15,3 +15,5 @@
 from pypdfium2._helpers.boxes import *
 from pypdfium2._helpers.classes import *
 from pypdfium2._helpers.text_inserter import *
+from pypdfium2._helpers.text_extractor import *
+from pypdfium2._helpers.pageobjects import *
diff --git a/src/pypdfium2/_helpers/boxes.py b/src/pypdfium2/_helpers/boxes.py
@@ -7,7 +7,7 @@
 
 def _get_box(page, box_function, fallback_function):
 
-    left, bottom, right, top = c_float(), c_float(), c_float(), c_float()
+    left, bottom, right, top = [c_float() for _i in range(4)]
 
     ret_code = box_function(page, byref(left), byref(bottom), byref(right), byref(top))
     if not ret_code: