Skip to content

Commit

Permalink
Add text extraction and page objects support models (#110)
Browse files Browse the repository at this point in the history
  • Loading branch information
mara004 committed May 20, 2022
1 parent 9d5f996 commit bbc2438
Show file tree
Hide file tree
Showing 24 changed files with 484 additions and 62 deletions.
9 changes: 9 additions & 0 deletions .reuse/dep5
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,18 @@ Copyright: 2022 PDFium Developers
2022 geisserml <geisserml@gmail.com>
License: BSD-3-Clause OR Apache-2.0

Files: tests/resources/images.pdf
Copyright: 2022 geisserml <geisserml@gmail.com>
2022 Johannes Schauer Marin Rodrigues <josch@mister-muffin.de>
License: LicenseRef-FairUse
Comments:
Contains `mono.png` from the img2pdf test suite.

Files: tests/resources/render.pdf
tests/resources/multipage.pdf
tests/resources/encrypted.pdf
tests/resources/text.pdf
tests/resources/empty.pdf
Copyright: 2022 geisserml <geisserml@gmail.com>
License: CC-BY-4.0

Expand Down
8 changes: 8 additions & 0 deletions docs/source/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
# Changelog


## 1.10.0 (sched 2022-??-??)

- Updated PDFium from `5065` to `????`.
- Added cropping capabilities to the rendering engine.
- Added a support model for text extraction. Thanks to Mike Kroutikov for the `pdf_text_page` example in `pdfbrain`.
- Added a support model for locating page objects. Thanks to the `doctr` project for the idea.


## 1.9.1 (2022-05-16)

- Bugfix release to address incompatibility of the CLI with Python 3.6, caused by recent changes.
Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
'members': True,
'undoc-members': True,
'member-order': 'bysource',
'special-members': "__len__",
}
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
Expand Down
24 changes: 13 additions & 11 deletions docs/source/python_api.rst
Original file line number Diff line number Diff line change
@@ -1,26 +1,18 @@
.. SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
.. SPDX-License-Identifier: CC-BY-4.0
==========
Python API
==========


Version
=======
*******
.. automodule:: pypdfium2._version


Object oriented API
===================
.. automodule:: pypdfium2._helpers.classes


Functional API
==============

Opener
******
.. automodule:: pypdfium2._helpers.classes
.. automodule:: pypdfium2._helpers.opener
:exclude-members: open_pdf_auto, close_pdf

Expand All @@ -40,6 +32,16 @@ Text Inserter
*************
.. automodule:: pypdfium2._helpers.text_inserter

Text Extractor
**************
.. versionadded:: 1.10.0
.. automodule:: pypdfium2._helpers.text_extractor

Page Objects
************
.. versionadded:: 1.10.0
.. automodule:: pypdfium2._helpers.pageobjects

TOC Parser
**********
.. automodule:: pypdfium2._helpers.toc
Expand All @@ -58,6 +60,6 @@ Error Handler


Constants
=========
*********

.. automodule:: pypdfium2._helpers.constants
32 changes: 32 additions & 0 deletions src/pypdfium2/_cli/_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause


def pagetext_type(value):

if not value:
return

page_indices = []
splitted = value.split(',')

for page_or_range in splitted:

if '-' in page_or_range:

start, end = page_or_range.split('-')
start = int(start) - 1
end = int(end) - 1

if start < end:
pages = [i for i in range(start, end+1)]
else:
pages = [i for i in range(start, end-1, -1)]

page_indices.extend(pages)

else:

page_indices.append(int(page_or_range) - 1)

return page_indices
73 changes: 73 additions & 0 deletions src/pypdfium2/_cli/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

import os.path
from pypdfium2 import _namespace as pdfium
from pypdfium2._cli._parsers import pagetext_type


def attach_parser(subparsers):
parser = subparsers.add_parser(
"extract-text",
help = "Extract text from a PDF page in given boundaries",
)
parser.add_argument(
"input",
type = os.path.abspath,
help = "Path to the PDF document to work with",
)
parser.add_argument(
"--password",
help = "Password to unlock the PDF, if encrypted",
)
parser.add_argument(
"--pages",
help = "Page numbers to include (defaults to all)",
type = pagetext_type,
)
parser.add_argument(
"--left",
type = int,
default = 0,
help = "Left coordinate of the area to search for text.",
)
parser.add_argument(
"--bottom",
type = int,
default = 0,
help = "Bottom coordinate of the area to search for text.",
)
parser.add_argument(
"--right",
type = int,
default = 0,
help = "Right coordinate of the area to search for text.",
)
parser.add_argument(
"--top",
type = int,
default = 0,
help = "Top coordinate of the area to search for text.",
)


def main(args):

doc = pdfium.PdfDocument(args.input, args.password)
if args.pages is None:
args.pages = [i for i in range(len(doc))]

sep = ''
for index in args.pages:
textpage = doc.get_textpage(index)
text = textpage.get_text(
left = args.left,
bottom = args.bottom,
right = args.right,
top = args.top,
)
textpage.close()
print(sep + "# Page %s\n" % (index+1) + text)
sep = '\n'

doc.close()
60 changes: 60 additions & 0 deletions src/pypdfium2/_cli/find_pageobjects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

import os.path
from pypdfium2 import _namespace as pdfium
from pypdfium2._cli._parsers import pagetext_type


NameToObjtype = dict(
unknown = pdfium.FPDF_PAGEOBJ_UNKNOWN,
text = pdfium.FPDF_PAGEOBJ_TEXT,
path = pdfium.FPDF_PAGEOBJ_PATH,
image = pdfium.FPDF_PAGEOBJ_IMAGE,
shading = pdfium.FPDF_PAGEOBJ_SHADING,
form = pdfium.FPDF_PAGEOBJ_FORM,
)


def attach_parser(subparsers):
parser = subparsers.add_parser(
"find-pageobjects",
help = "Locate page objects of a certain type",
)
parser.add_argument(
"input",
type = os.path.abspath,
help = "Path to the PDF document to work with",
)
parser.add_argument(
"--password",
help = "Password to unlock the PDF, if encrypted"
)
parser.add_argument(
"--pages",
type = pagetext_type,
help = "The pages to search (defaults to all)",
)
parser.add_argument(
"--type",
required = True,
choices = [k for k in NameToObjtype.keys()],
help = "Object types to consider",
)


def main(args):

doc = pdfium.PdfDocument(args.input, args.password)
args.type = NameToObjtype[args.type]
if args.pages is None:
args.pages = [i for i in range(len(doc))]

for index in args.pages:
page = doc.get_page(index)
pageobjs = pdfium.get_pageobjs(page)
for obj in pdfium.filter_pageobjs(pageobjs, args.type):
print( pdfium.locate_pageobj(obj) )
pdfium.close_page(page)

doc.close()
16 changes: 10 additions & 6 deletions src/pypdfium2/_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
toc,
merge,
tile,
extract_text,
find_pageobjects,
)

try:
Expand All @@ -23,12 +25,14 @@
have_argcomplete = True


Subcommands = dict(
render = render,
toc = toc,
merge = merge,
tile = tile,
)
Subcommands = {
"render": render,
"toc": toc,
"merge": merge,
"tile": tile,
"extract-text": extract_text,
"find-pageobjects": find_pageobjects,
}


def parse_args(argv=sys.argv[1:]):
Expand Down
38 changes: 2 additions & 36 deletions src/pypdfium2/_cli/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@

import os
import ast
from pypdfium2 import _namespace as pdfium
from os.path import (
join,
abspath,
basename,
splitext,
)
from pypdfium2 import _namespace as pdfium
from pypdfium2._cli._parsers import pagetext_type


def rotation_type(string):
Expand All @@ -20,21 +21,16 @@ def rotation_type(string):


def colour_type(string):

if string.lower() == 'none':
return

else:

colour = ast.literal_eval(string)

if not isinstance(colour, (tuple, list)):
raise ValueError("Invalid colour type %s. Must be list or tuple." % type(colour))
if not len(colour) in (3, 4):
raise ValueError("Invalid number of colour values. Must be 3 or 4.")
if not all(isinstance(val, int) and 0 <= val <= 255 for val in colour):
raise ValueError("Colour values must be integers ranging from 0 to 255.")

return colour


Expand All @@ -45,36 +41,6 @@ def crop_type(string):
return crop


def pagetext_type(value):

if not value:
return

page_indices = []
splitted = value.split(',')

for page_or_range in splitted:

if '-' in page_or_range:

start, end = page_or_range.split('-')
start = int(start) - 1
end = int(end) - 1

if start < end:
pages = [i for i in range(start, end+1)]
else:
pages = [i for i in range(start, end-1, -1)]

page_indices.extend(pages)

else:

page_indices.append(int(page_or_range) - 1)

return page_indices


def attach_parser(subparsers):
parser = subparsers.add_parser(
"render",
Expand Down
2 changes: 2 additions & 0 deletions src/pypdfium2/_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@
from pypdfium2._helpers.boxes import *
from pypdfium2._helpers.classes import *
from pypdfium2._helpers.text_inserter import *
from pypdfium2._helpers.text_extractor import *
from pypdfium2._helpers.pageobjects import *
2 changes: 1 addition & 1 deletion src/pypdfium2/_helpers/boxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def _get_box(page, box_function, fallback_function):

left, bottom, right, top = c_float(), c_float(), c_float(), c_float()
left, bottom, right, top = [c_float() for _i in range(4)]

ret_code = box_function(page, byref(left), byref(bottom), byref(right), byref(top))
if not ret_code:
Expand Down
Loading

0 comments on commit bbc2438

Please sign in to comment.