-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add text extraction and page objects support models (#110)
- Loading branch information
Showing
24 changed files
with
484 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com> | ||
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause | ||
|
||
|
||
def pagetext_type(value): | ||
|
||
if not value: | ||
return | ||
|
||
page_indices = [] | ||
splitted = value.split(',') | ||
|
||
for page_or_range in splitted: | ||
|
||
if '-' in page_or_range: | ||
|
||
start, end = page_or_range.split('-') | ||
start = int(start) - 1 | ||
end = int(end) - 1 | ||
|
||
if start < end: | ||
pages = [i for i in range(start, end+1)] | ||
else: | ||
pages = [i for i in range(start, end-1, -1)] | ||
|
||
page_indices.extend(pages) | ||
|
||
else: | ||
|
||
page_indices.append(int(page_or_range) - 1) | ||
|
||
return page_indices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com> | ||
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause | ||
|
||
import os.path | ||
from pypdfium2 import _namespace as pdfium | ||
from pypdfium2._cli._parsers import pagetext_type | ||
|
||
|
||
def attach_parser(subparsers): | ||
parser = subparsers.add_parser( | ||
"extract-text", | ||
help = "Extract text from a PDF page in given boundaries", | ||
) | ||
parser.add_argument( | ||
"input", | ||
type = os.path.abspath, | ||
help = "Path to the PDF document to work with", | ||
) | ||
parser.add_argument( | ||
"--password", | ||
help = "Password to unlock the PDF, if encrypted", | ||
) | ||
parser.add_argument( | ||
"--pages", | ||
help = "Page numbers to include (defaults to all)", | ||
type = pagetext_type, | ||
) | ||
parser.add_argument( | ||
"--left", | ||
type = int, | ||
default = 0, | ||
help = "Left coordinate of the area to search for text.", | ||
) | ||
parser.add_argument( | ||
"--bottom", | ||
type = int, | ||
default = 0, | ||
help = "Bottom coordinate of the area to search for text.", | ||
) | ||
parser.add_argument( | ||
"--right", | ||
type = int, | ||
default = 0, | ||
help = "Right coordinate of the area to search for text.", | ||
) | ||
parser.add_argument( | ||
"--top", | ||
type = int, | ||
default = 0, | ||
help = "Top coordinate of the area to search for text.", | ||
) | ||
|
||
|
||
def main(args): | ||
|
||
doc = pdfium.PdfDocument(args.input, args.password) | ||
if args.pages is None: | ||
args.pages = [i for i in range(len(doc))] | ||
|
||
sep = '' | ||
for index in args.pages: | ||
textpage = doc.get_textpage(index) | ||
text = textpage.get_text( | ||
left = args.left, | ||
bottom = args.bottom, | ||
right = args.right, | ||
top = args.top, | ||
) | ||
textpage.close() | ||
print(sep + "# Page %s\n" % (index+1) + text) | ||
sep = '\n' | ||
|
||
doc.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# SPDX-FileCopyrightText: 2022 geisserml <geisserml@gmail.com> | ||
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause | ||
|
||
import os.path | ||
from pypdfium2 import _namespace as pdfium | ||
from pypdfium2._cli._parsers import pagetext_type | ||
|
||
|
||
NameToObjtype = dict( | ||
unknown = pdfium.FPDF_PAGEOBJ_UNKNOWN, | ||
text = pdfium.FPDF_PAGEOBJ_TEXT, | ||
path = pdfium.FPDF_PAGEOBJ_PATH, | ||
image = pdfium.FPDF_PAGEOBJ_IMAGE, | ||
shading = pdfium.FPDF_PAGEOBJ_SHADING, | ||
form = pdfium.FPDF_PAGEOBJ_FORM, | ||
) | ||
|
||
|
||
def attach_parser(subparsers): | ||
parser = subparsers.add_parser( | ||
"find-pageobjects", | ||
help = "Locate page objects of a certain type", | ||
) | ||
parser.add_argument( | ||
"input", | ||
type = os.path.abspath, | ||
help = "Path to the PDF document to work with", | ||
) | ||
parser.add_argument( | ||
"--password", | ||
help = "Password to unlock the PDF, if encrypted" | ||
) | ||
parser.add_argument( | ||
"--pages", | ||
type = pagetext_type, | ||
help = "The pages to search (defaults to all)", | ||
) | ||
parser.add_argument( | ||
"--type", | ||
required = True, | ||
choices = [k for k in NameToObjtype.keys()], | ||
help = "Object types to consider", | ||
) | ||
|
||
|
||
def main(args): | ||
|
||
doc = pdfium.PdfDocument(args.input, args.password) | ||
args.type = NameToObjtype[args.type] | ||
if args.pages is None: | ||
args.pages = [i for i in range(len(doc))] | ||
|
||
for index in args.pages: | ||
page = doc.get_page(index) | ||
pageobjs = pdfium.get_pageobjs(page) | ||
for obj in pdfium.filter_pageobjs(pageobjs, args.type): | ||
print( pdfium.locate_pageobj(obj) ) | ||
pdfium.close_page(page) | ||
|
||
doc.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.