Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Switched from PyMuPDF to pypdfium2 #829

Merged
merged 18 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ Documents can be interpreted from PDF or images:
```python
from doctr.io import DocumentFile
# PDF
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
# Image
single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
# Webpage
webpage_doc = DocumentFile.from_url("https://www.yoursite.com").as_images()
webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
# Multiple page images
multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
```
Expand All @@ -51,7 +51,7 @@ from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)
# PDF
doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
# Analyze
result = model(doc)
```
Expand Down
2 changes: 1 addition & 1 deletion demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def main():
uploaded_file = st.sidebar.file_uploader("Upload files", type=['pdf', 'png', 'jpeg', 'jpg'])
if uploaded_file is not None:
if uploaded_file.name.endswith('.pdf'):
doc = DocumentFile.from_pdf(uploaded_file.read()).as_images()
doc = DocumentFile.from_pdf(uploaded_file.read())
else:
doc = DocumentFile.from_images(uploaded_file.read())
page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1
Expand Down
10 changes: 0 additions & 10 deletions docs/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,3 @@ High-performance file reading and conversion to processable structured data.
.. automethod:: from_url

.. automethod:: from_images

.. autoclass:: PDF

.. automethod:: as_images

.. automethod:: get_words

.. automethod:: get_lines

.. automethod:: get_artefacts
166 changes: 10 additions & 156 deletions doctr/io/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, List

import cv2
import fitz
import numpy as np
import pypdfium2 as pdfium

from doctr.utils.common_types import AbstractFile, Bbox
from doctr.utils.common_types import AbstractFile

__all__ = ['read_pdf', 'PDF']
__all__ = ['read_pdf']


def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.ndarray]:
"""Read a PDF file and convert it into an image in numpy format

Example::
Expand All @@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:

Args:
file: the path to the PDF file
scale: rendering scale (1 corresponds to 72dpi)
Returns:
the list of pages decoded as numpy ndarray of shape H x W x 3
"""

if not isinstance(file, (str, Path, bytes)):
raise TypeError("unsupported object type for argument 'file'")

if isinstance(file, (str, Path)) and not Path(file).is_file():
raise FileNotFoundError(f"unable to access {file}")

fitz_args: Dict[str, AbstractFile] = {}

if isinstance(file, (str, Path)):
fitz_args['filename'] = file
elif isinstance(file, bytes):
fitz_args['stream'] = file
else:
raise TypeError("unsupported object type for argument 'file'")

# Read pages with fitz and convert them to numpy ndarrays
return fitz.open(**fitz_args, filetype="pdf", **kwargs)


def convert_page_to_numpy(
page: fitz.fitz.Page,
output_size: Optional[Tuple[int, int]] = None,
bgr_output: bool = False,
default_scales: Tuple[float, float] = (2, 2),
) -> np.ndarray:
"""Convert a fitz page to a numpy-formatted image

Args:
page: the page of a file read with PyMuPDF
output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
corresponds to 72 dpi rendering.

Returns:
the rendered image in numpy format
"""

# If no output size is specified, keep the origin one
if output_size is not None:
scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3])
else:
# Default 72 DPI (scales of (1, 1)) is unnecessarily low
scales = default_scales

transform_matrix = fitz.Matrix(*scales)

# Generate the pixel map using the transformation matrix
pixmap = page.get_pixmap(matrix=transform_matrix)
# Decode it into a numpy
img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3)

# Switch the channel order
if bgr_output:
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

return img


class PDF:
"""PDF document template

Args:
doc: input PDF document
"""
def __init__(self, doc: fitz.Document) -> None:
self.doc = doc

def as_images(self, **kwargs) -> List[np.ndarray]:
"""Convert all document pages to images

Example::
>>> from doctr.documents import DocumentFile
>>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()

Args:
kwargs: keyword arguments of `convert_page_to_numpy`
Returns:
the list of pages decoded as numpy ndarray of shape H x W x 3
"""
return [convert_page_to_numpy(page, **kwargs) for page in self.doc]

def get_page_lines(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
"""Get the annotations for all lines of a given page"""
lines: List[Tuple[Bbox, str]] = []
prev_block, prev_line = -1, -1
current_line = []
xmin, ymin, xmax, ymax = 0, 0, 0, 0
# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
for info in self.doc[idx].get_text_words(**kwargs):
if prev_block == info[-3] and prev_line == info[-2]:
current_line.append(info[4])
xmin, ymin = min(xmin, info[0]), min(ymin, info[1])
xmax, ymax = max(xmax, info[2]), max(ymax, info[3])
else:
if len(current_line) > 0:
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
current_line = [info[4]]
prev_block, prev_line = info[-3], info[-2]
xmin, ymin, xmax, ymax = info[:4]

if len(current_line) > 0:
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))

return lines

def get_lines(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
"""Get the annotations for all lines in the document

Example::
>>> from doctr.documents import DocumentFile
>>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()

Args:
kwargs: keyword arguments of `fitz.Page.get_text_words`
Returns:
the list of pages annotations, represented as a list of tuple (bounding box, value)
"""
return [self.get_page_lines(idx, **kwargs) for idx in range(len(self.doc))]

def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
"""Get the annotations for all words of a given page"""

# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
return [(info[:4], info[4]) for info in self.doc[idx].get_text_words(**kwargs)]

def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
"""Get the annotations for all words in the document

Example::
>>> from doctr.documents import DocumentFile
>>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()

Args:
kwargs: keyword arguments of `fitz.Page.get_text_words`
Returns:
the list of pages annotations, represented as a list of tuple (bounding box, value)
"""
return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]

def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]:
return [tuple(self.doc[idx].get_image_bbox(artefact)) # type: ignore[misc]
for artefact in self.doc[idx].get_images(full=True)]

def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]:
"""Get the artefacts for the entire document

Example::
>>> from doctr.documents import DocumentFile
>>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()

Returns:
the list of pages artefacts, represented as a list of bounding boxes
"""

return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
return [np.asarray(img) for img, _ in pdfium.render_pdf(file, scale=scale)]
14 changes: 6 additions & 8 deletions doctr/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .html import read_html
from .image import read_img_as_numpy
from .pdf import PDF, read_pdf
from .pdf import read_pdf

__all__ = ['DocumentFile']

Expand All @@ -21,7 +21,7 @@ class DocumentFile:
"""Read a document from multiple extensions"""

@classmethod
def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
"""Read a PDF file

Example::
Expand All @@ -31,15 +31,13 @@ def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
Args:
file: the path to the PDF file or a binary stream
Returns:
a PDF document
the list of pages decoded as numpy ndarray of shape H x W x 3
"""

doc = read_pdf(file, **kwargs)

return PDF(doc)
return read_pdf(file, **kwargs)

@classmethod
def from_url(cls, url: str, **kwargs) -> PDF:
def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
"""Interpret a web page as a PDF document

Example::
Expand All @@ -49,7 +47,7 @@ def from_url(cls, url: str, **kwargs) -> PDF:
Args:
url: the URL of the target web page
Returns:
a PDF document
the list of pages decoded as numpy ndarray of shape H x W x 3
"""
pdf_stream = read_html(url)
return cls.from_pdf(pdf_stream, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ ignore_missing_imports = True

ignore_missing_imports = True

[mypy-fitz.*]
[mypy-pypdfium2.*]

ignore_missing_imports = True

Expand Down
2 changes: 1 addition & 1 deletion requirements-pt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ numpy>=1.16.0
scipy>=1.4.0
h5py>=3.1.0
opencv-python>=3.4.5.20
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
pypdfium2>=0.14.0
pyclipper>=1.2.0
shapely>=1.6.0
matplotlib>=3.1.0,<3.4.3
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ numpy>=1.16.0
scipy>=1.4.0
h5py>=3.1.0
opencv-python>=3.4.5.20
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
pypdfium2>=0.14.0
pyclipper>=1.2.0
shapely>=1.6.0
matplotlib>=3.1.0,<3.4.3
Expand Down
2 changes: 1 addition & 1 deletion scripts/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def main(args):
model = ocr_predictor(args.detection, args.recognition, pretrained=True)

if args.path.endswith(".pdf"):
doc = DocumentFile.from_pdf(args.path).as_images()
doc = DocumentFile.from_pdf(args.path)
else:
doc = DocumentFile.from_images(args.path)

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"h5py>=3.1.0",
"opencv-python>=3.4.5.20",
"tensorflow>=2.4.0",
"PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12", # 18.11 and 18.12 fail (issue #222)
"pypdfium2>=0.14.0",
"pyclipper>=1.2.0",
"shapely>=1.6.0",
"matplotlib>=3.1.0,<3.4.3",
Expand Down Expand Up @@ -94,7 +94,7 @@ def deps_list(*pkgs):
deps["scipy"],
deps["h5py"],
deps["opencv-python"],
deps["PyMuPDF"],
deps["pypdfium2"],
deps["pyclipper"],
deps["shapely"],
deps["matplotlib"],
Expand Down
Loading