Skip to content

Commit

Permalink
Merge pull request #145 from pycompression/bgzip
Browse files Browse the repository at this point in the history
Increase BGZip streaming decompression performance
  • Loading branch information
rhpvorderman authored Jul 21, 2023
2 parents fd66618 + a76c459 commit 4ad8e12
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ Changelog
version 1.2.0-dev
-----------------
+ Bgzip files are now detected and a smaller reading buffer is used to
accomodate the fact that bgzip blocks are typically less than 64K. (Unlike
normal gzip files that consist of one block that spans the entire file.)
This has reduced decompression time for bgzip files by roughly 12%.
+ Speed-up source build by using ISA-L Unix-specific makefile rather than the
autotools build.
+ Simplify build setup. ISA-L release flags are now used and not
Expand Down
27 changes: 26 additions & 1 deletion src/isal/igzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,22 @@ def write(self, data):
return length


def detect_bgzip(header: bytes) -> bool:
if len(header) < 18:
return False
magic, method, flags, mtime, xfl, os, xlen, si1, si2, slen, bsize = \
struct.unpack("<HBBIBBHBBHH", header[:18])
return (
method == 8 and # Deflate method used
flags & 4 and # There are extra fields
xlen == 6 and # The extra field should be of length 6
si1 == 66 and # BGZIP magic number one
si2 == 67 and # BGZIP magic number two
slen == 2 # The length of the 16 bit integer that stores
# the size of the block
)


class _PaddedFile(gzip._PaddedFile):
# Overwrite _PaddedFile from gzip as its prepend method assumes that
# the prepended data is always read from its _buffer. Unfortunately in
Expand Down Expand Up @@ -249,6 +265,15 @@ def __init__(self, fp):
# Set flag indicating start of a new member
self._new_member = True
self._last_mtime = None
self._read_buffer_size = READ_BUFFER_SIZE
if hasattr(fp, "peek") and detect_bgzip(fp.peek(18)):
# bgzip consists of puny little blocks of max 64K uncompressed data
# so in practice probably more around 16K in compressed size. A
# 128K buffer is a massive overshoot and slows down the
# decompression.
# bgzip stores the block size, so it can be unpacked more
# efficiently but this is outside scope for python-isal.
self._read_buffer_size = 16 * 1024

def read(self, size=-1):
if size < 0:
Expand Down Expand Up @@ -282,7 +307,7 @@ def read(self, size=-1):

# Read a chunk of data from the file
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
buf = self._fp.read(self._read_buffer_size)
uncompress = self._decompressor.decompress(buf, size)
else:
uncompress = self._decompressor.decompress(b"", size)
Expand Down
Binary file added tests/data/test.fastq.bgzip.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test_igzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,3 +443,13 @@ def test_concatenated_gzip():
with igzip.open(concat, "rb") as igzip_h:
result = igzip_h.read()
assert data == result


def test_bgzip():
bgzip_file = Path(__file__).parent / "data" / "test.fastq.bgzip.gz"
gzip_file = Path(__file__).parent / "data" / "test.fastq.gz"
with igzip.open(bgzip_file, "rb") as bgz:
bgz_data = bgz.read()
with igzip.open(gzip_file, "rb") as gz:
gz_data = gz.read()
assert bgz_data == gz_data

0 comments on commit 4ad8e12

Please sign in to comment.