Skip to content

Commit

Permalink
Use pigz subprocess when pigz bin is available
Browse files Browse the repository at this point in the history
  • Loading branch information
Thamme Gowda committed Apr 26, 2024
1 parent 8c07eff commit ea0b86f
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 20 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* Added WMT general test 2022 and 2023
* mtdata-bcp47 : -p/--pipe to map codes from stdin -> stdout
* mtdata-bcp47 : --script {suppress-default,suppress-all,express}
* Uses`pigz` to read and write gzip files by default when pigz is in PATH. export `USE_PIGZ=0` to disable

## v0.4.0 - 20230326

Expand Down
75 changes: 75 additions & 0 deletions mtdata/pigz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import subprocess
import shutil


class pigz:
"""pigz based file opener and writer. This should be a drop in replacement for gzip.open"""
DEF_LEVEL = 6
PGZ_BIN = shutil.which('pigz')

@classmethod
def is_available(cls):
return cls.PGZ_BIN is not None

def __init__(self, path, mode='r', level=DEF_LEVEL):
self.path = path
self.mode = mode
self.level = level
self._file = None
assert not 'b' in mode, "Binary mode not supported. Use this for reading and writing lines (text mode)"
assert self.is_available(), "pigz not found in PATH"
self._open()

def _open(self):
if 'r' in self.mode: # Open the file using pigz subprocess for reading
self._file = subprocess.Popen(
[self.PGZ_BIN, '-dc', self.path],
stdin=None, stdout=subprocess.PIPE, bufsize=-1, text=True
)
elif 'w' in self.mode:
# Open the file using pigz subprocess for writing
self._file = subprocess.Popen(
[self.PGZ_BIN, '-c', f'-{self.level}', '-'],
stdin=subprocess.PIPE, stdout=open(self.path, 'wb'),
bufsize=-1,
text=True)
else:
raise ValueError(f"Unsupported mode: {self.mode}")

def close(self):
if not self._file:
return # already closed
# Close the subprocess
for stream in [self._file.stdin, self._file.stdout, self._file.stderr]:
if stream:
stream.close()
self._file.wait()
self._file = None

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
return self.close()

def __iter__(self):
return self

def __next__(self):
line = self._file.stdout.readline()
if not line:
raise StopIteration
return line



def write(self, data):
if 'w' not in self.mode:
raise ValueError("File not open for writing")
self._file.stdin.write(data)

@classmethod
def open(cls, path, mode='rb', level=DEF_LEVEL, **kwargs):
"""Open a file using pigz subprocess. This should be a drop in replacement for gzip.open"""
return cls(path, mode, level)

42 changes: 22 additions & 20 deletions mtdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,39 @@
#
# Author: Thamme Gowda [tg (at) isi (dot) edu]
# Created: 5/13/20
import io
import bz2
import gzip
import io
import lzma
import os
import shutil
import subprocess
import tarfile
import zipfile
from dataclasses import dataclass
import portalocker
import gzip
import bz2
import lzma

from mtdata import log, Defaults
import shutil
from datetime import datetime
from pathlib import Path

import portalocker

from mtdata import Defaults, log
from mtdata.pigz import pigz

COMPRESSORS = {
'.gz': gzip.open,
'.bz2': bz2.open,
'.xz': lzma.open
}

USE_PIGZ = os.environ.get('USE_PIGZ', 'YES').lower() in ('yes', 'true', '1')
if USE_PIGZ and pigz.is_available():
COMPRESSORS['.gz'] = pigz.open

class IO:
"""File opener and automatic closer
Copied from my other project https://github.com/isi-nlp/rtg/blob/master/rtg/utils.py
"""

def __init__(self, path, mode='r', encoding=None, errors=None, smart_ext=True):
"""
Expand Down Expand Up @@ -127,7 +133,7 @@ class ArchivedPath:
root: Path
name: str
fd = None

@property
def suffix(self):
return self.root.suffix
Expand Down Expand Up @@ -208,28 +214,23 @@ def extract(self):
return # extracted by parallel process
log.info(f"extracting {self.root}")
with tarfile.open(self.root) as tar:

import os

def is_within_directory(directory, target):

abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)

prefix = os.path.commonprefix([abs_directory, abs_target])

return prefix == abs_directory

def safe_extract(tar, path=".", members=None, *, numeric_owner=False):

for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")

tar.extractall(path, members, numeric_owner=numeric_owner)


safe_extract(tar, out_path)
valid_path.touch()
return out_path
Expand All @@ -252,3 +253,4 @@ def format_byte_size(n: int) -> str:
m = n / 10 ** power
return f'{m:.2f}'.rstrip('0') + f' {unit}'
return f'{n}B'

38 changes: 38 additions & 0 deletions tests/test_pigz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from tempfile import TemporaryDirectory
from pathlib import Path

from mtdata import log
from mtdata.utils import pigz, IO


def test_pigz():
if not pigz.is_available():
log.warning(f'pigz is unavailable')
return

lines = ['Hello', 'World', 'This is a test', 'of pigz']

with TemporaryDirectory() as tmp_dir:
tmp_file = Path(tmp_dir) / 'tmp.gz'
with pigz.open(tmp_file, 'w') as f:
for line in lines:
f.write(line + '\n')
with pigz.open(tmp_file, 'r') as f:
for got, expected in zip(f, lines):
assert got.strip() == expected

def test_IO():
if not pigz.is_available():
log.warning(f'pigz is unavailable')
return

lines = ['Hello', 'World', 'This is a test', 'of pigz']

with TemporaryDirectory() as tmp_dir:
tmp_file = Path(tmp_dir) / 'tmp.gz'
with IO.writer(tmp_file) as out:
for line in lines:
out.write(line + '\n')
with IO.reader(tmp_file) as inp:
for got, expected in zip(inp, lines):
assert got.strip() == expected

0 comments on commit ea0b86f

Please sign in to comment.