-
Hi guys,
I wrote some loading scripts for jsonl data, which is not that elegant or general. import glob
import orjson
import os
import datasets
from itertools import islice
_HOMEPAGE = "https://huggingface.co/datasets/m-a-p/Matrix"
class MatrixDataset(datasets.GeneratorBasedBuilder):
"""Custom dataset for JSON files with filtering capabilities."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
features=datasets.Features({
"id": datasets.Value("string"),
"text": datasets.Value("string"),
}),
homepage=_HOMEPAGE,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
import random
data_files = glob.glob("*/*.jsonl")
data_shards = []
for filepath in data_files:
# max size of each shard is 1GB
num_shards = -os.path.getsize(filepath) // -1024**3
for i in range(num_shards):
data_shards.append((filepath, i, num_shards))
random.Random(42).shuffle(data_shards)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"data_shards": data_shards,
},
),
]
def _generate_examples(self, data_shards):
for file, split, num_shards in data_shards:
with open(file, "r") as f:
for i, line in islice(enumerate(f), split, None, num_shards):
data = orjson.loads(line)
if 'id' not in data:
data['id'] = f"{file}_{i}"
if 'content' in data and 'text' not in data:
data['text'] = data.pop('content')
if data['text'] is not None:
yield data["id"], data I'm wondering if you could suggest any better ways @lhoestq |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments
-
Maybe you can create multiple iterators per file, e.g. if you want 4 iterators per file you can have the first iterator that starts at the beginning of the file and stops at the first EOL at 1/4 of the file, then the second iterator starts after the first EOL at 1/4 of the file until the first EOL at 1/2 of the file, etc. Btw instead of using a GeneratorBasedBuilder it's maybe easier to use |
Beta Was this translation helpful? Give feedback.
-
@lhoestq Thank you for this nice response! and Happy New Year~ |
Beta Was this translation helpful? Give feedback.
Maybe you can create multiple iterators per file, e.g. if you want 4 iterators per file you can have the first iterator that starts at the beginning of the file and stops at the first EOL at 1/4 of the file, then the second iterator starts after the first EOL at 1/4 of the file until the first EOL at 1/2 of the file, etc.
Btw instead of using a GeneratorBasedBuilder it's maybe easier to use
Dataset.from_generator
orIterableDataset.from_generator