From f365470031b615097e9c8f9ccb167dc35714f1fa Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sat, 17 Aug 2024 14:24:29 -0700 Subject: [PATCH] Count Bytes and Docs (#186) * added option to count bytes * version * added document counter --- pyproject.toml | 2 +- python/dolma/taggers/length.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c9ae9453..9f7c870f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dolma" -version = "1.0.9" +version = "1.0.10" description = "Data filters" license = { text = "Apache-2.0" } readme = "README.md" diff --git a/python/dolma/taggers/length.py b/python/dolma/taggers/length.py index 4dbc649b..2247c9b8 100644 --- a/python/dolma/taggers/length.py +++ b/python/dolma/taggers/length.py @@ -18,6 +18,19 @@ from ..core.utils import split_paragraphs +@TaggerRegistry.add("bytes_length_v1") +class BytesLengthV1(BaseTagger): + def predict(self, doc: Document) -> DocResult: + score = len(doc.text.encode("utf-8")) + return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="bytes", score=score)]) + + +@TaggerRegistry.add("doc_count_v1") +class DocCountLengthV1(BaseTagger): + def predict(self, doc: Document) -> DocResult: + return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="docs", score=1)]) + + @TaggerRegistry.add("char_length_v1") class CharLengthV1(BaseTagger): def predict(self, doc: Document) -> DocResult: