From f365470031b615097e9c8f9ccb167dc35714f1fa Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Sat, 17 Aug 2024 14:24:29 -0700
Subject: [PATCH] Count Bytes and Docs (#186)

* added option to count bytes

* version

* added document counter
---
 pyproject.toml                 |  2 +-
 python/dolma/taggers/length.py | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index c9ae9453..9f7c870f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dolma"
-version = "1.0.9"
+version = "1.0.10"
 description = "Data filters"
 license = { text = "Apache-2.0" }
 readme = "README.md"
diff --git a/python/dolma/taggers/length.py b/python/dolma/taggers/length.py
index 4dbc649b..2247c9b8 100644
--- a/python/dolma/taggers/length.py
+++ b/python/dolma/taggers/length.py
@@ -18,6 +18,19 @@
 from ..core.utils import split_paragraphs
 
 
+@TaggerRegistry.add("bytes_length_v1")
+class BytesLengthV1(BaseTagger):
+    def predict(self, doc: Document) -> DocResult:
+        score = len(doc.text.encode("utf-8"))
+        return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="bytes", score=score)])
+
+
+@TaggerRegistry.add("doc_count_v1")
+class DocCountLengthV1(BaseTagger):
+    def predict(self, doc: Document) -> DocResult:
+        return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="docs", score=1)])
+
+
 @TaggerRegistry.add("char_length_v1")
 class CharLengthV1(BaseTagger):
     def predict(self, doc: Document) -> DocResult: