Skip to content

Commit

Permalink
Merge pull request #19 from yobix-ai/18-ocr-examples-and-docs
Browse files Browse the repository at this point in the history
18 ocr examples and docs
  • Loading branch information
nmammeri authored Nov 4, 2024
2 parents ab3fb0e + 205864e commit 2db7f6e
Show file tree
Hide file tree
Showing 24 changed files with 464 additions and 55 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ jobs:
shell: bash
run: |
set -e
sudo apt install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
Expand All @@ -83,6 +84,7 @@ jobs:
githubToken: ${{ github.token }}
install: |
apt-get update
apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
apt-get install -y --no-install-recommends python3 python3-pip
pip3 install -U pip pytest scikit-learn
run: |
Expand Down Expand Up @@ -138,7 +140,7 @@ jobs:
strategy:
matrix:
platform:
- runner: macos-12
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
Expand Down Expand Up @@ -176,6 +178,7 @@ jobs:
- name: pytest
run: |
set -e
brew install tesseract tesseract-lang
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
Expand Down
85 changes: 79 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,23 +75,96 @@ extractor.set_extract_string_max_length(1000)

# Extract text from a file
result = extractor.extract_file_to_string("README.md")
print(result)
```
* Extracting a file to a buffered stream:

```python
from extractous import Extractor

extractor = Extractor()
reader = extractor.extract_file("tests/quarkus.pdf")

result = ""
buffer = reader.read(4096)
while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)

print(result)
```

* Extracting a file with OCR:

You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`

```python
from extractous import Extractor, TesseractOcrConfig

extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")

print(result)
```

#### Rust
* Extract a file content to a string:
```rust
use extractous::Extractor;
use extractous::PdfParserConfig;

// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);
fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);

// Extract text from a file
let text = extractor.extract_file_to_string("README.md").unwrap();
println!("{}", text);
// Extract text from a file
let text = extractor.extract_file_to_string("README.md").unwrap();
println!("{}", text);
}
```

* Extract a content of a file to a `StreamReader` and perform buffered reading
```rust
use std::io::Read;
use extractous::Extractor;

fn main() {
// Get the command-line arguments
let args: Vec<String> = std::env::args().collect();
let file_path = &args[1];

// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();

// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut buffer = Vec::new();
stream.read_to_end(&mut buffer).unwrap();

println!("{}", String::from_utf8(buffer).unwrap())
}
```

* Extract content of PDF with OCR.

You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`

```rust
use extractous::Extractor;

fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";

let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
// extract file with extractor
let content = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
}
```


## 🔥 Performance
* **Extractous** is fast, please don't take our word for it, you can run the [benchmarks](https://github.com/yobix-ai/extractous-benchmarks) yourself. For example extracting content out of [sec10 filings pdf forms](https://github.com/yobix-ai/extractous-benchmarks/raw/main/dataset/sec10-filings), Extractous is on average **~18x faster** than unstructured-io:

Expand Down
11 changes: 11 additions & 0 deletions bindings/extractous-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,16 @@ while len(buffer) > 0:
result += buffer.decode("utf-8")
buffer = reader.read(4096)

print(result)
```

Extracting a file with OCR:

```python
from extractous import Extractor, TesseractOcrConfig

extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")

print(result)
```
2 changes: 1 addition & 1 deletion bindings/extractous-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "extractous"
version = '0.1.6'
version = '0.1.7'
classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down
14 changes: 2 additions & 12 deletions bindings/extractous-python/tests/test_extract_file_to_string.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import pytest

from extractous import Extractor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
from utils import cosine_similarity

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand All @@ -16,6 +15,7 @@
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
("deu-ocr.pdf", 0.9),
]

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
Expand All @@ -31,13 +31,3 @@ def test_extract_file_to_string(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

def cosine_similarity(text1, text2):
"""Calculate the cosine similarity between two texts."""

# Create the CountVectorizer and transform the texts into vectors
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()

# Calculate cosine similarity between the two vectors
cos_sim = cosine_sim(vectors)
return cos_sim[0][1]
46 changes: 46 additions & 0 deletions bindings/extractous-python/tests/test_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
from utils import cosine_similarity

def test_ara_ocr_png():
ocr_config = TesseractOcrConfig().set_language("ara")
extractor = Extractor().set_ocr_config(ocr_config)
result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")

with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)


def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
test_file = "../../test_files/documents/eng-ocr.pdf"
expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"

pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
ocr_config = TesseractOcrConfig().set_language("deu")

# Note builder patter is used
extractor = Extractor()
extractor = extractor.set_ocr_config(ocr_config)
extractor = extractor.set_pdf_config(pdf_config)

result = extractor.extract_file_to_string(test_file)

with open(expected_result_file, "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)

def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
test_file = "../../test_files/documents/deu-ocr.pdf"

pdf_config = PdfParserConfig()
pdf_config = pdf_config.set_ocr_strategy(PdfOcrStrategy.NO_OCR)
ocr_config = TesseractOcrConfig()
ocr_config = ocr_config.set_language("deu")

extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))

result = extractor.extract_file_to_string(test_file)

assert result.strip() == ""
4 changes: 2 additions & 2 deletions bindings/extractous-python/tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def test_extract_file_to_string():
extractor = Extractor()
result = extractor.extract_file_to_string("tests/quarkus.pdf")

print(result)
#print(result)
assert result == expected_result()


Expand All @@ -23,5 +23,5 @@ def test_extract_file():
result += b.decode("utf-8")
b = reader.read(4096)

print(result)
#print(result)
assert result == expected_result()
13 changes: 13 additions & 0 deletions bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim

def cosine_similarity(text1, text2):
"""Calculate the cosine similarity between two texts."""

# Create the CountVectorizer and transform the texts into vectors
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()

# Calculate cosine similarity between the two vectors
cos_sim = cosine_sim(vectors)
return cos_sim[0][1]
2 changes: 1 addition & 1 deletion extractous-core/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "extractous"
version = "0.1.6"
version = "0.1.7"
edition = "2021"

description = """
Expand Down
19 changes: 19 additions & 0 deletions extractous-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ fn main() {
}
```

* Extract content of PDF with OCR. You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
* If you get `Parse error occurred : Unable to extract PDF content`, it is most likely that OCR language pack is not installed
```rust
use extractous::Extractor;

fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";

let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
// extract file with extractor
let content = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
}
```


## Building

### Requirements
Expand All @@ -84,6 +102,7 @@ fn main() {
specific local version, you can do so by setting the GRAALVM_HOME environment variable
* We recommend using [sdkman](https://sdkman.io/install) to install GraalVM JDKs
* `sdk install java 22.0.1-graalce`
* To be able to use it from IDEA, on Ubuntu for example add `GRAALVM_HOME=$HOME/.sdkman/candidates/java/22.0.2-graalce` to `/etc/environment`
* Confirm that GraalVM is installed correctly by running `java -version`. You should see something like:
```text
openjdk 22.0.1 2024-04-16
Expand Down
4 changes: 2 additions & 2 deletions extractous-core/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl Default for PdfParserConfig {
Self {
ocr_strategy: PdfOcrStrategy::AUTO,
extract_inline_images: false,
extract_unique_inline_images_only: true,
extract_unique_inline_images_only: false,
extract_marked_content: false,
extract_annotation_text: true,
}
Expand Down Expand Up @@ -71,7 +71,7 @@ impl PdfParserConfig {
/// or similar equality metric. If the PDF actually contains multiple copies of the same
/// image -- all with different object ids -- then all images will be extracted.
/// For this parameter to have any effect, extractInlineImages must be set to true.
/// Default: true.
/// Default: false.
pub fn set_extract_unique_inline_images_only(mut self, val: bool) -> Self {
self.extract_unique_inline_images_only = val;
self
Expand Down
8 changes: 7 additions & 1 deletion extractous-core/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,13 @@ impl Extractor {
/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<String> {
tika::parse_file_to_string(file_path, self.extract_string_max_length)
tika::parse_file_to_string(
file_path,
self.extract_string_max_length,
&self.pdf_config,
&self.office_config,
&self.ocr_config,
)
}
}

Expand Down
21 changes: 21 additions & 0 deletions extractous-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//! To use an extractor, you need to:
//! - [create and configure new the extractor](#create-and-config-an-extractor)
//! - [use the extractor to extract text](#extract-text)
//! - [enable OCR for the extractor](#extract-text-with-ocr)
//!
//! ## Create and config an extractor
//!
Expand Down Expand Up @@ -44,6 +45,26 @@
//! println!("{}", text);
//!
//! ```
//!
//! ## Extract text with OCR
//! * Make sure Tesseract is installed with the corresponding language packs. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu` to install tesseract with German language pack.
//! * If you get `Parse error occurred : Unable to extract PDF content`, it is most likely that the OCR language pack is not installed
//!
//! ```no_run
//! use extractous::{Extractor, TesseractOcrConfig, PdfParserConfig, PdfOcrStrategy};
//!
//! let file_path = "../test_files/documents/deu-ocr.pdf";
//!
//! // Create a new extractor. Note it uses a consuming builder pattern
//! let extractor = Extractor::new()
//! .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
//! .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
//!
//! // extract file with extractor
//! let content = extractor.extract_file_to_string(file_path).unwrap();
//! println!("{}", content);
//!
//! ```
/// Default buffer size
pub const DEFAULT_BUF_SIZE: usize = 32768;
Expand Down
Loading

0 comments on commit 2db7f6e

Please sign in to comment.