Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
akshayballal95 committed Feb 6, 2025
2 parents 8bf89b7 + 868c93b commit 31801cf
Show file tree
Hide file tree
Showing 6 changed files with 413 additions and 962 deletions.
16 changes: 6 additions & 10 deletions examples/adapters/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Dict, List
from embed_anything import EmbedData
from embed_anything.vectordb import Adapter
from embed_anything import BertConfig, EmbedConfig
from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
Expand Down Expand Up @@ -67,19 +67,15 @@ def upsert(self, data: List[Dict]):
)

# Prase PDF and insert documents into Elasticsearch.
bert_config = BertConfig(
model_id="sentence-transformers/all-MiniLM-L6-v2",
chunk_size=100,
buffer_size=200,
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
)

embed_config = EmbedConfig(bert=bert_config)

data = embed_anything.embed_file(
"/path/to/my-file.pdf",
embedder="Bert",
adapter=elasticsearch_adapter,
config=embed_config,
"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
embedder=model,
adapter=elasticsearch_adapter
)

# Create an Index with explicit mappings.
Expand Down
267 changes: 267 additions & 0 deletions examples/adapters/lanceDB.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: embed_anything-gpu in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.5.1)\n",
"Requirement already satisfied: onnxruntime-gpu==1.20.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from embed_anything-gpu) (1.20.1)\n",
"Requirement already satisfied: coloredlogs in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (15.0.1)\n",
"Requirement already satisfied: flatbuffers in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.3.25)\n",
"Requirement already satisfied: numpy>=1.21.6 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (2.1.3)\n",
"Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.2)\n",
"Requirement already satisfied: protobuf in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (5.28.3)\n",
"Requirement already satisfied: sympy in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.13.3)\n",
"Requirement already satisfied: humanfriendly>=9.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from coloredlogs->onnxruntime-gpu==1.20.1->embed_anything-gpu) (10.0)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from sympy->onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.3.0)\n"
]
}
],
"source": [
"!pip install embed_anything-gpu\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import embed_anything\n",
"import os\n",
"\n",
"from typing import Dict, List\n",
"from embed_anything import EmbedData\n",
"from embed_anything.vectordb import Adapter\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: lancedb in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.18.0)\n",
"Requirement already satisfied: deprecation in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.1.0)\n",
"Requirement already satisfied: pylance==0.22.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (0.22.0)\n",
"Requirement already satisfied: tqdm>=4.27.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (4.66.5)\n",
"Requirement already satisfied: pydantic>=1.10 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.9.2)\n",
"Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (24.2)\n",
"Requirement already satisfied: overrides>=0.7 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (7.7.0)\n",
"Requirement already satisfied: pyarrow>=14 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (17.0.0)\n",
"Requirement already satisfied: numpy>=1.22 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (2.1.3)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.23.4 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (2.23.4)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (4.12.2)\n"
]
}
],
"source": [
"!pip install lancedb"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from embed_anything.vectordb import Adapter\n",
"from uuid import uuid4\n",
"import lancedb\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# table = self.connection.create_table(\"docs\", docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from lancedb.pydantic import Vector, LanceModel\n",
"\n",
"class Item(LanceModel):\n",
" vector: Vector(2)\n",
" item: str\n",
" price: float\n"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"class LanceAdapter(Adapter):\n",
" def __init__(self, db_path: str, embedding_dimension: int):\n",
" from lancedb.pydantic import Vector, LanceModel\n",
"\n",
" self.db_path = db_path\n",
" self.connection = lancedb.connect(self.db_path)\n",
" self.dimension = embedding_dimension\n",
" \n",
" class Metadata(LanceModel):\n",
" file_name: str\n",
" modified: str\n",
" created: str\n",
" class Item(LanceModel):\n",
" embeddings: Vector(self.dimension)\n",
" text: str\n",
" # metadata: Metadata\n",
" self.Metadata = Metadata\n",
" self.Item = Item\n",
"\n",
" def create_index(self, table_name: str):\n",
" self.table_name = table_name\n",
" self.connection = lancedb.connect(self.db_path)\n",
" self.table = self.connection.create_table(table_name, schema=self.Item.to_arrow_schema())\n",
"\n",
" def convert(self, embeddings: List[List[EmbedData]]) -> List[Dict]:\n",
" data = []\n",
" for embedding in embeddings:\n",
" data.append(\n",
" {\n",
" \"text\": embedding.text,\n",
" \"embeddings\": embedding.embedding,\n",
" # \"metadata\": self.Metadata(\n",
" # file_name=embedding.metadata[\"file_name\"],\n",
" # modified=embedding.metadata[\"modified\"],\n",
" # created=embedding.metadata[\"created\"],\n",
" # ),\n",
" }\n",
" )\n",
" return data\n",
" \n",
" def delete_index(self, table_name: str):\n",
" self.connection.drop_table(table_name)\n",
"\n",
" def upsert(self, data: EmbedData):\n",
" self.table.add(self.convert(data))\n"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"lance_adapter.delete_index(\"docs\")"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"# elasticsearch_adapter = ElasticsearchAdapter(\n",
"# api_key=elastic_api_key,\n",
"# cloud_id=elastic_cloud_id,\n",
"# index_name=index_name,\n",
"# )\n",
"lance_adapter = LanceAdapter(db_path=\"tmp/lancedb\", embedding_dimension=384)\n",
"lance_adapter.create_index(\"docs\")"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading weights from \"/home/sonamAI/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L12-v2/snapshots/a05860a77cef7b37e0048a7864658139bc18a854/model.safetensors\"\n"
]
}
],
"source": [
"from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel\n",
"\n",
"model = EmbeddingModel.from_pretrained_hf(\n",
" WhichModel.Bert, model_id=\"sentence-transformers/all-MiniLM-L12-v2\"\n",
")\n",
"\n",
"\n",
"data = embed_anything.embed_file(\n",
" \"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf\",\n",
" embedder=model,\n",
" adapter=lance_adapter\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"query_vec = embed_anything.embed_query(['attention'], embedder = model)[0].embedding\n",
"docs = lance_adapter.table.search(query_vec).limit(5).to_pandas()[\"text\"]"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2 Background The goal of reducing sequential c...\n",
"1 <EOS><pad> Input-Input Layer5TheLawwillneverbe...\n",
"2 MultiHead(Q, K, V ) = Concat(head1, ..., headh...\n",
"3 In contrast to RNN sequence-to-sequence models...\n",
"4 Convolutional layers are generally more expens...\n",
"Name: text, dtype: object"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "unsloth",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
17 changes: 11 additions & 6 deletions examples/adapters/pinecone_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,22 @@ def upsert(self, data: List[Dict]):
pinecone_adapter.create_index(dimension=512, metric="cosine")


clip_model = EmbeddingModel.from_pretrained_hf(
WhichModel.Clip, "openai/clip-vit-base-patch16", revision="main"
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
)


data = embed_anything.embed_file(
"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
embedder=model,
adapter=pinecone_adapter,
)

embed_config = TextEmbedConfig(chunk_size=512, batch_size=32, buffer_size=200)


data = embed_anything.embed_image_directory(
"test_files",
embedder=clip_model,
adapter=pinecone_adapter,
config=embed_config,
embedder=model,
adapter=pinecone_adapter
)
print(data)
14 changes: 7 additions & 7 deletions examples/adapters/weaviate_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import weaviate.classes as wvc
from tqdm.auto import tqdm
import embed_anything
from embed_anything import EmbedData
from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel
from embed_anything.vectordb import Adapter
import textwrap

Expand Down Expand Up @@ -57,15 +57,15 @@ def delete_index(self, index_name: str):
weaviate_adapter.create_index("Test_index")


# model id and embed image directory
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L12-v2",
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
)


data = embed_anything.embed_directory(
"test_files", embedder=model, adapter=weaviate_adapter
data = embed_anything.embed_file(
"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
embedder=model,
adapter=weaviate_adapter,
)

query_vector = embed_anything.embed_query(["What is attention"], embedder=model)[
Expand Down
Loading

0 comments on commit 31801cf

Please sign in to comment.