Merge branch 'main' of https://github.com/StarlightSearch/EmbedAnything

StarlightSearch · Feb 6, 2025 · 31801cf · 31801cf
2 parents 8bf89b7 + 868c93b
commit 31801cf
Show file tree

Hide file tree

Showing 6 changed files with 413 additions and 962 deletions.
diff --git a/examples/adapters/elastic.py b/examples/adapters/elastic.py
@@ -4,7 +4,7 @@
 from typing import Dict, List
 from embed_anything import EmbedData
 from embed_anything.vectordb import Adapter
-from embed_anything import BertConfig, EmbedConfig
+from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel
 
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
@@ -67,19 +67,15 @@ def upsert(self, data: List[Dict]):
 )
 
 # Prase PDF and insert documents into Elasticsearch.
-bert_config = BertConfig(
-    model_id="sentence-transformers/all-MiniLM-L6-v2",
-    chunk_size=100,
-    buffer_size=200,
+model = EmbeddingModel.from_pretrained_hf(
+    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
 )
 
-embed_config = EmbedConfig(bert=bert_config)
 
 data = embed_anything.embed_file(
-    "/path/to/my-file.pdf",
-    embedder="Bert",
-    adapter=elasticsearch_adapter,
-    config=embed_config,
+    "/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
+    embedder=model,
+    adapter=elasticsearch_adapter
 )
 
 # Create an Index with explicit mappings.

diff --git a/examples/adapters/lanceDB.ipynb b/examples/adapters/lanceDB.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: embed_anything-gpu in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.5.1)\n",
+      "Requirement already satisfied: onnxruntime-gpu==1.20.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from embed_anything-gpu) (1.20.1)\n",
+      "Requirement already satisfied: coloredlogs in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (15.0.1)\n",
+      "Requirement already satisfied: flatbuffers in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.3.25)\n",
+      "Requirement already satisfied: numpy>=1.21.6 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (2.1.3)\n",
+      "Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.2)\n",
+      "Requirement already satisfied: protobuf in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (5.28.3)\n",
+      "Requirement already satisfied: sympy in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.13.3)\n",
+      "Requirement already satisfied: humanfriendly>=9.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from coloredlogs->onnxruntime-gpu==1.20.1->embed_anything-gpu) (10.0)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from sympy->onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.3.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install embed_anything-gpu\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import embed_anything\n",
+    "import os\n",
+    "\n",
+    "from typing import Dict, List\n",
+    "from embed_anything import EmbedData\n",
+    "from embed_anything.vectordb import Adapter\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: lancedb in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.18.0)\n",
+      "Requirement already satisfied: deprecation in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.1.0)\n",
+      "Requirement already satisfied: pylance==0.22.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (0.22.0)\n",
+      "Requirement already satisfied: tqdm>=4.27.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (4.66.5)\n",
+      "Requirement already satisfied: pydantic>=1.10 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.9.2)\n",
+      "Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (24.2)\n",
+      "Requirement already satisfied: overrides>=0.7 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (7.7.0)\n",
+      "Requirement already satisfied: pyarrow>=14 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (17.0.0)\n",
+      "Requirement already satisfied: numpy>=1.22 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (2.1.3)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.23.4 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (2.23.4)\n",
+      "Requirement already satisfied: typing-extensions>=4.6.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (4.12.2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install lancedb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from embed_anything.vectordb import Adapter\n",
+    "from uuid import uuid4\n",
+    "import lancedb\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# table = self.connection.create_table(\"docs\", docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lancedb.pydantic import Vector, LanceModel\n",
+    "\n",
+    "class Item(LanceModel):\n",
+    "    vector: Vector(2)\n",
+    "    item: str\n",
+    "    price: float\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LanceAdapter(Adapter):\n",
+    "    def __init__(self, db_path: str, embedding_dimension: int):\n",
+    "        from lancedb.pydantic import Vector, LanceModel\n",
+    "\n",
+    "        self.db_path = db_path\n",
+    "        self.connection = lancedb.connect(self.db_path)\n",
+    "        self.dimension = embedding_dimension\n",
+    "        \n",
+    "        class Metadata(LanceModel):\n",
+    "            file_name: str\n",
+    "            modified: str\n",
+    "            created: str\n",
+    "        class Item(LanceModel):\n",
+    "            embeddings: Vector(self.dimension)\n",
+    "            text: str\n",
+    "            # metadata: Metadata\n",
+    "        self.Metadata = Metadata\n",
+    "        self.Item = Item\n",
+    "\n",
+    "    def create_index(self, table_name: str):\n",
+    "        self.table_name = table_name\n",
+    "        self.connection = lancedb.connect(self.db_path)\n",
+    "        self.table = self.connection.create_table(table_name, schema=self.Item.to_arrow_schema())\n",
+    "\n",
+    "    def convert(self, embeddings: List[List[EmbedData]]) -> List[Dict]:\n",
+    "        data = []\n",
+    "        for embedding in embeddings:\n",
+    "            data.append(\n",
+    "                {\n",
+    "                    \"text\": embedding.text,\n",
+    "                    \"embeddings\": embedding.embedding,\n",
+    "                    # \"metadata\": self.Metadata(\n",
+    "                    #     file_name=embedding.metadata[\"file_name\"],\n",
+    "                    #     modified=embedding.metadata[\"modified\"],\n",
+    "                    #     created=embedding.metadata[\"created\"],\n",
+    "                    # ),\n",
+    "                }\n",
+    "            )\n",
+    "        return data\n",
+    "    \n",
+    "    def delete_index(self, table_name: str):\n",
+    "        self.connection.drop_table(table_name)\n",
+    "\n",
+    "    def upsert(self, data: EmbedData):\n",
+    "        self.table.add(self.convert(data))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lance_adapter.delete_index(\"docs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# elasticsearch_adapter = ElasticsearchAdapter(\n",
+    "#     api_key=elastic_api_key,\n",
+    "#     cloud_id=elastic_cloud_id,\n",
+    "#     index_name=index_name,\n",
+    "# )\n",
+    "lance_adapter = LanceAdapter(db_path=\"tmp/lancedb\",  embedding_dimension=384)\n",
+    "lance_adapter.create_index(\"docs\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading weights from \"/home/sonamAI/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L12-v2/snapshots/a05860a77cef7b37e0048a7864658139bc18a854/model.safetensors\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel\n",
+    "\n",
+    "model = EmbeddingModel.from_pretrained_hf(\n",
+    "    WhichModel.Bert, model_id=\"sentence-transformers/all-MiniLM-L12-v2\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "data = embed_anything.embed_file(\n",
+    "    \"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf\",\n",
+    "    embedder=model,\n",
+    "    adapter=lance_adapter\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_vec = embed_anything.embed_query(['attention'], embedder = model)[0].embedding\n",
+    "docs = lance_adapter.table.search(query_vec).limit(5).to_pandas()[\"text\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    2 Background The goal of reducing sequential c...\n",
+       "1    <EOS><pad> Input-Input Layer5TheLawwillneverbe...\n",
+       "2    MultiHead(Q, K, V ) = Concat(head1, ..., headh...\n",
+       "3    In contrast to RNN sequence-to-sequence models...\n",
+       "4    Convolutional layers are generally more expens...\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "unsloth",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/adapters/pinecone_db.py b/examples/adapters/pinecone_db.py
@@ -114,17 +114,22 @@ def upsert(self, data: List[Dict]):
 pinecone_adapter.create_index(dimension=512, metric="cosine")
 
 
-clip_model = EmbeddingModel.from_pretrained_hf(
-    WhichModel.Clip, "openai/clip-vit-base-patch16", revision="main"
+model = EmbeddingModel.from_pretrained_hf(
+    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
+)
+
+
+data = embed_anything.embed_file(
+    "/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
+    embedder=model,
+    adapter=pinecone_adapter,
 )
 
-embed_config = TextEmbedConfig(chunk_size=512, batch_size=32, buffer_size=200)
 
 
 data = embed_anything.embed_image_directory(
     "test_files",
-    embedder=clip_model,
-    adapter=pinecone_adapter,
-    config=embed_config,
+    embedder=model,
+    adapter=pinecone_adapter
 )
 print(data)
diff --git a/examples/adapters/weaviate_db.py b/examples/adapters/weaviate_db.py
@@ -2,7 +2,7 @@
 import weaviate.classes as wvc
 from tqdm.auto import tqdm
 import embed_anything
-from embed_anything import EmbedData
+from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel
 from embed_anything.vectordb import Adapter
 import textwrap
 
@@ -57,15 +57,15 @@ def delete_index(self, index_name: str):
 weaviate_adapter.create_index("Test_index")
 
 
-# model id and embed image directory
-model = embed_anything.EmbeddingModel.from_pretrained_hf(
-    embed_anything.WhichModel.Bert,
-    model_id="sentence-transformers/all-MiniLM-L12-v2",
+model = EmbeddingModel.from_pretrained_hf(
+    WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L12-v2"
 )
 
 
-data = embed_anything.embed_directory(
-    "test_files", embedder=model, adapter=weaviate_adapter
+data = embed_anything.embed_file(
+    "/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf",
+    embedder=model,
+    adapter=weaviate_adapter,
 )
 
 query_vector = embed_anything.embed_query(["What is attention"], embedder=model)[