-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/StarlightSearch/EmbedAnything
- Loading branch information
Showing
6 changed files
with
413 additions
and
962 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: embed_anything-gpu in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.5.1)\n", | ||
"Requirement already satisfied: onnxruntime-gpu==1.20.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from embed_anything-gpu) (1.20.1)\n", | ||
"Requirement already satisfied: coloredlogs in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (15.0.1)\n", | ||
"Requirement already satisfied: flatbuffers in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.3.25)\n", | ||
"Requirement already satisfied: numpy>=1.21.6 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (2.1.3)\n", | ||
"Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (24.2)\n", | ||
"Requirement already satisfied: protobuf in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (5.28.3)\n", | ||
"Requirement already satisfied: sympy in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.13.3)\n", | ||
"Requirement already satisfied: humanfriendly>=9.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from coloredlogs->onnxruntime-gpu==1.20.1->embed_anything-gpu) (10.0)\n", | ||
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from sympy->onnxruntime-gpu==1.20.1->embed_anything-gpu) (1.3.0)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install embed_anything-gpu\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import embed_anything\n", | ||
"import os\n", | ||
"\n", | ||
"from typing import Dict, List\n", | ||
"from embed_anything import EmbedData\n", | ||
"from embed_anything.vectordb import Adapter\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Requirement already satisfied: lancedb in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (0.18.0)\n", | ||
"Requirement already satisfied: deprecation in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.1.0)\n", | ||
"Requirement already satisfied: pylance==0.22.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (0.22.0)\n", | ||
"Requirement already satisfied: tqdm>=4.27.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (4.66.5)\n", | ||
"Requirement already satisfied: pydantic>=1.10 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (2.9.2)\n", | ||
"Requirement already satisfied: packaging in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (24.2)\n", | ||
"Requirement already satisfied: overrides>=0.7 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from lancedb) (7.7.0)\n", | ||
"Requirement already satisfied: pyarrow>=14 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (17.0.0)\n", | ||
"Requirement already satisfied: numpy>=1.22 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pylance==0.22.0->lancedb) (2.1.3)\n", | ||
"Requirement already satisfied: annotated-types>=0.6.0 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (0.7.0)\n", | ||
"Requirement already satisfied: pydantic-core==2.23.4 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (2.23.4)\n", | ||
"Requirement already satisfied: typing-extensions>=4.6.1 in /home/sonamAI/miniconda3/envs/unsloth/lib/python3.11/site-packages (from pydantic>=1.10->lancedb) (4.12.2)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install lancedb" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from embed_anything.vectordb import Adapter\n", | ||
"from uuid import uuid4\n", | ||
"import lancedb\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# table = self.connection.create_table(\"docs\", docs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from lancedb.pydantic import Vector, LanceModel\n", | ||
"\n", | ||
"class Item(LanceModel):\n", | ||
" vector: Vector(2)\n", | ||
" item: str\n", | ||
" price: float\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 118, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"class LanceAdapter(Adapter):\n", | ||
" def __init__(self, db_path: str, embedding_dimension: int):\n", | ||
" from lancedb.pydantic import Vector, LanceModel\n", | ||
"\n", | ||
" self.db_path = db_path\n", | ||
" self.connection = lancedb.connect(self.db_path)\n", | ||
" self.dimension = embedding_dimension\n", | ||
" \n", | ||
" class Metadata(LanceModel):\n", | ||
" file_name: str\n", | ||
" modified: str\n", | ||
" created: str\n", | ||
" class Item(LanceModel):\n", | ||
" embeddings: Vector(self.dimension)\n", | ||
" text: str\n", | ||
" # metadata: Metadata\n", | ||
" self.Metadata = Metadata\n", | ||
" self.Item = Item\n", | ||
"\n", | ||
" def create_index(self, table_name: str):\n", | ||
" self.table_name = table_name\n", | ||
" self.connection = lancedb.connect(self.db_path)\n", | ||
" self.table = self.connection.create_table(table_name, schema=self.Item.to_arrow_schema())\n", | ||
"\n", | ||
" def convert(self, embeddings: List[List[EmbedData]]) -> List[Dict]:\n", | ||
" data = []\n", | ||
" for embedding in embeddings:\n", | ||
" data.append(\n", | ||
" {\n", | ||
" \"text\": embedding.text,\n", | ||
" \"embeddings\": embedding.embedding,\n", | ||
" # \"metadata\": self.Metadata(\n", | ||
" # file_name=embedding.metadata[\"file_name\"],\n", | ||
" # modified=embedding.metadata[\"modified\"],\n", | ||
" # created=embedding.metadata[\"created\"],\n", | ||
" # ),\n", | ||
" }\n", | ||
" )\n", | ||
" return data\n", | ||
" \n", | ||
" def delete_index(self, table_name: str):\n", | ||
" self.connection.drop_table(table_name)\n", | ||
"\n", | ||
" def upsert(self, data: EmbedData):\n", | ||
" self.table.add(self.convert(data))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 119, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"lance_adapter.delete_index(\"docs\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 120, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# elasticsearch_adapter = ElasticsearchAdapter(\n", | ||
"# api_key=elastic_api_key,\n", | ||
"# cloud_id=elastic_cloud_id,\n", | ||
"# index_name=index_name,\n", | ||
"# )\n", | ||
"lance_adapter = LanceAdapter(db_path=\"tmp/lancedb\", embedding_dimension=384)\n", | ||
"lance_adapter.create_index(\"docs\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 121, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Loading weights from \"/home/sonamAI/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L12-v2/snapshots/a05860a77cef7b37e0048a7864658139bc18a854/model.safetensors\"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel\n", | ||
"\n", | ||
"model = EmbeddingModel.from_pretrained_hf(\n", | ||
" WhichModel.Bert, model_id=\"sentence-transformers/all-MiniLM-L12-v2\"\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"data = embed_anything.embed_file(\n", | ||
" \"/home/sonamAI/projects/EmbedAnything/test_files/attention.pdf\",\n", | ||
" embedder=model,\n", | ||
" adapter=lance_adapter\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 122, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query_vec = embed_anything.embed_query(['attention'], embedder = model)[0].embedding\n", | ||
"docs = lance_adapter.table.search(query_vec).limit(5).to_pandas()[\"text\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 123, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0 2 Background The goal of reducing sequential c...\n", | ||
"1 <EOS><pad> Input-Input Layer5TheLawwillneverbe...\n", | ||
"2 MultiHead(Q, K, V ) = Concat(head1, ..., headh...\n", | ||
"3 In contrast to RNN sequence-to-sequence models...\n", | ||
"4 Convolutional layers are generally more expens...\n", | ||
"Name: text, dtype: object" | ||
] | ||
}, | ||
"execution_count": 123, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "unsloth", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.