Skip to content

Commit

Permalink
Merge pull request #69 from akshayballal95/refactor
Browse files Browse the repository at this point in the history
Add audio examples
  • Loading branch information
akshayballal95 authored Sep 1, 2024
2 parents 905f829 + d317a8e commit e460154
Show file tree
Hide file tree
Showing 21 changed files with 369 additions and 562 deletions.
24 changes: 14 additions & 10 deletions examples/adapters/pinecone.py → examples/adapters/pinecone_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import re
from typing import Dict, List
import uuid
import embed_anything
import os

from embed_anything.vectordb import Adapter
from pinecone import Pinecone, ServerlessSpec

from embed_anything import BertConfig, EmbedConfig
from embed_anything import EmbedData, EmbeddingModel, WhichModel, TextEmbedConfig


class PineconeAdapter(Adapter):
Expand Down Expand Up @@ -90,6 +93,7 @@ def upsert(self, data: List[Dict]):
Raises:
ValueError: If the index has not been created before upserting data.
"""
data = self.convert(data)
if not self.index_name:
raise ValueError("Index must be created before upserting data")
self.pc.Index(name=self.index_name).upsert(data)
Expand All @@ -107,18 +111,18 @@ def upsert(self, data: List[Dict]):

# Initialize the PineconeEmbedder class

pinecone_adapter.create_index(dimension=1536, metric="cosine")
pinecone_adapter.create_index(dimension=384, metric="cosine")

bert_config = BertConfig(
model_id="sentence-transformers/all-MiniLM-L12-v2", chunk_size=100
bert_model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, "sentence-transformers/all-MiniLM-L12-v2", revision="main"
)
embed_config = EmbedConfig(bert=bert_config)

# Embed the audio files
# Replace the line with a valid code snippet or remove it if not needed
data = embed_anything.embed_file(
"/content/EmbedAnything/test_files/test.pdf",
embeder="Bert",
embed_config = TextEmbedConfig(chunk_size=256, batch_size=32)


data = embed_anything.embed_directory(
"test_files",
embeder=bert_model,
adapter=pinecone_adapter,
config=embed_config,
)
Expand Down
24 changes: 12 additions & 12 deletions examples/adapters/weaviate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"from tqdm.auto import tqdm\n",
"import embed_anything\n",
"from embed_anything import EmbedData\n",
"from embed_anything.vectordb import Adapter\n"
"from embed_anything.vectordb import Adapter"
]
},
{
Expand Down Expand Up @@ -49,13 +49,12 @@
" print(\"Weaviate is ready\")\n",
"\n",
" def create_index(self, index_name: str):\n",
" self.index_name= index_name\n",
" self.collection= self.client.collections.create(\n",
" self.index_name = index_name\n",
" self.collection = self.client.collections.create(\n",
" index_name, vectorizer_config=wvc.config.Configure.Vectorizer.none()\n",
" )\n",
" return self.collection\n",
"\n",
"\n",
" def convert(self, embeddings: List[EmbedData]):\n",
" data = []\n",
" for embedding in embeddings:\n",
Expand All @@ -65,13 +64,12 @@
" wvc.data.DataObject(properties=property, vector=embedding.embedding)\n",
" )\n",
" return data\n",
" \n",
"\n",
" def upsert(self, embeddings):\n",
" self.client.collections.get(self.index_name).data.insert_many(embeddings)\n",
"\n",
" def delete_index(self, index_name: str):\n",
" self.client.collections.delete(index_name)\n",
" "
" self.client.collections.delete(index_name)"
]
},
{
Expand Down Expand Up @@ -212,7 +210,9 @@
}
],
"source": [
"query_vector = embed_anything.embed_query([\"What is self attention\"], embeder = \"OpenAI\")[0].embedding"
"query_vector = embed_anything.embed_query([\"What is self attention\"], embeder=\"OpenAI\")[\n",
" 0\n",
"].embedding"
]
},
{
Expand All @@ -222,10 +222,10 @@
"outputs": [],
"source": [
"response = weaviate_adapter.collection.query.near_vector(\n",
" near_vector=query_vector,\n",
" limit=2,\n",
" return_metadata=wvc.query.MetadataQuery(certainty=True)\n",
" )"
" near_vector=query_vector,\n",
" limit=2,\n",
" return_metadata=wvc.query.MetadataQuery(certainty=True),\n",
")"
]
},
{
Expand Down
30 changes: 19 additions & 11 deletions examples/audio.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
import embed_anything
from embed_anything import JinaConfig, EmbedConfig, AudioDecoderConfig
from embed_anything import (
AudioDecoderModel,
EmbeddingModel,
embed_audio_file,
TextEmbedConfig,
)
import time

start_time = time.time()

# choose any whisper or distilwhisper model from https://huggingface.co/distil-whisper or https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013
audio_decoder_config = AudioDecoderConfig(
decoder_model_id="openai/whisper-tiny.en",
decoder_revision="main",
model_type="tiny-en",
quantized=False,
audio_decoder = AudioDecoderModel.from_pretrained_hf(
"openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
)
jina_config = JinaConfig(
model_id="jinaai/jina-embeddings-v2-small-en", revision="main", chunk_size=100

embeder = EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)

config = EmbedConfig(jina=jina_config, audio_decoder=audio_decoder_config)
data = embed_anything.embed_file(
"test_files/audio/samples_hp0.wav", embeder="Audio", config=config
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_audio_file(
"test_files/audio/samples_hp0.wav",
audio_decoder=audio_decoder,
embeder=embeder,
text_embed_config=config,
)
print(data[0].metadata)
end_time = time.time()
Expand Down
8 changes: 4 additions & 4 deletions examples/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
import uuid
import time
import embed_anything
from embed_anything import EmbeddingModel, WhichModel
from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel
from embed_anything.vectordb import Adapter
from pinecone import Pinecone, ServerlessSpec
import os


model = EmbeddingModel.from_pretrained_local(
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, model_id="prithivida/miniMiracle_te_v1"
)

data = embed_anything.embed_file("test_files/test.pdf", embeder=model)
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_file("test_files/test.pdf", embeder=model, config=config)
print(data[0].embedding)
2 changes: 1 addition & 1 deletion examples/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

start = time.time()

model = embed_anything.EmbeddingModel.from_pretrained_local(
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Clip,
model_id="openai/clip-vit-base-patch16",
# revision="refs/pr/15",
Expand Down
25 changes: 0 additions & 25 deletions examples/pinecone.py

This file was deleted.

Loading

0 comments on commit e460154

Please sign in to comment.