Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add audio examples #69

Merged
merged 6 commits into from
Sep 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions examples/adapters/pinecone.py → examples/adapters/pinecone_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import re
from typing import Dict, List
import uuid
import embed_anything
import os

from embed_anything.vectordb import Adapter
from pinecone import Pinecone, ServerlessSpec

from embed_anything import BertConfig, EmbedConfig
from embed_anything import EmbedData, EmbeddingModel, WhichModel, TextEmbedConfig


class PineconeAdapter(Adapter):
Expand Down Expand Up @@ -90,6 +93,7 @@ def upsert(self, data: List[Dict]):
Raises:
ValueError: If the index has not been created before upserting data.
"""
data = self.convert(data)
if not self.index_name:
raise ValueError("Index must be created before upserting data")
self.pc.Index(name=self.index_name).upsert(data)
Expand All @@ -107,18 +111,18 @@ def upsert(self, data: List[Dict]):

# Initialize the PineconeEmbedder class

pinecone_adapter.create_index(dimension=1536, metric="cosine")
pinecone_adapter.create_index(dimension=384, metric="cosine")

bert_config = BertConfig(
model_id="sentence-transformers/all-MiniLM-L12-v2", chunk_size=100
bert_model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, "sentence-transformers/all-MiniLM-L12-v2", revision="main"
)
embed_config = EmbedConfig(bert=bert_config)

# Embed the audio files
# Replace the line with a valid code snippet or remove it if not needed
data = embed_anything.embed_file(
"/content/EmbedAnything/test_files/test.pdf",
embeder="Bert",
embed_config = TextEmbedConfig(chunk_size=256, batch_size=32)


data = embed_anything.embed_directory(
"test_files",
embeder=bert_model,
adapter=pinecone_adapter,
config=embed_config,
)
Expand Down
24 changes: 12 additions & 12 deletions examples/adapters/weaviate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"from tqdm.auto import tqdm\n",
"import embed_anything\n",
"from embed_anything import EmbedData\n",
"from embed_anything.vectordb import Adapter\n"
"from embed_anything.vectordb import Adapter"
]
},
{
Expand Down Expand Up @@ -49,13 +49,12 @@
" print(\"Weaviate is ready\")\n",
"\n",
" def create_index(self, index_name: str):\n",
" self.index_name= index_name\n",
" self.collection= self.client.collections.create(\n",
" self.index_name = index_name\n",
" self.collection = self.client.collections.create(\n",
" index_name, vectorizer_config=wvc.config.Configure.Vectorizer.none()\n",
" )\n",
" return self.collection\n",
"\n",
"\n",
" def convert(self, embeddings: List[EmbedData]):\n",
" data = []\n",
" for embedding in embeddings:\n",
Expand All @@ -65,13 +64,12 @@
" wvc.data.DataObject(properties=property, vector=embedding.embedding)\n",
" )\n",
" return data\n",
" \n",
"\n",
" def upsert(self, embeddings):\n",
" self.client.collections.get(self.index_name).data.insert_many(embeddings)\n",
"\n",
" def delete_index(self, index_name: str):\n",
" self.client.collections.delete(index_name)\n",
" "
" self.client.collections.delete(index_name)"
]
},
{
Expand Down Expand Up @@ -212,7 +210,9 @@
}
],
"source": [
"query_vector = embed_anything.embed_query([\"What is self attention\"], embeder = \"OpenAI\")[0].embedding"
"query_vector = embed_anything.embed_query([\"What is self attention\"], embeder=\"OpenAI\")[\n",
" 0\n",
"].embedding"
]
},
{
Expand All @@ -222,10 +222,10 @@
"outputs": [],
"source": [
"response = weaviate_adapter.collection.query.near_vector(\n",
" near_vector=query_vector,\n",
" limit=2,\n",
" return_metadata=wvc.query.MetadataQuery(certainty=True)\n",
" )"
" near_vector=query_vector,\n",
" limit=2,\n",
" return_metadata=wvc.query.MetadataQuery(certainty=True),\n",
")"
]
},
{
Expand Down
30 changes: 19 additions & 11 deletions examples/audio.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
import embed_anything
from embed_anything import JinaConfig, EmbedConfig, AudioDecoderConfig
from embed_anything import (
AudioDecoderModel,
EmbeddingModel,
embed_audio_file,
TextEmbedConfig,
)
import time

start_time = time.time()

# choose any whisper or distilwhisper model from https://huggingface.co/distil-whisper or https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013
audio_decoder_config = AudioDecoderConfig(
decoder_model_id="openai/whisper-tiny.en",
decoder_revision="main",
model_type="tiny-en",
quantized=False,
audio_decoder = AudioDecoderModel.from_pretrained_hf(
"openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False
)
jina_config = JinaConfig(
model_id="jinaai/jina-embeddings-v2-small-en", revision="main", chunk_size=100

embeder = EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Bert,
model_id="sentence-transformers/all-MiniLM-L6-v2",
revision="main",
)

config = EmbedConfig(jina=jina_config, audio_decoder=audio_decoder_config)
data = embed_anything.embed_file(
"test_files/audio/samples_hp0.wav", embeder="Audio", config=config
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_audio_file(
"test_files/audio/samples_hp0.wav",
audio_decoder=audio_decoder,
embeder=embeder,
text_embed_config=config,
)
print(data[0].metadata)
end_time = time.time()
Expand Down
8 changes: 4 additions & 4 deletions examples/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
import uuid
import time
import embed_anything
from embed_anything import EmbeddingModel, WhichModel
from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel
from embed_anything.vectordb import Adapter
from pinecone import Pinecone, ServerlessSpec
import os


model = EmbeddingModel.from_pretrained_local(
model = EmbeddingModel.from_pretrained_hf(
WhichModel.Bert, model_id="prithivida/miniMiracle_te_v1"
)

data = embed_anything.embed_file("test_files/test.pdf", embeder=model)
config = TextEmbedConfig(chunk_size=200, batch_size=32)
data = embed_anything.embed_file("test_files/test.pdf", embeder=model, config=config)
print(data[0].embedding)
2 changes: 1 addition & 1 deletion examples/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

start = time.time()

model = embed_anything.EmbeddingModel.from_pretrained_local(
model = embed_anything.EmbeddingModel.from_pretrained_hf(
embed_anything.WhichModel.Clip,
model_id="openai/clip-vit-base-patch16",
# revision="refs/pr/15",
Expand Down
25 changes: 0 additions & 25 deletions examples/pinecone.py

This file was deleted.

Loading
Loading