diff --git a/aligned/compiler/feature_factory.py b/aligned/compiler/feature_factory.py index 9eeba11f..e0db18a3 100644 --- a/aligned/compiler/feature_factory.py +++ b/aligned/compiler/feature_factory.py @@ -1403,9 +1403,15 @@ def copy_type(self) -> Embedding: def dtype(self) -> FeatureType: return FeatureType.embedding(self.embedding_size or 0) - def dot_product(self, embedding: Embedding) -> Float: + def dot_product(self, embedding: Embedding, check_embedding_size: bool = True) -> Float: from aligned.compiler.transformation_factory import ListDotProduct + if check_embedding_size: + assert self.embedding_size == embedding.embedding_size, ( + 'Expected similar embedding size, but got two different ones. ' + f"Left: {self.embedding_size}, right: {embedding.embedding_size}" + ) + feat = Float() feat.transformation = ListDotProduct(self, embedding) return feat diff --git a/aligned/compiler/model.py b/aligned/compiler/model.py index ca780c39..eb1fceca 100644 --- a/aligned/compiler/model.py +++ b/aligned/compiler/model.py @@ -27,7 +27,6 @@ FeatureViewWrapper, ) from aligned.exposed_model.interface import ExposedModel -from aligned.request.retrival_request import RetrivalRequest from aligned.retrival_job import ConvertableToRetrivalJob, PredictionJob, RetrivalJob from aligned.schemas.derivied_feature import DerivedFeature from aligned.schemas.feature import Feature, FeatureLocation, FeatureReference, FeatureType, StaticFeatureTags @@ -180,24 +179,11 @@ def predict_over( values: ConvertableToRetrivalJob | RetrivalJob, needed_views: list[FeatureViewWrapper | ModelContractWrapper] | None = None, ) -> PredictionJob: - from aligned.retrival_job import RetrivalJob - model = self.compile() if not model.exposed_model: raise ValueError(f"Model {model.name} does not have an `exposed_model` to use for predictions.") - if not isinstance(values, RetrivalJob): - features = {feat.as_feature() for feat in model.features.default_features} - request = RetrivalRequest( - name='default', - location=FeatureLocation.model(model.name), - entities=set(), - features=features, - derived_features=set(), - ) - values = RetrivalJob.from_convertable(values, request) - return self.query(needed_views).predict_over(values) def as_view(self) -> CompiledFeatureView | None: diff --git a/aligned/exposed_model/interface.py b/aligned/exposed_model/interface.py index 4d828785..b91189cb 100644 --- a/aligned/exposed_model/interface.py +++ b/aligned/exposed_model/interface.py @@ -434,7 +434,39 @@ async def function_wrapper(values: RetrivalJob, store: ModelFeatureStore) -> pl. return DillFunction(function=dill.dumps(function_wrapper)) -def openai_embedding(model: str, prompt_template: str | None = None) -> ExposedModel: +def openai_embedding( + model: str, batch_on_n_chunks: int | None, prompt_template: str | None = None +) -> ExposedModel: + """ + Returns an OpenAI embedding model. + + ```python + @model_contract( + input_features=[MyFeature().name], + exposed_model=openai_embedding("text-embedding-3-small"), + ) + class MyEmbedding: + my_entity = Int32().as_entity() + name = String() + embedding = Embedding(1536) + predicted_at = EventTimestamp() + + embeddings = await store.model(MyEmbedding).predict_over({ + "my_entity": [1, 2, 3], + "name": ["Hello", "World", "foo"] + }).to_polars() + ``` + + + Args: + model (str): the model to use. Look at the OpenAi docs to find the correct one. + batch_on_n_chunks (int): When to change to the batch API. Given that the batch size is too big. + prompt_template (str): A custom prompt template if wanted. The default will be based on the input features. + + Returns: + ExposedModel: a model that sends embedding requests to OpenAI + + """ from aligned.exposed_model.openai import OpenAiEmbeddingPredictor return OpenAiEmbeddingPredictor(model=model, prompt_template=prompt_template or '') diff --git a/aligned/exposed_model/openai.py b/aligned/exposed_model/openai.py index 94b3f130..aeaefdd9 100644 --- a/aligned/exposed_model/openai.py +++ b/aligned/exposed_model/openai.py @@ -1,18 +1,187 @@ +from __future__ import annotations + +import asyncio +import json +import logging from dataclasses import dataclass -import polars as pl +from datetime import datetime, timezone +from math import ceil +from pathlib import Path +from typing import TYPE_CHECKING +from uuid import uuid4 -from aligned.exposed_model.interface import ExposedModel +import polars as pl +from aligned.exposed_model.interface import ( + ExposedModel, + Feature, + FeatureReference, + RetrivalJob, + VersionedModel, +) from aligned.feature_store import ModelFeatureStore -from aligned.retrival_job import RetrivalJob -from aligned.schemas.feature import Feature, FeatureReference from aligned.schemas.model import Model +if TYPE_CHECKING: + from openai import AsyncClient + +logger = logging.getLogger(__name__) + + +def write_batch_request(texts: list[str], path: Path, model: str, url: str) -> None: + """ + Creates a .jsonl file for batch processing, with each line being a request to the embeddings API. + """ + with path.open('w') as f: + for i, text in enumerate(texts): + request = { + 'custom_id': f"request-{i+1}", + 'method': 'POST', + 'url': url, + 'body': {'model': model, 'input': text}, + } + f.write(json.dumps(request) + '\n') + + +async def chunk_batch_embedding_request(texts: list[str], model: str, client: AsyncClient) -> pl.DataFrame: + + max_batch = 50_000 + number_of_batches = ceil(len(texts) / max_batch) + + batch_result: pl.DataFrame | None = None + + for i in range(number_of_batches): + start = i * max_batch + end_batch = min((i + 1) * max_batch, len(texts)) + + if start == end_batch: + batch_prompts = [texts[start]] + else: + batch_prompts = texts[start:end_batch] + + result = await make_batch_embedding_request(batch_prompts, model, client) + + if batch_result is None: + batch_result = result + else: + batch_result = batch_result.hstack(result) + + assert batch_result is not None + return batch_result + + +async def make_batch_embedding_request(texts: list[str], model: str, client: AsyncClient) -> pl.DataFrame: + + id_path = str(uuid4()) + batch_file = Path(id_path) + output_file = Path(id_path + '-output.jsonl') + + write_batch_request(texts, batch_file, model, '/v1/embeddings') + request_file = await client.files.create(file=batch_file, purpose='batch') + response = await client.batches.create( + input_file_id=request_file.id, + endpoint='/v1/embeddings', + completion_window='24h', + metadata={'description': 'Embedding batch job'}, + ) + status_response = await client.batches.retrieve(response.id) + + last_process = None + expected_duration_left = 60 + + while status_response.status not in ['completed', 'failed']: + await asyncio.sleep(expected_duration_left * 0.8) # Poll every minute + status_response = await client.batches.retrieve(response.id) + logger.info(f"Status of batch request {status_response.status}") + + processed_records = 0 + leftover_records = 0 + + if status_response.request_counts: + processed_records = status_response.request_counts.completed + leftover_records = status_response.request_counts.total - processed_records + + if status_response.in_progress_at: + last_process = datetime.fromtimestamp(status_response.in_progress_at, tz=timezone.utc) + now = datetime.now(tz=timezone.utc) + + items_per_process = (now - last_process).total_seconds() / max(processed_records, 1) + expected_duration_left = max(items_per_process * leftover_records, 60) + + batch_info = await client.batches.retrieve(response.id) + output_file_id = batch_info.output_file_id + + if not output_file_id: + raise ValueError(f"No output file for request: {response.id}") + + output_content = await client.files.retrieve_content(output_file_id) + output_file.write_text(output_content) + embeddings = pl.read_ndjson(output_file.as_posix()) + expanded_emb = ( + embeddings.unnest('response') + .unnest('body') + .explode('data') + .select(['custom_id', 'data']) + .unnest('data') + .select(['custom_id', 'embedding']) + .with_columns(pl.col('custom_id').str.split('-').list.get(1).alias('index')) + ) + return expanded_emb + + +async def embed_texts( + texts: list[str], model: str, skip_if_n_chunks: int | None, client: AsyncClient +) -> list[list[float]] | str: + import tiktoken + + max_token_size = 8192 + number_of_texts = len(texts) + + chunks: list[int] = [] + chunk_size = 0 + encoder = tiktoken.encoding_for_model(model) + + for index, text in enumerate(texts): + token_size = len(encoder.encode(text)) + + if chunk_size + token_size > max_token_size: + chunks.append(index) + chunk_size = 0 + + if skip_if_n_chunks and len(chunks) + 1 >= skip_if_n_chunks: + return f"At text nr: {index} did it go above {skip_if_n_chunks} with {len(chunks)}" + + chunk_size += token_size + + if number_of_texts - 1 > chunks[-1]: + chunks.append(number_of_texts - 1) + + embeddings: list[list[float]] = [] + + last_chunk_index = 0 + + for chunk_index in chunks: + if last_chunk_index == 0 and chunk_index >= number_of_texts - 1: + chunk_texts = texts + elif last_chunk_index == 0: + chunk_texts = texts[:chunk_index] + elif chunk_index >= number_of_texts - 1: + chunk_texts = texts[last_chunk_index:] + else: + chunk_texts = texts[last_chunk_index:chunk_index] + + res = await client.embeddings.create(input=chunk_texts, model=model) + embeddings.extend([emb.embedding for emb in res.data]) + last_chunk_index = chunk_index + + return embeddings + @dataclass -class OpenAiEmbeddingPredictor(ExposedModel): +class OpenAiEmbeddingPredictor(ExposedModel, VersionedModel): model: str + batch_on_n_chunks: int | None = 100 feature_refs: list[FeatureReference] = None # type: ignore output_name: str = '' prompt_template: str = '' @@ -27,6 +196,24 @@ def prompt_template_hash(self) -> str: return sha256(self.prompt_template.encode(), usedforsecurity=False).hexdigest() + @property + def as_markdown(self) -> str: + return f"""Sending a `embedding` request to OpenAI's API. + +This will use the model: `{self.model}` to generate the embeddings. +Will switch to the batch API if more then {self.batch_on_n_chunks} chunks are needed to fulfill the request. + +And use the prompt template: +``` +{self.prompt_template} +```""" + + async def model_version(self) -> str: + if len(self.feature_refs) == 1: + return self.model + else: + return f"{self.model}-{self.prompt_template_hash()}" + async def needed_features(self, store: ModelFeatureStore) -> list[FeatureReference]: return self.feature_refs @@ -58,24 +245,33 @@ async def run_polars(self, values: RetrivalJob, store: ModelFeatureStore) -> pl. missing_cols = expected_cols - set(values.loaded_columns) if missing_cols: + logging.info(f"Missing cols: {missing_cols}") df = await store.store.features_for(values, features=self.feature_refs).to_polars() else: df = await values.to_polars() if len(expected_cols) == 1: - prompts = df[self.feature_refs[0].name].to_list() + texts = df[self.feature_refs[0].name].to_list() else: - prompts: list[str] = [] + texts: list[str] = [] for row in df.to_dicts(): - prompts.append(self.prompt_template.format(**row)) - - embeddings = await client.embeddings.create(input=prompts, model=self.model) - return df.hstack( - [ - pl.Series( - name=self.output_name, - values=[emb.embedding for emb in embeddings.data], - dtype=pl.List(pl.Float32), - ) - ] + texts.append(self.prompt_template.format(**row)) + + realtime_emb = await embed_texts( + texts, model=self.model, skip_if_n_chunks=self.batch_on_n_chunks, client=client ) + + if isinstance(realtime_emb, list): + return df.hstack( + [ + pl.Series( + name=self.output_name, + values=realtime_emb, + dtype=pl.List(pl.Float32), + ) + ] + ) + + batch_result = await chunk_batch_embedding_request(texts, self.model, client) + + return df.hstack([batch_result['embedding'].alias(self.output_name)]) diff --git a/aligned/exposed_model/tests/test_model.py b/aligned/exposed_model/tests/test_model.py index 03c1a8cb..094773df 100644 --- a/aligned/exposed_model/tests/test_model.py +++ b/aligned/exposed_model/tests/test_model.py @@ -94,7 +94,7 @@ class MyModelContract2: entities = {'entity_id': ['a', 'b'], 'x': [1, 2]} pred_job = MyModelContract2.predict_over(entities, needed_views=[InputFeatureView, MyModelContract]) - assert set(pred_job.request_result.feature_columns) == {'x', 'prediction', 'other_pred'} + assert set(pred_job.request_result.all_returned_columns) == {'x', 'entity_id', 'prediction', 'other_pred'} preds = await pred_job.to_polars() assert preds['other_pred'].to_list() == [6, 12] @@ -226,3 +226,13 @@ class MyModelContract2: ) assert preds['other_pred'].null_count() == 0 assert not first_preds['model_version'].series_equal(preds['model_version']) + + preds = ( + await without_cache.model(MyModelContract2) + .predict_over(without_cache.feature_view(InputFeatureView).all()) + .to_polars() + ) + input_features = InputFeatureView.query().request.all_returned_columns + assert set(input_features) - set(preds.columns) == set(), 'Missing some columns' + assert preds['other_pred'].null_count() == 0 + assert not first_preds['model_version'].series_equal(preds['model_version']) diff --git a/aligned/feature_store.py b/aligned/feature_store.py index ca49eea2..134738f9 100644 --- a/aligned/feature_store.py +++ b/aligned/feature_store.py @@ -446,6 +446,9 @@ def features_for( if not isinstance(entities, RetrivalJob): entities = RetrivalJob.from_convertable(entities, requests) + feature_names.update(entities.loaded_columns) + else: + feature_names.update(entities.request_result.all_returned_columns) existing_features = set(entities.loaded_columns) @@ -1056,10 +1059,12 @@ def predict_over( 'This can be done by setting the `exposed_at` value' ) + returned_request = self.request().needed_requests + if not isinstance(entities, RetrivalJob): - entities = RetrivalJob.from_convertable(entities, self.request().needed_requests) + entities = RetrivalJob.from_convertable(entities, returned_request) - return PredictionJob(entities, self.model, self.store) + return PredictionJob(entities, self.model, self.store, returned_request) def features_for( self, entities: ConvertableToRetrivalJob | RetrivalJob, event_timestamp_column: str | None = None diff --git a/aligned/feature_view/feature_view.py b/aligned/feature_view/feature_view.py index a1b8cb63..6d3370ff 100644 --- a/aligned/feature_view/feature_view.py +++ b/aligned/feature_view/feature_view.py @@ -311,6 +311,8 @@ def with_schema( compiled = self.compile() for agg_feature in compiled.aggregated_features: + if agg_feature.name.isdigit(): + continue org_feature: FeatureFactory = getattr(view, agg_feature.derived_feature.name) feature = org_feature.copy_type() feature.transformation = None @@ -322,6 +324,8 @@ def with_schema( setattr(view, agg_feature.derived_feature.name, feature) for derived_feature in compiled.derived_features: + if derived_feature.name.isdigit(): + continue org_feature: FeatureFactory = getattr(view, derived_feature.name) feature = org_feature.copy_type() feature.transformation = None diff --git a/aligned/request/retrival_request.py b/aligned/request/retrival_request.py index 2c2b9c42..95ad9edb 100644 --- a/aligned/request/retrival_request.py +++ b/aligned/request/retrival_request.py @@ -494,6 +494,12 @@ def needs_event_timestamp(self) -> bool: def request_result(self) -> RequestResult: return RequestResult.from_request_list(self.needed_requests).filter_features(self.features_to_include) + def entities(self) -> set[Feature]: + features = set() + for req in self.needed_requests: + features.update(req.entities) + return features + def without_event_timestamp(self, name_sufix: str | None = None) -> 'FeatureRequest': return FeatureRequest( location=self.location, diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index 064d7308..b0937a6b 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -21,7 +21,7 @@ from aligned.exceptions import UnableToFindFileException from aligned.request.retrival_request import FeatureRequest, RequestResult, RetrivalRequest -from aligned.schemas.feature import Feature, FeatureType +from aligned.schemas.feature import Feature, FeatureLocation, FeatureType from aligned.schemas.derivied_feature import DerivedFeature from aligned.schemas.vector_storage import VectorIndex from aligned.split_strategy import SupervisedDataSet @@ -875,19 +875,56 @@ def from_convertable( elif isinstance(request, FeatureRequest): request = request.needed_requests + def remove_features(loaded_features: dict[str, pl.DataType]) -> list[RetrivalRequest]: + revised_requests: list[RetrivalRequest] = [] + req_feature_names: list[str] = [] + for req in request: + req_feature_names.extend(req.all_returned_columns) + + revised_requests.append( + RetrivalRequest( + name=req.name, + location=req.location, + entities=req.entities, + features={feat for feat in req.features if feat.name in loaded_features}, + derived_features=req.derived_features, + event_timestamp_request=req.event_timestamp_request, + ) + ) + + additional_features = { + Feature(feat, FeatureType.from_polars(dtype)) + for feat, dtype in loaded_features.items() + if feat not in req_feature_names + } + if additional_features: + revised_requests.append( + RetrivalRequest( + name='additional', + location=FeatureLocation.feature_view('additional'), + entities=set(), + features=additional_features, + derived_features=set(), + ) + ) + + return revised_requests + if isinstance(data, dict): - return LiteralRetrivalJob(pl.DataFrame(data).lazy(), request) + df = pl.DataFrame(data).lazy() elif isinstance(data, list): - return LiteralRetrivalJob(pl.DataFrame(data).lazy(), request) + df = pl.DataFrame(data).lazy() elif isinstance(data, pl.DataFrame): - return LiteralRetrivalJob(data.lazy(), request) + df = data.lazy() elif isinstance(data, pl.LazyFrame): - return LiteralRetrivalJob(data, request) + df = data elif isinstance(data, pd.DataFrame): - return LiteralRetrivalJob(pl.from_pandas(data).lazy(), request) + df = pl.from_pandas(data).lazy() else: raise ValueError(f'Unable to convert {type(data)} to RetrivalJob') + return LiteralRetrivalJob(df, remove_features(df.schema)) + async def write_to_source(self, source: WritableFeatureSource | DataFileReference) -> None: """ Writes the output of the retrival job to the passed source. @@ -2550,6 +2587,7 @@ class PredictionJob(RetrivalJob): job: RetrivalJob model: Model store: ContractStore + output_requests: list[RetrivalRequest] def added_features(self) -> set[Feature]: pred_view = self.model.predictions_view @@ -2558,17 +2596,12 @@ def added_features(self) -> set[Feature]: @property def request_result(self) -> RequestResult: - result = self.job.request_result - - return RequestResult( - entities=result.entities, - features=result.features.union(self.added_features()), - event_timestamp=result.event_timestamp, - ) + reqs = self.retrival_requests + return RequestResult.from_request_list(reqs) @property def retrival_requests(self) -> list[RetrivalRequest]: - return self.job.retrival_requests + return self.output_requests + [self.model.predictions_view.request(self.model.name)] def describe(self) -> str: added = self.added_features() @@ -2612,10 +2645,10 @@ async def to_lazy_polars(self) -> pl.LazyFrame: return df.lazy() def log_each_job(self, logger_func: Callable[[object], None] | None = None) -> RetrivalJob: - return PredictionJob(self.job.log_each_job(logger_func), self.model, self.store) + return PredictionJob(self.job.log_each_job(logger_func), self.model, self.store, self.output_requests) def filter(self, condition: str | Feature | DerivedFeature | pl.Expr) -> RetrivalJob: - return PredictionJob(self.job.filter(condition), self.model, self.store) + return PredictionJob(self.job.filter(condition), self.model, self.store, self.output_requests) def remove_derived_features(self) -> RetrivalJob: return self.job.remove_derived_features() diff --git a/aligned/sources/in_mem_source.py b/aligned/sources/in_mem_source.py index 72c7ec79..61eb3556 100644 --- a/aligned/sources/in_mem_source.py +++ b/aligned/sources/in_mem_source.py @@ -57,8 +57,6 @@ def nearest_n_to( ) -> RetrivalJob: from aligned.retrival_job import RetrivalJob - print(request.features_to_include) - async def load() -> pl.LazyFrame: def first_embedding(features: set[Feature]) -> Feature | None: for feature in features: diff --git a/poetry.lock b/poetry.lock index 86c89115..f98aaaee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -596,7 +596,7 @@ pycparser = "*" name = "charset-normalizer" version = "3.4.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = true +optional = false python-versions = ">=3.7.0" files = [ {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, @@ -3823,7 +3823,7 @@ ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)" name = "regex" version = "2024.9.11" description = "Alternative regular expression module, to replace re." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"}, @@ -3926,7 +3926,7 @@ files = [ name = "requests" version = "2.32.3" description = "Python HTTP for Humans." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, @@ -4460,6 +4460,53 @@ files = [ {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, ] +[[package]] +name = "tiktoken" +version = "0.8.0" +description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" +optional = false +python-versions = ">=3.9" +files = [ + {file = "tiktoken-0.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b07e33283463089c81ef1467180e3e00ab00d46c2c4bbcef0acab5f771d6695e"}, + {file = "tiktoken-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9269348cb650726f44dd3bbb3f9110ac19a8dcc8f54949ad3ef652ca22a38e21"}, + {file = "tiktoken-0.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e13f37bc4ef2d012731e93e0fef21dc3b7aea5bb9009618de9a4026844e560"}, + {file = "tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f13d13c981511331eac0d01a59b5df7c0d4060a8be1e378672822213da51e0a2"}, + {file = "tiktoken-0.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6b2ddbc79a22621ce8b1166afa9f9a888a664a579350dc7c09346a3b5de837d9"}, + {file = "tiktoken-0.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d8c2d0e5ba6453a290b86cd65fc51fedf247e1ba170191715b049dac1f628005"}, + {file = "tiktoken-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d622d8011e6d6f239297efa42a2657043aaed06c4f68833550cac9e9bc723ef1"}, + {file = "tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2efaf6199717b4485031b4d6edb94075e4d79177a172f38dd934d911b588d54a"}, + {file = "tiktoken-0.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5637e425ce1fc49cf716d88df3092048359a4b3bbb7da762840426e937ada06d"}, + {file = "tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fb0e352d1dbe15aba082883058b3cce9e48d33101bdaac1eccf66424feb5b47"}, + {file = "tiktoken-0.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56edfefe896c8f10aba372ab5706b9e3558e78db39dd497c940b47bf228bc419"}, + {file = "tiktoken-0.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:326624128590def898775b722ccc327e90b073714227175ea8febbc920ac0a99"}, + {file = "tiktoken-0.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:881839cfeae051b3628d9823b2e56b5cc93a9e2efb435f4cf15f17dc45f21586"}, + {file = "tiktoken-0.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fe9399bdc3f29d428f16a2f86c3c8ec20be3eac5f53693ce4980371c3245729b"}, + {file = "tiktoken-0.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a58deb7075d5b69237a3ff4bb51a726670419db6ea62bdcd8bd80c78497d7ab"}, + {file = "tiktoken-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2908c0d043a7d03ebd80347266b0e58440bdef5564f84f4d29fb235b5df3b04"}, + {file = "tiktoken-0.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:294440d21a2a51e12d4238e68a5972095534fe9878be57d905c476017bff99fc"}, + {file = "tiktoken-0.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:d8f3192733ac4d77977432947d563d7e1b310b96497acd3c196c9bddb36ed9db"}, + {file = "tiktoken-0.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:02be1666096aff7da6cbd7cdaa8e7917bfed3467cd64b38b1f112e96d3b06a24"}, + {file = "tiktoken-0.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94ff53c5c74b535b2cbf431d907fc13c678bbd009ee633a2aca269a04389f9a"}, + {file = "tiktoken-0.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b231f5e8982c245ee3065cd84a4712d64692348bc609d84467c57b4b72dcbc5"}, + {file = "tiktoken-0.8.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4177faa809bd55f699e88c96d9bb4635d22e3f59d635ba6fd9ffedf7150b9953"}, + {file = "tiktoken-0.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5376b6f8dc4753cd81ead935c5f518fa0fbe7e133d9e25f648d8c4dabdd4bad7"}, + {file = "tiktoken-0.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:18228d624807d66c87acd8f25fc135665617cab220671eb65b50f5d70fa51f69"}, + {file = "tiktoken-0.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e17807445f0cf1f25771c9d86496bd8b5c376f7419912519699f3cc4dc5c12e"}, + {file = "tiktoken-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:886f80bd339578bbdba6ed6d0567a0d5c6cfe198d9e587ba6c447654c65b8edc"}, + {file = "tiktoken-0.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6adc8323016d7758d6de7313527f755b0fc6c72985b7d9291be5d96d73ecd1e1"}, + {file = "tiktoken-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b591fb2b30d6a72121a80be24ec7a0e9eb51c5500ddc7e4c2496516dd5e3816b"}, + {file = "tiktoken-0.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:845287b9798e476b4d762c3ebda5102be87ca26e5d2c9854002825d60cdb815d"}, + {file = "tiktoken-0.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:1473cfe584252dc3fa62adceb5b1c763c1874e04511b197da4e6de51d6ce5a02"}, + {file = "tiktoken-0.8.0.tar.gz", hash = "sha256:9ccbb2740f24542534369c5635cfd9b2b3c2490754a78ac8831d99f89f94eeb2"}, +] + +[package.dependencies] +regex = ">=2022.1.18" +requests = ">=2.26.0" + +[package.extras] +blobfile = ["blobfile (>=2)"] + [[package]] name = "tokenizers" version = "0.20.1" @@ -4865,7 +4912,7 @@ files = [ name = "urllib3" version = "2.2.3" description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, @@ -5144,4 +5191,4 @@ sql = ["sqlglot"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "81ae0f59ce7dd0be8236108c7aea47798bc7d0643041c96c125839bbafb33f7d" +content-hash = "7dcda5c8a866ba7b95c492beee6794219c74295f45c7b4b82c6dfdc10492bb26" diff --git a/pyproject.toml b/pyproject.toml index 79399d24..045396d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aligned" -version = "0.0.110" +version = "0.0.111" description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" @@ -74,6 +74,7 @@ adlfs = { version = "^2024.4.1", optional = true } lancedb = { version = "^0.8.2", optional = true } deltalake = { version = "^0.18.1", optional = true } sentence-transformers = { version = "^3.2.0", optional = true } +tiktoken = { version = "^0.8.0", optional = true } [tool.poetry.extras] aws = ["aioaws", "connectorx"] @@ -91,7 +92,7 @@ deltalake = ["deltalake"] lancedb = ["lancedb"] pandas = ["pandas"] sentence-transformers = ["sentence-transformers"] -openai = ["openai"] +openai = ["openai", "tiktoken"] [tool.poetry.group.dev.dependencies] types-redis = "^4.2.6"