From 15ef90d097c4af364747e8337bd58fd8db90c4a5 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Tue, 12 Mar 2024 19:37:42 +0100 Subject: [PATCH] Filling missing columns in more sources --- aligned/data_source/batch_data_source.py | 12 ++- aligned/local/job.py | 28 ------ aligned/retrival_job.py | 58 +++++++---- aligned/schemas/feature_view.py | 15 ++- aligned/sources/tests/test_parquet.py | 40 +++++++- conftest.py | 3 - pyproject.toml | 2 +- test_data/credit_history.csv | 14 +-- test_data/credit_history_mater.parquet | Bin 980 -> 976 bytes test_data/feature-store.json | 2 +- test_data/loan.csv | 14 +-- test_data/test_model.csv | 8 +- test_data/test_model.parquet | Bin 586 -> 590 bytes test_data/titanic-sets.json | 2 +- test_data/titanic-test.csv | 42 ++++---- test_data/titanic-train.csv | 122 +++++++++++------------ test_data/titanic-validate.csv | 42 ++++---- 17 files changed, 217 insertions(+), 187 deletions(-) diff --git a/aligned/data_source/batch_data_source.py b/aligned/data_source/batch_data_source.py index 634609c3..b303a02c 100644 --- a/aligned/data_source/batch_data_source.py +++ b/aligned/data_source/batch_data_source.py @@ -301,7 +301,7 @@ def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: return CustomLazyPolarsJob( request=request, method=lambda: dill.loads(self.all_data_method)(request, limit) - ) + ).fill_missing_columns() def all_between_dates( self, request: RetrivalRequest, start_date: datetime, end_date: datetime @@ -312,7 +312,7 @@ def all_between_dates( return CustomLazyPolarsJob( request=request, method=lambda: dill.loads(self.all_between_dates_method)(request, start_date, end_date), - ) + ).fill_missing_columns() def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> RetrivalJob: from aligned.retrival_job import CustomLazyPolarsJob @@ -320,7 +320,7 @@ def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> Retrival return CustomLazyPolarsJob( request=request, method=lambda: dill.loads(self.features_for_method)(facts, request) - ) + ).fill_missing_columns() @classmethod def multi_source_features_for( @@ -619,6 +619,7 @@ def all_with_limit(self, limit: int | None) -> RetrivalJob: right_on=self.right_on, timestamp_unit=self.timestamp_unit, ) + .fill_missing_columns() ) def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: @@ -639,6 +640,7 @@ def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: timestamp_unit=self.timestamp_unit, ) .aggregate(request) + .fill_missing_columns() .derive_features([request]) ) @@ -661,6 +663,7 @@ def all_between_dates( right_on=self.right_on, ) .aggregate(request) + .fill_missing_columns() .derive_features([request]) ) @@ -729,6 +732,7 @@ def all_with_limit(self, limit: int | None) -> RetrivalJob: self.source.all_data(self.left_request, limit=limit) .derive_features([self.left_request]) .join(right_job, method=self.method, left_on=self.left_on, right_on=self.right_on) + .fill_missing_columns() ) def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: @@ -741,6 +745,7 @@ def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: self.source.all_data(self.left_request, limit=limit) .derive_features([self.left_request]) .join(right_job, method=self.method, left_on=self.left_on, right_on=self.right_on) + .fill_missing_columns() .aggregate(request) .derive_features([request]) ) @@ -763,6 +768,7 @@ def all_between_dates( left_on=self.left_on, right_on=self.right_on, ) + .fill_missing_columns() .aggregate(request) .derive_features([request]) ) diff --git a/aligned/local/job.py b/aligned/local/job.py index 0ea8d3b6..e8c854a4 100644 --- a/aligned/local/job.py +++ b/aligned/local/job.py @@ -240,34 +240,6 @@ def request_result(self) -> RequestResult: def retrival_requests(self) -> list[RetrivalRequest]: return [self.request] - def file_transformations(self, df: pd.DataFrame) -> pd.DataFrame: - from aligned.data_source.batch_data_source import ColumnFeatureMappable - - entity_names = self.request.entity_names - all_names = list(self.request.all_required_feature_names.union(entity_names)) - - request_features = all_names - if isinstance(self.source, ColumnFeatureMappable): - request_features = self.source.feature_identifier_for(all_names) - - df.rename( - columns=dict(zip(request_features, all_names)), - inplace=True, - ) - - if self.request.event_timestamp is None: - raise ValueError(f'Source {self.source} have no event timestamp to filter on') - - event_timestamp_column = self.request.event_timestamp.name - # Making sure it is in the correct format - df[event_timestamp_column] = pd.to_datetime( - df[event_timestamp_column], infer_datetime_format=True, utc=True - ) - - start_date_ts = pd.to_datetime(self.start_date, utc=True) - end_date_ts = pd.to_datetime(self.end_date, utc=True) - return df.loc[df[event_timestamp_column].between(start_date_ts, end_date_ts)] - def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame: from aligned.data_source.batch_data_source import ColumnFeatureMappable diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index 7da924c9..43a4da60 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -285,6 +285,17 @@ async def to_pandas(self) -> SupervisedDataSet[pd.DataFrame]: data, entities, features, self.target_columns, self.job.request_result.event_timestamp ) + async def to_polars(self) -> SupervisedDataSet[pl.DataFrame]: + dataset = await self.to_lazy_polars() + + return SupervisedDataSet( + data=dataset.data.collect(), + entity_columns=dataset.entity_columns, + features=dataset.feature_columns, + target=dataset.target_columns, + event_timestamp_column=dataset.event_timestamp_column, + ) + async def to_lazy_polars(self) -> SupervisedDataSet[pl.LazyFrame]: data = await self.job.to_lazy_polars() if self.should_filter_out_null_targets: @@ -760,7 +771,7 @@ def copy_with(self: JobType, job: RetrivalJob) -> JobType: class CustomPolarsJob(RetrivalJob, ModificationJob): job: RetrivalJob - polars_method: Callable[[pl.LazyFrame], pl.LazyFrame] + polars_method: Callable[[pl.LazyFrame], pl.LazyFrame] # type: ignore async def to_lazy_polars(self) -> pl.LazyFrame: df = await self.job.to_lazy_polars() @@ -1389,38 +1400,41 @@ class FillMissingColumnsJob(RetrivalJob, ModificationJob): job: RetrivalJob async def to_pandas(self) -> pd.DataFrame: + from aligned.schemas.constraints import Optional + data = await self.job.to_pandas() for request in self.retrival_requests: - missing = request.all_required_feature_names - set(data.columns) - if not missing: - continue + optional_constraint = Optional() + for feature in request.features: + if ( + feature.constraints + and optional_constraint in feature.constraints + and feature.name not in data.columns + ): + data[feature] = None - logger.warn( - f""" -Some features is missing. -Will fill values with None, but it could be a potential problem: {missing} -""" - ) - for feature in missing: - data[feature] = None return data async def to_lazy_polars(self) -> pl.LazyFrame: + from aligned.schemas.constraints import Optional + data = await self.job.to_lazy_polars() + optional_constraint = Optional() + for request in self.retrival_requests: - missing = request.all_required_feature_names - set(data.columns) - if not missing: - continue + missing_columns = [ + feature.name + for feature in request.features + if feature.constraints + and optional_constraint in feature.constraints + and feature.name not in data.columns + ] + + if missing_columns: + data = data.with_columns([pl.lit(None).alias(feature) for feature in missing_columns]) - logger.warn( - f""" -Some features is missing. -Will fill values with None, but it could be a potential problem: {missing} -""" - ) - data = data.with_columns([pl.lit(None).alias(feature) for feature in missing]) return data diff --git a/aligned/schemas/feature_view.py b/aligned/schemas/feature_view.py index 31cb9d71..5b94d6d4 100644 --- a/aligned/schemas/feature_view.py +++ b/aligned/schemas/feature_view.py @@ -373,7 +373,12 @@ def multi_source_features_for( sub_source = source.view.materialized_source or source.view.source sub_request = source.sub_request(request) - sub_job = sub_source.all_data(sub_request, limit=None).derive_features() + sub_job = ( + sub_source.all_data(sub_request, limit=None) + .derive_features() + .with_request([request]) + .fill_missing_columns() + ) if request.aggregated_features: available_features = sub_job.aggregate(request).derive_features() @@ -388,10 +393,11 @@ def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: sub_req = self.sub_request(request) core_job = sub_source.all_data(sub_req, limit=limit) + if request.aggregated_features: job = core_job.aggregate(request) else: - job = core_job.derive_features() + job = core_job.derive_features().with_request([request]).fill_missing_columns() return job.derive_features([request]).rename(self.renames) @@ -402,11 +408,12 @@ def all_between_dates( sub_req = self.sub_request(request) - core_job = sub_source.all_between_dates(sub_req, start_date, end_date) + core_job = sub_source.all_between_dates(sub_req, start_date, end_date).fill_missing_columns() + if request.aggregated_features: job = core_job.aggregate(request) else: - job = core_job.derive_features() + job = core_job.derive_features().with_request([request]).fill_missing_columns() return job.derive_features([request]).rename(self.renames) def depends_on(self) -> set[FeatureLocation]: diff --git a/aligned/sources/tests/test_parquet.py b/aligned/sources/tests/test_parquet.py index 1e4d1314..7d999d42 100644 --- a/aligned/sources/tests/test_parquet.py +++ b/aligned/sources/tests/test_parquet.py @@ -126,7 +126,7 @@ async def test_read_csv(point_in_time_data_test: DataTest) -> None: batch_source=file_source, ) compiled = view.compile_instance() - assert compiled.source.path == file_source.path # type: ignore + assert compiled.source.path == file_source.path store.add_compiled_view(compiled) @@ -170,9 +170,43 @@ class Test: filled = b.fill_na(0) expected_df = df.with_columns(pl.lit(None).alias('b'), pl.lit(0).alias('filled')) - loaded = await Test.query().all().to_polars() + loaded = await Test.query().all().to_polars() # type: ignore assert loaded.equals(expected_df.select(loaded.columns)) - facts = await Test.query().features_for({'a': [2]}).to_polars() + facts = await Test.query().features_for({'a': [2]}).to_polars() # type: ignore + assert expected_df.filter(pl.col('a') == 2).equals(facts.select(expected_df.columns)) + + +@pytest.mark.asyncio +async def test_read_optional_view() -> None: + + source = FileSource.csv_at('test_data/optional_test.csv') + df = pl.DataFrame( + { + 'a': [1, 2, 3], + 'c': [1, 2, 3], + } + ) + await source.write_polars(df.lazy()) + + @feature_view(name='test_a', source=source) + class TestA: + a = Int32().as_entity() + c = Int32() + + @feature_view(name='test', source=TestA) # type: ignore + class Test: + a = Int32().as_entity() + b = Int32().is_optional() + c = Int32() + + filled = b.fill_na(0) + + expected_df = df.with_columns(pl.lit(None).alias('b'), pl.lit(0).alias('filled')) + loaded = await Test.query().all().to_polars() # type: ignore + + assert loaded.equals(expected_df.select(loaded.columns)) + + facts = await Test.query().features_for({'a': [2]}).to_polars() # type: ignore assert expected_df.filter(pl.col('a') == 2).equals(facts.select(expected_df.columns)) diff --git a/conftest.py b/conftest.py index 7b22d343..fb7cc72c 100644 --- a/conftest.py +++ b/conftest.py @@ -159,7 +159,6 @@ class BreastDiagnoseFeatureView(FeatureView): metadata = FeatureViewMetadata( name='breast_features', description='Features defining a scan and diagnose of potential cancer cells', - tags={}, source=scan_without_datetime, ) @@ -231,7 +230,6 @@ class BreastDiagnoseFeatureView(FeatureView): metadata = FeatureViewMetadata( name='breast_features', description='Features defining a scan and diagnose of potential cancer cells', - tags={}, source=scan_with_datetime, ) @@ -292,7 +290,6 @@ class BreastDiagnoseFeatureView(FeatureView): metadata = FeatureViewMetadata( name='breast_features', description='Features defining a scan and diagnose of potential cancer cells', - tags={}, source=scan_with_datetime, ) diff --git a/pyproject.toml b/pyproject.toml index db2147cb..536fa0a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aligned" -version = "0.0.78" +version = "0.0.79" description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" diff --git a/test_data/credit_history.csv b/test_data/credit_history.csv index d6ce930a..81ff56ad 100644 --- a/test_data/credit_history.csv +++ b/test_data/credit_history.csv @@ -1,7 +1,7 @@ -credit_card_due,dob_ssn,due_sum,bankruptcies,student_loan_due,event_timestamp -8419,19530219_5179,30747,0,22328,1587924064746575 -2944,19520816_8737,5459,0,2515,1587924064746575 -833,19860413_2537,33833,0,33000,1587924064746575 -5936,19530219_5179,54891,0,48955,1588010464746575 -1575,19520816_8737,11076,0,9501,1588010464746575 -6263,19860413_2537,41773,0,35510,1588010464746575 +bankruptcies,event_timestamp,student_loan_due,credit_card_due,dob_ssn,due_sum +0,1587924064746575,22328,8419,19530219_5179,30747 +0,1587924064746575,2515,2944,19520816_8737,5459 +0,1587924064746575,33000,833,19860413_2537,33833 +0,1588010464746575,48955,5936,19530219_5179,54891 +0,1588010464746575,9501,1575,19520816_8737,11076 +0,1588010464746575,35510,6263,19860413_2537,41773 diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 6bf6054f3713d28bde8c7e89b9e4ca9f3ed858e8..5bd897b355c89015675c2617c1e8e2ef9840d266 100644 GIT binary patch delta 568 zcmcb@eu3R2z%j^Blugt`)FcK-F@Q0PD1#sa12ek>0|O%?$Bx|&jO;oL%QreO06_!i z8HdR~7?rsa)Hpz@vqWPia!ILJm>HND8pj)%8k=(pDwrA>SQwhcTbPRqsu&v>8Cu4h z8k$>9p35k~(!r>Z7n5ibh$+e<>Epptl%HQB%ElndrOGt_b>FeuT6oYhmf(R#&prex;kZ^T$40d!31kpi`j$m`Vz7tqUGLY|-on+*dmTc@-3N^1BD4368pd(NSkAY?&1A%t9J32->0u2rUSrcjO ul@sNjV-R8Fmyw(slosU?mJ?y@XP6Y|Xv-kM7LcElSX9i=&&0q0N-hA9=ZC5Q delta 510 zcmcb>eubSYz%j^Blua~CG+|<*l$xorfsvtQys4qNC9j~0k%5JwS-ge0u{o!phJ~4d ziJ@`4k*Try#Q72|J&bA`6Zgmq8KggSVC2wYsMzkn00a+qKA8AcgYn$N-3r1DoM*sN z%Qu3h7^-2Z$f#ZkZWj>{peP X?r6&(!4{C8lUP*DFoB7I0Tk%~H2Q+x diff --git a/test_data/feature-store.json b/test_data/feature-store.json index b681fd2a..d4d62f26 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2024-03-11T21:36:35.174367", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic", "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "derived_features": [{"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "derived_features": [{"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} +{"metadata": {"created_at": "2024-03-12T18:22:07.588714", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic", "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "updated_at", "dtype": {"name": "datetime"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}], "derived_features": [{"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}, {"name": "titanic_parquet", "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "lower_bound_inc", "value": 0.0}]}], "derived_features": [{"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} diff --git a/test_data/loan.csv b/test_data/loan.csv index 7198b116..ee17382a 100644 --- a/test_data/loan.csv +++ b/test_data/loan.csv @@ -1,7 +1,7 @@ -loan_amount,loan_id,event_timestamp,personal_income,loan_status -35000,10000,1587924064746575,59000,True -1000,10001,1587924064746575,9600,False -5500,10002,1587924064746575,9600,True -35000,10000,1588010464746575,65500,True -35000,10001,1588010464746575,54400,True -2500,10002,1588010464746575,9900,True +loan_id,loan_amount,personal_income,loan_status,event_timestamp +10000,35000,59000,True,1587924064746575 +10001,1000,9600,False,1587924064746575 +10002,5500,9600,True,1587924064746575 +10000,35000,65500,True,1588010464746575 +10001,35000,54400,True,1588010464746575 +10002,2500,9900,True,1588010464746575 diff --git a/test_data/test_model.csv b/test_data/test_model.csv index decf3268..9bfa2bd0 100644 --- a/test_data/test_model.csv +++ b/test_data/test_model.csv @@ -1,7 +1,7 @@ -a,some_id -10,1 -14,2 -20,3 +some_id,a +1,10 +2,14 +3,20 1,1 2,2 3,3 diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index f63de52f4ee400e6ca841ace96471c1df9e7c33c..aa314b5337eb24f70b40bf557dc5f1cae818f945 100644 GIT binary patch delta 215 zcmX@ba*kz!oG>E;CnEzBC%Xd!3j+i&Ppp!j=q)461(M-ImJyj~C}lr~kwH>Mj6*a- z45%E)n!^ZTin2)Bc(4@Z=a-1GFo?3KGD$FHra%~si40;Kk}{GyY6UVA8$}d27{oR( ziHb0&wJ^y5wW%_&NlGv#iZY3Dh!u!ssLh$o!KlXB!=w)a<`$F98RxURIyyQ!WjId0 k%NQb?4rIIMWIHJ61?1->78NthWMp6na11g80Qj*a$N&HU delta 239 zcmX@da*AbwoG>2)CnJLhC%Xd!3j+jjO{|uh=p!S{1d?G!mSLP|C|y5`kwH>Mj6*a- z45%E)n#BlVin2)Bc(4@Z=a-1GFo?3KGD$EdLTIMU6b3O4Nf}8UwE`KSGF1jPNuZ=C zlNg6sfmntb2ZPuKCQ%UvwI(LGG*Ay*9%!T35hhVF2C)Z>Vv|J}r8qm7^g+PfWU@Kq qe0EPqM@OfEsL6L3LnH%%Y