From ada1578f037c91723d2eef0c10620098680c2cb6 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Tue, 26 Mar 2024 08:48:57 +0100 Subject: [PATCH] Fixed bug for azure delta source --- aligned/compiler/feature_factory.py | 1 + aligned/local/job.py | 52 +++++++---- aligned/schemas/feature.py | 4 +- test_data/credit_history.csv | 14 +-- test_data/credit_history_mater.parquet | Bin 991 -> 986 bytes test_data/data/csv_iso.csv | 6 +- test_data/data/csv_unix.csv | 6 +- test_data/data/parquet_iso.parquet | Bin 1137 -> 1136 bytes test_data/data/parquet_unix.parquet | Bin 1077 -> 1077 bytes test_data/feature-store.json | 2 +- test_data/loan.csv | 14 +-- test_data/test_model.parquet | Bin 598 -> 598 bytes test_data/titanic-sets.json | 2 +- test_data/titanic-test.csv | 42 ++++----- test_data/titanic-train.csv | 122 ++++++++++++------------- test_data/titanic-validate.csv | 42 ++++----- 16 files changed, 161 insertions(+), 146 deletions(-) diff --git a/aligned/compiler/feature_factory.py b/aligned/compiler/feature_factory.py index 9f7fa163..cc1a9874 100644 --- a/aligned/compiler/feature_factory.py +++ b/aligned/compiler/feature_factory.py @@ -1180,6 +1180,7 @@ def event_timestamp(self) -> EventTimestampFeature: name=self.name, ttl=int(self.ttl.total_seconds()) if self.ttl else None, description=self._description, + dtype=self.dtype, ) diff --git a/aligned/local/job.py b/aligned/local/job.py index 4bd3f8de..a02bedfb 100644 --- a/aligned/local/job.py +++ b/aligned/local/job.py @@ -1,5 +1,6 @@ from dataclasses import dataclass, field +from pytz import timezone from datetime import datetime import pandas as pd @@ -127,37 +128,41 @@ async def aggregate(request: RetrivalRequest, core_data: pl.LazyFrame) -> pl.Laz def decode_timestamps(df: pl.LazyFrame, request: RetrivalRequest, formatter: DateFormatter) -> pl.LazyFrame: - columns: set[tuple[str, str | None]] = set() + decode_columns: dict[str, str | None] = {} + check_timezone_columns: dict[str, str | None] = {} + dtypes = dict(zip(df.columns, df.dtypes)) - for feature in request.all_features: - if ( - feature.dtype.is_datetime - and feature.name in df.columns - and not isinstance(dtypes[feature.name], pl.Datetime) - ): - columns.add((feature.name, feature.dtype.datetime_timezone)) + all_features = request.all_features + + if request.event_timestamp: + all_features.add(request.event_timestamp.as_feature()) - if ( - request.event_timestamp - and request.event_timestamp.name in df.columns - and not isinstance(dtypes[request.event_timestamp.name], pl.Datetime) - ): - columns.add((request.event_timestamp.name, request.event_timestamp.dtype.datetime_timezone)) + for feature in all_features: + if feature.dtype.is_datetime and feature.name in df.columns: - if not columns: - return df + if not isinstance(dtypes[feature.name], pl.Datetime): + decode_columns[feature.name] = feature.dtype.datetime_timezone + else: + check_timezone_columns[feature.name] = feature.dtype.datetime_timezone exprs = [] - for column, time_zone in columns: + for column, time_zone in decode_columns.items(): logger.info(f'Decoding column {column} using {formatter} with timezone {time_zone}') if time_zone is None: - exprs.append(formatter.decode_polars(column).alias(column)) + exprs.append(formatter.decode_polars(column).dt.replace_time_zone(None).alias(column)) else: exprs.append(formatter.decode_polars(column).dt.convert_time_zone(time_zone).alias(column)) + for column, time_zone in check_timezone_columns.items(): + logger.info(f'Checking timezone for column {column} with timezone {time_zone}') + if time_zone is None: + exprs.append(pl.col(column).dt.replace_time_zone(None).alias(column)) + else: + exprs.append(pl.col(column).dt.convert_time_zone(time_zone).alias(column)) + return df.with_columns(exprs) @@ -285,7 +290,16 @@ def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame: event_timestamp_column = self.request.event_timestamp.name df = decode_timestamps(df, self.request, self.date_formatter) - return df.filter(pl.col(event_timestamp_column).is_between(self.start_date, self.end_date)) + time_zone = self.request.event_timestamp.dtype.datetime_timezone + if time_zone is None: + start_date = self.start_date.replace(tzinfo=None) + end_date = self.end_date.replace(tzinfo=None) + else: + tz = timezone(time_zone) + start_date = tz.localize(self.start_date) + end_date = tz.localize(self.end_date) + + return df.filter(pl.col(event_timestamp_column).is_between(start_date, end_date)) async def to_pandas(self) -> pd.DataFrame: return (await self.to_lazy_polars()).collect().to_pandas() diff --git a/aligned/schemas/feature.py b/aligned/schemas/feature.py index 7ce34670..f6bd0968 100644 --- a/aligned/schemas/feature.py +++ b/aligned/schemas/feature.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Literal from zoneinfo import ZoneInfo @@ -315,7 +315,7 @@ class EventTimestamp(Codable): ttl: int | None = None description: str | None = None tags: dict[str, str] | None = None - dtype: FeatureType = FeatureType.datetime() + dtype: FeatureType = field(default_factory=lambda: FeatureType.datetime()) def __hash__(self) -> int: return hash(self.name) diff --git a/test_data/credit_history.csv b/test_data/credit_history.csv index 321d44ad..093846bd 100644 --- a/test_data/credit_history.csv +++ b/test_data/credit_history.csv @@ -1,7 +1,7 @@ -due_sum,event_timestamp,student_loan_due,credit_card_due,dob_ssn,bankruptcies -30747,2020-04-26 18:01:04.746575+00:00,22328,8419,19530219_5179,0 -5459,2020-04-26 18:01:04.746575+00:00,2515,2944,19520816_8737,0 -33833,2020-04-26 18:01:04.746575+00:00,33000,833,19860413_2537,0 -54891,2020-04-27 18:01:04.746575+00:00,48955,5936,19530219_5179,0 -11076,2020-04-27 18:01:04.746575+00:00,9501,1575,19520816_8737,0 -41773,2020-04-27 18:01:04.746575+00:00,35510,6263,19860413_2537,0 +credit_card_due,due_sum,bankruptcies,event_timestamp,dob_ssn,student_loan_due +8419,30747,0,2020-04-26 18:01:04.746575+00:00,19530219_5179,22328 +2944,5459,0,2020-04-26 18:01:04.746575+00:00,19520816_8737,2515 +833,33833,0,2020-04-26 18:01:04.746575+00:00,19860413_2537,33000 +5936,54891,0,2020-04-27 18:01:04.746575+00:00,19530219_5179,48955 +1575,11076,0,2020-04-27 18:01:04.746575+00:00,19520816_8737,9501 +6263,41773,0,2020-04-27 18:01:04.746575+00:00,19860413_2537,35510 diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 142bca967e2e40310320859c1475ef8f53e8c4af..cf685d5da85e15261a6c608da5bf156080b26aa5 100644 GIT binary patch delta 523 zcmcc5ev5sA(!~4h+zb`l9Tw=g$0 z=M>biFf%YQG>$hiow!hnrH4^%&&2%-!VR2f92hxt7?y7Y8^f?;x5MPQjH;$B%nXt; zVjQALVnD-ztUZjPJ&aIx3o}Gmltt3VgQX}xzeJRcL6l3CY4Tr2aS;Yl9#tj@_LTgj z_~PO`RR$j!28PL=jM5xpY?3mPHfmXuzcOlYaWIH&V-Xc$P&+aC45Ji3$T^G*Vy9R{ zB^cD6F;AYus3rt-g4iP#Q85OwYfNHnY8+x0ChIeaa{ge_2Z0LD$-Yc3Y@v=o5H@)_ zlYhNyIgsh@R~Z!^RFL9knH3q~l^tOeWEd4@S(OrQ;AkEI)K&>naMu7LcElSX3-& S0}M+DVwlFnz!2aVWC#HDPlSX3 delta 564 zcmcb`exH4U(!>WG+&gwVFo3}FjSiE)GpcfMFo?2=W{DLDcXp$Jv1|Vw(qi7!^l-9_YU1JhsQ{xc3#3TZ8n*@(z zP>_GPRdI4gYHp&$A0~Yes0f_w!{ovi=?DbjlczEH*SiG*nI3)yDd9mmNqIqON$!>z zNja9;DdAyhN#S7`N$Eg&|0J61?1->78Og{0K*r87-lgsFa$UT83F(a Cm616B diff --git a/test_data/data/csv_iso.csv b/test_data/data/csv_iso.csv index a0c92be3..1c26f8b9 100644 --- a/test_data/data/csv_iso.csv +++ b/test_data/data/csv_iso.csv @@ -1,4 +1,4 @@ id,other,et,timestamp -1,foo,2024-03-25T12:03:30.013947+UTC,2024-03-25T12:03:30.013952+UTC -2,bar,2024-03-24T12:03:30.013949+UTC,2024-03-26T12:03:30.013952+UTC -3,baz,2024-03-23T12:03:30.013951+UTC,2024-03-27T12:03:30.013953+UTC +1,foo,2024-03-26T07:48:28.878809+UTC,2024-03-26T07:48:28.878814+UTC +2,bar,2024-03-25T07:48:28.878811+UTC,2024-03-27T07:48:28.878814+UTC +3,baz,2024-03-24T07:48:28.878813+UTC,2024-03-28T07:48:28.878815+UTC diff --git a/test_data/data/csv_unix.csv b/test_data/data/csv_unix.csv index 39374a14..90e053dc 100644 --- a/test_data/data/csv_unix.csv +++ b/test_data/data/csv_unix.csv @@ -1,4 +1,4 @@ id,other,et,timestamp -1,foo,1711368210013947,1711368210013952 -2,bar,1711281810013949,1711454610013952 -3,baz,1711195410013951,1711541010013953 +1,foo,1711439308878809,1711439308878814 +2,bar,1711352908878811,1711525708878814 +3,baz,1711266508878813,1711612108878815 diff --git a/test_data/data/parquet_iso.parquet b/test_data/data/parquet_iso.parquet index 495950211a499ab6ecaac4513d27a085d312010c..8090ed741c8f440382671e5c1f84ed8abc495432 100644 GIT binary patch delta 248 zcmey!@quH)45kLg$=r;J&SoJ7=2j*aRz?C9W=ae!hK5c+ zriqmjgMcweMwCZVm4QuCf+@8`lu3+Dtbs{v!^Ho(s%XX-n!t@QZvq-%0W!dJav-Ck zMjH!*q>LDcXq*@W1Bf+=5zbt~B)WkSD%>`C7Go*ni^&2^%36O|Ma3A@t}x4Bvz<|G p2@`|ZBsNhA2DKHFW0~ZIfHsJ&0`bLmu!^y%aftP8?qhn)2mnLMF+BhP delta 247 zcmeys@sVS~492F3GnHISLkx|q42-Re4fG5QjV(>gwL?Rk<@A&oSWH69lo%K+oq&Wf zkR@Vj2$B-zkyK@1laydeEfHlBV-ss&65BNKpROvJIi^N1a~RB;fF_t*DKQ8bPYz&I z)aYPgkdzVQ5RDUKU;wcuF~XTkm_#=*LWMgf&txoRd^MS$Nm=V3tEd=*+BIewY_>D1 qEn#91o5Ch4!JxKkatxEa5YPs(H6Xs&4puQXH4d=}n|qlaGXenKTQTea diff --git a/test_data/data/parquet_unix.parquet b/test_data/data/parquet_unix.parquet index cc087d999dbb582aac22fcca0eb4132f95cb55d9..425648ec06de272db141e75d03b9e909ee19aa50 100644 GIT binary patch delta 58 zcmdnWv6W-O8qu4@{4L!gYz()BEz-(?^xe$~9?26Anuy+mO5D5rY~g&M#C_v^`!{dq IV$@~^0F84NO8@`> delta 58 zcmdnWv6W-O8qwePY`#^Aurd6tOTCc>r2otH?FyQB&_tBsK171SUPPoHD8V@Ywc+B; IT#VYx0KPpG^#A|> diff --git a/test_data/feature-store.json b/test_data/feature-store.json index 0ef43495..ce47b38c 100644 --- a/test_data/feature-store.json +++ b/test_data/feature-store.json @@ -1 +1 @@ -{"metadata": {"created_at": "2024-03-25T12:03:30.935743", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic_parquet", "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}, "date_formatter": {"name": "noop"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "derived_features": [{"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "updated_at", "dtype": {"name": "datetime-UTC"}, "description": null, "tags": null, "constraints": null}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "derived_features": [{"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime-UTC"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} +{"metadata": {"created_at": "2024-03-26T07:48:29.807389", "name": "feature_store_location.py", "repo_url": null, "github_url": null}, "feature_views": [{"name": "titanic_parquet", "source": {"mapping_keys": {}, "type_name": "parquet", "path": "test_data/titanic.parquet", "config": {"engine": "auto", "compression": "snappy", "should_write_index": false}, "date_formatter": {"name": "noop"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "derived_features": [{"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic_parquet", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": null, "stream_data_source": null, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": []}, {"name": "titanic", "source": {"mapping_keys": {"PassengerId": "passenger_id", "Age": "age", "Sex": "sex", "Survived": "survived", "SibSp": "sibsp", "UpdatedAt": "updated_at"}, "type_name": "csv", "path": "test_data/titanic_scd_data.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "updated_at", "dtype": {"name": "datetime-UTC"}, "description": null, "tags": null, "constraints": null}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "derived_features": [{"name": "double_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul_val", "dtype": {"name": "float"}, "key": "sibsp", "value": {"name": "int", "value": 2}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "square_sibsp", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "mul", "dtype": {"name": "float"}, "front": "sibsp", "behind": "sibsp"}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "word_vectoriser", "dtype": {"name": "embedding"}, "key": "name", "model": {"name": "gensim", "model_name": "glove-wiki-gigaword-50", "config": {"to_lowercase": false, "deaccent": false, "encoding": "utf8", "errors": "strict"}, "loaded_model": null}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}], "tags": null, "description": "Some features from the titanic dataset", "aggregated_features": [], "event_timestamp": {"name": "updated_at", "ttl": null, "description": null, "tags": null, "dtype": {"name": "datetime-UTC"}}, "stream_data_source": {"mapping_keys": {}, "name": "redis", "topic_name": "titanic_stream", "config": {"env_var": "REDIS_URL"}, "record_coder": {"coder_type": "json", "key": "json"}}, "application_source": null, "materialized_source": null, "acceptable_freshness": null, "unacceptable_freshness": null, "event_triggers": null, "contacts": null, "indexes": [{"location": {"name": "titanic", "location": "feature_view"}, "vector": {"name": "name_embedding", "dtype": {"name": "embedding"}, "description": null, "tags": null, "constraints": null}, "vector_dim": 50, "metadata": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}]}], "storage": {"type_name": "redis", "config": {"env_var": "REDIS_URL"}, "name": "name_embedding_index", "initial_cap": 10000, "distance_metric": "COSINE", "index_alogrithm": "FLAT", "embedding_type": "FLOAT32"}, "entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}]}]}], "combined_feature_views": [], "models": [{"name": "titanic", "features": {"default_version": "default", "versions": {"default": [{"name": "age", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "float"}}, {"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}, {"name": "has_siblings", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, {"name": "is_male", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}]}}, "predictions_view": {"entities": [], "features": [{"name": "probability", "dtype": {"name": "float"}, "description": "The probability of target named will_survive being 'True'.", "tags": null, "constraints": null}], "derived_features": [{"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "probability", "location": {"name": "titanic", "location": "model"}, "dtype": {"name": "float"}}], "transformation": {"name": "map_arg_max", "dtype": {"name": "bool"}, "column_mappings": {"probability": {"name": "bool", "value": true}}}, "depth": 1}], "model_version_column": null, "event_timestamp": null, "source": null, "application_source": null, "stream_source": null, "regression_targets": [], "classification_targets": [{"estimating": {"name": "survived", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "bool"}}, "feature": {"name": "will_survive", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null}, "on_ground_truth_event": null, "event_trigger": null, "class_probabilities": [{"outcome": {"name": "bool", "value": true}, "feature": {"name": "probability", "dtype": {"name": "float"}, "description": null, "tags": null, "constraints": null}}], "confidence": null}], "recommendation_targets": [], "acceptable_freshness": 86400.0, "unacceptable_freshness": 172800.0}, "description": "A model predicting if a passenger will survive", "contacts": null, "tags": null, "dataset_store": null, "exposed_at_url": null}], "enrichers": []} diff --git a/test_data/loan.csv b/test_data/loan.csv index 4c720313..b97ff1f5 100644 --- a/test_data/loan.csv +++ b/test_data/loan.csv @@ -1,7 +1,7 @@ -event_timestamp,personal_income,loan_amount,loan_status,loan_id -2020-04-26 18:01:04.746575+00:00,59000,35000,True,10000 -2020-04-26 18:01:04.746575+00:00,9600,1000,False,10001 -2020-04-26 18:01:04.746575+00:00,9600,5500,True,10002 -2020-04-27 18:01:04.746575+00:00,65500,35000,True,10000 -2020-04-27 18:01:04.746575+00:00,54400,35000,True,10001 -2020-04-27 18:01:04.746575+00:00,9900,2500,True,10002 +personal_income,loan_amount,event_timestamp,loan_id,loan_status +59000,35000,2020-04-26 18:01:04.746575+00:00,10000,True +9600,1000,2020-04-26 18:01:04.746575+00:00,10001,False +9600,5500,2020-04-26 18:01:04.746575+00:00,10002,True +65500,35000,2020-04-27 18:01:04.746575+00:00,10000,True +54400,35000,2020-04-27 18:01:04.746575+00:00,10001,True +9900,2500,2020-04-27 18:01:04.746575+00:00,10002,True diff --git a/test_data/test_model.parquet b/test_data/test_model.parquet index 51cec3ff78a1238ef84cf46cf0be3a54f64af537..047afa77633876b50c7c4dee8bc00f88968d1098 100644 GIT binary patch delta 53 ucmcb{a*btzoG=puCnEzRC%Xd!GXn&$Of)nX<^u_HfrLe1!W*k^GXem%dj@y_ delta 53 scmcb{a*btzoG>#3CnEzRC%Xd!3j+i&O*AwY76A!!Aq(?utiH_%0Jeb!cmMzZ diff --git a/test_data/titanic-sets.json b/test_data/titanic-sets.json index 6dec8ec0..af1fc2df 100644 --- a/test_data/titanic-sets.json +++ b/test_data/titanic-sets.json @@ -1 +1 @@ -{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} +{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} diff --git a/test_data/titanic-test.csv b/test_data/titanic-test.csv index 38aaa8df..482b99e5 100644 --- a/test_data/titanic-test.csv +++ b/test_data/titanic-test.csv @@ -1,21 +1,21 @@ -age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex -22.0,False,False,61,True,,True,False,"Sirayanian, Mr. Orsen",0,male -38.0,True,False,62,False,B28,False,True,"Icard, Miss. Amelie",0,female -45.0,False,True,63,True,C83,True,False,"Harris, Mr. Henry Birkhardt",1,male -4.0,False,True,64,False,,True,False,"Skoog, Master. Harald",3,male -,False,False,65,True,,True,False,"Stewart, Mr. Albert A",0,male -,True,True,66,False,,True,False,"Moubarek, Master. Gerios",1,male -29.0,True,False,67,True,F33,False,True,"Nye, Mrs. (Elizabeth Ramell)",0,female -19.0,False,False,68,True,,True,False,"Crease, Mr. Ernest James",0,male -17.0,True,True,69,False,,False,True,"Andersson, Miss. Erna Alexandra",4,female -26.0,False,True,70,True,,True,False,"Kink, Mr. Vincenz",2,male -32.0,False,False,71,True,,True,False,"Jenkin, Mr. Stephen Curnow",0,male -16.0,False,True,72,False,,False,True,"Goodwin, Miss. Lillian Amy",5,female -21.0,False,False,73,True,,True,False,"Hood, Mr. Ambrose Jr",0,male -26.0,False,True,74,True,,True,False,"Chronopoulos, Mr. Apostolos",1,male -32.0,True,False,75,True,,True,False,"Bing, Mr. Lee",0,male -25.0,False,False,76,True,F G73,True,False,"Moen, Mr. Sigurd Hansen",0,male -,False,False,77,True,,True,False,"Staneff, Mr. Ivan",0,male -,False,False,78,True,,True,False,"Moutal, Mr. Rahamin Haim",0,male -0.83,True,False,79,False,,True,False,"Caldwell, Master. Alden Gates",0,male -30.0,True,False,80,False,,False,True,"Dowdell, Miss. Elizabeth",0,female +sex,is_mr,name,age,cabin,sibsp,is_male,is_female,has_siblings,survived,passenger_id +male,True,"Sirayanian, Mr. Orsen",22.0,,0,True,False,False,False,61 +female,False,"Icard, Miss. Amelie",38.0,B28,0,False,True,False,True,62 +male,True,"Harris, Mr. Henry Birkhardt",45.0,C83,1,True,False,True,False,63 +male,False,"Skoog, Master. Harald",4.0,,3,True,False,True,False,64 +male,True,"Stewart, Mr. Albert A",,,0,True,False,False,False,65 +male,False,"Moubarek, Master. Gerios",,,1,True,False,True,True,66 +female,True,"Nye, Mrs. (Elizabeth Ramell)",29.0,F33,0,False,True,False,True,67 +male,True,"Crease, Mr. Ernest James",19.0,,0,True,False,False,False,68 +female,False,"Andersson, Miss. Erna Alexandra",17.0,,4,False,True,True,True,69 +male,True,"Kink, Mr. Vincenz",26.0,,2,True,False,True,False,70 +male,True,"Jenkin, Mr. Stephen Curnow",32.0,,0,True,False,False,False,71 +female,False,"Goodwin, Miss. Lillian Amy",16.0,,5,False,True,True,False,72 +male,True,"Hood, Mr. Ambrose Jr",21.0,,0,True,False,False,False,73 +male,True,"Chronopoulos, Mr. Apostolos",26.0,,1,True,False,True,False,74 +male,True,"Bing, Mr. Lee",32.0,,0,True,False,False,True,75 +male,True,"Moen, Mr. Sigurd Hansen",25.0,F G73,0,True,False,False,False,76 +male,True,"Staneff, Mr. Ivan",,,0,True,False,False,False,77 +male,True,"Moutal, Mr. Rahamin Haim",,,0,True,False,False,False,78 +male,False,"Caldwell, Master. Alden Gates",0.83,,0,True,False,False,True,79 +female,False,"Dowdell, Miss. Elizabeth",30.0,,0,False,True,False,True,80 diff --git a/test_data/titanic-train.csv b/test_data/titanic-train.csv index e05f4099..bbe11ef0 100644 --- a/test_data/titanic-train.csv +++ b/test_data/titanic-train.csv @@ -1,61 +1,61 @@ -age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex -22.0,False,True,1,True,,True,False,"Braund, Mr. Owen Harris",1,male -38.0,True,True,2,True,C85,False,True,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",1,female -26.0,True,False,3,False,,False,True,"Heikkinen, Miss. Laina",0,female -35.0,True,True,4,True,C123,False,True,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female -35.0,False,False,5,True,,True,False,"Allen, Mr. William Henry",0,male -,False,False,6,True,,True,False,"Moran, Mr. James",0,male -54.0,False,False,7,True,E46,False,False,"McCarthy, Mr. Timothy J",0,other -2.0,False,True,8,False,,True,False,"Palsson, Master. Gosta Leonard",3,male -27.0,True,False,9,True,,False,True,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,female -14.0,True,True,10,True,,False,True,"Nasser, Mrs. Nicholas (Adele Achem)",1,female -4.0,True,True,11,False,G6,False,True,"Sandstrom, Miss. Marguerite Rut",1,female -58.0,True,False,12,False,C103,False,True,"Bonnell, Miss. Elizabeth",0,female -20.0,False,False,13,True,,True,False,"Saundercock, Mr. William Henry",0,male -39.0,False,True,14,True,,True,False,"Andersson, Mr. Anders Johan",1,male -14.0,False,False,15,False,,False,True,"Vestrom, Miss. Hulda Amanda Adolfina",0,female -55.0,True,False,16,True,,False,True,"Hewlett, Mrs. (Mary D Kingcome) ",0,female -2.0,False,True,17,False,,True,False,"Rice, Master. Eugene",4,male -,True,False,18,True,,True,False,"Williams, Mr. Charles Eugene",0,male -31.0,False,True,19,True,,False,True,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",1,female -,True,False,20,True,,False,True,"Masselmani, Mrs. Fatima",0,female -35.0,False,False,21,True,,True,False,"Fynney, Mr. Joseph J",0,male -34.0,True,False,22,True,D56,True,False,"Beesley, Mr. Lawrence",0,male -15.0,True,False,23,False,,False,True,"McGowan, Miss. Anna ""Annie""",0,female -28.0,True,False,24,True,A6,True,False,"Sloper, Mr. William Thompson",0,male -8.0,False,True,25,False,,False,True,"Palsson, Miss. Torborg Danira",3,female -38.0,True,True,26,True,,False,True,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",1,female -,False,False,27,True,,True,False,"Emir, Mr. Farred Chehab",0,male -19.0,False,True,28,True,C23 C25 C27,True,False,"Fortune, Mr. Charles Alexander",3,male -,True,False,29,False,,False,True,"O'Dwyer, Miss. Ellen ""Nellie""",0,female -,False,False,30,True,,True,False,"Todoroff, Mr. Lalio",0,male -40.0,False,False,31,False,,True,False,"Uruchurtu, Don. Manuel E",0,male -,True,True,32,True,B78,False,True,"Spencer, Mrs. William Augustus (Marie Eugenie)",1,female -,True,False,33,False,,False,True,"Glynn, Miss. Mary Agatha",0,female -66.0,False,False,34,True,,True,False,"Wheadon, Mr. Edward H",0,male -28.0,False,True,35,True,,True,False,"Meyer, Mr. Edgar Joseph",1,male -42.0,False,True,36,True,,True,False,"Holverson, Mr. Alexander Oskar",1,male -,True,False,37,True,,True,False,"Mamee, Mr. Hanna",0,male -21.0,False,False,38,True,,True,False,"Cann, Mr. Ernest Charles",0,male -18.0,False,True,39,False,,False,True,"Vander Planke, Miss. Augusta Maria",2,female -14.0,True,True,40,False,,False,True,"Nicola-Yarred, Miss. Jamila",1,female -40.0,False,True,41,True,,False,True,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",1,female -27.0,False,True,42,True,,False,True,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",1,female -,False,False,43,True,,True,False,"Kraeff, Mr. Theodor",0,male -3.0,True,True,44,False,,False,True,"Laroche, Miss. Simonne Marie Anne Andree",1,female -19.0,True,False,45,False,,False,True,"Devaney, Miss. Margaret Delia",0,female -,False,False,46,True,,True,False,"Rogers, Mr. William John",0,male -,False,True,47,True,,True,False,"Lennon, Mr. Denis",1,male -,True,False,48,False,,False,True,"O'Driscoll, Miss. Bridget",0,female -,False,True,49,True,,True,False,"Samaan, Mr. Youssef",2,male -18.0,False,True,50,True,,False,True,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",1,female -7.0,False,True,51,False,,True,False,"Panula, Master. Juha Niilo",4,male -21.0,False,False,52,True,,True,False,"Nosworthy, Mr. Richard Cater",0,male -49.0,True,True,53,True,D33,False,True,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",1,female -29.0,True,True,54,True,,False,True,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",1,female -65.0,False,False,55,True,B30,True,False,"Ostby, Mr. Engelhart Cornelius",0,male -,True,False,56,True,C52,True,False,"Woolner, Mr. Hugh",0,male -21.0,True,False,57,False,,False,True,"Rugg, Miss. Emily",0,female -28.5,False,False,58,True,,True,False,"Novel, Mr. Mansouer",0,male -5.0,True,True,59,False,,False,True,"West, Miss. Constance Mirium",1,female -11.0,False,True,60,False,,True,False,"Goodwin, Master. William Frederick",5,male +sex,is_mr,name,age,cabin,sibsp,is_male,is_female,has_siblings,survived,passenger_id +male,True,"Braund, Mr. Owen Harris",22.0,,1,True,False,True,False,1 +female,True,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,C85,1,False,True,True,True,2 +female,False,"Heikkinen, Miss. Laina",26.0,,0,False,True,False,True,3 +female,True,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,C123,1,False,True,True,True,4 +male,True,"Allen, Mr. William Henry",35.0,,0,True,False,False,False,5 +male,True,"Moran, Mr. James",,,0,True,False,False,False,6 +other,True,"McCarthy, Mr. Timothy J",54.0,E46,0,False,False,False,False,7 +male,False,"Palsson, Master. Gosta Leonard",2.0,,3,True,False,True,False,8 +female,True,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,,0,False,True,False,True,9 +female,True,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,,1,False,True,True,True,10 +female,False,"Sandstrom, Miss. Marguerite Rut",4.0,G6,1,False,True,True,True,11 +female,False,"Bonnell, Miss. Elizabeth",58.0,C103,0,False,True,False,True,12 +male,True,"Saundercock, Mr. William Henry",20.0,,0,True,False,False,False,13 +male,True,"Andersson, Mr. Anders Johan",39.0,,1,True,False,True,False,14 +female,False,"Vestrom, Miss. Hulda Amanda Adolfina",14.0,,0,False,True,False,False,15 +female,True,"Hewlett, Mrs. (Mary D Kingcome) ",55.0,,0,False,True,False,True,16 +male,False,"Rice, Master. Eugene",2.0,,4,True,False,True,False,17 +male,True,"Williams, Mr. Charles Eugene",,,0,True,False,False,True,18 +female,True,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",31.0,,1,False,True,True,False,19 +female,True,"Masselmani, Mrs. Fatima",,,0,False,True,False,True,20 +male,True,"Fynney, Mr. Joseph J",35.0,,0,True,False,False,False,21 +male,True,"Beesley, Mr. Lawrence",34.0,D56,0,True,False,False,True,22 +female,False,"McGowan, Miss. Anna ""Annie""",15.0,,0,False,True,False,True,23 +male,True,"Sloper, Mr. William Thompson",28.0,A6,0,True,False,False,True,24 +female,False,"Palsson, Miss. Torborg Danira",8.0,,3,False,True,True,False,25 +female,True,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",38.0,,1,False,True,True,True,26 +male,True,"Emir, Mr. Farred Chehab",,,0,True,False,False,False,27 +male,True,"Fortune, Mr. Charles Alexander",19.0,C23 C25 C27,3,True,False,True,False,28 +female,False,"O'Dwyer, Miss. Ellen ""Nellie""",,,0,False,True,False,True,29 +male,True,"Todoroff, Mr. Lalio",,,0,True,False,False,False,30 +male,False,"Uruchurtu, Don. Manuel E",40.0,,0,True,False,False,False,31 +female,True,"Spencer, Mrs. William Augustus (Marie Eugenie)",,B78,1,False,True,True,True,32 +female,False,"Glynn, Miss. Mary Agatha",,,0,False,True,False,True,33 +male,True,"Wheadon, Mr. Edward H",66.0,,0,True,False,False,False,34 +male,True,"Meyer, Mr. Edgar Joseph",28.0,,1,True,False,True,False,35 +male,True,"Holverson, Mr. Alexander Oskar",42.0,,1,True,False,True,False,36 +male,True,"Mamee, Mr. Hanna",,,0,True,False,False,True,37 +male,True,"Cann, Mr. Ernest Charles",21.0,,0,True,False,False,False,38 +female,False,"Vander Planke, Miss. Augusta Maria",18.0,,2,False,True,True,False,39 +female,False,"Nicola-Yarred, Miss. Jamila",14.0,,1,False,True,True,True,40 +female,True,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",40.0,,1,False,True,True,False,41 +female,True,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",27.0,,1,False,True,True,False,42 +male,True,"Kraeff, Mr. Theodor",,,0,True,False,False,False,43 +female,False,"Laroche, Miss. Simonne Marie Anne Andree",3.0,,1,False,True,True,True,44 +female,False,"Devaney, Miss. Margaret Delia",19.0,,0,False,True,False,True,45 +male,True,"Rogers, Mr. William John",,,0,True,False,False,False,46 +male,True,"Lennon, Mr. Denis",,,1,True,False,True,False,47 +female,False,"O'Driscoll, Miss. Bridget",,,0,False,True,False,True,48 +male,True,"Samaan, Mr. Youssef",,,2,True,False,True,False,49 +female,True,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",18.0,,1,False,True,True,False,50 +male,False,"Panula, Master. Juha Niilo",7.0,,4,True,False,True,False,51 +male,True,"Nosworthy, Mr. Richard Cater",21.0,,0,True,False,False,False,52 +female,True,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",49.0,D33,1,False,True,True,True,53 +female,True,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",29.0,,1,False,True,True,True,54 +male,True,"Ostby, Mr. Engelhart Cornelius",65.0,B30,0,True,False,False,False,55 +male,True,"Woolner, Mr. Hugh",,C52,0,True,False,False,True,56 +female,False,"Rugg, Miss. Emily",21.0,,0,False,True,False,True,57 +male,True,"Novel, Mr. Mansouer",28.5,,0,True,False,False,False,58 +female,False,"West, Miss. Constance Mirium",5.0,,1,False,True,True,True,59 +male,False,"Goodwin, Master. William Frederick",11.0,,5,True,False,True,False,60 diff --git a/test_data/titanic-validate.csv b/test_data/titanic-validate.csv index e845825f..6bc56a4b 100644 --- a/test_data/titanic-validate.csv +++ b/test_data/titanic-validate.csv @@ -1,21 +1,21 @@ -age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex -22.0,False,False,81,True,,True,False,"Waelens, Mr. Achille",0,male -29.0,True,False,82,True,,True,False,"Sheerlinck, Mr. Jan Baptist",0,male -,True,False,83,False,,False,True,"McDermott, Miss. Brigdet Delia",0,female -28.0,False,False,84,True,,True,False,"Carrau, Mr. Francisco M",0,male -17.0,True,False,85,False,,False,True,"Ilett, Miss. Bertha",0,female -33.0,True,True,86,True,,False,True,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",3,female -16.0,False,True,87,True,,True,False,"Ford, Mr. William Neal",1,male -,False,False,88,True,,True,False,"Slocovski, Mr. Selman Francis",0,male -23.0,True,True,89,False,C23 C25 C27,False,True,"Fortune, Miss. Mabel Helen",3,female -24.0,False,False,90,True,,True,False,"Celotti, Mr. Francesco",0,male -29.0,False,False,91,True,,True,False,"Christmann, Mr. Emil",0,male -20.0,False,False,92,True,,True,False,"Andreasson, Mr. Paul Edvin",0,male -46.0,False,True,93,True,E31,True,False,"Chaffee, Mr. Herbert Fuller",1,male -26.0,False,True,94,True,,True,False,"Dean, Mr. Bertram Frank",1,male -59.0,False,False,95,True,,True,False,"Coxon, Mr. Daniel",0,male -,False,False,96,True,,True,False,"Shorney, Mr. Charles Joseph",0,male -71.0,False,False,97,True,A5,True,False,"Goldschmidt, Mr. George B",0,male -23.0,True,False,98,True,D10 D12,True,False,"Greenfield, Mr. William Bertram",0,male -34.0,True,False,99,True,,False,True,"Doling, Mrs. John T (Ada Julia Bone)",0,female -34.0,False,True,100,True,,True,False,"Kantor, Mr. Sinai",1,male +sex,is_mr,name,age,cabin,sibsp,is_male,is_female,has_siblings,survived,passenger_id +male,True,"Waelens, Mr. Achille",22.0,,0,True,False,False,False,81 +male,True,"Sheerlinck, Mr. Jan Baptist",29.0,,0,True,False,False,True,82 +female,False,"McDermott, Miss. Brigdet Delia",,,0,False,True,False,True,83 +male,True,"Carrau, Mr. Francisco M",28.0,,0,True,False,False,False,84 +female,False,"Ilett, Miss. Bertha",17.0,,0,False,True,False,True,85 +female,True,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",33.0,,3,False,True,True,True,86 +male,True,"Ford, Mr. William Neal",16.0,,1,True,False,True,False,87 +male,True,"Slocovski, Mr. Selman Francis",,,0,True,False,False,False,88 +female,False,"Fortune, Miss. Mabel Helen",23.0,C23 C25 C27,3,False,True,True,True,89 +male,True,"Celotti, Mr. Francesco",24.0,,0,True,False,False,False,90 +male,True,"Christmann, Mr. Emil",29.0,,0,True,False,False,False,91 +male,True,"Andreasson, Mr. Paul Edvin",20.0,,0,True,False,False,False,92 +male,True,"Chaffee, Mr. Herbert Fuller",46.0,E31,1,True,False,True,False,93 +male,True,"Dean, Mr. Bertram Frank",26.0,,1,True,False,True,False,94 +male,True,"Coxon, Mr. Daniel",59.0,,0,True,False,False,False,95 +male,True,"Shorney, Mr. Charles Joseph",,,0,True,False,False,False,96 +male,True,"Goldschmidt, Mr. George B",71.0,A5,0,True,False,False,False,97 +male,True,"Greenfield, Mr. William Bertram",23.0,D10 D12,0,True,False,False,True,98 +female,True,"Doling, Mrs. John T (Ada Julia Bone)",34.0,,0,False,True,False,True,99 +male,True,"Kantor, Mr. Sinai",34.0,,1,True,False,True,False,100