From 246ce6da0e0947d7547881bb3cf48c7c854c6df8 Mon Sep 17 00:00:00 2001 From: "Mats E. Mollestad" Date: Mon, 25 Mar 2024 13:05:55 +0100 Subject: [PATCH] Improved datetime encoding --- .../feature_view/tests/test_combined_view.py | 19 +-- aligned/local/job.py | 22 ++-- aligned/retrival_job.py | 44 +------ aligned/schemas/date_formatter.py | 33 ++++- aligned/sources/azure_blob_storage.py | 108 +++++++++++++--- aligned/sources/local.py | 68 +++++++++- aligned/tests/test_timestamp_decoding.py | 104 +++++++++++++++ pyproject.toml | 2 +- test_data/credit_history.csv | 14 +- test_data/credit_history_mater.parquet | Bin 988 -> 991 bytes test_data/data/csv.csv | 4 + test_data/data/csv_iso.csv | 4 + test_data/data/csv_unix.csv | 4 + test_data/data/parquet.parquet | Bin 0 -> 1077 bytes test_data/data/parquet_iso.parquet | Bin 0 -> 1137 bytes test_data/data/parquet_unix.parquet | Bin 0 -> 1077 bytes test_data/feature-store.json | 2 +- test_data/loan.csv | 14 +- test_data/test_model.csv | 8 +- test_data/test_model.parquet | Bin 594 -> 598 bytes test_data/titanic-sets.json | 2 +- test_data/titanic-test.csv | 42 +++--- test_data/titanic-train.csv | 122 +++++++++--------- test_data/titanic-validate.csv | 42 +++--- 24 files changed, 443 insertions(+), 215 deletions(-) create mode 100644 aligned/tests/test_timestamp_decoding.py create mode 100644 test_data/data/csv.csv create mode 100644 test_data/data/csv_iso.csv create mode 100644 test_data/data/csv_unix.csv create mode 100644 test_data/data/parquet.parquet create mode 100644 test_data/data/parquet_iso.parquet create mode 100644 test_data/data/parquet_unix.parquet diff --git a/aligned/feature_view/tests/test_combined_view.py b/aligned/feature_view/tests/test_combined_view.py index 3c15cd4a..3d58fa8e 100644 --- a/aligned/feature_view/tests/test_combined_view.py +++ b/aligned/feature_view/tests/test_combined_view.py @@ -1,6 +1,6 @@ import pytest -from aligned import FeatureStore, feature_view, Int32, FileSource +from aligned import FeatureStore, feature_view, Int32, Int64, FileSource @pytest.mark.asyncio @@ -44,19 +44,19 @@ async def test_new_combined_solution() -> None: @feature_view(name='test', source=FileSource.csv_at('test_data/test.csv')) class Test: - some_id = Int32().as_entity() + some_id = Int64().as_entity() - feature = Int32() + feature = Int64() derived_feature = feature * 10 @feature_view(name='other', source=FileSource.csv_at('test_data/other.csv')) class Other: - other_id = Int32().as_entity() - some_id = Int32() + other_id = Int64().as_entity() + some_id = Int64() - other_feature = Int32() + other_feature = Int64() test_feature = other_feature * 10 @@ -65,13 +65,14 @@ class Other: @feature_view(name='combined', source=Test.join(other, on=test.some_id)) # type: ignore class Combined: - some_id = Int32().as_entity() + some_id = Int64().as_entity() new_feature = test.derived_feature * other.test_feature result = await Combined.query().all().to_pandas() # type: ignore - result['new_feature'] = result['new_feature'].astype('int64') - assert result[expected_df.columns].equals(expected_df) + + new_df = result.sort_values('some_id', ascending=True)[expected_df.columns].reset_index(drop=True) + assert new_df.equals(expected_df) @pytest.mark.asyncio diff --git a/aligned/local/job.py b/aligned/local/job.py index 77d1e9bc..6d29fa3e 100644 --- a/aligned/local/job.py +++ b/aligned/local/job.py @@ -8,9 +8,12 @@ from aligned.request.retrival_request import AggregatedFeature, AggregateOver, RetrivalRequest from aligned.retrival_job import RequestResult, RetrivalJob from aligned.schemas.date_formatter import DateFormatter -from aligned.schemas.feature import Feature, FeatureType +from aligned.schemas.feature import Feature from aligned.sources.local import DataFileReference from aligned.schemas.constraints import Optional +import logging + +logger = logging.getLogger(__name__) class LiteralRetrivalJob(RetrivalJob): @@ -133,14 +136,14 @@ def decode_timestamps(df: pl.LazyFrame, request: RetrivalRequest, formatter: Dat and feature.name in df.columns and not isinstance(dtypes[feature.name], pl.Datetime) ): - columns.add((feature.name, None)) + columns.add((feature.name, feature.dtype.datetime_timezone)) if ( request.event_timestamp and request.event_timestamp.name in df.columns and not isinstance(dtypes[request.event_timestamp.name], pl.Datetime) ): - columns.add((request.event_timestamp.name, None)) + columns.add((request.event_timestamp.name, request.event_timestamp.dtype.datetime_timezone)) if not columns: return df @@ -148,6 +151,8 @@ def decode_timestamps(df: pl.LazyFrame, request: RetrivalRequest, formatter: Dat exprs = [] for column, time_zone in columns: + logger.info(f'Decoding column {column} with timezone {time_zone}') + if time_zone is None: exprs.append(formatter.decode_polars(column).alias(column)) else: @@ -379,7 +384,7 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: did_rename_event_timestamp = True row_id_name = 'row_id' - result = result.with_row_count(row_id_name) + result = result.with_row_index(row_id_name) for request in self.requests: entity_names = request.entity_names @@ -468,11 +473,6 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: field = request.event_timestamp.name ttl = request.event_timestamp.ttl - if new_result.select(field).dtypes[0] == pl.Utf8(): - new_result = new_result.with_columns( - pl.col(field).str.strptime(pl.Datetime, '%+').alias(field) - ) - if ttl: ttl_request = (pl.col(field) <= pl.col(event_timestamp_col)) & ( pl.col(field) >= pl.col(event_timestamp_col) - ttl @@ -484,9 +484,7 @@ async def file_transformations(self, df: pl.LazyFrame) -> pl.LazyFrame: ) new_result = new_result.sort(field, descending=True).select(pl.exclude(field)) elif request.event_timestamp: - new_result = new_result.sort( - [row_id_name, request.event_timestamp.name], descending=True - ).select(pl.exclude(request.event_timestamp.name)) + new_result = new_result.sort([row_id_name, request.event_timestamp.name], descending=True) unique = new_result.unique(subset=row_id_name, keep='first') column_selects.remove('row_id') diff --git a/aligned/retrival_job.py b/aligned/retrival_job.py index f2751717..e4fc46a1 100644 --- a/aligned/retrival_job.py +++ b/aligned/retrival_job.py @@ -6,7 +6,6 @@ import timeit from abc import ABC, abstractmethod from collections import defaultdict -from contextlib import suppress from dataclasses import dataclass, field from datetime import datetime from typing import TYPE_CHECKING, Callable, Union, TypeVar, Coroutine, Any @@ -1889,47 +1888,8 @@ def retrival_requests(self) -> list[RetrivalRequest]: return self.requests async def to_pandas(self) -> pd.DataFrame: - df = await self.job.to_pandas() - for request in self.requests: - features_to_check = request.all_required_features - - if request.aggregated_features: - features_to_check = {feature.derived_feature for feature in request.aggregated_features} - - for feature in features_to_check: - - mask = ~df[feature.name].isnull() - - with suppress(AttributeError, TypeError): - df[feature.name] = df[feature.name].mask( - ~mask, other=df.loc[mask, feature.name].str.strip('"') - ) - - if feature.dtype.is_datetime: - df[feature.name] = pd.to_datetime(df[feature.name], infer_datetime_format=True, utc=True) - elif feature.dtype == FeatureType.string(): - continue - elif (feature.dtype.is_array) or (feature.dtype == FeatureType.embedding()): - import json - - if df[feature.name].dtype == 'object': - df[feature.name] = df[feature.name].apply( - lambda x: json.loads(x) if isinstance(x, str) else x - ) - elif (feature.dtype == FeatureType.json()) or feature.dtype.is_datetime: - pass - else: - if feature.dtype.is_numeric: - df[feature.name] = pd.to_numeric(df[feature.name], errors='coerce').astype( - feature.dtype.pandas_type - ) - else: - df[feature.name] = df[feature.name].astype(feature.dtype.pandas_type) - - if request.event_timestamp and request.event_timestamp.name in df.columns: - feature = request.event_timestamp - df[feature.name] = pd.to_datetime(df[feature.name], infer_datetime_format=True, utc=True) - return df + df = await self.to_polars() + return df.to_pandas() async def to_lazy_polars(self) -> pl.LazyFrame: df = await self.job.to_lazy_polars() diff --git a/aligned/schemas/date_formatter.py b/aligned/schemas/date_formatter.py index c67e3c41..80d5b55d 100644 --- a/aligned/schemas/date_formatter.py +++ b/aligned/schemas/date_formatter.py @@ -16,10 +16,7 @@ class AllDateFormatters: @classmethod def shared(cls) -> AllDateFormatters: if cls._shared is None: - formatters = [ - Timestamp, - StringDateFormatter, - ] + formatters = [Timestamp, StringDateFormatter, NoopFormatter] cls._shared = AllDateFormatters({formatter.name(): formatter for formatter in formatters}) return cls._shared @@ -58,12 +55,34 @@ def string_format(format: str) -> StringDateFormatter: @staticmethod def iso_8601() -> StringDateFormatter: - return StringDateFormatter('yyyy-MM-ddTHH:mm:ssZ') + return StringDateFormatter('%Y-%m-%dT%H:%M:%S%.f+%Z') @staticmethod def unix_timestamp(time_unit: TimeUnit = 'us', time_zone: str | None = 'UTC') -> Timestamp: return Timestamp(time_unit, time_zone) + @staticmethod + def noop() -> DateFormatter: + return NoopFormatter() + + +@dataclass +class NoopFormatter(DateFormatter): + """ + A formatter that assumes that the underlying format can store timestamps. + Therefore, no decoding or encoding is necessary. + """ + + @classmethod + def name(cls) -> str: + return 'noop' + + def decode_polars(self, column: str) -> pl.Expr: + return pl.col(column) + + def encode_polars(self, column: str) -> pl.Expr: + return pl.col(column) + @dataclass class Timestamp(DateFormatter): @@ -97,8 +116,8 @@ def name(cls) -> str: def decode_polars(self, column: str) -> pl.Expr: return pl.col(column).str.to_datetime( - self.date_format, time_unit=self.time_unit, time_zone=self.time_zone + format=self.date_format, time_unit=self.time_unit, time_zone=self.time_zone ) def encode_polars(self, column: str) -> pl.Expr: - return pl.col(column).dt.strftime(self.date_format) + return pl.col(column).dt.to_string(self.date_format) diff --git a/aligned/sources/azure_blob_storage.py b/aligned/sources/azure_blob_storage.py index ae1583c7..4ac869b0 100644 --- a/aligned/sources/azure_blob_storage.py +++ b/aligned/sources/azure_blob_storage.py @@ -78,18 +78,41 @@ def to_markdown(self) -> str: def json_at(self, path: str) -> StorageFileReference: raise NotImplementedError(type(self)) - def parquet_at(self, path: str) -> AzureBlobParquetDataSource: - return AzureBlobParquetDataSource(self, path) + def parquet_at( + self, + path: str, + mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, + ) -> AzureBlobParquetDataSource: + return AzureBlobParquetDataSource( + self, path, mapping_keys=mapping_keys or {}, date_formatter=date_formatter or DateFormatter.noop() + ) - def csv_at(self, path: str) -> AzureBlobCsvDataSource: - return AzureBlobCsvDataSource(self, path) + def csv_at( + self, + path: str, + mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, + ) -> AzureBlobCsvDataSource: + return AzureBlobCsvDataSource( + self, + path, + mapping_keys=mapping_keys or {}, + date_formatter=date_formatter or DateFormatter.unix_timestamp(), + ) def delta_at( self, path: str, mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, ) -> AzureBlobDeltaDataSource: - return AzureBlobDeltaDataSource(self, path, mapping_keys=mapping_keys or {}) + return AzureBlobDeltaDataSource( + self, + path, + mapping_keys=mapping_keys or {}, + date_formatter=date_formatter or DateFormatter.unix_timestamp(), + ) def directory(self, path: str) -> AzureBlobDirectory: return AzureBlobDirectory(self, Path(path)) @@ -149,21 +172,36 @@ class AzureBlobDirectory(Directory): def json_at(self, path: str) -> StorageFileReference: return AzureBlobDataSource(self.config, (self.sub_path / path).as_posix()) - def parquet_at(self, path: str) -> AzureBlobParquetDataSource: + def parquet_at( + self, + path: str, + mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, + ) -> AzureBlobParquetDataSource: sub_path = self.sub_path / path - return self.config.parquet_at(sub_path.as_posix()) + return self.config.parquet_at( + sub_path.as_posix(), date_formatter=date_formatter or DateFormatter.noop() + ) - def csv_at(self, path: str) -> AzureBlobCsvDataSource: + def csv_at( + self, + path: str, + mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, + ) -> AzureBlobCsvDataSource: sub_path = self.sub_path / path - return self.config.csv_at(sub_path.as_posix()) + return self.config.csv_at( + sub_path.as_posix(), date_formatter=date_formatter or DateFormatter.unix_timestamp() + ) def delta_at( self, path: str, mapping_keys: dict[str, str] | None = None, + date_formatter: DateFormatter | None = None, ) -> AzureBlobDeltaDataSource: sub_path = self.sub_path / path - return self.config.delta_at(sub_path.as_posix(), mapping_keys) + return self.config.delta_at(sub_path.as_posix(), mapping_keys, date_formatter=date_formatter) def sub_directory(self, path: str) -> AzureBlobDirectory: return AzureBlobDirectory(self.config, self.sub_path / path) @@ -224,6 +262,7 @@ class AzureBlobCsvDataSource( path: str mapping_keys: dict[str, str] = field(default_factory=dict) csv_config: CsvConfig = field(default_factory=CsvConfig) + date_formatter: DateFormatter = field(default_factory=lambda: DateFormatter.unix_timestamp()) type_name: str = 'azure_blob_csv' @@ -294,6 +333,26 @@ async def write(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> None df = await job.to_polars() await self.write_polars(df.select(features)) + def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> RetrivalJob: + return FileFactualJob(self, [request], facts, date_formatter=self.date_formatter) + + def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: + return FileFullJob(self, request, limit, date_formatter=self.date_formatter) + + def all_between_dates( + self, + request: RetrivalRequest, + start_date: datetime, + end_date: datetime, + ) -> RetrivalJob: + return FileDateJob( + source=self, + request=request, + start_date=start_date, + end_date=end_date, + date_formatter=self.date_formatter, + ) + @dataclass class AzureBlobParquetDataSource( @@ -305,6 +364,7 @@ class AzureBlobParquetDataSource( path: str mapping_keys: dict[str, str] = field(default_factory=dict) parquet_config: ParquetConfig = field(default_factory=ParquetConfig) + date_formatter: DateFormatter = field(default_factory=lambda: DateFormatter.noop()) type_name: str = 'azure_blob_parquet' @property @@ -372,6 +432,26 @@ async def write_polars(self, df: pl.LazyFrame) -> None: creds = self.config.read_creds() df.collect().to_pandas().to_parquet(url, storage_options=creds) + def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> RetrivalJob: + return FileFactualJob(self, [request], facts, date_formatter=self.date_formatter) + + def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: + return FileFullJob(self, request, limit, date_formatter=self.date_formatter) + + def all_between_dates( + self, + request: RetrivalRequest, + start_date: datetime, + end_date: datetime, + ) -> RetrivalJob: + return FileDateJob( + source=self, + request=request, + start_date=start_date, + end_date=end_date, + date_formatter=self.date_formatter, + ) + @dataclass class AzureBlobDeltaDataSource( @@ -421,9 +501,6 @@ async def freshness(self, event_timestamp: EventTimestamp) -> datetime | None: logger.info(f"Failed to get freshness for {self.path}. {error} - returning None.") return None - def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> RetrivalJob: - return FileFactualJob(self, [request], facts, date_formatter=self.date_formatter) - async def schema(self) -> dict[str, FeatureType]: try: schema = (await self.to_lazy_polars()).schema @@ -434,6 +511,9 @@ async def schema(self) -> dict[str, FeatureType]: except HTTPStatusError as error: raise UnableToFindFileException() from error + def features_for(self, facts: RetrivalJob, request: RetrivalRequest) -> RetrivalJob: + return FileFactualJob(self, [request], facts, date_formatter=self.date_formatter) + def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: return FileFullJob(self, request, limit, date_formatter=self.date_formatter) @@ -556,8 +636,6 @@ async def upsert(self, job: RetrivalJob, requests: list[RetrivalRequest]) -> Non url = f"az://{self.path}" merge_on = set() - schemas = {} - for request in requests: merge_on.update(request.entity_names) diff --git a/aligned/sources/local.py b/aligned/sources/local.py index a9c1f7cd..db2f3435 100644 --- a/aligned/sources/local.py +++ b/aligned/sources/local.py @@ -306,6 +306,7 @@ class ParquetFileSource(BatchDataSource, ColumnFeatureMappable, DataFileReferenc path: str mapping_keys: dict[str, str] = field(default_factory=dict) config: ParquetConfig = field(default_factory=ParquetConfig) + date_formatter: DateFormatter = field(default_factory=lambda: DateFormatter.noop()) type_name: str = 'parquet' @@ -356,12 +357,18 @@ async def write_polars(self, df: pl.LazyFrame) -> None: df.collect().write_parquet(self.path, compression=self.config.compression) def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: - return FileFullJob(self, request, limit) + return FileFullJob(self, request, limit, date_formatter=self.date_formatter) def all_between_dates( self, request: RetrivalRequest, start_date: datetime, end_date: datetime ) -> RetrivalJob: - return FileDateJob(source=self, request=request, start_date=start_date, end_date=end_date) + return FileDateJob( + source=self, + request=request, + start_date=start_date, + end_date=end_date, + date_formatter=self.date_formatter, + ) @classmethod def multi_source_features_for( @@ -377,6 +384,7 @@ def multi_source_features_for( source=source, requests=[request for _, request in requests], facts=facts, + date_formatter=source.date_formatter, ) async def schema(self) -> dict[str, FeatureType]: @@ -414,6 +422,7 @@ class DeltaFileSource(BatchDataSource, ColumnFeatureMappable, DataFileReference, path: str mapping_keys: dict[str, str] = field(default_factory=dict) config: DeltaFileConfig = field(default_factory=DeltaFileConfig) + date_formatter: DateFormatter = field(default_factory=lambda: DateFormatter.noop()) type_name: str = 'delta' @@ -444,6 +453,37 @@ async def write_polars(self, df: pl.LazyFrame) -> None: self.path, mode=self.config.mode, overwrite_schema=self.config.overwrite_schema ) + def all_data(self, request: RetrivalRequest, limit: int | None) -> RetrivalJob: + return FileFullJob(self, request, limit, date_formatter=self.date_formatter) + + def all_between_dates( + self, request: RetrivalRequest, start_date: datetime, end_date: datetime + ) -> RetrivalJob: + return FileDateJob( + source=self, + request=request, + start_date=start_date, + end_date=end_date, + date_formatter=self.date_formatter, + ) + + @classmethod + def multi_source_features_for( + cls, facts: RetrivalJob, requests: list[tuple[DeltaFileSource, RetrivalRequest]] + ) -> RetrivalJob: + + source = requests[0][0] + if not isinstance(source, cls): + raise ValueError(f'Only {cls} is supported, recived: {source}') + + # Group based on config + return FileFactualJob( + source=source, + requests=[request for _, request in requests], + facts=facts, + date_formatter=source.date_formatter, + ) + async def schema(self) -> dict[str, FeatureType]: parquet_schema = pl.read_delta(self.path).schema return {name: FeatureType.from_polars(pl_type) for name, pl_type in parquet_schema.items()} @@ -613,15 +653,31 @@ def csv_at( @staticmethod def parquet_at( - path: str, mapping_keys: dict[str, str] | None = None, config: ParquetConfig | None = None + path: str, + mapping_keys: dict[str, str] | None = None, + config: ParquetConfig | None = None, + date_formatter: DateFormatter | None = None, ) -> ParquetFileSource: - return ParquetFileSource(path=path, mapping_keys=mapping_keys or {}, config=config or ParquetConfig()) + return ParquetFileSource( + path=path, + mapping_keys=mapping_keys or {}, + config=config or ParquetConfig(), + date_formatter=date_formatter or DateFormatter.noop(), + ) @staticmethod def delta_at( - path: str, mapping_keys: dict[str, str] | None = None, config: DeltaFileConfig | None = None + path: str, + mapping_keys: dict[str, str] | None = None, + config: DeltaFileConfig | None = None, + date_formatter: DateFormatter | None = None, ) -> DeltaFileSource: - return DeltaFileSource(path, mapping_keys or {}, config=config or DeltaFileConfig()) + return DeltaFileSource( + path, + mapping_keys or {}, + config=config or DeltaFileConfig(), + date_formatter=date_formatter or DateFormatter.noop(), + ) @staticmethod def directory(path: str) -> FileDirectory: diff --git a/aligned/tests/test_timestamp_decoding.py b/aligned/tests/test_timestamp_decoding.py new file mode 100644 index 00000000..fca629e9 --- /dev/null +++ b/aligned/tests/test_timestamp_decoding.py @@ -0,0 +1,104 @@ +from datetime import datetime, timezone, timedelta +import polars as pl +from aligned import feature_view, FileSource, String, Timestamp, EventTimestamp, Int32 +import pytest +from aligned.data_file import DataFileReference + +from aligned.schemas.date_formatter import DateFormatter + + +@feature_view( + name='timestamp', + source=FileSource.parquet_at('test_data/data/placeholder.parquet'), +) +class TimestampView: + id = Int32().as_entity() + other = String() + et = EventTimestamp() + timestamp = Timestamp() + + +data = pl.DataFrame( + { + 'id': [1, 2, 3], + 'other': ['foo', 'bar', 'baz'], + 'et': [ + datetime.now(timezone.utc), + datetime.now(timezone.utc) - timedelta(days=1), + datetime.now(timezone.utc) - timedelta(days=2), + ], + 'timestamp': [ + datetime.now(timezone.utc), + datetime.now(timezone.utc) + timedelta(days=1), + datetime.now(timezone.utc) + timedelta(days=2), + ], + } +) + + +@pytest.mark.asyncio +async def test_unix_timestamp() -> None: + + formatter = DateFormatter.unix_timestamp() + + sources_to_test: list[DataFileReference] = [ + FileSource.parquet_at('test_data/data/parquet_unix.parquet', date_formatter=formatter), + FileSource.csv_at('test_data/data/csv_unix.csv', date_formatter=formatter), + # FileSource.delta_at("test_data/data/delta_unix", date_formatter=formatter), + ] + + converted_data = data.with_columns([formatter.encode_polars('et'), formatter.encode_polars('timestamp')]) + + for source in sources_to_test: + await source.write_polars(converted_data.lazy()) + + TimestampView.metadata.source = source # type: ignore + + view = TimestampView.query() # type: ignore + assert view.view.source == source + + df = await view.all().to_polars() + + assert df.select(data.columns).equals(data) + + df = await view.features_for( + { + 'id': [1, 2], + } + ).to_polars() + + assert df.select(data.columns).equals(data.filter(pl.col('id').is_in([1, 2]))) + + +@pytest.mark.asyncio +async def test_iso_timestamp() -> None: + + formatter = DateFormatter.iso_8601() + + sources_to_test = [ + FileSource.csv_at('test_data/data/csv_iso.csv', date_formatter=formatter), + FileSource.parquet_at('test_data/data/parquet_iso.parquet', date_formatter=formatter), + # FileSource.delta_at("test_data/data/delta", date_formatter=formatter), + ] + + converted_data = data.with_columns([formatter.encode_polars('et'), formatter.encode_polars('timestamp')]) + + for source in sources_to_test: + await source.write_polars(converted_data.lazy()) # type: ignore + + TimestampView.metadata.source = source # type: ignore + + view = TimestampView.query() # type: ignore + assert view.view.source == source + + df = await view.all().to_polars() + + assert df.select(data.columns).equals(data) + + df = await view.features_for( + { + 'id': [1, 2], + } + ).to_polars() + + assert df.select(data.columns).equals(data.filter(pl.col('id').is_in([1, 2]))) diff --git a/pyproject.toml b/pyproject.toml index 7b31c77e..d8cc15d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aligned" -version = "0.0.88" +version = "0.0.89" description = "A data managment and lineage tool for ML applications." authors = ["Mats E. Mollestad "] license = "Apache-2.0" diff --git a/test_data/credit_history.csv b/test_data/credit_history.csv index 28597fd6..321d44ad 100644 --- a/test_data/credit_history.csv +++ b/test_data/credit_history.csv @@ -1,7 +1,7 @@ -credit_card_due,due_sum,dob_ssn,bankruptcies,student_loan_due,event_timestamp -8419,30747,19530219_5179,0,22328,2020-04-26 18:01:04.746575+00:00 -2944,5459,19520816_8737,0,2515,2020-04-26 18:01:04.746575+00:00 -833,33833,19860413_2537,0,33000,2020-04-26 18:01:04.746575+00:00 -5936,54891,19530219_5179,0,48955,2020-04-27 18:01:04.746575+00:00 -1575,11076,19520816_8737,0,9501,2020-04-27 18:01:04.746575+00:00 -6263,41773,19860413_2537,0,35510,2020-04-27 18:01:04.746575+00:00 +due_sum,event_timestamp,student_loan_due,credit_card_due,dob_ssn,bankruptcies +30747,2020-04-26 18:01:04.746575+00:00,22328,8419,19530219_5179,0 +5459,2020-04-26 18:01:04.746575+00:00,2515,2944,19520816_8737,0 +33833,2020-04-26 18:01:04.746575+00:00,33000,833,19860413_2537,0 +54891,2020-04-27 18:01:04.746575+00:00,48955,5936,19530219_5179,0 +11076,2020-04-27 18:01:04.746575+00:00,9501,1575,19520816_8737,0 +41773,2020-04-27 18:01:04.746575+00:00,35510,6263,19860413_2537,0 diff --git a/test_data/credit_history_mater.parquet b/test_data/credit_history_mater.parquet index 4da87a096e5f588821c3e6f9f9ab1681a01e6464..142bca967e2e40310320859c1475ef8f53e8c4af 100644 GIT binary patch delta 378 zcmcb^exIEsz%j^h;$Au72F^1Mj2t=)J9axTfWY#N4wDZuDsv^NaWIIoiDro=P2`et zHZ?XdGPI00H8i*66x6UVGcYkUjyEzjHV5)djSMUd&EhT0jm<@QBvl#MBqi8W@{{6= zi}OU8#Ms29F^csus_mG#TS3Sm{UOM*2Rk1?EUVZ)@vVkI6ElOPj2MS#k{Hl_AZrJs zXdfe#-NXzL7G;t2@n9*+&o2>Wn;gj~BgCW11h+<&!AFLHVRA8}3S-vf1B@yrAjgaC zU=bBzP&>vXgU?lr3}R#a&>?9+vv}9wy(#h=1X_NOdi*w9k JVqgFTH~`G8SVsT= delta 404 zcmcc5euv#7z%j^Blua~CG$sZ}F@Q0PD1#sa12em^11|#ugQ2B`nSqI+alDbKu{o!p zf~k>#g`ruzg}JDpim`!_p=G?Op}FPcbBq$E9gJ!mAR|mfO)yQ9U|?WmfstK@ zVfjV}pwSEsoM#*+&t+6*PEb2FaleAlgPji?7&&wpDz<~A4bmS@{HWpbi-|!}MvOzW zgHa5q2gp3dD4GCe{$dhM0x?BdBz-(sit_VIMA;Zbc~qGs*i-V8;){#(R2h6^7#Jpd zGRiP>sWMIe%P7vpAjT#sBWa^HfpPL#MoB)fv&A;Bh>9_&tzw!yhf#$W=pIG}v14GN zJCnaKY6<}zA$AAE7rVwJ#-_$0c51RdQxaqTOSl9pn diff --git a/test_data/data/csv.csv b/test_data/data/csv.csv new file mode 100644 index 00000000..34acb5d7 --- /dev/null +++ b/test_data/data/csv.csv @@ -0,0 +1,4 @@ +id,other,et,timestamp +1,foo,1711364948362731,1711364948362744 +2,bar,1711278548362741,1711451348362744 +3,baz,1711192148362743,1711537748362745 diff --git a/test_data/data/csv_iso.csv b/test_data/data/csv_iso.csv new file mode 100644 index 00000000..a0c92be3 --- /dev/null +++ b/test_data/data/csv_iso.csv @@ -0,0 +1,4 @@ +id,other,et,timestamp +1,foo,2024-03-25T12:03:30.013947+UTC,2024-03-25T12:03:30.013952+UTC +2,bar,2024-03-24T12:03:30.013949+UTC,2024-03-26T12:03:30.013952+UTC +3,baz,2024-03-23T12:03:30.013951+UTC,2024-03-27T12:03:30.013953+UTC diff --git a/test_data/data/csv_unix.csv b/test_data/data/csv_unix.csv new file mode 100644 index 00000000..39374a14 --- /dev/null +++ b/test_data/data/csv_unix.csv @@ -0,0 +1,4 @@ +id,other,et,timestamp +1,foo,1711368210013947,1711368210013952 +2,bar,1711281810013949,1711454610013952 +3,baz,1711195410013951,1711541010013953 diff --git a/test_data/data/parquet.parquet b/test_data/data/parquet.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5612096f10ba670262d63b272c9dd73381c241c3 GIT binary patch literal 1077 zcmb_czi-n}5PtpmW1lzKvl zfr+6510yOEd;bGAMg;!@EG&qC7`S^jPAZj7JjL(3_wMey@6M53@UL)#xA@gA0+Wx< z*+s^*3pHI~Gs;bsWlPJE>t=%)W=-7_fy?W8xsbPyxI&yem^3GE(2Hvu7CaIo5n##R-&|iyLuWCOMORadZ?`BCMFH5d zluhz5=`NK(^mDy;CmJgXUQ+z*B$lm4Z}kA22Zfr}rEKYFU;-v9sr literal 0 HcmV?d00001 diff --git a/test_data/data/parquet_iso.parquet b/test_data/data/parquet_iso.parquet new file mode 100644 index 0000000000000000000000000000000000000000..495950211a499ab6ecaac4513d27a085d312010c GIT binary patch literal 1137 zcmbtUPiq=c5P!R_TLT_i@I6+L90d0gVszE`hvqPAjR{T3YE1$qgc|?EP{p82(|9U5 zc`TuyAmmhX%BkO=*Pi+zdg><#oq4XtVkne8+&A-Pe)F5(o0WB-n&di9bL$WdlaI#P zGsb*R3{7E6%5#-Xmd~?X3&)rqHq_H5SA{O#iX$^XmGQ?&xI=aAexJ;7eK0X29-NT) z`);Q-kUVdY=Sm@>?+U+E7WK5p%gA9Rtn_LkwQ6N*Nvn`Z6;g?KBALpkvnzYG;&MF7 zv~+Df%2?jUErnhvlcdzNrP^|3zmKZ8Q_LIH>*)S#!;Cd;$ks2hRkjdiPgC=qmfX`` zYv|T{XR_hFBCZv4tg&!J1o)A_%b<6q%-kE5-zi>rtnq8<h3{d*yyx+b>r5zMr+it0ic8r zG)3KWTYt9o{Pus>>+H{Axoc4dA@E{XuKCAGbYE*8C}IgD|l^d@{Gsb~JabiKdaw}#5{R8A;dCvD{NAkQ(3;WA#&ZSGQ%>}-H5Bg5ckD~Q=-f(XOVE9@;XUEuo?Dq$apy? znMl%|xG#C`AI92^jBNyreks1K+h%5>K@X4Ztiz5Uj4x4z_mUUTD74rfAg~ zE&w#~fyP5TKWdSQXL)rKU;qmMh%LD6qyON5{L28_!#Ik+u|nk3F3H2B+f)KE80x)6A*i>8NmG(O?-Nt^NDm^hxZMbMssB*w8rOdpUFYbRdaxvh sPyuhI7J6%kX#Mwj$XDQX_Wl@%D^Tm!I&t@B*r0DAeNyvXEFz)8fOoaJ_wjwOg3kn&+h8z=;)N;IQcGPh-x~J k?VgkE=xA%pAi)-ppOaWrEU6>IzyL%H3mF+00vv-30naEUMF0Q* delta 231 zcmcb{a*1Vv95)LCCnJLhC%Xd!*TiC_i9RwA5oVAG<3vs6`dN$&k}_f(q8VZg3?SAl zMhH`sMbgHDr6@nYM3jX=ltqc}uK01?AnMh1oe#~?!hg;yuM diff --git a/test_data/titanic-sets.json b/test_data/titanic-sets.json index 2d86894e..6dec8ec0 100644 --- a/test_data/titanic-sets.json +++ b/test_data/titanic-sets.json @@ -1 +1 @@ -{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 100.0}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}, {"name": "lower_bound_inc", "value": 0.0}]}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "yyyy-MM-ddTHH:mm:ssZ", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} +{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []} diff --git a/test_data/titanic-test.csv b/test_data/titanic-test.csv index 634a02a1..38aaa8df 100644 --- a/test_data/titanic-test.csv +++ b/test_data/titanic-test.csv @@ -1,21 +1,21 @@ -cabin,age,sibsp,name,is_mr,has_siblings,is_female,is_male,sex,survived,passenger_id -,22.0,0,"Sirayanian, Mr. Orsen",True,False,False,True,male,False,61 -B28,38.0,0,"Icard, Miss. Amelie",False,False,True,False,female,True,62 -C83,45.0,1,"Harris, Mr. Henry Birkhardt",True,True,False,True,male,False,63 -,4.0,3,"Skoog, Master. Harald",False,True,False,True,male,False,64 -,,0,"Stewart, Mr. Albert A",True,False,False,True,male,False,65 -,,1,"Moubarek, Master. Gerios",False,True,False,True,male,True,66 -F33,29.0,0,"Nye, Mrs. (Elizabeth Ramell)",True,False,True,False,female,True,67 -,19.0,0,"Crease, Mr. Ernest James",True,False,False,True,male,False,68 -,17.0,4,"Andersson, Miss. Erna Alexandra",False,True,True,False,female,True,69 -,26.0,2,"Kink, Mr. Vincenz",True,True,False,True,male,False,70 -,32.0,0,"Jenkin, Mr. Stephen Curnow",True,False,False,True,male,False,71 -,16.0,5,"Goodwin, Miss. Lillian Amy",False,True,True,False,female,False,72 -,21.0,0,"Hood, Mr. Ambrose Jr",True,False,False,True,male,False,73 -,26.0,1,"Chronopoulos, Mr. Apostolos",True,True,False,True,male,False,74 -,32.0,0,"Bing, Mr. Lee",True,False,False,True,male,True,75 -F G73,25.0,0,"Moen, Mr. Sigurd Hansen",True,False,False,True,male,False,76 -,,0,"Staneff, Mr. Ivan",True,False,False,True,male,False,77 -,,0,"Moutal, Mr. Rahamin Haim",True,False,False,True,male,False,78 -,0.83,0,"Caldwell, Master. Alden Gates",False,False,False,True,male,True,79 -,30.0,0,"Dowdell, Miss. Elizabeth",False,False,True,False,female,True,80 +age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex +22.0,False,False,61,True,,True,False,"Sirayanian, Mr. Orsen",0,male +38.0,True,False,62,False,B28,False,True,"Icard, Miss. Amelie",0,female +45.0,False,True,63,True,C83,True,False,"Harris, Mr. Henry Birkhardt",1,male +4.0,False,True,64,False,,True,False,"Skoog, Master. Harald",3,male +,False,False,65,True,,True,False,"Stewart, Mr. Albert A",0,male +,True,True,66,False,,True,False,"Moubarek, Master. Gerios",1,male +29.0,True,False,67,True,F33,False,True,"Nye, Mrs. (Elizabeth Ramell)",0,female +19.0,False,False,68,True,,True,False,"Crease, Mr. Ernest James",0,male +17.0,True,True,69,False,,False,True,"Andersson, Miss. Erna Alexandra",4,female +26.0,False,True,70,True,,True,False,"Kink, Mr. Vincenz",2,male +32.0,False,False,71,True,,True,False,"Jenkin, Mr. Stephen Curnow",0,male +16.0,False,True,72,False,,False,True,"Goodwin, Miss. Lillian Amy",5,female +21.0,False,False,73,True,,True,False,"Hood, Mr. Ambrose Jr",0,male +26.0,False,True,74,True,,True,False,"Chronopoulos, Mr. Apostolos",1,male +32.0,True,False,75,True,,True,False,"Bing, Mr. Lee",0,male +25.0,False,False,76,True,F G73,True,False,"Moen, Mr. Sigurd Hansen",0,male +,False,False,77,True,,True,False,"Staneff, Mr. Ivan",0,male +,False,False,78,True,,True,False,"Moutal, Mr. Rahamin Haim",0,male +0.83,True,False,79,False,,True,False,"Caldwell, Master. Alden Gates",0,male +30.0,True,False,80,False,,False,True,"Dowdell, Miss. Elizabeth",0,female diff --git a/test_data/titanic-train.csv b/test_data/titanic-train.csv index bc08c87a..e05f4099 100644 --- a/test_data/titanic-train.csv +++ b/test_data/titanic-train.csv @@ -1,61 +1,61 @@ -cabin,age,sibsp,name,is_mr,has_siblings,is_female,is_male,sex,survived,passenger_id -,22.0,1,"Braund, Mr. Owen Harris",True,True,False,True,male,False,1 -C85,38.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",True,True,True,False,female,True,2 -,26.0,0,"Heikkinen, Miss. Laina",False,False,True,False,female,True,3 -C123,35.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,True,True,False,female,True,4 -,35.0,0,"Allen, Mr. William Henry",True,False,False,True,male,False,5 -,,0,"Moran, Mr. James",True,False,False,True,male,False,6 -E46,54.0,0,"McCarthy, Mr. Timothy J",True,False,False,False,other,False,7 -,2.0,3,"Palsson, Master. Gosta Leonard",False,True,False,True,male,False,8 -,27.0,0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",True,False,True,False,female,True,9 -,14.0,1,"Nasser, Mrs. Nicholas (Adele Achem)",True,True,True,False,female,True,10 -G6,4.0,1,"Sandstrom, Miss. Marguerite Rut",False,True,True,False,female,True,11 -C103,58.0,0,"Bonnell, Miss. Elizabeth",False,False,True,False,female,True,12 -,20.0,0,"Saundercock, Mr. William Henry",True,False,False,True,male,False,13 -,39.0,1,"Andersson, Mr. Anders Johan",True,True,False,True,male,False,14 -,14.0,0,"Vestrom, Miss. Hulda Amanda Adolfina",False,False,True,False,female,False,15 -,55.0,0,"Hewlett, Mrs. (Mary D Kingcome) ",True,False,True,False,female,True,16 -,2.0,4,"Rice, Master. Eugene",False,True,False,True,male,False,17 -,,0,"Williams, Mr. Charles Eugene",True,False,False,True,male,True,18 -,31.0,1,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",True,True,True,False,female,False,19 -,,0,"Masselmani, Mrs. Fatima",True,False,True,False,female,True,20 -,35.0,0,"Fynney, Mr. Joseph J",True,False,False,True,male,False,21 -D56,34.0,0,"Beesley, Mr. Lawrence",True,False,False,True,male,True,22 -,15.0,0,"McGowan, Miss. Anna ""Annie""",False,False,True,False,female,True,23 -A6,28.0,0,"Sloper, Mr. William Thompson",True,False,False,True,male,True,24 -,8.0,3,"Palsson, Miss. Torborg Danira",False,True,True,False,female,False,25 -,38.0,1,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",True,True,True,False,female,True,26 -,,0,"Emir, Mr. Farred Chehab",True,False,False,True,male,False,27 -C23 C25 C27,19.0,3,"Fortune, Mr. Charles Alexander",True,True,False,True,male,False,28 -,,0,"O'Dwyer, Miss. Ellen ""Nellie""",False,False,True,False,female,True,29 -,,0,"Todoroff, Mr. Lalio",True,False,False,True,male,False,30 -,40.0,0,"Uruchurtu, Don. Manuel E",False,False,False,True,male,False,31 -B78,,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",True,True,True,False,female,True,32 -,,0,"Glynn, Miss. Mary Agatha",False,False,True,False,female,True,33 -,66.0,0,"Wheadon, Mr. Edward H",True,False,False,True,male,False,34 -,28.0,1,"Meyer, Mr. Edgar Joseph",True,True,False,True,male,False,35 -,42.0,1,"Holverson, Mr. Alexander Oskar",True,True,False,True,male,False,36 -,,0,"Mamee, Mr. Hanna",True,False,False,True,male,True,37 -,21.0,0,"Cann, Mr. Ernest Charles",True,False,False,True,male,False,38 -,18.0,2,"Vander Planke, Miss. Augusta Maria",False,True,True,False,female,False,39 -,14.0,1,"Nicola-Yarred, Miss. Jamila",False,True,True,False,female,True,40 -,40.0,1,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",True,True,True,False,female,False,41 -,27.0,1,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",True,True,True,False,female,False,42 -,,0,"Kraeff, Mr. Theodor",True,False,False,True,male,False,43 -,3.0,1,"Laroche, Miss. Simonne Marie Anne Andree",False,True,True,False,female,True,44 -,19.0,0,"Devaney, Miss. Margaret Delia",False,False,True,False,female,True,45 -,,0,"Rogers, Mr. William John",True,False,False,True,male,False,46 -,,1,"Lennon, Mr. Denis",True,True,False,True,male,False,47 -,,0,"O'Driscoll, Miss. Bridget",False,False,True,False,female,True,48 -,,2,"Samaan, Mr. Youssef",True,True,False,True,male,False,49 -,18.0,1,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",True,True,True,False,female,False,50 -,7.0,4,"Panula, Master. Juha Niilo",False,True,False,True,male,False,51 -,21.0,0,"Nosworthy, Mr. Richard Cater",True,False,False,True,male,False,52 -D33,49.0,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",True,True,True,False,female,True,53 -,29.0,1,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",True,True,True,False,female,True,54 -B30,65.0,0,"Ostby, Mr. Engelhart Cornelius",True,False,False,True,male,False,55 -C52,,0,"Woolner, Mr. Hugh",True,False,False,True,male,True,56 -,21.0,0,"Rugg, Miss. Emily",False,False,True,False,female,True,57 -,28.5,0,"Novel, Mr. Mansouer",True,False,False,True,male,False,58 -,5.0,1,"West, Miss. Constance Mirium",False,True,True,False,female,True,59 -,11.0,5,"Goodwin, Master. William Frederick",False,True,False,True,male,False,60 +age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex +22.0,False,True,1,True,,True,False,"Braund, Mr. Owen Harris",1,male +38.0,True,True,2,True,C85,False,True,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",1,female +26.0,True,False,3,False,,False,True,"Heikkinen, Miss. Laina",0,female +35.0,True,True,4,True,C123,False,True,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female +35.0,False,False,5,True,,True,False,"Allen, Mr. William Henry",0,male +,False,False,6,True,,True,False,"Moran, Mr. James",0,male +54.0,False,False,7,True,E46,False,False,"McCarthy, Mr. Timothy J",0,other +2.0,False,True,8,False,,True,False,"Palsson, Master. Gosta Leonard",3,male +27.0,True,False,9,True,,False,True,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,female +14.0,True,True,10,True,,False,True,"Nasser, Mrs. Nicholas (Adele Achem)",1,female +4.0,True,True,11,False,G6,False,True,"Sandstrom, Miss. Marguerite Rut",1,female +58.0,True,False,12,False,C103,False,True,"Bonnell, Miss. Elizabeth",0,female +20.0,False,False,13,True,,True,False,"Saundercock, Mr. William Henry",0,male +39.0,False,True,14,True,,True,False,"Andersson, Mr. Anders Johan",1,male +14.0,False,False,15,False,,False,True,"Vestrom, Miss. Hulda Amanda Adolfina",0,female +55.0,True,False,16,True,,False,True,"Hewlett, Mrs. (Mary D Kingcome) ",0,female +2.0,False,True,17,False,,True,False,"Rice, Master. Eugene",4,male +,True,False,18,True,,True,False,"Williams, Mr. Charles Eugene",0,male +31.0,False,True,19,True,,False,True,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",1,female +,True,False,20,True,,False,True,"Masselmani, Mrs. Fatima",0,female +35.0,False,False,21,True,,True,False,"Fynney, Mr. Joseph J",0,male +34.0,True,False,22,True,D56,True,False,"Beesley, Mr. Lawrence",0,male +15.0,True,False,23,False,,False,True,"McGowan, Miss. Anna ""Annie""",0,female +28.0,True,False,24,True,A6,True,False,"Sloper, Mr. William Thompson",0,male +8.0,False,True,25,False,,False,True,"Palsson, Miss. Torborg Danira",3,female +38.0,True,True,26,True,,False,True,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",1,female +,False,False,27,True,,True,False,"Emir, Mr. Farred Chehab",0,male +19.0,False,True,28,True,C23 C25 C27,True,False,"Fortune, Mr. Charles Alexander",3,male +,True,False,29,False,,False,True,"O'Dwyer, Miss. Ellen ""Nellie""",0,female +,False,False,30,True,,True,False,"Todoroff, Mr. Lalio",0,male +40.0,False,False,31,False,,True,False,"Uruchurtu, Don. Manuel E",0,male +,True,True,32,True,B78,False,True,"Spencer, Mrs. William Augustus (Marie Eugenie)",1,female +,True,False,33,False,,False,True,"Glynn, Miss. Mary Agatha",0,female +66.0,False,False,34,True,,True,False,"Wheadon, Mr. Edward H",0,male +28.0,False,True,35,True,,True,False,"Meyer, Mr. Edgar Joseph",1,male +42.0,False,True,36,True,,True,False,"Holverson, Mr. Alexander Oskar",1,male +,True,False,37,True,,True,False,"Mamee, Mr. Hanna",0,male +21.0,False,False,38,True,,True,False,"Cann, Mr. Ernest Charles",0,male +18.0,False,True,39,False,,False,True,"Vander Planke, Miss. Augusta Maria",2,female +14.0,True,True,40,False,,False,True,"Nicola-Yarred, Miss. Jamila",1,female +40.0,False,True,41,True,,False,True,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",1,female +27.0,False,True,42,True,,False,True,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",1,female +,False,False,43,True,,True,False,"Kraeff, Mr. Theodor",0,male +3.0,True,True,44,False,,False,True,"Laroche, Miss. Simonne Marie Anne Andree",1,female +19.0,True,False,45,False,,False,True,"Devaney, Miss. Margaret Delia",0,female +,False,False,46,True,,True,False,"Rogers, Mr. William John",0,male +,False,True,47,True,,True,False,"Lennon, Mr. Denis",1,male +,True,False,48,False,,False,True,"O'Driscoll, Miss. Bridget",0,female +,False,True,49,True,,True,False,"Samaan, Mr. Youssef",2,male +18.0,False,True,50,True,,False,True,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",1,female +7.0,False,True,51,False,,True,False,"Panula, Master. Juha Niilo",4,male +21.0,False,False,52,True,,True,False,"Nosworthy, Mr. Richard Cater",0,male +49.0,True,True,53,True,D33,False,True,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",1,female +29.0,True,True,54,True,,False,True,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",1,female +65.0,False,False,55,True,B30,True,False,"Ostby, Mr. Engelhart Cornelius",0,male +,True,False,56,True,C52,True,False,"Woolner, Mr. Hugh",0,male +21.0,True,False,57,False,,False,True,"Rugg, Miss. Emily",0,female +28.5,False,False,58,True,,True,False,"Novel, Mr. Mansouer",0,male +5.0,True,True,59,False,,False,True,"West, Miss. Constance Mirium",1,female +11.0,False,True,60,False,,True,False,"Goodwin, Master. William Frederick",5,male diff --git a/test_data/titanic-validate.csv b/test_data/titanic-validate.csv index 4d1b80c4..e845825f 100644 --- a/test_data/titanic-validate.csv +++ b/test_data/titanic-validate.csv @@ -1,21 +1,21 @@ -cabin,age,sibsp,name,is_mr,has_siblings,is_female,is_male,sex,survived,passenger_id -,22.0,0,"Waelens, Mr. Achille",True,False,False,True,male,False,81 -,29.0,0,"Sheerlinck, Mr. Jan Baptist",True,False,False,True,male,True,82 -,,0,"McDermott, Miss. Brigdet Delia",False,False,True,False,female,True,83 -,28.0,0,"Carrau, Mr. Francisco M",True,False,False,True,male,False,84 -,17.0,0,"Ilett, Miss. Bertha",False,False,True,False,female,True,85 -,33.0,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",True,True,True,False,female,True,86 -,16.0,1,"Ford, Mr. William Neal",True,True,False,True,male,False,87 -,,0,"Slocovski, Mr. Selman Francis",True,False,False,True,male,False,88 -C23 C25 C27,23.0,3,"Fortune, Miss. Mabel Helen",False,True,True,False,female,True,89 -,24.0,0,"Celotti, Mr. Francesco",True,False,False,True,male,False,90 -,29.0,0,"Christmann, Mr. Emil",True,False,False,True,male,False,91 -,20.0,0,"Andreasson, Mr. Paul Edvin",True,False,False,True,male,False,92 -E31,46.0,1,"Chaffee, Mr. Herbert Fuller",True,True,False,True,male,False,93 -,26.0,1,"Dean, Mr. Bertram Frank",True,True,False,True,male,False,94 -,59.0,0,"Coxon, Mr. Daniel",True,False,False,True,male,False,95 -,,0,"Shorney, Mr. Charles Joseph",True,False,False,True,male,False,96 -A5,71.0,0,"Goldschmidt, Mr. George B",True,False,False,True,male,False,97 -D10 D12,23.0,0,"Greenfield, Mr. William Bertram",True,False,False,True,male,True,98 -,34.0,0,"Doling, Mrs. John T (Ada Julia Bone)",True,False,True,False,female,True,99 -,34.0,1,"Kantor, Mr. Sinai",True,True,False,True,male,False,100 +age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex +22.0,False,False,81,True,,True,False,"Waelens, Mr. Achille",0,male +29.0,True,False,82,True,,True,False,"Sheerlinck, Mr. Jan Baptist",0,male +,True,False,83,False,,False,True,"McDermott, Miss. Brigdet Delia",0,female +28.0,False,False,84,True,,True,False,"Carrau, Mr. Francisco M",0,male +17.0,True,False,85,False,,False,True,"Ilett, Miss. Bertha",0,female +33.0,True,True,86,True,,False,True,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",3,female +16.0,False,True,87,True,,True,False,"Ford, Mr. William Neal",1,male +,False,False,88,True,,True,False,"Slocovski, Mr. Selman Francis",0,male +23.0,True,True,89,False,C23 C25 C27,False,True,"Fortune, Miss. Mabel Helen",3,female +24.0,False,False,90,True,,True,False,"Celotti, Mr. Francesco",0,male +29.0,False,False,91,True,,True,False,"Christmann, Mr. Emil",0,male +20.0,False,False,92,True,,True,False,"Andreasson, Mr. Paul Edvin",0,male +46.0,False,True,93,True,E31,True,False,"Chaffee, Mr. Herbert Fuller",1,male +26.0,False,True,94,True,,True,False,"Dean, Mr. Bertram Frank",1,male +59.0,False,False,95,True,,True,False,"Coxon, Mr. Daniel",0,male +,False,False,96,True,,True,False,"Shorney, Mr. Charles Joseph",0,male +71.0,False,False,97,True,A5,True,False,"Goldschmidt, Mr. George B",0,male +23.0,True,False,98,True,D10 D12,True,False,"Greenfield, Mr. William Bertram",0,male +34.0,True,False,99,True,,False,True,"Doling, Mrs. John T (Ada Julia Bone)",0,female +34.0,False,True,100,True,,True,False,"Kantor, Mr. Sinai",1,male