Skip to content

Commit

Permalink
Fixed bug for azure delta source
Browse files Browse the repository at this point in the history
  • Loading branch information
MatsMoll committed Mar 26, 2024
1 parent bd59831 commit ada1578
Show file tree
Hide file tree
Showing 16 changed files with 161 additions and 146 deletions.
1 change: 1 addition & 0 deletions aligned/compiler/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,7 @@ def event_timestamp(self) -> EventTimestampFeature:
name=self.name,
ttl=int(self.ttl.total_seconds()) if self.ttl else None,
description=self._description,
dtype=self.dtype,
)


Expand Down
52 changes: 33 additions & 19 deletions aligned/local/job.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass, field

from pytz import timezone
from datetime import datetime

import pandas as pd
Expand Down Expand Up @@ -127,37 +128,41 @@ async def aggregate(request: RetrivalRequest, core_data: pl.LazyFrame) -> pl.Laz

def decode_timestamps(df: pl.LazyFrame, request: RetrivalRequest, formatter: DateFormatter) -> pl.LazyFrame:

columns: set[tuple[str, str | None]] = set()
decode_columns: dict[str, str | None] = {}
check_timezone_columns: dict[str, str | None] = {}

dtypes = dict(zip(df.columns, df.dtypes))

for feature in request.all_features:
if (
feature.dtype.is_datetime
and feature.name in df.columns
and not isinstance(dtypes[feature.name], pl.Datetime)
):
columns.add((feature.name, feature.dtype.datetime_timezone))
all_features = request.all_features

if request.event_timestamp:
all_features.add(request.event_timestamp.as_feature())

if (
request.event_timestamp
and request.event_timestamp.name in df.columns
and not isinstance(dtypes[request.event_timestamp.name], pl.Datetime)
):
columns.add((request.event_timestamp.name, request.event_timestamp.dtype.datetime_timezone))
for feature in all_features:
if feature.dtype.is_datetime and feature.name in df.columns:

if not columns:
return df
if not isinstance(dtypes[feature.name], pl.Datetime):
decode_columns[feature.name] = feature.dtype.datetime_timezone
else:
check_timezone_columns[feature.name] = feature.dtype.datetime_timezone

exprs = []

for column, time_zone in columns:
for column, time_zone in decode_columns.items():
logger.info(f'Decoding column {column} using {formatter} with timezone {time_zone}')

if time_zone is None:
exprs.append(formatter.decode_polars(column).alias(column))
exprs.append(formatter.decode_polars(column).dt.replace_time_zone(None).alias(column))
else:
exprs.append(formatter.decode_polars(column).dt.convert_time_zone(time_zone).alias(column))

for column, time_zone in check_timezone_columns.items():
logger.info(f'Checking timezone for column {column} with timezone {time_zone}')
if time_zone is None:
exprs.append(pl.col(column).dt.replace_time_zone(None).alias(column))
else:
exprs.append(pl.col(column).dt.convert_time_zone(time_zone).alias(column))

return df.with_columns(exprs)


Expand Down Expand Up @@ -285,7 +290,16 @@ def file_transform_polars(self, df: pl.LazyFrame) -> pl.LazyFrame:
event_timestamp_column = self.request.event_timestamp.name
df = decode_timestamps(df, self.request, self.date_formatter)

return df.filter(pl.col(event_timestamp_column).is_between(self.start_date, self.end_date))
time_zone = self.request.event_timestamp.dtype.datetime_timezone
if time_zone is None:
start_date = self.start_date.replace(tzinfo=None)
end_date = self.end_date.replace(tzinfo=None)
else:
tz = timezone(time_zone)
start_date = tz.localize(self.start_date)
end_date = tz.localize(self.end_date)

return df.filter(pl.col(event_timestamp_column).is_between(start_date, end_date))

async def to_pandas(self) -> pd.DataFrame:
return (await self.to_lazy_polars()).collect().to_pandas()
Expand Down
4 changes: 2 additions & 2 deletions aligned/schemas/feature.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Literal
from zoneinfo import ZoneInfo

Expand Down Expand Up @@ -315,7 +315,7 @@ class EventTimestamp(Codable):
ttl: int | None = None
description: str | None = None
tags: dict[str, str] | None = None
dtype: FeatureType = FeatureType.datetime()
dtype: FeatureType = field(default_factory=lambda: FeatureType.datetime())

def __hash__(self) -> int:
return hash(self.name)
Expand Down
14 changes: 7 additions & 7 deletions test_data/credit_history.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
due_sum,event_timestamp,student_loan_due,credit_card_due,dob_ssn,bankruptcies
30747,2020-04-26 18:01:04.746575+00:00,22328,8419,19530219_5179,0
5459,2020-04-26 18:01:04.746575+00:00,2515,2944,19520816_8737,0
33833,2020-04-26 18:01:04.746575+00:00,33000,833,19860413_2537,0
54891,2020-04-27 18:01:04.746575+00:00,48955,5936,19530219_5179,0
11076,2020-04-27 18:01:04.746575+00:00,9501,1575,19520816_8737,0
41773,2020-04-27 18:01:04.746575+00:00,35510,6263,19860413_2537,0
credit_card_due,due_sum,bankruptcies,event_timestamp,dob_ssn,student_loan_due
8419,30747,0,2020-04-26 18:01:04.746575+00:00,19530219_5179,22328
2944,5459,0,2020-04-26 18:01:04.746575+00:00,19520816_8737,2515
833,33833,0,2020-04-26 18:01:04.746575+00:00,19860413_2537,33000
5936,54891,0,2020-04-27 18:01:04.746575+00:00,19530219_5179,48955
1575,11076,0,2020-04-27 18:01:04.746575+00:00,19520816_8737,9501
6263,41773,0,2020-04-27 18:01:04.746575+00:00,19860413_2537,35510
Binary file modified test_data/credit_history_mater.parquet
Binary file not shown.
6 changes: 3 additions & 3 deletions test_data/data/csv_iso.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id,other,et,timestamp
1,foo,2024-03-25T12:03:30.013947+UTC,2024-03-25T12:03:30.013952+UTC
2,bar,2024-03-24T12:03:30.013949+UTC,2024-03-26T12:03:30.013952+UTC
3,baz,2024-03-23T12:03:30.013951+UTC,2024-03-27T12:03:30.013953+UTC
1,foo,2024-03-26T07:48:28.878809+UTC,2024-03-26T07:48:28.878814+UTC
2,bar,2024-03-25T07:48:28.878811+UTC,2024-03-27T07:48:28.878814+UTC
3,baz,2024-03-24T07:48:28.878813+UTC,2024-03-28T07:48:28.878815+UTC
6 changes: 3 additions & 3 deletions test_data/data/csv_unix.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id,other,et,timestamp
1,foo,1711368210013947,1711368210013952
2,bar,1711281810013949,1711454610013952
3,baz,1711195410013951,1711541010013953
1,foo,1711439308878809,1711439308878814
2,bar,1711352908878811,1711525708878814
3,baz,1711266508878813,1711612108878815
Binary file modified test_data/data/parquet_iso.parquet
Binary file not shown.
Binary file modified test_data/data/parquet_unix.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion test_data/feature-store.json

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions test_data/loan.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
event_timestamp,personal_income,loan_amount,loan_status,loan_id
2020-04-26 18:01:04.746575+00:00,59000,35000,True,10000
2020-04-26 18:01:04.746575+00:00,9600,1000,False,10001
2020-04-26 18:01:04.746575+00:00,9600,5500,True,10002
2020-04-27 18:01:04.746575+00:00,65500,35000,True,10000
2020-04-27 18:01:04.746575+00:00,54400,35000,True,10001
2020-04-27 18:01:04.746575+00:00,9900,2500,True,10002
personal_income,loan_amount,event_timestamp,loan_id,loan_status
59000,35000,2020-04-26 18:01:04.746575+00:00,10000,True
9600,1000,2020-04-26 18:01:04.746575+00:00,10001,False
9600,5500,2020-04-26 18:01:04.746575+00:00,10002,True
65500,35000,2020-04-27 18:01:04.746575+00:00,10000,True
54400,35000,2020-04-27 18:01:04.746575+00:00,10001,True
9900,2500,2020-04-27 18:01:04.746575+00:00,10002,True
Binary file modified test_data/test_model.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion test_data/titanic-sets.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}, {"name": "in_domain", "values": ["male", "female"]}]}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
{"raw_data": [], "train_test": [], "train_test_validation": [{"id": "titanic_test", "name": null, "request_result": {"entities": [{"name": "passenger_id", "dtype": {"name": "int32"}, "description": null, "tags": null, "constraints": null}], "features": [{"name": "sex", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "in_domain", "values": ["male", "female"]}, {"name": "optional"}]}, {"name": "is_mr", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "name", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "contains", "dtype": {"name": "bool"}, "key": "name", "value": "Mr."}, "depth": 1}, {"name": "name", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "age", "dtype": {"name": "float"}, "description": "A float as some have decimals", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 100.0}]}, {"name": "cabin", "dtype": {"name": "string"}, "description": null, "tags": null, "constraints": [{"name": "optional"}]}, {"name": "sibsp", "dtype": {"name": "int32"}, "description": "Number of siblings on titanic", "tags": null, "constraints": [{"name": "lower_bound_inc", "value": 0.0}, {"name": "upper_bound_inc", "value": 20.0}, {"name": "optional"}]}, {"name": "is_male", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "male"}}, "depth": 1}, {"name": "is_female", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sex", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "string"}}], "transformation": {"name": "equals", "dtype": {"name": "bool"}, "key": "sex", "value": {"name": "string", "value": "female"}}, "depth": 1}, {"name": "has_siblings", "dtype": {"name": "bool"}, "description": null, "tags": null, "constraints": null, "depending_on": [{"name": "sibsp", "location": {"name": "titanic", "location": "feature_view"}, "dtype": {"name": "int32"}}], "transformation": {"name": "not-equals", "dtype": {"name": "bool"}, "key": "sibsp", "value": {"name": "int", "value": 0}}, "depth": 1}, {"name": "survived", "dtype": {"name": "bool"}, "description": "If the passenger survived", "tags": null, "constraints": null}], "event_timestamp": null}, "train_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-train.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "test_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-test.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "validation_dataset": {"mapping_keys": {}, "type_name": "csv", "path": "test_data/titanic-validate.csv", "csv_config": {"seperator": ",", "compression": "infer", "should_write_index": false}, "formatter": {"date_format": "%Y-%m-%dT%H:%M:%S%.f+%Z", "time_unit": null, "time_zone": null, "name": "string_form"}}, "train_size_fraction": 0.6, "test_size_fraction": 0.20000000000000007, "validate_size_fraction": 0.19999999999999996, "target": ["survived"], "description": null, "tags": null}], "active_learning": []}
42 changes: 21 additions & 21 deletions test_data/titanic-test.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
age,survived,has_siblings,passenger_id,is_mr,cabin,is_male,is_female,name,sibsp,sex
22.0,False,False,61,True,,True,False,"Sirayanian, Mr. Orsen",0,male
38.0,True,False,62,False,B28,False,True,"Icard, Miss. Amelie",0,female
45.0,False,True,63,True,C83,True,False,"Harris, Mr. Henry Birkhardt",1,male
4.0,False,True,64,False,,True,False,"Skoog, Master. Harald",3,male
,False,False,65,True,,True,False,"Stewart, Mr. Albert A",0,male
,True,True,66,False,,True,False,"Moubarek, Master. Gerios",1,male
29.0,True,False,67,True,F33,False,True,"Nye, Mrs. (Elizabeth Ramell)",0,female
19.0,False,False,68,True,,True,False,"Crease, Mr. Ernest James",0,male
17.0,True,True,69,False,,False,True,"Andersson, Miss. Erna Alexandra",4,female
26.0,False,True,70,True,,True,False,"Kink, Mr. Vincenz",2,male
32.0,False,False,71,True,,True,False,"Jenkin, Mr. Stephen Curnow",0,male
16.0,False,True,72,False,,False,True,"Goodwin, Miss. Lillian Amy",5,female
21.0,False,False,73,True,,True,False,"Hood, Mr. Ambrose Jr",0,male
26.0,False,True,74,True,,True,False,"Chronopoulos, Mr. Apostolos",1,male
32.0,True,False,75,True,,True,False,"Bing, Mr. Lee",0,male
25.0,False,False,76,True,F G73,True,False,"Moen, Mr. Sigurd Hansen",0,male
,False,False,77,True,,True,False,"Staneff, Mr. Ivan",0,male
,False,False,78,True,,True,False,"Moutal, Mr. Rahamin Haim",0,male
0.83,True,False,79,False,,True,False,"Caldwell, Master. Alden Gates",0,male
30.0,True,False,80,False,,False,True,"Dowdell, Miss. Elizabeth",0,female
sex,is_mr,name,age,cabin,sibsp,is_male,is_female,has_siblings,survived,passenger_id
male,True,"Sirayanian, Mr. Orsen",22.0,,0,True,False,False,False,61
female,False,"Icard, Miss. Amelie",38.0,B28,0,False,True,False,True,62
male,True,"Harris, Mr. Henry Birkhardt",45.0,C83,1,True,False,True,False,63
male,False,"Skoog, Master. Harald",4.0,,3,True,False,True,False,64
male,True,"Stewart, Mr. Albert A",,,0,True,False,False,False,65
male,False,"Moubarek, Master. Gerios",,,1,True,False,True,True,66
female,True,"Nye, Mrs. (Elizabeth Ramell)",29.0,F33,0,False,True,False,True,67
male,True,"Crease, Mr. Ernest James",19.0,,0,True,False,False,False,68
female,False,"Andersson, Miss. Erna Alexandra",17.0,,4,False,True,True,True,69
male,True,"Kink, Mr. Vincenz",26.0,,2,True,False,True,False,70
male,True,"Jenkin, Mr. Stephen Curnow",32.0,,0,True,False,False,False,71
female,False,"Goodwin, Miss. Lillian Amy",16.0,,5,False,True,True,False,72
male,True,"Hood, Mr. Ambrose Jr",21.0,,0,True,False,False,False,73
male,True,"Chronopoulos, Mr. Apostolos",26.0,,1,True,False,True,False,74
male,True,"Bing, Mr. Lee",32.0,,0,True,False,False,True,75
male,True,"Moen, Mr. Sigurd Hansen",25.0,F G73,0,True,False,False,False,76
male,True,"Staneff, Mr. Ivan",,,0,True,False,False,False,77
male,True,"Moutal, Mr. Rahamin Haim",,,0,True,False,False,False,78
male,False,"Caldwell, Master. Alden Gates",0.83,,0,True,False,False,True,79
female,False,"Dowdell, Miss. Elizabeth",30.0,,0,False,True,False,True,80
Loading

0 comments on commit ada1578

Please sign in to comment.