-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfetch_alpaca_data.py
164 lines (136 loc) · 6.19 KB
/
fetch_alpaca_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""fetch alpaca data from api"""
import pandas as pd
from alpaca_trade_api.rest import REST, TimeFrame, TimeFrameUnit
from datetime import date
from datetime import datetime
from dateutil.relativedelta import relativedelta
from datetime import timedelta
from requests import get
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
# create starting timestamps
years = 25
end = (date.today()-timedelta(days=250)).strftime("%Y-%m-%d")
date_1 = datetime.strptime(end, "%Y-%m-%d")
start_1 = date_1 - timedelta(days=365*years)
start = str(start_1).replace(' 00:00:00', '')
main_df = pd.DataFrame(data=[], columns=['Timestamp', 'Open', "High", 'Low', 'Close', 'Volume'])
# check if download was valid
def check_download(results: dict):
"""check if minutes are off during download stage"""
first_timestamp = results['results'][0]['t']
minute = datetime.utcfromtimestamp(first_timestamp/1000).strftime('%Y-%m-%d %H:%M:%S').split(':')[1]
return (minute == '30' or minute == "00")
def polygon_to_df(results: dict):
"""download json results to dataframe"""
container = []
for result in results['results']:
container.append([datetime.utcfromtimestamp(result['t']/1000).strftime('%Y-%m-%d %H:%M:%S'),
result['o'], result['h'], result['l'], result['c'], result['v']])
df = pd.DataFrame(data=container, columns=['Timestamp', 'Open', "High", 'Low', 'Close', 'Volume'])
return df
# TO DO: another while loop to collect all data up to the end-date
def download_polygon(start=start, start_1=start_1, main_df=main_df, interval: str = 'minute', interval_length: int = 90):
counter = 0
while start < end and counter < 100:
download_correct: bool = False
print('Start Date: ', start)
while not download_correct:
url = f'https://api.polygon.io/v2/aggs/ticker/SPY/range/{interval_length}/{interval}/{start}/{end}?adjusted=false&sort=asc&limit=50000&apiKey=eTD2a0gOvakkPjBpyYBqRgiWY9CLJ0ot'
results: dict = get(url)
print(results)
download_correct = check_download(results.json())
if not download_correct:
start_1 = start_1 + timedelta(days=1)
start = str(start_1).replace(' 00:00:00', '')
main_df = main_df.append(polygon_to_df(results))
start = main_df.iloc[-1]['Timestamp'].split(' ')[0]
counter += 1
return main_df
# filter out before and after market times to preserve only intraday entries
def get_hour_min(hour_str: str):
"""get hour and minute from hour"""
hour, min, sec = hour_str.split(':')
return int(hour), int(min)
# subdivide the downloaded dataframe into years
def subdivide_df(df: pd.DataFrame, years: int) -> pd.DataFrame:
"""subdivide the pandas dataframe by years into the past"""
# create new dataframe features for time and year
df['Year'] = df['Timestamp'].apply(lambda x: x.split(' ')[0])
df['time'] = df['Timestamp'].apply(lambda x: x.split(' ')[1])
#df.drop('Date', axis=1, inplace=True)
# make year column a dataframe index
df.set_index('Year', inplace=True)
# calculate time y-years from present
today = date.today()
end = today.strftime("%Y-%m-%d")
dtObj = datetime.strptime(str(end), "%Y-%m-%d")
past_date = dtObj - relativedelta(years=years)
# filter dataframe for results
df = df[str(past_date).split(' ')[0]:end]
df['day'] = df.index
df.reset_index(inplace=True)
df.drop('Year', 1, inplace=True)
return df
def fetch_alpaca(years: int, stacked_df) -> None:
"""fetch alpaca data from backend client and stacking algorithm"""
df = subdivide_df(stacked_df, years)
df['hour'] = df['time'].apply(lambda x: get_hour_min(x)[0])
df['minute'] = df['time'].apply(lambda x: get_hour_min(x)[1])
# remove after-market and before-market data
sub_df = df[(df['hour'] >= 9) & (df['hour'] <= 16)]
# get datasets to remove before 9:30:00 and over 16:00:00
sub9_df = sub_df[(sub_df['hour'] == 9) & (sub_df['minute'] < 30)]
sub16_df = sub_df[(sub_df['hour'] == 16) & (sub_df['minute'] > 0)]
# drop them from the big dataset
sub_df.drop(sub9_df.index, axis=0, inplace=True)
sub_df.drop(sub16_df.index, axis=0, inplace=True)
df = sub_df
df.reset_index(inplace=True)
# drop unused columns
df.drop(['hour', 'minute'], axis=1, inplace=True)
# remove missing days from dataframe
missing_days = []
value_counts = df['day'].value_counts()
for key, value in value_counts.items():
if value != 14:
missing_days.append(key)
copy_data = df.loc[~df['day'].isin(missing_days)]
copy_data.drop('index', 1, inplace=True)
# download as csv file in directory
copy_data.to_csv(f'data_yearly/{years}_years_alpaca.csv')
def fetch_alpaca_csv(years: int) -> pd.DataFrame:
"""fetch alpaca data for model training purposes"""
return pd.read_csv(f'data_yearly/{years}_years_alpaca.csv')
def filter_daily_data(df) -> pd.DataFrame:
"""filter out and clean daily data"""
# create minute and hourly intervals
df['hour'] = df['time'].apply(lambda x: get_hour_min(x)[0])
df['minute'] = df['time'].apply(lambda x: get_hour_min(x)[1])
# remove after-market and before-market data
sub_df = df[(df['hour'] >= 9) & (df['hour'] <= 16)]
# get datasets to remove before 9:30:00 and over 16:00:00
sub9_df = sub_df[(sub_df['hour'] == 9) & (sub_df['minute'] < 30)]
sub16_df = sub_df[(sub_df['hour'] == 16) & (sub_df['minute'] > 0)]
# drop them from the big dataset
sub_df.drop(sub9_df.index, axis=0, inplace=True)
sub_df.drop(sub16_df.index, axis=0, inplace=True)
df = sub_df
df.reset_index(inplace=True)
# drop unused columns
df.drop(['hour', 'minute'], axis=1, inplace=True)
# filter out missing days
return df
# download data
def fetch_action(intraday=True):
# load in base dataframe
if intraday:
print('Intraday Data: ')
stacked_df = download_polygon()
for i in tqdm([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]):
fetch_alpaca(i, stacked_df)
else:
stacked_df = download_polygon(interval='hour', interval_length=2)
stacked_df.to_csv(f'data_daily/daily_alpaca.csv')
fetch_action()