forked from intelligent-environments-lab/CityLearn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq_planning_tiles.py
413 lines (324 loc) · 18.9 KB
/
q_planning_tiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
import copy
from energy_models import HeatPump
import numpy as np
import random
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import signal
import utils
import logging
from tc import ValueFunctionWithTile
# logger = logging.getLogger('spam_application')
# logger.setLevel(logging.INFO)
# ch = logging.StreamHandler()
# ch.setLevel(logging.INFO)
# logger.addHandler(ch)
np.random.seed(1)
class QPlanningTiles:
# All "Out tweak" comments are to note down the design choices we made for the algorithm. It will help
# us keep track of them and if we should change them. Plus, for noting in the final report.
def __init__(self, storage_capacity, elec_consump, gamma=0.9, alpha=0.1, level_cnt=9,
efficiency=1.0, loss_coefficient=0.0, parameterize_actions=True, use_adaptive_learning_rate=False): #0.19/24
# TODO: Our Tweak -> Wrap around Generalize for the hour of the day in a circular fashion
# TODO: Our Tweak -> Start at specific tile weights
# Configurables
self.use_adaptive_learning_rate = use_adaptive_learning_rate
self.parameterize_actions = parameterize_actions
# State space - [Hour of the day, outside temperature, charge available, the action level]
if parameterize_actions:
self.state_low = [1, -6.4, 0.0, -0.5]
self.state_high = [24, 39.1, 1.0, 0.5]
self.tile_widths = [2, 1, 0.1, 0.1]
else:
self.state_low = [1, -6.4, 0.0]
self.state_high = [24, 39.1, 1.0]
self.tile_widths = [2, 2, 0.2]
self.level_cnt = level_cnt
self.num_tilings = 10
self.delta_term = 0.05
self.initial_weight_value = -1 * (elec_consump*elec_consump) / self.num_tilings
print("use_adaptive_learning_rate={0}, parameterize_actions={1}".format(self.use_adaptive_learning_rate, self.parameterize_actions))
print("num_tilings={0}, state_low={1}, state_high={2}, tile_widths={3}, initial_weight_value={4}".format(self.num_tilings,
self.state_low, self.state_high, self.tile_widths, self.initial_weight_value))
if not self.parameterize_actions:
self.Q_sa = []
for _ in range(level_cnt):
self.Q_sa.append(ValueFunctionWithTile(
self.state_low, self.state_high, num_tilings=self.num_tilings,
tile_width=self.tile_widths, initial_weight_value=self.initial_weight_value,
wrap_around=[False, False, False], use_standard_tile_coding=False))
else:
self.Q_sa = ValueFunctionWithTile(
self.state_low, self.state_high, num_tilings=self.num_tilings,
tile_width=self.tile_widths, initial_weight_value=self.initial_weight_value,
wrap_around=[False, False, False, False], use_standard_tile_coding=False)
# Learning method params
self.gamma = gamma
self.alpha = alpha
print("gamma={0}, alpha={1}, delta_term={2}, levels={3}".format(self.gamma, self.alpha, self.delta_term, self.level_cnt))
self.action_disc = utils.Discretizer(min_val=-0.5, max_val=0.5, level_cnt=level_cnt)
self.charge_disc = utils.Discretizer(min_val=0.0, max_val=1.0, level_cnt=level_cnt)
# Format of one state in the replay buffer -
# Hour of day, t_outdoor, cooling_demand. Below will have a list of these states.
self.replay_buffer = []
# Note that we have not included charge on ES, we will be updating Q values for states at
# all charge levels In essence, we will be making updates for unseen states which can be
# done as there is no correlation on which charge level you start with and the cooling
# demand at that time step
# Environment specific variables that are allowed to be used by the agent.
self.storage_capacity = storage_capacity
self.efficiency = efficiency
self.loss_coefficient = loss_coefficient
self.max_action_val_seen_till_now = 0.0
self.num_visits = {}
self.max_action_val_seen_till_now = 0.0
self.max_action_val_seen_pair = None
self.min_action_val_seen_till_now = float('inf')
self.min_action_val_seen_pair = None
self.got_stop_signal = False
def update_prev_cooling_demand(self, cooling_demand):
self.replay_buffer[-1]["cooling_demand"] = abs(cooling_demand)
def get_max_action(self, state, action_val=None, Q_sa_copy=None):
max_action = 0
max_action_val = float('-inf')
if not Q_sa_copy:
Q_sa_copy = self.Q_sa
for action in range(self.level_cnt):
if self.action_disc.get_val(action) > 0:
if self.action_disc.get_val(action) > 1 - state[2]:
continue
else:
if -1 * self.action_disc.get_val(action) > state[2]:
continue
# Testing Hack
if action_val is not None and action_val != self.action_disc.get_val(action):
continue
state_action = list(state)
state_action.append(action)
q_sa_val = None
if not self.parameterize_actions:
q_sa_val = Q_sa_copy[action](list(state))
else:
input_ = list(state)
input_.append(self.action_disc.get_val(action))
q_sa_val = Q_sa_copy(input_)
if q_sa_val > max_action_val:
max_action_val = q_sa_val
max_action = action
if q_sa_val > 0:
x = list(state)
x.append(self.action_disc.get_val(action))
# print("[Warning!] Qs, a > 0 = {0} for {1}".format(q_sa_val, x))
if self.max_action_val_seen_till_now < abs(q_sa_val):
self.max_action_val_seen_till_now = abs(q_sa_val)
self.max_action_val_seen_pair = list(state)
self.max_action_val_seen_pair.append(self.action_disc.get_val(action))
if self.min_action_val_seen_till_now > abs(q_sa_val):
self.min_action_val_seen_till_now = abs(q_sa_val)
self.min_action_val_seen_pair = list(state)
self.min_action_val_seen_pair.append(self.action_disc.get_val(action))
return max_action, max_action_val
def plan_on_replay_buffer(self, num_iterations=1, without_updates=False, rel_delta=True):
delta = float('inf')
idx = 0
if without_updates:
num_iterations = 1
# if not without_updates:
# logger.debug("Replay buffer = {0}".format(self.replay_buffer))
alpha = self.alpha
# if not without_updates:
# alpha = float(input("What alpha value?"))
self.max_action_val_seen_till_now = 0.0
self.max_action_val_seen_pair = None
self.min_action_val_seen_till_now = float('inf')
self.min_action_val_seen_pair = None
# prev_delta = float('inf')
prev_delta_ratio = float('inf')
alpha_ceil = 1.0
if not without_updates:
self.got_stop_signal = False
self.num_times_delta_inc = 0
max_delta_ratio = 0.0
while True: #* self.min_action_val_seen_till_now: #idx < num_iterations or (
if self.got_stop_signal and not without_updates:
break
self.max_action_val_seen_till_now = 0.0
idx += 1
prev_state = {}
delta = 0.0
Q_sa_copy = copy.deepcopy(self.Q_sa)
for state in self.replay_buffer:
import sys
sys.stdout.flush()
if self.got_stop_signal and not without_updates:
break
if not prev_state:
prev_state = state
continue
# Our Tweak -> We can make updates for different states which we haven't even seen!
for charge_level in range(self.level_cnt): # int(self.level_cnt/2)
if self.got_stop_signal and not without_updates:
break
charge_val = self.charge_disc.get_val(charge_level)
# To account for losses when storing energy.
charge_val = charge_val*(1-self.loss_coefficient)
for action in range(self.level_cnt):
if self.got_stop_signal and not without_updates:
break
action_val = self.action_disc.get_val(action)
# Testing hack
# if charge_level != 0 or action_val != 0:
# continue
# print("Checking state {0} charge {1} action_val {2}".format(state, charge_val, action_val))
if action_val < 0 and -1 * action_val > charge_val:
continue
if action_val > 0 and action_val > 1 - charge_val:
continue
cooling_pump = HeatPump(nominal_power = 9e12, eta_tech = 0.22, t_target_heating = 45, t_target_cooling = 1)
cooling_pump.set_cop(t_source_cooling = prev_state["t_out"])
# TODO: Not handling cases where without charge in ES, we can't satisfy our cooling demand.
assert(cooling_pump.get_max_cooling_power(t_source_cooling = prev_state["t_out"]) > prev_state["cooling_demand"])
cooling_power_avail = cooling_pump.get_max_cooling_power(t_source_cooling = prev_state["t_out"]) - prev_state["cooling_demand"]
# Don't accept charge values which require charging more than possible, and which discharge more than required.
# Not updating weights for these values will help in not using this for updates and focussing representational power
# in valid actions only. TODO: Trying removing and confirm the benefits.
if action_val >= 0:
# Note the different in action_val in this and DDP. In DDP an action means drawing energy from pump to get us that
# much of charging and hence we use /efficiency and not * efficiency in that.
if action_val*self.storage_capacity*self.efficiency > cooling_power_avail:
continue
cooling_energy_to_storage = action_val*self.storage_capacity*self.efficiency
else:
if -1 * action_val*self.storage_capacity*self.efficiency > prev_state["cooling_demand"]:
continue
cooling_energy_to_storage = action_val*self.storage_capacity*self.efficiency
next_charge_val = charge_val + cooling_energy_to_storage/self.storage_capacity
cooling_energy_drawn_from_heat_pump = cooling_energy_to_storage + prev_state["cooling_demand"]
elec_demand_cooling = cooling_pump.get_electric_consumption_cooling(cooling_supply = cooling_energy_drawn_from_heat_pump)
q_val = None
if not self.parameterize_actions:
q_val = self.Q_sa[action]([prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level)])
else:
q_val = self.Q_sa([prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level), self.action_disc.get_val(action)])
max_action, max_action_val = self.get_max_action(
[state["hour_of_day"], state["t_out"], next_charge_val], action_val=None, Q_sa_copy=Q_sa_copy)
# The below condition means that the state-action pair has not been visited
# if max_action_val == self.initial_weight_value * self.num_tilings:
# logger.debug("Tiles not properly generalized, this error should fade in sometime, if it does not - change params!!! State {0} max_action {1} max_action_val {2}, coming from charge_val of prev state {3}".format(
# [state["hour_of_day"], state["t_out"], next_charge_val],
# self.action_disc.get_val(max_action),
# max_action_val, charge_val))
# print("Qs, a for {0} is {1}, Target {2}, Q*s' for {3} is {4} with action {5},{6}".format(
# [prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level), action], q_val,
# - 1 * (elec_demand_cooling*elec_demand_cooling) + self.gamma * max_action_val,
# [state["hour_of_day"], state["t_out"], next_charge_val], max_action_val, max_action,self.action_disc.get_val(max_action)))
curr_delta = abs(- 1 * (elec_demand_cooling*elec_demand_cooling) +
self.gamma * max_action_val - q_val)
delta = max(delta, curr_delta)
delta_ratio = curr_delta/abs(q_val)
max_delta_ratio = max(max_delta_ratio, delta_ratio)
prev_state_action = [prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level), action]
next_state_action = [state["hour_of_day"], state["t_out"], next_charge_val, max_action]
# Testing hack
# tupl = (prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level), action)
# if tupl not in self.num_visits:
# self.num_visits[tupl] = 0
# self.num_visits[tupl] += 1
if not without_updates:
if not self.parameterize_actions:
self.Q_sa[action].update(alpha, - 1 * (elec_demand_cooling*elec_demand_cooling) + self.gamma * max_action_val,
[prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level)])
else:
self.Q_sa.update(alpha, - 1 * (elec_demand_cooling*elec_demand_cooling) + self.gamma * max_action_val,
[prev_state["hour_of_day"], prev_state["t_out"], self.charge_disc.get_val(charge_level),
self.action_disc.get_val(action)])
prev_state = state
print("Done Planning iteration {0}: {1} with buffer size {2}, delta={3}, max_action_value={4}:{5} min_action_val={6}:{7} "
"delta_ratio={8}".format(
"Without updates" if without_updates else "normal",
idx, len(self.replay_buffer), delta,
self.max_action_val_seen_till_now,
self.max_action_val_seen_pair,
self.min_action_val_seen_till_now,
self.min_action_val_seen_pair,
max_delta_ratio))
# Testing hack
if without_updates:
return max_delta_ratio
max_delta_ratio = self.plan_on_replay_buffer(num_iterations=1, without_updates=True)
import sys
sys.stdout.flush()
if max_delta_ratio < self.delta_term:
print("Breaking as max delta ratio < 0.01")
break
if self.use_adaptive_learning_rate:
if max_delta_ratio > prev_delta_ratio:
self.num_times_delta_inc += 1
if self.num_times_delta_inc <= 3:
prev_delta_ratio = max_delta_ratio
continue
self.num_times_delta_inc = 0
print("Max Delta Ratio {0} > prev_delta_ratio {1}. Changing alpha {2} -> {3}".format(delta_ratio, prev_delta_ratio, alpha, alpha/2))
alpha /= 2
if alpha < 0.001:
break
alpha_ceil = alpha
else:
prev_alpha = alpha
alpha = min(2*alpha, alpha_ceil)
print("Max Delta Ratio {0} <= prev_delta_ratio {1}. Changing alpha {2} -> {3}".format(delta_ratio, prev_delta_ratio, prev_alpha, alpha))
prev_delta_ratio = max_delta_ratio
def select_action(self, state):
# State - Hour of day, temperatue, charge on ES
self.current_state = state[0]
# TODO: Change below for multi building case later
max_action, max_action_val = self.get_max_action(self.current_state)
# print("Selecting action for state {0}, {1} -> {2}".format(state,
# max_action, max_action_val), flush=True)
self.chosen_action = max_action
# Maintain a replay buffer of just 25 hours for now.
if len(self.replay_buffer) >= 25:
self.replay_buffer = self.replay_buffer[1:]
self.replay_buffer.append(
{
"cooling_demand": -1,
"hour_of_day": self.current_state[0],
"t_out": self.current_state[1]
})
return [[self.action_disc.get_val(self.chosen_action)]]
def update_on_transition(self, r, s_next, done):
max_action_next_val = 0.0
# TODO: Change below for multi building case later
next_state = s_next[0]
max_action = -1
if not done:
max_action, max_action_next_val = self.get_max_action(next_state)
state_action = list(self.current_state)
state_action.append(self.chosen_action)
next_state_action = list(next_state)
next_state_action.append(max_action)
q_val = None
if not self.parameterize_actions:
q_val = self.Q_sa[self.chosen_action](list(self.current_state))
else:
input_ = list(self.current_state)
input_.append(self.action_disc.get_val(self.chosen_action))
q_val = self.Q_sa(input_)
delta = \
abs(
-1 * r * r + self.gamma * max_action_next_val - q_val)
# print("Update: {0}, {1} -> {2}, {3} Diff {4}".format(state_action,
# self.Q_sa[self.chosen_action](list(self.current_state)),
# next_state_action, max_action_next_val,
# delta))
# self.Q_sa[self.chosen_action].update(
# self.alpha, -1 * r * r + self.gamma * max_action_next_val, list(self.current_state))
if len(self.replay_buffer) >= 25:# or done:
self.plan_on_replay_buffer()
# TODO: Verify this change
# TODO: Plan on the past 25 hours, make it sliding window
# self.replay_buffer = [self.replay_buffer[-1]]
import sys
sys.stdout.flush()