Skip to content

Commit

Permalink
Merge branch 'release/v1.0.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
RobertoChiosa committed Dec 27, 2024
2 parents 848f03b + 31f1f13 commit da5d7e5
Show file tree
Hide file tree
Showing 6 changed files with 794 additions and 684 deletions.
4 changes: 1 addition & 3 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
### Features:

- Fixed csv path issues
- Documentation updates
- Docker run support
- Fixed numpy
1,248 changes: 642 additions & 606 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ packages = [

[tool.poetry.dependencies]
python = "^3.11"
numpy = "^2.0.0"
pandas = "^2.2.2"
plotly = "^5.22.0"
scipy = "^1.14.0"
Expand All @@ -22,6 +21,7 @@ jinja2 = "^3.1.4"
argparse = "^1.4.0"
logger = "^1.4"
requests = "^2.32.3"
numpy = "^2.2.1"


[build-system]
Expand Down
6 changes: 5 additions & 1 deletion src/cmp/anomaly_detection_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright © Roberto Chiosa 2024.
# Email: [email protected]
# Last edited: 13/8/2024
# Last edited: 23/9/2024

import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -36,6 +36,7 @@ def boxplot_fun(group, vector_ad):
ax.set_title('Notched box plot')
# close
plt.close(fig)
plt.close('all')

# get the outliers
outliers_both_whisker = [flier.get_ydata() for flier in bp["fliers"]]
Expand Down Expand Up @@ -91,6 +92,7 @@ def zscore_fun(group, vector_ad, upper_bound=2):
sns.kdeplot(data=zscore)
plt.axvline(x=upper_bound, linestyle='dashed', color='gray')
plt.close(fig)
plt.close('all')
# create an array of medians according cluster on yearly period
outliers = np.zeros(group.size)
j = 0
Expand Down Expand Up @@ -142,6 +144,7 @@ def elbow_fun(group, vector_ad):
anomaly_ticks = list(range(0, vector_ad.size, int(vector_ad.size / 5)))
anomaly_ticks.append(num_anomalies_to_show)
plt.xticks(anomaly_ticks)
plt.close('all')

# create an array of medians according cluster on yearly period
outliers = np.zeros(group.size)
Expand Down Expand Up @@ -260,6 +263,7 @@ def gesd_fun(group, vector_ad):
fig, ax = plt.subplots()
stats.probplot(vector_ad, plot=plt)
plt.close(fig)
plt.close('all')

n_outliers = gesd_esd_test(input_series=vector_ad, alpha=0.05, max_outliers=10)

Expand Down
129 changes: 97 additions & 32 deletions src/cmp/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Copyright © Roberto Chiosa 2024.
# Email: [email protected]
# Last edited: 17/9/2024
# Last edited: 23/9/2024

import argparse
import datetime # data
from statistics import mean

import plotly.express as px
from scipy.stats import zscore

from src.cmp.anomaly_detection_functions import anomaly_detection, extract_vector_ad_temperature, \
extract_vector_ad_energy, extract_vector_ad_cmp
from src.cmp.utils import *
Expand Down Expand Up @@ -53,29 +54,6 @@
raw_data = download_data(args.input_file)
data, obs_per_day, obs_per_hour = process_data(raw_data, args.variable_name)

# print dataset main characteristics
summary = f''' \n*********************\n
DATASET: Electrical Load dataset from {args.variable_name}\n
- From\t{data.index[0]}\n
- To\t{data.index[len(data) - 1]}\n
- {len(data.index[::obs_per_day])}\tdays\n
- 1 \tobservations every 15 min\n
- {obs_per_day}\tobservations per day\n
- {obs_per_hour}\tobservations per hour\n
- {len(data)}observations
'''

# Visualise the data with plotly line plot
fig = px.line(data['value'])
fig.update_layout(xaxis_title=None, yaxis_title="Electrical Load [kW]", showlegend=False,
paper_bgcolor='rgba(0,0,0,0)')

report_content['summary'] = {
"title": "Dataset Summary",
"content": summary,
"plot": fig.to_html(full_html=False)
}

########################################################################################
# Define configuration for the Contextual Matrix Profile calculation.

Expand All @@ -86,7 +64,45 @@

# The context is defined as 1 hour before time window, to be consistent with other analysis,
# results are loaded from 'm_context.csv' file
m_context = 1
m_context = 1 # [h]

# todo perform cluster analysis
# Load Cluster results as boolean dataframe: each column represents a group
group_df = pd.read_csv(os.path.join(path_to_data, "group_cluster.csv"), index_col='timestamp', parse_dates=True)
# get number of groups/clusters
n_group = group_df.shape[1]
cluster_summary = (f'The dataset has been clustered into {n_group} groups using K-means algorithm and displayed '
f'in the following image. The clusters group similar daily '
f'load profiles for which the contextual matrix profile calculation will be performed.')

cluster_data_plot = data.copy()
cluster_data_plot.reset_index(inplace=True)
cluster_data_plot['date'] = cluster_data_plot['timestamp'].dt.date
cluster_data_plot['time'] = cluster_data_plot['timestamp'].dt.time
cluster_data_plot['cluster'] = 'no_cluster'

for cluster_id in range(n_group):
dates_plot = group_df.index[group_df[f'Cluster_{cluster_id + 1}'] == True]
# convert to string object
dates_plot = [date.date() for date in dates_plot]
# add info to clsuter column
cluster_data_plot = cluster_data_plot.assign(
cluster=lambda x: np.where(x['date'].isin(dates_plot), f'Cluster_{cluster_id + 1}', x['cluster'])
)

fig = px.line(cluster_data_plot, x='time', y='value', line_group='date', facet_col='cluster', color='cluster')
# use viridis palette
fig.update(layout=dict(
xaxis_title=None,
yaxis_title="Power [kW]",
paper_bgcolor='rgba(0,0,0,0)',
showlegend=False
))
report_content['clusters'] = {
"title": "Group definition",
"content": cluster_summary,
"plot": fig.to_html(full_html=False)
}

# Define output files as dataframe
# - df_anomaly_results -> in this file the anomaly results will be saved
Expand All @@ -95,6 +111,8 @@
df_contexts = pd.DataFrame(
columns=["from", "to", "context_string", "context_string_small", "duration", "observations"])

anomalies_table_overall = pd.DataFrame()

# begin for loop on the number of time windows
for id_tw in range(len(df_time_window)):

Expand Down Expand Up @@ -204,16 +222,11 @@
})

########################################################################################
# todo perform cluster analysis
# Load Cluster results as boolean dataframe: each column represents a group
group_df = pd.read_csv(os.path.join(path_to_data, "group_cluster.csv"), index_col='timestamp', parse_dates=True)
# initialize dataframe of results for context to be appended to the overall result
df_anomaly_context = group_df.astype(int)

# set labels
day_labels = data.index[::obs_per_day]
# get number of groups
n_group = group_df.shape[1]

# perform analysis of context on groups (clusters)
for id_cluster in range(n_group):
Expand Down Expand Up @@ -313,7 +326,7 @@
anomalies_table["Date"] = cmp_ad_score_dates
anomalies_table["Anomaly Score"] = cmp_ad_score[cmp_ad_score_index]
anomalies_table["Rank"] = anomalies_table.index + 1

anomalies_table_overall = pd.concat([anomalies_table_overall, anomalies_table])
# the number of anomalies is the number of non nan elements, count
num_anomalies_to_show = np.count_nonzero(~np.isnan(cmp_ad_score))

Expand Down Expand Up @@ -420,15 +433,67 @@
# remove redundant columns
df_anomaly_results = df_anomaly_results.loc[:, ~df_anomaly_results.columns.duplicated()]

print('\n*********************\n')
# at the end of loop on context save dataframe of results
df_anomaly_results.to_csv(os.path.join(path_to_data, "anomaly_results.csv"))
df_contexts.to_csv(os.path.join(path_to_data, "contexts.csv"), index=False)

# print summary with anomalies
# print dataset main characteristics
summary = f'''The dataset under analysis refers to the variable '<strong>{args.variable_name}</strong>':
<ul>
<li>From: {data.index[0]}</li>
<li>To: {data.index[len(data) - 1]}</li>
<li>{len(data.index[::obs_per_day])} days</li>
<li>1 observation every 15 minutes</li>
<li>{obs_per_day} observations per day</li>
<li>{obs_per_hour} observations per hour</li>
<li>{len(data)} total observations</li>
</ul>
The line plot represented in the following image represents the whole dataset.
In gray the days where no anomalies where found while in red are highlighted
the anomalous days, identified by the CMP proceed. Please mind that the identified anomalous days
may be anomalous only in certain sub daily sequences as further described in the analysis that follows.
'''

# il summary lo si fa alla fine
# Visualise the data with plotly line plot
df_summary_plot = data.copy()
df_summary_plot.reset_index(inplace=True)
df_summary_plot['date'] = df_summary_plot['timestamp'].dt.date
df_summary_plot['anomaly_score'] = 0

# Highlight only anomalous days
for anom in anomalies_table_overall.itertuples():
# get column that matches the date
df_summary_plot['date'] = df_summary_plot['timestamp'].dt.date
# convert to string object to compare properly
df_summary_plot['date'] = df_summary_plot['date'].astype(str)

index_anom_plot = list(df_summary_plot[df_summary_plot['date'] == anom[1]].index)
# update all index to red
df_summary_plot.loc[index_anom_plot, 'anomaly_score'] = anom[2]

# gray from gray to red with 8 steps

color_palette = ["#808080", "#A00000", "#C00000", "#E00000", "#FF0000", "#FF2020", "#FF4040", "#FF6060", "#FF8080"]

df_summary_plot['anomaly_score'] = df_summary_plot['anomaly_score'].astype(int)
fig = px.line(df_summary_plot, x='timestamp', y='value', color='anomaly_score',
line_group='date', color_discrete_sequence=color_palette)
fig.update_layout(xaxis_title=None, yaxis_title="Electrical Load [kW]", showlegend=False,
paper_bgcolor='rgba(0,0,0,0)')

report_content['summary'] = {
"title": "Dataset Summary",
"content": summary,
"plot": fig.to_html(full_html=False)
}

# print the execution time
total_time = datetime.datetime.now() - begin_time
hours, remainder = divmod(total_time.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)
print('\n*********************\n')
logger.info(f"TOTAL {str(int(minutes))} min {str(int(seconds))} s")

save_report(report_content, args.output_file)
89 changes: 48 additions & 41 deletions src/cmp/templates/cmp.html
Original file line number Diff line number Diff line change
Expand Up @@ -26,54 +26,61 @@ <h2>{{ summary.title }}</h2>
{{ summary.plot }}
</div>

<!-- Clusters -->
<div class="p-0 mb-4">
<h2>{{ clusters.title }}</h2>
<p>{{ clusters.content }}</p>
{{ clusters.plot }}
</div>

<!-- Contexts -->
{% for context in contexts %}
<br>
<div class="h-100 p-5 bg-body-tertiary border rounded-3">
<div class="row align-items-md-stretch top">
<div class="col-md-7 p-5 mb-4">
<h2>{{ context.title }}</h2>
<p class="text-muted">{{ context.subtitle }}</p>
<p>{{ context.content }}</p>
</div>
<div class="col-md-5 p-0 mb-4">
{{ context.plot }}
<br>
<div class="h-100 p-5 bg-body-tertiary border rounded-3">
<div class="row align-items-md-stretch top">
<div class="col-md-7 p-5 mb-4">
<h2>{{ context.title }}</h2>
<p class="text-muted">{{ context.subtitle }}</p>
<p>{{ context.content }}</p>
</div>
<div class="col-md-5 p-0 mb-4">
{{ context.plot }}
</div>
</div>
</div>
{% for cluster in context.clusters %}
{% for cluster in context.clusters %}

{% if cluster.plot_anomalies %}
<div class="row align-items-md-stretch border-top">
<div class="col-md-12 pt-5 pb-0 pl-5 pr-5 mb-4 bg-body-tertiary">
<h3>{{ cluster.title }}</h3>
<p>{{ cluster.content }}</p>
{{cluster.table}}
</div>
</div>
<div class="row align-items-md-stretch">
<div class="col-md-7 p-0 mb-4">
{{ cluster.plot_anomalies }}
</div>
<div class="col-md-5 p-0 mb-4">
{{ cluster.plot }}
</div>
</div>
{% if cluster.plot_anomalies %}
<div class="row align-items-md-stretch border-top">
<div class="col-md-12 pt-5 pb-0 pl-5 pr-5 mb-4 bg-body-tertiary">
<h3>{{ cluster.title }}</h3>
<p>{{ cluster.content }}</p>
{{ cluster.table }}
</div>
</div>
<div class="row align-items-md-stretch">
<div class="col-md-7 p-0 mb-4">
{{ cluster.plot_anomalies }}
</div>
<div class="col-md-5 p-0 mb-4">
{{ cluster.plot }}
</div>
</div>


{% else %}
<div class="row align-items-md-stretch border-top">
<div class="col-md-7 p-5 mb-4 bg-body-tertiary">
<h3>{{ cluster.title }}</h3>
<p>{{ cluster.content }}</p>
</div>
<div class="col-md-5 p-0 mb-4">
{{ cluster.plot }}
</div>
</div>
{% endif %}
{% else %}
<div class="row align-items-md-stretch border-top">
<div class="col-md-7 p-5 mb-4 bg-body-tertiary">
<h3>{{ cluster.title }}</h3>
<p>{{ cluster.content }}</p>
</div>
<div class="col-md-5 p-0 mb-4">
{{ cluster.plot }}
</div>
</div>
{% endif %}

{% endfor %}
</div>
{% endfor %}
</div>
{% endfor %}


Expand Down

0 comments on commit da5d7e5

Please sign in to comment.