Skip to content

Commit

Permalink
Properly set Nullable fields in the DB. Simplify population of histor…
Browse files Browse the repository at this point in the history
…ical/recent downloads data.
  • Loading branch information
vladsavelyev committed Feb 19, 2024
1 parent 8350e53 commit 2efc83e
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 77 deletions.
105 changes: 37 additions & 68 deletions app/db.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""Functions to interact with the database."""
from typing import Optional

import logging
import os

Expand Down Expand Up @@ -29,33 +31,33 @@ class VisitStats(SQLModel, table=True): # type: ignore # mypy doesn't like this
start: datetime.datetime = Field(primary_key=True)
end: datetime.datetime = Field(primary_key=True)
count: int
version_multiqc: str = Field(index=True)
version_python: str = Field(default=None, index=True)
operating_system: str = Field(default=None, index=True)
installation_method: str = Field(default=None, index=True)
ci_environment: str = Field(default=None, index=True)
version_multiqc: Optional[str] = Field(index=True, default=None)
version_python: Optional[str] = Field(default=None, index=True)
operating_system: Optional[str] = Field(default=None, index=True)
installation_method: Optional[str] = Field(default=None, index=True)
ci_environment: Optional[bool] = Field(default=None, index=True)


class DownloadStatsDaily(SQLModel, table=True):
"""Daily download statistics"""

date: datetime.datetime = Field(primary_key=True)
pip_new: int = Field(default=None)
pip_total: int = Field(default=None)
bioconda_total: int = Field(default=None)
bioconda_new: int = Field(default=None)
biocontainers_quay_new: int = Field(default=None)
biocontainers_quay_total: int = Field(default=None)
prs_new: int = Field(default=None)
contributors_pr: int = Field(default=None)
contributors_new: int = Field(default=None)
prs_total: int = Field(default=None)
contributors_total: int = Field(default=None)
modules_new: int = Field(default=None)
modules_total: int = Field(default=None)
biocontainers_aws_total: int = Field(default=None)
dockerhub_total: int = Field(default=None)
clones_total: int = Field(default=None)
pip_new: Optional[int] = None
pip_total: Optional[int] = None
bioconda_total: Optional[int] = None
bioconda_new: Optional[int] = None
biocontainers_quay_new: Optional[int] = None
biocontainers_quay_total: Optional[int] = None
prs_new: Optional[int] = None
contributors_pr: Optional[int] = None
contributors_new: Optional[int] = None
prs_total: Optional[int] = None
contributors_total: Optional[int] = None
modules_new: Optional[int] = None
modules_total: Optional[int] = None
biocontainers_aws_total: Optional[int] = None
dockerhub_total: Optional[int] = None
clones_total: Optional[int] = None


def create_db_and_tables() -> None:
Expand Down Expand Up @@ -100,53 +102,20 @@ def get_download_stats(
return session.exec(statement).all()


def insert_download_stats(
df: pd.DataFrame,
db_table="downloadstatsdaily",
days: int | None = None,
) -> pd.DataFrame:
def insert_download_stats(df: pd.DataFrame) -> pd.DataFrame:
df["date"] = pd.to_datetime(df.index) # adding a separate field date with a type datetime
df = df[["date"] + [c for c in df.columns if c != "date"]] # place date first

if days is None:
# Initiating the database with historical data, making sure we are not
# overriding th entire database.
try:
# Add a new date column separate from index in order to ensure the db uses Date type
df.to_sql(db_table, engine, if_exists="fail", index=False, index_label="date")
except ValueError as e:
logger.error(
f"Failed to save historical data to table '{db_table}', the table might already exist? "
f"Clean manually if you want to replace the historical data: {e}"
)
raise
except Exception as e:
logger.error(f"Failed to write historical downloads stats to table '{db_table}': {e}")
raise
# Adding date as a primary key. Not wrapping in try-except here because if the DB was
# populated without problems but failed creating a primary key, something is wrong here
# and needs to vbe cleaned up manually.
with engine.connect() as c:
cursor = c.connection.cursor()
cursor.execute(f"ALTER TABLE {db_table} ADD PRIMARY KEY (date);")
print(f"Wrote historical downloads stats to table '{db_table}'")
else:
logger.debug(
f"Adding recent {len(df)} entries to the '{db_table}' table one by one, updating if"
f"an entry at this date already exists"
)
with Session(engine) as session:
for index, row in df.iterrows():
row = row.where(pd.notna(row), None)
existing_entry = session.exec(
select(DownloadStatsDaily).where(DownloadStatsDaily.date == row["date"])
).first()
if existing_entry:
for key, value in row.items():
setattr(existing_entry, key, value)
else:
new_entry = DownloadStatsDaily(**row)
session.add(new_entry)
session.commit()
logger.debug(f"Updated last {days} days to in daily downloads table '{db_table}'")
with Session(engine) as session:
for index, row in df.iterrows():
row = row.where(pd.notna(row), None)
existing_entry = session.exec(
select(DownloadStatsDaily).where(DownloadStatsDaily.date == row["date"])
).first()
if existing_entry:
for key, value in row.items():
setattr(existing_entry, key, value)
else:
new_entry = DownloadStatsDaily(**row)
session.add(new_entry)
session.commit()
return df
45 changes: 36 additions & 9 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ async def version(
"version_python": version_python,
"operating_system": operating_system,
"installation_method": installation_method,
"ci_environment": ci_environment,
"ci_environment": strtobool(ci_environment) if ci_environment is not None else None,
}
)
return models.VersionResponse(latest_release=app.latest_release)
Expand Down Expand Up @@ -153,6 +153,9 @@ def _summarize_visits() -> Response:
if len(minute_summary) == 0:
return PlainTextResponse(content="No new visits to summarize")

# Replace Unknown with None
minute_summary = minute_summary.replace("Unknown", None)

logger.debug(
f"Summarizing {len(df)} visits in {CSV_FILE_PATH} and writing {len(minute_summary)} rows to the DB"
)
Expand Down Expand Up @@ -191,22 +194,24 @@ def _update_download_stats():
"""
Update the daily download statistics in the database
"""
logger.info("Update download stats")
try:
existing_downloads = db.get_download_stats()
except ProgrammingError:
logger.error("The table does not exist, will create and populate with historical data")
existing_downloads = []
if len(existing_downloads) == 0: # first time, populate historical data
logger.info("Populating historical data...")
logger.info("Collecting historical data...")
df = daily.collect_daily_download_stats()
logger.debug(f"Adding {len(df)} historical entries to the table...")
db.insert_download_stats(df)
logger.info(f"Successfully populated {len(df)} historical entries")
else: # recent days only
n_days = 4
logger.info(f"Updating data for the last {n_days} days...")
df = daily.collect_daily_download_stats(days=n_days)
logger.debug(f"Adding {len(df)} recent entries to the table. Will update existing entries at the same date")
db.insert_download_stats(df)
logger.info(f"Successfully populated {len(df)} historical daily entries to the downloads table")
else: # recent 2 days
logger.info("Updating data for the last 2 days...")
df = daily.collect_daily_download_stats(days=4)
db.insert_download_stats(df, days=4)
logger.info("Successfully appended new daily download statistics")
logger.info(f"Successfully updated {len(df)} new daily download statistics")


@app.on_event("startup")
Expand Down Expand Up @@ -378,5 +383,27 @@ def plotly_image_response(plot, format: PlotlyImageFormats = PlotlyImageFormats.
return Response(content=plot)


def strtobool(val) -> bool:
"""
Replaces deprecated https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool
The deprecation recommendation is to re-implement the function https://peps.python.org/pep-0632/
------------------------------------------------------------
Convert a string representation of truth to true (1) or false (0).
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
are 'n', 'no', 'f', 'false', 'off', and '0'. Raises ValueError if
'val' is anything else.
"""
val_str = str(val).lower()
if val_str in ("y", "yes", "t", "true", "on", "1"):
return True
elif val_str in ("n", "no", "f", "false", "off", "0"):
return False
else:
raise ValueError(f"invalid truth value {val!r}")


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

0 comments on commit 2efc83e

Please sign in to comment.