Skip to content

Commit

Permalink
delete hydro_utils and add hydroutils package in env files; move grdc…
Browse files Browse the repository at this point in the history
… to hydro_opendata
  • Loading branch information
OuyangWenyu committed Oct 15, 2023
1 parent 57e9a7f commit 9247b52
Show file tree
Hide file tree
Showing 11 changed files with 83 additions and 537 deletions.
6 changes: 5 additions & 1 deletion environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,8 @@ dependencies:
- pint-xarray

# request tools
- async-retriever
- async-retriever

- pip
- pip:
- hydroutils
54 changes: 38 additions & 16 deletions hydrodataset/camels.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Author: Wenyu Ouyang
Date: 2022-01-05 18:01:11
LastEditTime: 2023-09-20 16:01:22
LastEditTime: 2023-10-15 14:38:46
LastEditors: Wenyu Ouyang
Description: Read Camels Series ("AUStralia", "BRazil", "ChiLe", "GreatBritain", "UnitedStates") datasets
FilePath: \hydrodataset\hydrodataset\camels.py
Expand All @@ -21,14 +21,36 @@
from urllib.request import urlopen
from tqdm import tqdm
import xarray as xr
from hydrodataset import CACHE_DIR, hydro_utils, HydroDataset, CAMELS_REGIONS
from hydroutils import hydro_time, hydro_file
from hydrodataset import CACHE_DIR, HydroDataset, CAMELS_REGIONS

CAMELS_NO_DATASET_ERROR_LOG = (
"We cannot read this dataset now. Please check if you choose correctly:\n"
+ str(CAMELS_REGIONS)
)


def map_string_vars(ds):
# Iterate over all variables in the dataset
for var in ds.data_vars:
# Check if the variable contains string data
if ds[var].dtype == object:
# Convert the DataArray to a pandas Series
var_series = ds[var].to_series()

# Get all unique strings and create a mapping to integers
unique_strings = sorted(var_series.unique())
mapping = {value: i for i, value in enumerate(unique_strings)}

# Apply the mapping to the series
mapped_series = var_series.map(mapping)

# Convert the series back to a DataArray and replace the old one in the Dataset
ds[var] = xr.DataArray(mapped_series)

return ds


def time_intersect_dynamic_data(obs: np.array, date: np.array, t_range: list):
"""
chose data from obs in the t_range
Expand All @@ -47,7 +69,7 @@ def time_intersect_dynamic_data(obs: np.array, date: np.array, t_range: list):
np.array
the chosen data
"""
t_lst = hydro_utils.t_range_days(t_range)
t_lst = hydro_time.t_range_days(t_range)
nt = t_lst.shape[0]
if len(obs) != nt:
out = np.full([nt], np.nan)
Expand Down Expand Up @@ -419,10 +441,10 @@ def download_data_source(self) -> None:
for url in links
if not Path(self.data_source_dir, url.rsplit("/", 1)[1]).exists()
]
hydro_utils.download_zip_files(to_dl, self.data_source_dir)
hydro_file.download_zip_files(to_dl, self.data_source_dir)
else:
warnings.warn("We only provide downloading methods for CAMELS-US now")
hydro_utils.zip_extract(camels_config["CAMELS_DIR"])
hydro_file.zip_extract(camels_config["CAMELS_DIR"])

def read_site_info(self) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -653,7 +675,7 @@ def read_usgs_gage(self, usgs_id, t_range):
data_temp = pd.read_csv(usgs_file, sep=r"\s+", header=None)
obs = data_temp[4].values
obs[obs < 0] = np.nan
t_lst = hydro_utils.t_range_days(t_range)
t_lst = hydro_time.t_range_days(t_range)
nt = t_lst.shape[0]
return (
self._read_usgs_gage_for_some(nt, data_temp, t_lst, obs)
Expand Down Expand Up @@ -699,7 +721,7 @@ def read_camels_streamflow(self, usgs_id, t_range):
data_temp = pd.read_csv(usgs_file, sep=",", header=None, skiprows=1)
obs = data_temp[4].values
obs[obs < 0] = np.nan
t_lst = hydro_utils.t_range_days(t_range)
t_lst = hydro_time.t_range_days(t_range)
nt = t_lst.shape[0]
return (
self._read_usgs_gage_for_some(nt, data_temp, t_lst, obs)
Expand Down Expand Up @@ -811,14 +833,14 @@ def read_target_cols(
return np.array([])
else:
nf = len(target_cols)
t_range_list = hydro_utils.t_range_days(t_range)
t_range_list = hydro_time.t_range_days(t_range)
nt = t_range_list.shape[0]
y = np.full([len(gage_id_lst), nt, nf], np.nan)
if self.region == "US":
for k in tqdm(
range(len(gage_id_lst)), desc="Read streamflow data of CAMELS-US"
):
dt150101 = hydro_utils.t2str("2015-01-01")
dt150101 = hydro_time.t2str("2015-01-01")
if t_range_list[-1] > dt150101 and t_range_list[0] < dt150101:
# latest streamflow data in CAMELS is 2014/12/31
data_obs_after_2015 = self.read_camels_streamflow(
Expand Down Expand Up @@ -945,7 +967,7 @@ def read_camels_us_model_output_data(
forcing_type : str, optional
by default "daymet"
"""
t_range_list = hydro_utils.t_range_days(t_range)
t_range_list = hydro_time.t_range_days(t_range)
model_out_put_var_lst = [
"SWE",
"PRCP",
Expand Down Expand Up @@ -1109,7 +1131,7 @@ def read_relevant_cols(
np.array
forcing data
"""
t_range_list = hydro_utils.t_range_days(t_range)
t_range_list = hydro_time.t_range_days(t_range)
nt = t_range_list.shape[0]
x = np.full([len(gage_id_lst), nt, len(var_lst)], np.nan)
if self.region == "US":
Expand Down Expand Up @@ -1332,7 +1354,7 @@ def read_attr_all_yr(self):
gage_id_lst[k],
"attributes.json",
)
attr_data = hydro_utils.unserialize_json_ordered(attr_file)
attr_data = hydro_file.unserialize_json_ordered(attr_file)
for j in range(len(var_lst)):
c[k, j] = attr_data[var_lst[j]]
data_temp = pd.DataFrame(c, columns=var_lst)
Expand Down Expand Up @@ -1429,8 +1451,8 @@ def cache_forcing_np_json(self):
basins = self.sites["gauge_id"].values
daymet_t_range = ["1980-01-01", "2015-01-01"]
times = [
hydro_utils.t2str(tmp)
for tmp in hydro_utils.t_range_days(daymet_t_range).tolist()
hydro_time.t2str(tmp)
for tmp in hydro_time.t_range_days(daymet_t_range).tolist()
]
data_info = collections.OrderedDict(
{
Expand Down Expand Up @@ -1461,7 +1483,7 @@ def cache_streamflow_np_json(self):
basins = self.sites["gauge_id"].values
t_range = ["1980-01-01", "2015-01-01"]
times = [
hydro_utils.t2str(tmp) for tmp in hydro_utils.t_range_days(t_range).tolist()
hydro_time.t2str(tmp) for tmp in hydro_time.t_range_days(t_range).tolist()
]
data_info = collections.OrderedDict(
{
Expand Down Expand Up @@ -1689,6 +1711,6 @@ def read_attr_xrdataset(self, gage_id_lst=None, var_lst=None, **kwargs):
return None
attr = xr.open_dataset(CACHE_DIR.joinpath("camelsus_attributes.nc"))
if "all_number" in list(kwargs.keys()) and kwargs["all_number"]:
attr_num = hydro_utils.map_string_vars(attr)
attr_num = map_string_vars(attr)
return attr_num[var_lst].sel(basin=gage_id_lst)
return attr[var_lst].sel(basin=gage_id_lst)
19 changes: 6 additions & 13 deletions hydrodataset/caravan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@
from urllib.request import urlopen
import pandas as pd
import numpy as np
from pandas.core.dtypes.common import is_string_dtype, is_numeric_dtype
from tqdm import tqdm

from hydrodataset import hydro_utils, HydroDataset
from hydroutils import hydro_file
from hydrodataset import HydroDataset


class Caravan(HydroDataset):
Expand Down Expand Up @@ -97,10 +95,10 @@ def download_data_source(self) -> None:
to_dl = []
if not Path(self.data_source_dir, url.rsplit("/", 1)[1]).exists():
to_dl.append(url)
hydro_utils.download_zip_files(to_dl, self.data_source_dir)
hydro_file.download_zip_files(to_dl, self.data_source_dir)
# It seems that there is sth. wrong with hysets_06444000.nc
try:
hydro_utils.zip_extract(dataset_config["DATASET_DIR"])
hydro_file.zip_extract(dataset_config["DATASET_DIR"])
except tarfile.ReadError:
Warning("Please manually unzip the file.")

Expand Down Expand Up @@ -198,9 +196,7 @@ def read_target_cols(
target_cols: Union[list, np.array] = None,
**kwargs,
) -> np.array:
return self._read_timeseries_data(
"FLOW_DIR", gage_id_lst, t_range, target_cols
)
return self._read_timeseries_data("FLOW_DIR", gage_id_lst, t_range, target_cols)

def read_relevant_cols(
self,
Expand Down Expand Up @@ -287,10 +283,7 @@ def read_constant_cols(
data = data.loc[gage_id_lst]
if var_lst is not None:
data = data.loc[:, var_lst]
if is_return_dict:
return data.to_dict("index")
else:
return data.values
return data.to_dict("index") if is_return_dict else data.values

def read_basin_area(self, object_ids) -> np.array:
return self.read_constant_cols(object_ids, ["area_calc"], is_return_dict=False)
Expand Down
Loading

0 comments on commit 9247b52

Please sign in to comment.