fix: remove duplication for downloading data and test data

GlacioHack · Jan 15, 2025 · 951e2a9 · 951e2a9
1 parent f0a24a7
commit 951e2a9
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 80 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,69 +1,18 @@
 import os
-import tarfile
-import tempfile
-import urllib
-from distutils.dir_util import copy_tree
 from typing import Callable
 
 import pytest
 
-_TESTDATA_DIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "tests", "test_data"))
-
-# Define a URL to the xdem-data repository's test data
-_TESTDATA_REPO_URL = "https://github.com/vschaffn/xdem-data/tarball/2-richdem_gdal"
-_COMMIT_HASH = "31a7159c982cec4b352f0de82bd4e0be61db3afe"
-
-
-def download_test_data(overwrite: bool = False) -> None:
-    """
-    Download the entire test_data directory from the xdem-data repository.
-
-    :param overwrite: If True, re-downloads the data even if it already exists.
-    """
-    if not overwrite and os.path.exists(_TESTDATA_DIRECTORY) and os.listdir(_TESTDATA_DIRECTORY):
-        return  # Test data already exists
-
-    # Clear the directory if overwrite is True
-    if overwrite and os.path.exists(_TESTDATA_DIRECTORY):
-        for root, dirs, files in os.walk(_TESTDATA_DIRECTORY, topdown=False):
-            for name in files:
-                os.remove(os.path.join(root, name))
-            for name in dirs:
-                os.rmdir(os.path.join(root, name))
+from xdem.examples import download_and_extract_tarball
 
-    # Create a temporary directory to download the tarball
-    temp_dir = tempfile.TemporaryDirectory()
-    tar_path = os.path.join(temp_dir.name, "test_data.tar.gz")
-
-    # Construct the URL with the commit hash
-    url = f"{_TESTDATA_REPO_URL}#commit={_COMMIT_HASH}"
-
-    response = urllib.request.urlopen(url)
-    if response.getcode() == 200:
-        with open(tar_path, "wb") as outfile:
-            outfile.write(response.read())
-    else:
-        raise ValueError(f"Failed to download test data: {response.status_code}")
-
-    # Extract the tarball
-    with tarfile.open(tar_path) as tar:
-        tar.extractall(temp_dir.name)
-
-    # Copy the test_data directory to the target directory
-    extracted_dir = os.path.join(
-        temp_dir.name,
-        [dirname for dirname in os.listdir(temp_dir.name) if os.path.isdir(os.path.join(temp_dir.name, dirname))][0],
-        "test_data",
-    )
-
-    copy_tree(extracted_dir, _TESTDATA_DIRECTORY)
+_TESTDATA_DIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "tests", "test_data"))
 
 
 @pytest.fixture(scope="session")  # type: ignore
 def get_test_data_path() -> Callable[[str], str]:
     def _get_test_data_path(filename: str, overwrite: bool = False) -> str:
         """Get file from test_data"""
-        download_test_data(overwrite=overwrite)  # Ensure the test data is downloaded
+        download_and_extract_tarball(dir="test_data", target_dir=_TESTDATA_DIRECTORY, overwrite=overwrite)
         file_path = os.path.join(_TESTDATA_DIRECTORY, filename)
 
         if not os.path.exists(file_path):

diff --git a/xdem/examples.py b/xdem/examples.py
@@ -18,6 +18,7 @@
 
 """Utility functions to download and find example data."""
 import os
+import shutil
 import tarfile
 import tempfile
 import urllib.request
@@ -27,6 +28,10 @@
 
 import xdem
 
+_DATA_REPO_URL = "https://github.com/vschaffn/xdem-data/tarball/2-richdem_gdal"
+_COMMIT_HASH = "31a7159c982cec4b352f0de82bd4e0be61db3afe"
+
+
 _EXAMPLES_DIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "examples", "data"))
 # Absolute filepaths to the example files.
 _FILEPATHS_DATA = {
@@ -48,55 +53,61 @@
 available = list(_FILEPATHS_DATA.keys()) + list(_FILEPATHS_PROCESSED.keys())
 
 
-def download_longyearbyen_examples(overwrite: bool = False) -> None:
+def download_and_extract_tarball(dir: str, target_dir: str, overwrite: bool = False) -> None:
     """
-    Fetch the Longyearbyen example files.
+    Helper function to download and extract a tarball from a given URL.
 
-    :param overwrite: Do not download the files again if they already exist.
+    :param dir: the directory to import.
+    :param target_dir: The directory to extract the files into.
+    :param overwrite: Whether to overwrite existing files.
     """
-    if not overwrite and all(map(os.path.isfile, list(_FILEPATHS_DATA.values()))):
-        # print("Datasets exist")
-        return
 
-    # If we ask for overwrite, also remove the processed test data
-    if overwrite:
-        for fn in list(_FILEPATHS_PROCESSED.values()):
-            if os.path.exists(fn):
-                os.remove(fn)
+    # Exit code if files already exist
+    if not overwrite and os.path.exists(target_dir) and os.listdir(target_dir):
+        return
 
-    # Static commit hash to be bumped every time it needs to be.
-    commit = "ff5ede952fc422ebd2a3c6340041a118850bc905"
-    # The URL from which to download the repository
-    url = f"https://github.com/GlacioHack/xdem-data/tarball/main#commit={commit}"
-    # To test new data from a user-branch before merging in xdem-data
-    # url = f"https://github.com/ameliefroessl/xdem-data/tarball/cog-files#commit={commit}"
+    if overwrite and os.path.exists(target_dir):
+        # Clear existing files
+        shutil.rmtree(target_dir)
 
-    # Create a temporary directory to extract the tarball in.
+    # Create a temporary directory to download the tarball
     temp_dir = tempfile.TemporaryDirectory()
     tar_path = os.path.join(temp_dir.name, "data.tar.gz")
 
+    # Construct the URL with the commit hash
+    url = f"{_DATA_REPO_URL}#commit={_COMMIT_HASH}"
+
+    # Download the tarball
     response = urllib.request.urlopen(url)
-    # If the response was right, download the tarball to the temporary directory
     if response.getcode() == 200:
         with open(tar_path, "wb") as outfile:
             outfile.write(response.read())
     else:
-        raise ValueError(f"Longyearbyen data fetch gave non-200 response: {response.status_code}.")
+        raise ValueError(f"Failed to download data: {response.status_code}")
 
     # Extract the tarball
     with tarfile.open(tar_path) as tar:
         tar.extractall(temp_dir.name)
 
-    # Find the first directory in the temp_dir (should only be one) and construct the Longyearbyen data dir path.
-    dir_name = os.path.join(
+    # Find the first directory inside the extracted tarball
+    extracted_dir = os.path.join(
         temp_dir.name,
         [dirname for dirname in os.listdir(temp_dir.name) if os.path.isdir(os.path.join(temp_dir.name, dirname))][0],
-        "data",
-        "Longyearbyen",
+        dir,
     )
 
-    # Copy the data to the examples directory.
-    copy_tree(dir_name, os.path.join(_EXAMPLES_DIRECTORY, "Longyearbyen", "data"))
+    # Copy the extracted data to the target directory
+    copy_tree(extracted_dir, target_dir)
+
+
+def download_longyearbyen_examples(overwrite: bool = False) -> None:
+    """
+    Fetch the Longyearbyen example files.
+
+    :param overwrite: Do not download the files again if they already exist.
+    """
+    target_dir = os.path.join(_EXAMPLES_DIRECTORY, "Longyearbyen", "data")
+    download_and_extract_tarball(dir="data/Longyearbyen", target_dir=target_dir, overwrite=overwrite)
 
 
 def process_coregistered_examples(name: str, overwrite: bool = False) -> None: