Improve error messages with badly formatted @ids (#584)

mlcommons · Mar 7, 2024 · c851047 · c851047
1 parent 23837f7
commit c851047
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 5 deletions.
diff --git a/python/mlcroissant/mlcroissant/_src/core/issues.py b/python/mlcroissant/mlcroissant/_src/core/issues.py
@@ -3,6 +3,11 @@
 import dataclasses
 from typing import Any
 
+WRONG_ID_MSG = (
+    "Note that `@id`s should be valid URIs. If you wish to use reserved characters in"
+    " your `@id`s, you should URL-encode them."
+)
+
 
 class ValidationError(Exception):
     """Error during the validation of the format."""

diff --git a/python/mlcroissant/mlcroissant/_src/core/json_ld.py b/python/mlcroissant/mlcroissant/_src/core/json_ld.py
@@ -6,6 +6,7 @@
 """
 
 import json
+import re
 from typing import Any
 
 import rdflib
@@ -30,6 +31,7 @@
 from mlcroissant._src.core.rdf import make_context
 from mlcroissant._src.core.types import Json
 
+_ID_REGEX = re.compile(r".*\s+.*")
 _DCTERMS_PREFIX = constants.DCTERMS
 _SCHEMA_ORG_PREFIX = constants.SCHEMA_ORG
 _WD_PREFIX = "https://www.wikidata.org/wiki/"
@@ -168,13 +170,31 @@ def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -
     return entry_node
 
 
+def check_valid_ids(data: Json, ctx: Context) -> None:
+    """Checks that the given json contains valid `@id`s."""
+    if isinstance(data, dict):
+        for k, v in data.items():
+            if k == "@id" and re.match(_ID_REGEX, v):
+                ctx.issues.add_error(
+                    f"The dataset contains a wrong `@id`: '{v}'. Note that currently we"
+                    " do not support `@id`s containing whitespaces (not even if"
+                    " URL-escaped)."
+                )
+            if isinstance(v, dict):
+                check_valid_ids(v, ctx)
+            elif isinstance(v, list):
+                for d in v:
+                    check_valid_ids(d, ctx)
+
+
 def expand_jsonld(data: Json, ctx: Context) -> Json:
     """Expands a Croissant JSON to a nested JSON-LD with expanded.
 
     For this we use RDFLib. RDFLib expands the CURIE of the form "rdf:type" into their
     full expression, but RDFLib also flattens the JSON-LD in a list of nodes. We then
     need to reconstruct the hierarchy.
     """
+    check_valid_ids(data=data, ctx=ctx)
     context = get_context(data)
     if "@base" not in context:
         context["@base"] = constants.BASE_IRI

diff --git a/python/mlcroissant/mlcroissant/_src/datasets.py b/python/mlcroissant/mlcroissant/_src/datasets.py
@@ -90,10 +90,12 @@ def records(self, record_set: str) -> Records:
         """Accesses all records in `record_set` if it exists."""
         if not any(rs for rs in self.metadata.record_sets if rs.name == record_set):
             names = [record_set.name for record_set in self.metadata.record_sets]
-            raise ValueError(
-                f"did not find any record set with the name {record_set}. Possible"
-                f" RecordSets: {names}"
-            )
+            error_msg = f"did not find any record set with the name `{record_set}`. "
+            if not names:
+                error_msg += "This dataset declares no record sets."
+            else:
+                error_msg += f"Possible RecordSets: {names}"
+            raise ValueError(error_msg)
         return Records(self, record_set, debug=self.debug)
 
 

diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py
@@ -64,6 +64,20 @@ def test_static_analysis_0_8(folder):
     assert str(error_info.value) == get_error_msg(base_path / folder)
 
 
+# Tests for 1.0-datasets only.
+@pytest.mark.parametrize(
+    "folder",
+    [
+        "distribution_bad_id",
+    ],
+)
+def test_static_analysis_1_0(folder):
+    base_path = epath.Path(__file__).parent / "tests/graphs/1.0"
+    with pytest.raises(ValidationError) as error_info:
+        datasets.Dataset(base_path / f"{folder}/metadata.json")
+    assert str(error_info.value) == get_error_msg(base_path / folder)
+
+
 def load_records_and_test_equality(
     version: str, dataset_name: str, record_set_name: str, num_records: int
 ):

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/source.py
@@ -266,7 +266,7 @@ def from_jsonld(cls, ctx: Context, jsonld: Any) -> Source:
                         ]
                     ctx.issues.add_error(
                         f"Every {constants.ML_COMMONS_SOURCE(ctx)} should declare"
-                        f" either {' or '.join(mandatory_fields_in_source)}"
+                        f" either {' or '.join(mandatory_fields_in_source)}."
                     )
                 # Safely access and check "file_property" from JSON-LD.
                 file_property = data_extraction.get(

diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_id/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_id/metadata.json
@@ -0,0 +1,91 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "citeAs": "cr:citeAs",
+    "column": "cr:column",
+    "conformsTo": "dct:conformsTo",
+    "cr": "http://mlcommons.org/croissant/",
+    "data": {
+      "@id": "cr:data",
+      "@type": "@json"
+    },
+    "dataBiases": "cr:dataBiases",
+    "dataCollection": "cr:dataCollection",
+    "dataType": {
+      "@id": "cr:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "cr:extract",
+    "field": "cr:field",
+    "fileProperty": "cr:fileProperty",
+    "fileObject": "cr:fileObject",
+    "fileSet": "cr:fileSet",
+    "format": "cr:format",
+    "includes": "cr:includes",
+    "isEnumeration": "cr:isEnumeration",
+    "isLiveDataset": "cr:isLiveDataset",
+    "jsonPath": "cr:jsonPath",
+    "key": "cr:key",
+    "md5": "cr:md5",
+    "parentField": "cr:parentField",
+    "path": "cr:path",
+    "personalSensitiveInformation": "cr:personalSensitiveInformation",
+    "recordSet": "cr:recordSet",
+    "references": "cr:references",
+    "regex": "cr:regex",
+    "repeated": "cr:repeated",
+    "replace": "cr:replace",
+    "sc": "https://schema.org/",
+    "separator": "cr:separator",
+    "source": "cr:source",
+    "subField": "cr:subField",
+    "transform": "cr:transform"
+  },
+  "@type": "sc:Dataset",
+  "name": "mydataset",
+  "description": "This is a description.",
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
+  "datePublished": "1990-02-01",
+  "@language": "en",
+  "citeAs": "This is a citation.",
+  "license": "This is a license.",
+  "url": "https://www.google.com/dataset",
+  "version": "1.0.0",
+  "distribution": [
+    {
+      "@type": "cr:FileObject",
+      "@id": "ids should not contain spaces",
+      "name": "distribution name",
+      "contentSize": "117743 B",
+      "contentUrl": "https://www.google.com/data.csv",
+      "encodingFormat": "text/csv",
+      "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "cr:RecordSet",
+      "@id": "a-record-set",
+      "name": "a-record-set",
+      "description": "This is a record set.",
+      "field": [
+        {
+          "@id": "a-record-set/first-field",
+          "@type": "cr:Field",
+          "name": "first-field",
+          "dataType": "sc:Integer",
+          "source": {
+            "extract": {
+              "column": "a-column"
+            },
+            "fileObject": {
+              "@id": "ids should not contain spaces"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_id/output.txt b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_id/output.txt
@@ -0,0 +1,6 @@
+Found the following 3 error(s) during the validation:
+  -  Every http://mlcommons.org/croissant/source should declare either http://mlcommons.org/croissant/field or http://mlcommons.org/croissant/fileObject or http://mlcommons.org/croissant/fileSet.
+  -  The dataset contains a wrong `@id`: 'ids should not contain spaces'. Note that currently we do not support `@id`s containing whitespaces (not even if URL-escaped).
+  -  [Metadata(mydataset) > RecordSet(a-record-set) > Field(first-field)] Node "a-record-set/first-field" is a field and has no source. Please, use http://mlcommons.org/croissant/source to specify the source.
+Found the following 1 warning(s) during the validation:
+  -  [Metadata(mydataset) > RecordSet(a-record-set) > Field(first-field)] Property "https://schema.org/description" is recommended, but does not exist.