Skip to content

Commit

Permalink
Improve error messages with badly formatted @ids (#584)
Browse files Browse the repository at this point in the history
  • Loading branch information
ccl-core authored Mar 7, 2024
1 parent 23837f7 commit c851047
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 5 deletions.
5 changes: 5 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import dataclasses
from typing import Any

WRONG_ID_MSG = (
"Note that `@id`s should be valid URIs. If you wish to use reserved characters in"
" your `@id`s, you should URL-encode them."
)


class ValidationError(Exception):
"""Error during the validation of the format."""
Expand Down
20 changes: 20 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/json_ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import json
import re
from typing import Any

import rdflib
Expand All @@ -30,6 +31,7 @@
from mlcroissant._src.core.rdf import make_context
from mlcroissant._src.core.types import Json

_ID_REGEX = re.compile(r".*\s+.*")
_DCTERMS_PREFIX = constants.DCTERMS
_SCHEMA_ORG_PREFIX = constants.SCHEMA_ORG
_WD_PREFIX = "https://www.wikidata.org/wiki/"
Expand Down Expand Up @@ -168,13 +170,31 @@ def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -
return entry_node


def check_valid_ids(data: Json, ctx: Context) -> None:
"""Checks that the given json contains valid `@id`s."""
if isinstance(data, dict):
for k, v in data.items():
if k == "@id" and re.match(_ID_REGEX, v):
ctx.issues.add_error(
f"The dataset contains a wrong `@id`: '{v}'. Note that currently we"
" do not support `@id`s containing whitespaces (not even if"
" URL-escaped)."
)
if isinstance(v, dict):
check_valid_ids(v, ctx)
elif isinstance(v, list):
for d in v:
check_valid_ids(d, ctx)


def expand_jsonld(data: Json, ctx: Context) -> Json:
"""Expands a Croissant JSON to a nested JSON-LD with expanded.
For this we use RDFLib. RDFLib expands the CURIE of the form "rdf:type" into their
full expression, but RDFLib also flattens the JSON-LD in a list of nodes. We then
need to reconstruct the hierarchy.
"""
check_valid_ids(data=data, ctx=ctx)
context = get_context(data)
if "@base" not in context:
context["@base"] = constants.BASE_IRI
Expand Down
10 changes: 6 additions & 4 deletions python/mlcroissant/mlcroissant/_src/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,12 @@ def records(self, record_set: str) -> Records:
"""Accesses all records in `record_set` if it exists."""
if not any(rs for rs in self.metadata.record_sets if rs.name == record_set):
names = [record_set.name for record_set in self.metadata.record_sets]
raise ValueError(
f"did not find any record set with the name {record_set}. Possible"
f" RecordSets: {names}"
)
error_msg = f"did not find any record set with the name `{record_set}`. "
if not names:
error_msg += "This dataset declares no record sets."
else:
error_msg += f"Possible RecordSets: {names}"
raise ValueError(error_msg)
return Records(self, record_set, debug=self.debug)


Expand Down
14 changes: 14 additions & 0 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ def test_static_analysis_0_8(folder):
assert str(error_info.value) == get_error_msg(base_path / folder)


# Tests for 1.0-datasets only.
@pytest.mark.parametrize(
"folder",
[
"distribution_bad_id",
],
)
def test_static_analysis_1_0(folder):
base_path = epath.Path(__file__).parent / "tests/graphs/1.0"
with pytest.raises(ValidationError) as error_info:
datasets.Dataset(base_path / f"{folder}/metadata.json")
assert str(error_info.value) == get_error_msg(base_path / folder)


def load_records_and_test_equality(
version: str, dataset_name: str, record_set_name: str, num_records: int
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def from_jsonld(cls, ctx: Context, jsonld: Any) -> Source:
]
ctx.issues.add_error(
f"Every {constants.ML_COMMONS_SOURCE(ctx)} should declare"
f" either {' or '.join(mandatory_fields_in_source)}"
f" either {' or '.join(mandatory_fields_in_source)}."
)
# Safely access and check "file_property" from JSON-LD.
file_property = data_extraction.get(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isEnumeration": "cr:isEnumeration",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform"
},
"@type": "sc:Dataset",
"name": "mydataset",
"description": "This is a description.",
"conformsTo": "http://mlcommons.org/croissant/1.0",
"datePublished": "1990-02-01",
"@language": "en",
"citeAs": "This is a citation.",
"license": "This is a license.",
"url": "https://www.google.com/dataset",
"version": "1.0.0",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "ids should not contain spaces",
"name": "distribution name",
"contentSize": "117743 B",
"contentUrl": "https://www.google.com/data.csv",
"encodingFormat": "text/csv",
"sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "a-record-set",
"name": "a-record-set",
"description": "This is a record set.",
"field": [
{
"@id": "a-record-set/first-field",
"@type": "cr:Field",
"name": "first-field",
"dataType": "sc:Integer",
"source": {
"extract": {
"column": "a-column"
},
"fileObject": {
"@id": "ids should not contain spaces"
}
}
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Found the following 3 error(s) during the validation:
- Every http://mlcommons.org/croissant/source should declare either http://mlcommons.org/croissant/field or http://mlcommons.org/croissant/fileObject or http://mlcommons.org/croissant/fileSet.
- The dataset contains a wrong `@id`: 'ids should not contain spaces'. Note that currently we do not support `@id`s containing whitespaces (not even if URL-escaped).
- [Metadata(mydataset) > RecordSet(a-record-set) > Field(first-field)] Node "a-record-set/first-field" is a field and has no source. Please, use http://mlcommons.org/croissant/source to specify the source.
Found the following 1 warning(s) during the validation:
- [Metadata(mydataset) > RecordSet(a-record-set) > Field(first-field)] Property "https://schema.org/description" is recommended, but does not exist.

0 comments on commit c851047

Please sign in to comment.