diff --git a/python/mlcroissant/mlcroissant/_src/core/dataclasses.py b/python/mlcroissant/mlcroissant/_src/core/dataclasses.py index 54f8db85a..3e4563b9a 100644 --- a/python/mlcroissant/mlcroissant/_src/core/dataclasses.py +++ b/python/mlcroissant/mlcroissant/_src/core/dataclasses.py @@ -23,7 +23,7 @@ class Metadata(TypedDict): description: str from_jsonld: Callable[[Context, Json], Any] | None input_types: list[Any] - to_jsonld: Callable[[Context, Json], Any] | None + to_jsonld: Callable[[Context, Any], Json] | None required: bool url: term.URIRef | Callable[[Context], term.URIRef] diff --git a/python/mlcroissant/mlcroissant/_src/core/dataclasses_test.py b/python/mlcroissant/mlcroissant/_src/core/dataclasses_test.py new file mode 100644 index 000000000..a2d9630f1 --- /dev/null +++ b/python/mlcroissant/mlcroissant/_src/core/dataclasses_test.py @@ -0,0 +1,54 @@ +"""dataclasses_test module.""" + +from rdflib.namespace import SDO + +from mlcroissant._src.core import dataclasses as mlc_dataclasses +from mlcroissant._src.core.context import Context +from mlcroissant._src.core.context import CroissantVersion + + +def test_dataclass(): + url = lambda ctx: "http://foo.org" if ctx.is_v0() else "http://bar.org" + + @mlc_dataclasses.dataclass + class SomeNode: + field1: int = mlc_dataclasses.jsonld_field(description="The first field") + field2: str = mlc_dataclasses.jsonld_field( + cardinality="MANY", + description="The second field", + input_types=[SDO.Text], + required=True, + url=url, + ) + + node = SomeNode(field1=42, field2="foo") + for cls_or_instance in [node, SomeNode]: + field1, field2 = list(mlc_dataclasses.jsonld_fields(cls_or_instance)) + + # Field #1 + assert field1.name == "field1" + assert field1.cardinality == "ONE" + assert field1.description == "The first field" + assert field1.from_jsonld == None + assert field1.input_types == [] + assert field1.to_jsonld == None + assert field1.required == False + assert field1.url == None + + # Field #2 + assert field2.name == "field2" + assert field2.cardinality == "MANY" + assert field2.description == "The second field" + assert field2.from_jsonld == None + assert field2.input_types == [SDO.Text] + assert field2.to_jsonld == None + assert field2.required == True + assert field2.url == url + assert ( + field2.call_url(Context(conforms_to=CroissantVersion.V_0_8)) + == "http://foo.org" + ) + assert ( + field2.call_url(Context(conforms_to=CroissantVersion.V_1_0)) + == "http://bar.org" + ) diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py b/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py index d1eaaf92d..7b49e814e 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py @@ -2,10 +2,11 @@ from __future__ import annotations -import abc import dataclasses import re -from typing import Any +from typing import Any, Callable + +from rdflib import term from mlcroissant._src.core import constants from mlcroissant._src.core import dataclasses as mlc_dataclasses @@ -27,7 +28,7 @@ @dataclasses.dataclass(eq=False, repr=False) -class Node(abc.ABC): +class Node: """Structure node in Croissant. This generic class will be inherited by the actual Croissant nodes: @@ -213,17 +214,6 @@ def issues(self) -> Issues: """Shortcut to access issues in node.""" return self.ctx.issues - @abc.abstractmethod - def to_json(self) -> Json: - """Converts the node to JSON.""" - ... - - @classmethod - @abc.abstractmethod - def from_jsonld(cls, *args, **kwargs) -> Any: - """Creates a node from JSON-LD.""" - ... - def validate_name(self): """Validates the name.""" name = self.name @@ -280,15 +270,11 @@ def __deepcopy__(self, memo): memo[id(self)] = copy return copy - -class NodeV2(Node): - """Extends Node. When the migration is complete, merge `Node` and `NodeV2`.""" - def to_json(self) -> Json: """Converts the Python class to JSON.""" cls = self.__class__ jsonld = { - "@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)), + "@type": self.ctx.rdf.shorten_value(cls._jsonld_type(self.ctx)), "@id": None if self.ctx.is_v0() else self.id, } for field in mlc_dataclasses.jsonld_fields(self): @@ -305,14 +291,14 @@ def to_json(self) -> Json: @classmethod def from_jsonld(cls, ctx: Context, jsonld: Json): """Creates a Python class from JSON-LD.""" - if cls._JSONLD_TYPE(ctx) == constants.SCHEMA_ORG_DATASET: + if cls._jsonld_type(ctx) == constants.SCHEMA_ORG_DATASET: # For `Metadata` node, insert the conforms_to in the context: ctx.conforms_to = CroissantVersion.from_jsonld( ctx, jsonld.get(constants.DCTERMS_CONFORMS_TO) ) if isinstance(jsonld, list): return [cls.from_jsonld(ctx, el) for el in jsonld] - check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx)) + check_expected_type(ctx.issues, jsonld, cls._jsonld_type(ctx)) kwargs = {} for field in mlc_dataclasses.jsonld_fields(cls): url = field.call_url(ctx) @@ -328,7 +314,14 @@ def from_jsonld(cls, ctx: Context, jsonld: Json): **kwargs, ) + JSONLD_TYPE: Callable[[Context], term.URIRef] | term.URIRef | None = None + @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - del ctx - raise NotImplementedError("Output the right JSON-LD type.") + def _jsonld_type(cls, ctx: Context): + """Get the actual JSON-LD type according the the ctx.""" + if cls.JSONLD_TYPE is None: + raise NotImplementedError("Output the right JSON-LD type.") + elif callable(cls.JSONLD_TYPE): + return cls.JSONLD_TYPE(ctx) + else: + return cls.JSONLD_TYPE diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py index 3012e4758..66724e70f 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py @@ -14,7 +14,7 @@ from mlcroissant._src.core.data_types import EXPECTED_DATA_TYPES from mlcroissant._src.core.json_ld import remove_empty_values from mlcroissant._src.core.types import Json -from mlcroissant._src.structure_graph.base_node import NodeV2 +from mlcroissant._src.structure_graph.base_node import Node from mlcroissant._src.structure_graph.nodes.source import Source @@ -62,7 +62,7 @@ def to_json(self, ctx: Context) -> Json: @mlc_dataclasses.dataclass -class Field(NodeV2): +class Field(Node): """Nodes to describe a dataset Field.""" description: str | None = mlc_dataclasses.jsonld_field( @@ -152,9 +152,7 @@ def __post_init__(self): self.source.check_source(self.add_error) self._standardize_data_types() - @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - return constants.ML_COMMONS_FIELD_TYPE(ctx) + JSONLD_TYPE = constants.ML_COMMONS_FIELD_TYPE def _standardize_data_types(self): """Converts data_types to a list of rdflib.URIRef.""" diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py index d587342d2..792c1a550 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py @@ -7,15 +7,14 @@ from mlcroissant._src.core import constants from mlcroissant._src.core import dataclasses as mlc_dataclasses from mlcroissant._src.core.constants import ML_COMMONS -from mlcroissant._src.core.context import Context from mlcroissant._src.core.uuid import formatted_uuid_to_json from mlcroissant._src.core.uuid import uuid_from_jsonld -from mlcroissant._src.structure_graph.base_node import NodeV2 +from mlcroissant._src.structure_graph.base_node import Node from mlcroissant._src.structure_graph.nodes.source import Source @mlc_dataclasses.dataclass -class FileObject(NodeV2): +class FileObject(Node): """Nodes to describe a dataset FileObject (distribution).""" content_url: str | None = mlc_dataclasses.jsonld_field( @@ -114,7 +113,4 @@ def __post_init__(self): if self.ctx and not self.ctx.is_live_dataset: self.assert_has_exclusive_properties(["md5", "sha256"]) - @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - """Gets the class' JSON-LD @type.""" - return constants.SCHEMA_ORG_FILE_OBJECT(ctx) + JSONLD_TYPE = constants.SCHEMA_ORG_FILE_OBJECT diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py index 8103c134f..23540680b 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py @@ -6,14 +6,13 @@ from mlcroissant._src.core import constants from mlcroissant._src.core import dataclasses as mlc_dataclasses -from mlcroissant._src.core.context import Context from mlcroissant._src.core.uuid import formatted_uuid_to_json from mlcroissant._src.core.uuid import uuid_from_jsonld -from mlcroissant._src.structure_graph.base_node import NodeV2 +from mlcroissant._src.structure_graph.base_node import Node @mlc_dataclasses.dataclass -class FileSet(NodeV2): +class FileSet(Node): """Nodes to describe a dataset FileSet (distribution).""" contained_in: list[str] | None = mlc_dataclasses.jsonld_field( @@ -75,7 +74,4 @@ def __post_init__(self): self.validate_name() self.assert_has_mandatory_properties("includes", "encoding_format", uuid_field) - @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - """Gets the class' JSON-LD @type.""" - return constants.SCHEMA_ORG_FILE_SET(ctx) + JSONLD_TYPE = constants.SCHEMA_ORG_FILE_SET diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py index 90feb35f5..f13987be9 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py @@ -24,7 +24,7 @@ from mlcroissant._src.core.rdf import Rdf from mlcroissant._src.core.types import Json from mlcroissant._src.core.url import is_url -from mlcroissant._src.structure_graph.base_node import NodeV2 +from mlcroissant._src.structure_graph.base_node import Node from mlcroissant._src.structure_graph.graph import from_file_to_json from mlcroissant._src.structure_graph.graph import from_nodes_to_graph from mlcroissant._src.structure_graph.nodes.field import Field @@ -118,7 +118,7 @@ def _distribution_to_json(ctx: Context, distribution: list[FileObject | FileSet] @mlc_dataclasses.dataclass -class Metadata(NodeV2): +class Metadata(Node): """Nodes to describe a dataset metadata.""" cite_as: str | None = mlc_dataclasses.jsonld_field( @@ -407,10 +407,7 @@ def __post_init__(self): self.ctx, self.ctx.conforms_to ) - @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - del ctx - return constants.SCHEMA_ORG_DATASET + JSONLD_TYPE = constants.SCHEMA_ORG_DATASET def to_json(self) -> Json: """Converts the `Metadata` to JSON.""" @@ -441,9 +438,9 @@ def file_sets(self) -> list[FileSet]: file_set for file_set in self.distribution if isinstance(file_set, FileSet) ] - def nodes(self) -> list[NodeV2]: + def nodes(self) -> list[Node]: """List all nodes in metadata.""" - nodes: list[NodeV2] = [self] + nodes: list[Node] = [self] nodes.extend(self.distribution) nodes.extend(self.record_sets) for record_set in self.record_sets: diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py index ddfc06247..c18aaf875 100644 --- a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py +++ b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py @@ -11,7 +11,7 @@ from mlcroissant._src.core import dataclasses as mlc_dataclasses from mlcroissant._src.core.context import Context from mlcroissant._src.core.types import Json -from mlcroissant._src.structure_graph.base_node import NodeV2 +from mlcroissant._src.structure_graph.base_node import Node from mlcroissant._src.structure_graph.nodes.field import Field @@ -28,7 +28,7 @@ def data_from_jsonld(ctx: Context, data) -> Json | None: @mlc_dataclasses.dataclass -class RecordSet(NodeV2): +class RecordSet(Node): """Nodes to describe a dataset RecordSet.""" data: list[Json] | None = mlc_dataclasses.jsonld_field( @@ -147,10 +147,7 @@ def check_joins_in_fields(self): " documentation for more information." ) - @classmethod - def _JSONLD_TYPE(cls, ctx: Context): - """Gets the class' JSON-LD @type.""" - return constants.ML_COMMONS_RECORD_SET_TYPE(ctx) + JSONLD_TYPE = constants.ML_COMMONS_RECORD_SET_TYPE def get_parent_uuid(ctx: Context, uuid: str) -> str: