Skip to content

Commit

Permalink
Cleaning after implementing PEP 0681. (#589)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp authored Mar 8, 2024
1 parent c851047 commit 7cb999e
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 58 deletions.
2 changes: 1 addition & 1 deletion python/mlcroissant/mlcroissant/_src/core/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Metadata(TypedDict):
description: str
from_jsonld: Callable[[Context, Json], Any] | None
input_types: list[Any]
to_jsonld: Callable[[Context, Json], Any] | None
to_jsonld: Callable[[Context, Any], Json] | None
required: bool
url: term.URIRef | Callable[[Context], term.URIRef]

Expand Down
54 changes: 54 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/dataclasses_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""dataclasses_test module."""

from rdflib.namespace import SDO

from mlcroissant._src.core import dataclasses as mlc_dataclasses
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.context import CroissantVersion


def test_dataclass():
url = lambda ctx: "http://foo.org" if ctx.is_v0() else "http://bar.org"

@mlc_dataclasses.dataclass
class SomeNode:
field1: int = mlc_dataclasses.jsonld_field(description="The first field")
field2: str = mlc_dataclasses.jsonld_field(
cardinality="MANY",
description="The second field",
input_types=[SDO.Text],
required=True,
url=url,
)

node = SomeNode(field1=42, field2="foo")
for cls_or_instance in [node, SomeNode]:
field1, field2 = list(mlc_dataclasses.jsonld_fields(cls_or_instance))

# Field #1
assert field1.name == "field1"
assert field1.cardinality == "ONE"
assert field1.description == "The first field"
assert field1.from_jsonld == None
assert field1.input_types == []
assert field1.to_jsonld == None
assert field1.required == False
assert field1.url == None

# Field #2
assert field2.name == "field2"
assert field2.cardinality == "MANY"
assert field2.description == "The second field"
assert field2.from_jsonld == None
assert field2.input_types == [SDO.Text]
assert field2.to_jsonld == None
assert field2.required == True
assert field2.url == url
assert (
field2.call_url(Context(conforms_to=CroissantVersion.V_0_8))
== "http://foo.org"
)
assert (
field2.call_url(Context(conforms_to=CroissantVersion.V_1_0))
== "http://bar.org"
)
41 changes: 17 additions & 24 deletions python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

from __future__ import annotations

import abc
import dataclasses
import re
from typing import Any
from typing import Any, Callable

from rdflib import term

from mlcroissant._src.core import constants
from mlcroissant._src.core import dataclasses as mlc_dataclasses
Expand All @@ -27,7 +28,7 @@


@dataclasses.dataclass(eq=False, repr=False)
class Node(abc.ABC):
class Node:
"""Structure node in Croissant.
This generic class will be inherited by the actual Croissant nodes:
Expand Down Expand Up @@ -213,17 +214,6 @@ def issues(self) -> Issues:
"""Shortcut to access issues in node."""
return self.ctx.issues

@abc.abstractmethod
def to_json(self) -> Json:
"""Converts the node to JSON."""
...

@classmethod
@abc.abstractmethod
def from_jsonld(cls, *args, **kwargs) -> Any:
"""Creates a node from JSON-LD."""
...

def validate_name(self):
"""Validates the name."""
name = self.name
Expand Down Expand Up @@ -280,15 +270,11 @@ def __deepcopy__(self, memo):
memo[id(self)] = copy
return copy


class NodeV2(Node):
"""Extends Node. When the migration is complete, merge `Node` and `NodeV2`."""

def to_json(self) -> Json:
"""Converts the Python class to JSON."""
cls = self.__class__
jsonld = {
"@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
"@type": self.ctx.rdf.shorten_value(cls._jsonld_type(self.ctx)),
"@id": None if self.ctx.is_v0() else self.id,
}
for field in mlc_dataclasses.jsonld_fields(self):
Expand All @@ -305,14 +291,14 @@ def to_json(self) -> Json:
@classmethod
def from_jsonld(cls, ctx: Context, jsonld: Json):
"""Creates a Python class from JSON-LD."""
if cls._JSONLD_TYPE(ctx) == constants.SCHEMA_ORG_DATASET:
if cls._jsonld_type(ctx) == constants.SCHEMA_ORG_DATASET:
# For `Metadata` node, insert the conforms_to in the context:
ctx.conforms_to = CroissantVersion.from_jsonld(
ctx, jsonld.get(constants.DCTERMS_CONFORMS_TO)
)
if isinstance(jsonld, list):
return [cls.from_jsonld(ctx, el) for el in jsonld]
check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
check_expected_type(ctx.issues, jsonld, cls._jsonld_type(ctx))
kwargs = {}
for field in mlc_dataclasses.jsonld_fields(cls):
url = field.call_url(ctx)
Expand All @@ -328,7 +314,14 @@ def from_jsonld(cls, ctx: Context, jsonld: Json):
**kwargs,
)

JSONLD_TYPE: Callable[[Context], term.URIRef] | term.URIRef | None = None

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
del ctx
raise NotImplementedError("Output the right JSON-LD type.")
def _jsonld_type(cls, ctx: Context):
"""Get the actual JSON-LD type according the the ctx."""
if cls.JSONLD_TYPE is None:
raise NotImplementedError("Output the right JSON-LD type.")
elif callable(cls.JSONLD_TYPE):
return cls.JSONLD_TYPE(ctx)
else:
return cls.JSONLD_TYPE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from mlcroissant._src.core.data_types import EXPECTED_DATA_TYPES
from mlcroissant._src.core.json_ld import remove_empty_values
from mlcroissant._src.core.types import Json
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.nodes.source import Source


Expand Down Expand Up @@ -62,7 +62,7 @@ def to_json(self, ctx: Context) -> Json:


@mlc_dataclasses.dataclass
class Field(NodeV2):
class Field(Node):
"""Nodes to describe a dataset Field."""

description: str | None = mlc_dataclasses.jsonld_field(
Expand Down Expand Up @@ -152,9 +152,7 @@ def __post_init__(self):
self.source.check_source(self.add_error)
self._standardize_data_types()

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
return constants.ML_COMMONS_FIELD_TYPE(ctx)
JSONLD_TYPE = constants.ML_COMMONS_FIELD_TYPE

def _standardize_data_types(self):
"""Converts data_types to a list of rdflib.URIRef."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@
from mlcroissant._src.core import constants
from mlcroissant._src.core import dataclasses as mlc_dataclasses
from mlcroissant._src.core.constants import ML_COMMONS
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.uuid import formatted_uuid_to_json
from mlcroissant._src.core.uuid import uuid_from_jsonld
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.nodes.source import Source


@mlc_dataclasses.dataclass
class FileObject(NodeV2):
class FileObject(Node):
"""Nodes to describe a dataset FileObject (distribution)."""

content_url: str | None = mlc_dataclasses.jsonld_field(
Expand Down Expand Up @@ -114,7 +113,4 @@ def __post_init__(self):
if self.ctx and not self.ctx.is_live_dataset:
self.assert_has_exclusive_properties(["md5", "sha256"])

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
"""Gets the class' JSON-LD @type."""
return constants.SCHEMA_ORG_FILE_OBJECT(ctx)
JSONLD_TYPE = constants.SCHEMA_ORG_FILE_OBJECT
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@

from mlcroissant._src.core import constants
from mlcroissant._src.core import dataclasses as mlc_dataclasses
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.uuid import formatted_uuid_to_json
from mlcroissant._src.core.uuid import uuid_from_jsonld
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.base_node import Node


@mlc_dataclasses.dataclass
class FileSet(NodeV2):
class FileSet(Node):
"""Nodes to describe a dataset FileSet (distribution)."""

contained_in: list[str] | None = mlc_dataclasses.jsonld_field(
Expand Down Expand Up @@ -75,7 +74,4 @@ def __post_init__(self):
self.validate_name()
self.assert_has_mandatory_properties("includes", "encoding_format", uuid_field)

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
"""Gets the class' JSON-LD @type."""
return constants.SCHEMA_ORG_FILE_SET(ctx)
JSONLD_TYPE = constants.SCHEMA_ORG_FILE_SET
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from mlcroissant._src.core.rdf import Rdf
from mlcroissant._src.core.types import Json
from mlcroissant._src.core.url import is_url
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.graph import from_file_to_json
from mlcroissant._src.structure_graph.graph import from_nodes_to_graph
from mlcroissant._src.structure_graph.nodes.field import Field
Expand Down Expand Up @@ -118,7 +118,7 @@ def _distribution_to_json(ctx: Context, distribution: list[FileObject | FileSet]


@mlc_dataclasses.dataclass
class Metadata(NodeV2):
class Metadata(Node):
"""Nodes to describe a dataset metadata."""

cite_as: str | None = mlc_dataclasses.jsonld_field(
Expand Down Expand Up @@ -407,10 +407,7 @@ def __post_init__(self):
self.ctx, self.ctx.conforms_to
)

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
del ctx
return constants.SCHEMA_ORG_DATASET
JSONLD_TYPE = constants.SCHEMA_ORG_DATASET

def to_json(self) -> Json:
"""Converts the `Metadata` to JSON."""
Expand Down Expand Up @@ -441,9 +438,9 @@ def file_sets(self) -> list[FileSet]:
file_set for file_set in self.distribution if isinstance(file_set, FileSet)
]

def nodes(self) -> list[NodeV2]:
def nodes(self) -> list[Node]:
"""List all nodes in metadata."""
nodes: list[NodeV2] = [self]
nodes: list[Node] = [self]
nodes.extend(self.distribution)
nodes.extend(self.record_sets)
for record_set in self.record_sets:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from mlcroissant._src.core import dataclasses as mlc_dataclasses
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.types import Json
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.nodes.field import Field


Expand All @@ -28,7 +28,7 @@ def data_from_jsonld(ctx: Context, data) -> Json | None:


@mlc_dataclasses.dataclass
class RecordSet(NodeV2):
class RecordSet(Node):
"""Nodes to describe a dataset RecordSet."""

data: list[Json] | None = mlc_dataclasses.jsonld_field(
Expand Down Expand Up @@ -147,10 +147,7 @@ def check_joins_in_fields(self):
" documentation for more information."
)

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
"""Gets the class' JSON-LD @type."""
return constants.ML_COMMONS_RECORD_SET_TYPE(ctx)
JSONLD_TYPE = constants.ML_COMMONS_RECORD_SET_TYPE


def get_parent_uuid(ctx: Context, uuid: str) -> str:
Expand Down

0 comments on commit 7cb999e

Please sign in to comment.