Add automatic quality checks in GitHub Actions for the editor. (#321)

``` pip install --upgrade black isort make format ```
mlcommons · Nov 10, 2023 · 54df95d · 54df95d
1 parent 29b98a5
commit 54df95d
Show file tree

Hide file tree

Showing 34 changed files with 333 additions and 253 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,4 +1,4 @@
-name: Continuous Integration (CI)
+name: Continuous Integration - mlcroissant (CI)
 
 on:
   push:
@@ -92,7 +92,7 @@ jobs:
         python-version: '3.11'
     - uses: psf/black@stable
       with:
-        options: --line-length 88 --preview --experimental-string-processing
+        options: --check --line-length 88 --preview
         src: './python/mlcroissant'
     - uses: isort/isort-action@master
       with:

diff --git a/.github/workflows/editor.yml b/.github/workflows/editor.yml
@@ -0,0 +1,29 @@
+name: Continuous Integration - Editor (CI)
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  merge_group:
+
+jobs:
+  python-format:
+    name: Python format
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.11'
+    - uses: psf/black@stable
+      with:
+        options: --check --line-length 88 --preview
+        src: './wizard'
+    - uses: isort/isort-action@master
+      with:
+        sort-paths: './wizard'
+        configuration: --profile google --line-length 88 --use-parentheses --project mlcroissant --project components --project core --project views --project state --project utils --multi-line 3 --thirdparty datasets
+        requirements-files: 'wizard/requirements.txt'
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -26,17 +26,28 @@
     "black-formatter.args": [
         "--line-length",
         "88",
-        "--preview",
-        "--experimental-string-processing"
+        "--preview"
     ],
     "isort.args": [
         "--profile",
         "google",
         "--line-length",
         "88",
         "--use-parentheses",
+        // For python/mlcroissant/
         "--project",
         "mlcroissant",
+        // For wizard/
+        "--project",
+        "components",
+        "--project",
+        "core",
+        "--project",
+        "views",
+        "--project",
+        "state",
+        "--project",
+        "utils",
         "--multi-line",
         "3",
         "--thirdparty",

diff --git a/python/mlcroissant/mlcroissant/_src/core/git.py b/python/mlcroissant/mlcroissant/_src/core/git.py
@@ -37,7 +37,9 @@ def download_git_lfs_file(file: Path):
     try:
         repo.execute(["git", "lfs", "pull", "--include", fullpath])
     except deps.git.exc.GitCommandError as ex:
-        raise RuntimeError("Problem when launching `git lfs`. "
-                           "Possible problems: Have you installed git lfs "
-                           f"locally? Is '{fullpath}' a valid `git lfs` "
-                           "repository?") from ex
+        raise RuntimeError(
+            "Problem when launching `git lfs`. "
+            "Possible problems: Have you installed git lfs "
+            f"locally? Is '{fullpath}' a valid `git lfs` "
+            "repository?"
+        ) from ex
diff --git a/python/mlcroissant/mlcroissant/_src/core/git_test.py b/python/mlcroissant/mlcroissant/_src/core/git_test.py
@@ -11,19 +11,15 @@
 from mlcroissant._src.core.optional import deps
 from mlcroissant._src.core.path import Path
 
-_GIT_LFS_CONTENT = (
-    lambda: """version https://git-lfs.github.com/spec/v1
+_GIT_LFS_CONTENT = lambda: """version https://git-lfs.github.com/spec/v1
 oid sha256:5e2785fcd9098567a49d6e62e328923d955b307b6dcd0492f6234e96e670772a
 size 309207547
 """
-)
 
-_NON_GIT_LFS_CONTENT = (
-    lambda: """name,age
+_NON_GIT_LFS_CONTENT = lambda: """name,age
 a,1
 b,2
 c,3"""
-)
 
 _NON_ASCII_CONTENT = lambda: (255).to_bytes(1, byteorder="big")
 
@@ -54,6 +50,6 @@ def test_download_git_lfs_file():
     with mock.patch.object(git, "Git", autospec=True) as git_mock:
         download_git_lfs_file(file)
         git_mock.assert_called_once_with("/tmp/full/")
-        git_mock.return_value.execute.assert_called_once_with(
-            ["git", "lfs", "pull", "--include", "path.json"]
-        )
+        git_mock.return_value.execute.assert_called_once_with([
+            "git", "lfs", "pull", "--include", "path.json"
+        ])
diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py
@@ -113,10 +113,16 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records):
 @pytest.mark.parametrize(
     ["dataset_name", "record_set_name", "num_records"],
     [
-        ["flores-200/metadata.json",
-         "language_translations_train_data_with_metadata", 10],
-        ["flores-200/metadata.json",
-         "language_translations_test_data_with_metadata", 10],
+        [
+            "flores-200/metadata.json",
+            "language_translations_train_data_with_metadata",
+            10,
+        ],
+        [
+            "flores-200/metadata.json",
+            "language_translations_test_data_with_metadata",
+            10,
+        ],
         ["gpt-3/metadata.json", "default", 10],
         ["huggingface-c4/metadata.json", "en", 1],
         ["huggingface-mnist/metadata.json", "default", 10],

diff --git a/python/mlcroissant/mlcroissant/_src/nodes.py b/python/mlcroissant/mlcroissant/_src/nodes.py
@@ -1,4 +1,5 @@
 """Defines the public interface to the `mlcroissant.nodes` package."""
+
 from mlcroissant._src.structure_graph.nodes.field import Field
 from mlcroissant._src.structure_graph.nodes.file_object import FileObject
 from mlcroissant._src.structure_graph.nodes.file_set import FileSet

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py b/python/mlcroissant/mlcroissant/_src/operation_graph/graph.py
@@ -94,9 +94,9 @@ def _add_operations_for_file_object(
                 >> Concatenate(operations=operations, node=successor)
             ]
         if node.encoding_format and not should_extract(node.encoding_format):
-            fields = tuple(
-                [field for field in node.successors if isinstance(field, Field)]
-            )
+            fields = tuple([
+                field for field in node.successors if isinstance(field, Field)
+            ])
             operation >> Read(
                 operations=operations,
                 node=node,

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/concatenate.py
@@ -21,10 +21,8 @@ def __call__(self, *args: list[Path]) -> pd.DataFrame:
         """See class' docstring."""
         assert len(args) > 0, "No dataframe to merge."
         files = [file for files in args for file in files]
-        return pd.DataFrame(
-            {
-                FileProperty.filepath: [os.fspath(file.filepath) for file in files],
-                FileProperty.filename: [file.filename for file in files],
-                FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
-            }
-        )
+        return pd.DataFrame({
+            FileProperty.filepath: [os.fspath(file.filepath) for file in files],
+            FileProperty.filename: [file.filename for file in files],
+            FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
+        })
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -64,9 +64,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
     """Reads a file line-by-line and outputs a named pd.Series of the lines."""
     path = epath.Path(row[FileProperty.filepath])
     lines = path.open("rb").read().splitlines()
-    return pd.Series(
-        {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
-    )
+    return pd.Series({
+        **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
+    })
 
 
 def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/read.py
@@ -97,11 +97,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
                     return parse_json_content(json_content, self.fields)
                 else:
                     # Raw files are returned as a one-line pd.DataFrame.
-                    return pd.DataFrame(
-                        {
-                            FileProperty.content: [json_content],
-                        }
-                    )
+                    return pd.DataFrame({
+                        FileProperty.content: [json_content],
+                    })
             elif encoding_format == EncodingFormat.JSON_LINES:
                 return pd.read_json(file, lines=True)
             elif encoding_format == EncodingFormat.PARQUET:
@@ -119,11 +117,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
                         filepath, header=None, names=[FileProperty.lines]
                     )
                 else:
-                    return pd.DataFrame(
-                        {
-                            FileProperty.content: [file.read()],
-                        }
-                    )
+                    return pd.DataFrame({
+                        FileProperty.content: [file.read()],
+                    })
             else:
                 raise ValueError(
                     f"Unsupported encoding format for file: {encoding_format}"

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
@@ -41,12 +41,10 @@ def from_jsonld(cls, issues: Issues, json_) -> ParentField | None:
 
     def to_json(self) -> Json:
         """Converts the `ParentField` to JSON."""
-        return remove_empty_values(
-            {
-                "references": self.references.to_json() if self.references else None,
-                "source": self.source.to_json() if self.source else None,
-            }
-        )
+        return remove_empty_values({
+            "references": self.references.to_json() if self.references else None,
+            "source": self.source.to_json() if self.source else None,
+        })
 
 
 @dataclasses.dataclass(eq=False, repr=False)
@@ -128,20 +126,18 @@ def to_json(self) -> Json:
             self.rdf.shorten_value(data_type) for data_type in self.data_types
         ]
         parent_field = self.parent_field.to_json() if self.parent_field else None
-        return remove_empty_values(
-            {
-                "@type": "ml:Field",
-                "name": self.name,
-                "description": self.description,
-                "dataType": data_types[0] if len(data_types) == 1 else data_types,
-                "isEnumeration": self.is_enumeration,
-                "parentField": parent_field,
-                "repeated": self.repeated,
-                "references": self.references.to_json() if self.references else None,
-                "source": self.source.to_json() if self.source else None,
-                "subField": [sub_field.to_json() for sub_field in self.sub_fields],
-            }
-        )
+        return remove_empty_values({
+            "@type": "ml:Field",
+            "name": self.name,
+            "description": self.description,
+            "dataType": data_types[0] if len(data_types) == 1 else data_types,
+            "isEnumeration": self.is_enumeration,
+            "parentField": parent_field,
+            "repeated": self.repeated,
+            "references": self.references.to_json() if self.references else None,
+            "source": self.source.to_json() if self.source else None,
+            "subField": [sub_field.to_json() for sub_field in self.sub_fields],
+        })
 
     @classmethod
     def from_jsonld(

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py
@@ -45,20 +45,18 @@ def to_json(self) -> Json:
             contained_in: str | list[str] = self.contained_in[0]
         else:
             contained_in = self.contained_in
-        return remove_empty_values(
-            {
-                "@type": "sc:FileObject",
-                "name": self.name,
-                "description": self.description,
-                "contentSize": self.content_size,
-                "contentUrl": self.content_url,
-                "containedIn": contained_in,
-                "encodingFormat": self.encoding_format,
-                "md5": self.md5,
-                "sha256": self.sha256,
-                "source": self.source.to_json() if self.source else None,
-            }
-        )
+        return remove_empty_values({
+            "@type": "sc:FileObject",
+            "name": self.name,
+            "description": self.description,
+            "contentSize": self.content_size,
+            "contentUrl": self.content_url,
+            "containedIn": contained_in,
+            "encodingFormat": self.encoding_format,
+            "md5": self.md5,
+            "sha256": self.sha256,
+            "source": self.source.to_json() if self.source else None,
+        })
 
     @classmethod
     def from_jsonld(

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object_test.py
@@ -25,9 +25,9 @@ def test_checks_are_performed():
         Node, "assert_has_exclusive_properties"
     ) as exclusive_mock:
         create_test_node(FileObject)
-        mandatory_mock.assert_has_calls(
-            [mock.call("encoding_format", "name"), mock.call("content_url")]
-        )
+        mandatory_mock.assert_has_calls([
+            mock.call("encoding_format", "name"), mock.call("content_url")
+        ])
         exclusive_mock.assert_called_once_with(["md5", "sha256"])
         validate_name_mock.assert_called_once()
 

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py
@@ -37,16 +37,14 @@ def to_json(self) -> Json:
             contained_in: str | list[str] = self.contained_in[0]
         else:
             contained_in = self.contained_in
-        return remove_empty_values(
-            {
-                "@type": "sc:FileSet",
-                "name": self.name,
-                "description": self.description,
-                "containedIn": contained_in,
-                "encodingFormat": self.encoding_format,
-                "includes": self.includes,
-            }
-        )
+        return remove_empty_values({
+            "@type": "sc:FileSet",
+            "name": self.name,
+            "description": self.description,
+            "containedIn": contained_in,
+            "encodingFormat": self.encoding_format,
+            "includes": self.includes,
+        })
 
     @classmethod
     def from_jsonld(

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/metadata.py
@@ -64,19 +64,17 @@ def __post_init__(self):
 
     def to_json(self) -> Json:
         """Converts the `Metadata` to JSON."""
-        return remove_empty_values(
-            {
-                "@context": self.rdf.context,
-                "@type": "sc:Dataset",
-                "name": self.name,
-                "description": self.description,
-                "citation": self.citation,
-                "license": self.license,
-                "url": self.url,
-                "distribution": [f.to_json() for f in self.distribution],
-                "recordSet": [record_set.to_json() for record_set in self.record_sets],
-            }
-        )
+        return remove_empty_values({
+            "@context": self.rdf.context,
+            "@type": "sc:Dataset",
+            "name": self.name,
+            "description": self.description,
+            "citation": self.citation,
+            "license": self.license,
+            "url": self.url,
+            "distribution": [f.to_json() for f in self.distribution],
+            "recordSet": [record_set.to_json() for record_set in self.record_sets],
+        })
 
     @property
     def file_objects(self) -> list[FileObject]:

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/record_set.py
@@ -68,17 +68,15 @@ def __post_init__(self):
 
     def to_json(self) -> Json:
         """Converts the `RecordSet` to JSON."""
-        return remove_empty_values(
-            {
-                "@type": "ml:RecordSet",
-                "name": self.name,
-                "description": self.description,
-                "isEnumeration": self.is_enumeration,
-                "key": self.key,
-                "field": [field.to_json() for field in self.fields],
-                "data": self.data,
-            }
-        )
+        return remove_empty_values({
+            "@type": "ml:RecordSet",
+            "name": self.name,
+            "description": self.description,
+            "isEnumeration": self.is_enumeration,
+            "key": self.key,
+            "field": [field.to_json() for field in self.fields],
+            "data": self.data,
+        })
 
     @classmethod
     def from_jsonld(