Skip to content

Commit

Permalink
Add automatic quality checks in GitHub Actions for the editor. (#321)
Browse files Browse the repository at this point in the history
```
pip install --upgrade black isort
make format
```
  • Loading branch information
marcenacp authored Nov 10, 2023
1 parent 29b98a5 commit 54df95d
Show file tree
Hide file tree
Showing 34 changed files with 333 additions and 253 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Continuous Integration (CI)
name: Continuous Integration - mlcroissant (CI)

on:
push:
Expand Down Expand Up @@ -92,7 +92,7 @@ jobs:
python-version: '3.11'
- uses: psf/black@stable
with:
options: --line-length 88 --preview --experimental-string-processing
options: --check --line-length 88 --preview
src: './python/mlcroissant'
- uses: isort/isort-action@master
with:
Expand Down
29 changes: 29 additions & 0 deletions .github/workflows/editor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Continuous Integration - Editor (CI)

on:
push:
branches:
- main
pull_request:
branches:
- main
merge_group:

jobs:
python-format:
name: Python format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.11'
- uses: psf/black@stable
with:
options: --check --line-length 88 --preview
src: './wizard'
- uses: isort/isort-action@master
with:
sort-paths: './wizard'
configuration: --profile google --line-length 88 --use-parentheses --project mlcroissant --project components --project core --project views --project state --project utils --multi-line 3 --thirdparty datasets
requirements-files: 'wizard/requirements.txt'
15 changes: 13 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,28 @@
"black-formatter.args": [
"--line-length",
"88",
"--preview",
"--experimental-string-processing"
"--preview"
],
"isort.args": [
"--profile",
"google",
"--line-length",
"88",
"--use-parentheses",
// For python/mlcroissant/
"--project",
"mlcroissant",
// For wizard/
"--project",
"components",
"--project",
"core",
"--project",
"views",
"--project",
"state",
"--project",
"utils",
"--multi-line",
"3",
"--thirdparty",
Expand Down
10 changes: 6 additions & 4 deletions python/mlcroissant/mlcroissant/_src/core/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def download_git_lfs_file(file: Path):
try:
repo.execute(["git", "lfs", "pull", "--include", fullpath])
except deps.git.exc.GitCommandError as ex:
raise RuntimeError("Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
"repository?") from ex
raise RuntimeError(
"Problem when launching `git lfs`. "
"Possible problems: Have you installed git lfs "
f"locally? Is '{fullpath}' a valid `git lfs` "
"repository?"
) from ex
14 changes: 5 additions & 9 deletions python/mlcroissant/mlcroissant/_src/core/git_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,15 @@
from mlcroissant._src.core.optional import deps
from mlcroissant._src.core.path import Path

_GIT_LFS_CONTENT = (
lambda: """version https://git-lfs.github.com/spec/v1
_GIT_LFS_CONTENT = lambda: """version https://git-lfs.github.com/spec/v1
oid sha256:5e2785fcd9098567a49d6e62e328923d955b307b6dcd0492f6234e96e670772a
size 309207547
"""
)

_NON_GIT_LFS_CONTENT = (
lambda: """name,age
_NON_GIT_LFS_CONTENT = lambda: """name,age
a,1
b,2
c,3"""
)

_NON_ASCII_CONTENT = lambda: (255).to_bytes(1, byteorder="big")

Expand Down Expand Up @@ -54,6 +50,6 @@ def test_download_git_lfs_file():
with mock.patch.object(git, "Git", autospec=True) as git_mock:
download_git_lfs_file(file)
git_mock.assert_called_once_with("/tmp/full/")
git_mock.return_value.execute.assert_called_once_with(
["git", "lfs", "pull", "--include", "path.json"]
)
git_mock.return_value.execute.assert_called_once_with([
"git", "lfs", "pull", "--include", "path.json"
])
14 changes: 10 additions & 4 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ def test_hermetic_loading(dataset_name, record_set_name, num_records):
@pytest.mark.parametrize(
["dataset_name", "record_set_name", "num_records"],
[
["flores-200/metadata.json",
"language_translations_train_data_with_metadata", 10],
["flores-200/metadata.json",
"language_translations_test_data_with_metadata", 10],
[
"flores-200/metadata.json",
"language_translations_train_data_with_metadata",
10,
],
[
"flores-200/metadata.json",
"language_translations_test_data_with_metadata",
10,
],
["gpt-3/metadata.json", "default", 10],
["huggingface-c4/metadata.json", "en", 1],
["huggingface-mnist/metadata.json", "default", 10],
Expand Down
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/nodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Defines the public interface to the `mlcroissant.nodes` package."""

from mlcroissant._src.structure_graph.nodes.field import Field
from mlcroissant._src.structure_graph.nodes.file_object import FileObject
from mlcroissant._src.structure_graph.nodes.file_set import FileSet
Expand Down
6 changes: 3 additions & 3 deletions python/mlcroissant/mlcroissant/_src/operation_graph/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def _add_operations_for_file_object(
>> Concatenate(operations=operations, node=successor)
]
if node.encoding_format and not should_extract(node.encoding_format):
fields = tuple(
[field for field in node.successors if isinstance(field, Field)]
)
fields = tuple([
field for field in node.successors if isinstance(field, Field)
])
operation >> Read(
operations=operations,
node=node,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,8 @@ def __call__(self, *args: list[Path]) -> pd.DataFrame:
"""See class' docstring."""
assert len(args) > 0, "No dataframe to merge."
files = [file for files in args for file in files]
return pd.DataFrame(
{
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
}
)
return pd.DataFrame({
FileProperty.filepath: [os.fspath(file.filepath) for file in files],
FileProperty.filename: [file.filename for file in files],
FileProperty.fullpath: [os.fspath(file.fullpath) for file in files],
})
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
"""Reads a file line-by-line and outputs a named pd.Series of the lines."""
path = epath.Path(row[FileProperty.filepath])
lines = path.open("rb").read().splitlines()
return pd.Series(
{**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
)
return pd.Series({
**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
})


def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
return parse_json_content(json_content, self.fields)
else:
# Raw files are returned as a one-line pd.DataFrame.
return pd.DataFrame(
{
FileProperty.content: [json_content],
}
)
return pd.DataFrame({
FileProperty.content: [json_content],
})
elif encoding_format == EncodingFormat.JSON_LINES:
return pd.read_json(file, lines=True)
elif encoding_format == EncodingFormat.PARQUET:
Expand All @@ -119,11 +117,9 @@ def _read_file_content(self, encoding_format: str, file: Path) -> pd.DataFrame:
filepath, header=None, names=[FileProperty.lines]
)
else:
return pd.DataFrame(
{
FileProperty.content: [file.read()],
}
)
return pd.DataFrame({
FileProperty.content: [file.read()],
})
else:
raise ValueError(
f"Unsupported encoding format for file: {encoding_format}"
Expand Down
36 changes: 16 additions & 20 deletions python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ def from_jsonld(cls, issues: Issues, json_) -> ParentField | None:

def to_json(self) -> Json:
"""Converts the `ParentField` to JSON."""
return remove_empty_values(
{
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
})


@dataclasses.dataclass(eq=False, repr=False)
Expand Down Expand Up @@ -128,20 +126,18 @@ def to_json(self) -> Json:
self.rdf.shorten_value(data_type) for data_type in self.data_types
]
parent_field = self.parent_field.to_json() if self.parent_field else None
return remove_empty_values(
{
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
}
)
return remove_empty_values({
"@type": "ml:Field",
"name": self.name,
"description": self.description,
"dataType": data_types[0] if len(data_types) == 1 else data_types,
"isEnumeration": self.is_enumeration,
"parentField": parent_field,
"repeated": self.repeated,
"references": self.references.to_json() if self.references else None,
"source": self.source.to_json() if self.source else None,
"subField": [sub_field.to_json() for sub_field in self.sub_fields],
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,18 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
}
)
return remove_empty_values({
"@type": "sc:FileObject",
"name": self.name,
"description": self.description,
"contentSize": self.content_size,
"contentUrl": self.content_url,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"md5": self.md5,
"sha256": self.sha256,
"source": self.source.to_json() if self.source else None,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def test_checks_are_performed():
Node, "assert_has_exclusive_properties"
) as exclusive_mock:
create_test_node(FileObject)
mandatory_mock.assert_has_calls(
[mock.call("encoding_format", "name"), mock.call("content_url")]
)
mandatory_mock.assert_has_calls([
mock.call("encoding_format", "name"), mock.call("content_url")
])
exclusive_mock.assert_called_once_with(["md5", "sha256"])
validate_name_mock.assert_called_once()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,14 @@ def to_json(self) -> Json:
contained_in: str | list[str] = self.contained_in[0]
else:
contained_in = self.contained_in
return remove_empty_values(
{
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
}
)
return remove_empty_values({
"@type": "sc:FileSet",
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"includes": self.includes,
})

@classmethod
def from_jsonld(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,17 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `Metadata` to JSON."""
return remove_empty_values(
{
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
}
)
return remove_empty_values({
"@context": self.rdf.context,
"@type": "sc:Dataset",
"name": self.name,
"description": self.description,
"citation": self.citation,
"license": self.license,
"url": self.url,
"distribution": [f.to_json() for f in self.distribution],
"recordSet": [record_set.to_json() for record_set in self.record_sets],
})

@property
def file_objects(self) -> list[FileObject]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,15 @@ def __post_init__(self):

def to_json(self) -> Json:
"""Converts the `RecordSet` to JSON."""
return remove_empty_values(
{
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
}
)
return remove_empty_values({
"@type": "ml:RecordSet",
"name": self.name,
"description": self.description,
"isEnumeration": self.is_enumeration,
"key": self.key,
"field": [field.to_json() for field in self.fields],
"data": self.data,
})

@classmethod
def from_jsonld(
Expand Down
Loading

0 comments on commit 54df95d

Please sign in to comment.