Skip to content

Commit

Permalink
Mv validate methods & soft deprecate Validator
Browse files Browse the repository at this point in the history
  • Loading branch information
pierrecamilleri committed Jan 27, 2025
1 parent f20aab1 commit bc27f9b
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 234 deletions.
2 changes: 2 additions & 0 deletions frictionless/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@
from .table import Lookup as Lookup
from .table import Row as Row
from .transformer import Transformer as Transformer

# Deprecated
from .validator import Validator as Validator
69 changes: 59 additions & 10 deletions frictionless/package/package.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
from __future__ import annotations

from multiprocessing import Pool
from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union

import attrs
from typing_extensions import Self

from .. import errors, fields, helpers, settings
from ..checklist import Checklist
from ..exception import FrictionlessException
from ..metadata import Metadata
from ..platform import platform
from ..report import Report
from ..resource import Resource
from ..system import system
from ..transformer import Transformer
from ..validator import Validator
from .factory import Factory

if TYPE_CHECKING:
from .. import types
from ..catalog import Dataset
from ..checklist import Checklist
from ..detector import Detector
from ..dialect import Control, Dialect
from ..indexer import IOnProgress, IOnRow
Expand Down Expand Up @@ -483,14 +484,54 @@ def validate(
Report: validation report
"""
validator = Validator()
return validator.validate_package(
self,
checklist=checklist,
name=name,
parallel=parallel,
limit_rows=limit_rows,
limit_errors=limit_errors,
# Create state
timer = helpers.Timer()
reports: List[Report] = []
resources = self.resources if name is None else [self.get_resource(name)]
with_foreign_keys = any(
res.schema and res.schema.foreign_keys for res in resources
)

# Prepare checklist
checklist = checklist or Checklist()

# Validate metadata
try:
self.to_descriptor(validate=True)
except FrictionlessException as exception:
return Report.from_validation(time=timer.time, errors=exception.to_errors())

# Validate sequential
if not parallel or with_foreign_keys:
for resource in resources:
report = resource.validate(
checklist=checklist,
limit_errors=limit_errors,
limit_rows=limit_rows,
)
reports.append(report)

# Validate parallel
else:
with Pool() as pool:
options_pool: List[Dict[str, Any]] = []
for resource in resources:
options: Any = {}
options["resource"] = {}
options["resource"]["descriptor"] = resource.to_descriptor()
options["resource"]["basepath"] = resource.basepath
options["validate"] = {}
options["validate"]["limit_rows"] = limit_rows
options["validate"]["limit_errors"] = limit_errors
options_pool.append(options)
report_descriptors = pool.map(_validate_parallel, options_pool)
for report_descriptor in report_descriptors:
reports.append(Report.from_descriptor(report_descriptor))

# Return report
return Report.from_validation_reports(
time=timer.time,
reports=reports,
)

# Convert
Expand Down Expand Up @@ -707,3 +748,11 @@ def metadata_export(self): # type: ignore
# descriptor = {"$frictionless": "package/v2", **descriptor}

return descriptor


def _validate_parallel(options: types.IDescriptor) -> types.IDescriptor:
resource_options = options["resource"]
validate_options = options["validate"]
resource = Resource.from_descriptor(**resource_options)
report = resource.validate(**validate_options)
return report.to_descriptor()
114 changes: 109 additions & 5 deletions frictionless/resource/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
from typing_extensions import Self

from .. import errors, fields, helpers, settings
from ..checklist import Checklist
from ..detector import Detector
from ..dialect import Control, Dialect
from ..exception import FrictionlessException
from ..metadata import Metadata
from ..platform import platform
from ..report import Report
from ..schema import Schema
from ..system import system
from ..validator import Validator
from .factory import Factory
from .stats import ResourceStats

if TYPE_CHECKING:
from .. import types
from ..checklist import Checklist
from ..error import Error
from ..package import Package
from ..report import Report
from ..system import Loader


Expand Down Expand Up @@ -619,8 +619,112 @@ def validate(
Report: validation report
"""
validator = Validator()
return validator.validate_resource(self, checklist=checklist)
# Create state
partial = False
timer = helpers.Timer()
labels: List[str] = []
errors: List[Error] = []
warnings: List[str] = []

# Prepare checklist
checklist = checklist or Checklist()
checks = checklist.connect(self)

# Validate metadata
try:
self.to_descriptor(validate=True)
except FrictionlessException as exception:
return Report.from_validation_task(
self, time=timer.time, errors=exception.to_errors()
)

# TODO: remove in next version
# Ignore not-supported hashings
if self.hash:
algorithm, _ = helpers.parse_resource_hash_v1(self.hash)
if algorithm not in ["md5", "sha256"]:
warning = "hash is ignored; supported algorithms: md5/sha256"
warnings.append(warning)

# Prepare resource
if self.closed:
try:
self.open()
except FrictionlessException as exception:
self.close()
return Report.from_validation_task(
self, time=timer.time, errors=exception.to_errors()
)

# Validate data
with self:
# Validate start
for index, check in enumerate(checks):
for error in check.validate_start():
if error.type == "check-error":
del checks[index]
if checklist.match(error):
errors.append(error)

# Validate file
if not isinstance(self, platform.frictionless_resources.TableResource):
if self.hash is not None or self.bytes is not None:
helpers.pass_through(self.byte_stream)

# Validate table
else:
row_count = 0
labels = self.labels
while True:
row_count += 1

# Emit row
try:
row = next(self.row_stream) # type: ignore
except FrictionlessException as exception:
errors.append(exception.error)
continue
except StopIteration:
break

# Validate row
for check in checks:
for error in check.validate_row(row):
if checklist.match(error):
errors.append(error)

# Callback row
if on_row:
on_row(row)

# Limit rows
if limit_rows:
if row_count >= limit_rows:
warning = f"reached row limit: {limit_rows}"
warnings.append(warning)
partial = True
break

# Limit errors
if limit_errors:
if len(errors) >= limit_errors:
errors = errors[:limit_errors]
warning = f"reached error limit: {limit_errors}"
warnings.append(warning)
partial = True
break

# Validate end
if not partial:
for check in checks:
for error in check.validate_end():
if checklist.match(error):
errors.append(error)

# Return report
return Report.from_validation_task(
self, time=timer.time, labels=labels, errors=errors, warnings=warnings
)

# Export

Expand Down
25 changes: 1 addition & 24 deletions frictionless/resources/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from frictionless.schema.field import Field

from .. import errors, helpers, settings
from .. import errors, helpers
from ..analyzer import Analyzer
from ..dialect import Dialect
from ..exception import FrictionlessException
Expand All @@ -17,11 +17,9 @@
from ..system import system
from ..table import Header, Lookup, Row, Table
from ..transformer import Transformer
from ..validator import Validator

if TYPE_CHECKING:
from .. import types
from ..checklist import Checklist
from ..indexer import IOnProgress, IOnRow
from ..pipeline import Pipeline
from ..system import Loader, Parser
Expand Down Expand Up @@ -626,27 +624,6 @@ def transform(self, pipeline: Pipeline):
transformer = Transformer()
return transformer.transform_table_resource(self, pipeline)

# Validate

def validate(
self,
checklist: Optional[Checklist] = None,
*,
name: Optional[str] = None,
on_row: Optional[types.ICallbackFunction] = None,
parallel: bool = False,
limit_rows: Optional[int] = None,
limit_errors: int = settings.DEFAULT_LIMIT_ERRORS,
):
validator = Validator()
return validator.validate_resource(
self,
checklist=checklist,
on_row=on_row,
limit_rows=limit_rows,
limit_errors=limit_errors,
)

# Export

def to_view(self, type: str = "look", **options: Any):
Expand Down
Loading

0 comments on commit bc27f9b

Please sign in to comment.