diff --git a/README.md b/README.md index 0f11023..920cd5f 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # sip-validator The SIP (Submission Information Package) Validator is a Python script designed to validate the structure and content of a SIP directory. A SIP typically consists of digital objects, metadata files, supporting information, a manifest, and a Readme file. The script checks for the presence of expected folders and files, validates CSV files' structure, and ensures data integrity. +## references +- [DLP Metadata Application Profile](https://docs.google.com/document/d/1WZUe6fnmbQNRX5crx3YAcHNb6tyuHvfPyBLWoJbAwhM/edit?usp=sharing) +- Virginia Tech Digital Library Submission Information Package # Prerequisites - Python 3.x - Standard Python libraries: os, csv, re, datetime @@ -13,18 +16,21 @@ python3 sip-validator.py The script verifies the existence of the following folders and files in the SIP: - Data: Files - Manifest: checksumsha1 -- Metadata: collection_metadata.csv, item_metadata.csv. -- README: (read me of what is in the folder). +- Metadata: collection_metadata.csv, item_metadata.csv +- README (optional): readme.txt or readme.md either within this folder or in SIP root folder +- Supporting Information 2. CSV File Validation The script checks the structure and content of collection_metadata.csv and item_metadata.csv: -collection_metadata.csv -- Required Fields: identifier, title, description, visibility, rightsholder, rights. -- Unique Identifier Check: Ensures each identifier is unique. -- Special Character Check: Looks for special characters in each field. -item_metadata.csv -- Required Fields: identifier, title, description, creator, display_date, start_date, end_date, rights_holder, rights, tags, coverage, type, language, visibility, location. -- Unique Identifier Check: Ensures each identifier is unique. -- Special Character Check: Looks for special characters in each field. +- collection_metadata.csv + - Required Fields: identifier, title, visibility. + - Unique Identifier Check: Ensures each identifier is unique. + - Special Character Check: Identifier field allowing only alphanumeric characters, underscores _, and hyphens -. + - Controlled Value Check: Looks for controlled values in rights or license. +- item_metadata.csv + - Required Fields: identifier, title, rights or license, visibility. + - Unique Identifier Check: Ensures each identifier is unique. + - Special Character Check: Identifier field allowing only alphanumeric characters, underscores _, and hyphens -. + - Controlled Value Check: Looks for controlled values in rights or license. 3. Validation Results Validation results are saved in a receipt file (validation_receipt.txt) within the SIP root folder: - Validation Errors: Details errors encountered during validation. diff --git a/sip-validator.py b/sip-validator.py index 49fb5fa..515f688 100644 --- a/sip-validator.py +++ b/sip-validator.py @@ -1,43 +1,70 @@ import os import csv import re -from datetime import datetime from collections import OrderedDict -def validate_date_format(date_text): - try: - datetime.strptime(date_text, '%Y/%m/%d') - return True - except ValueError: - return False +def validate_special_characters(string, allow_spaces=False): + """ + Validate special characters. + If allow_spaces is True, allow spaces for folder names. + """ + if allow_spaces: + # Accept alphanumeric characters, _ (underscore), - (hyphen), and spaces + return bool(re.match(r'^[a-z0-9 _-]+$', string.lower())) + else: + # Accept only alphanumeric characters, _ (underscore), and - (hyphen) + return bool(re.match(r'^[a-z0-9_-]+$', string.lower())) -def validate_special_characters(string): - return bool(re.match(r'^[a-zA-Z0-9@#$%&*/!\']+$', string.lower())) +def validate_visibility(value): + """ + Validate the visibility field. Accepted values are 'true' or 'false', case insensitive. + """ + return value.lower() in ['true', 'false'] def check_directory_structure(root_path): - required_folders = ['Data', 'Manifest', 'Metadata'] + # Add "Supporting Information" and "readme" to the required folders + required_folders = ['Data', 'Manifest', 'Metadata', 'Supporting Information', 'readme'] errors = OrderedDict() + try: existing_folders = os.listdir(root_path) except Exception as e: errors[f"Error accessing directory {root_path}: {str(e)}"] = None return errors - extra_folders = [folder for folder in existing_folders if folder not in required_folders and os.path.isdir(os.path.join(root_path, folder))] + # Check for special characters in folder names (allow spaces for folder names) + for folder in existing_folders: + folder_path = os.path.join(root_path, folder) + if os.path.isdir(folder_path): # Only check folders for special characters + if not validate_special_characters(folder, allow_spaces=True): + errors[f"Folder name contains invalid special characters: {folder}"] = None + + # Check for required folders without checking capitalization + extra_folders = [folder for folder in existing_folders if os.path.isdir(os.path.join(root_path, folder)) and folder.lower() not in [f.lower() for f in required_folders]] for folder in required_folders: - matched_folders = [f for f in existing_folders if re.match(rf'{folder}[^a-zA-Z0-9]*$', f, re.IGNORECASE)] - if not matched_folders: + if not any(f.lower() == folder.lower() for f in existing_folders if os.path.isdir(os.path.join(root_path, f))): errors[f"Missing required folder: {folder}"] = None - else: - for matched_folder in matched_folders: - if matched_folder != folder: - errors[f"Folder name should be {folder} but found {matched_folder}"] = None - elif not validate_special_characters(matched_folder): - errors[f"Folder name contains special characters: {matched_folder}"] = None + + # Check for README.md or README.txt in the root directory or 'readme' folder + readme_exists = False + for fname in os.listdir(root_path): + if fname.lower() in ['readme.md', 'readme.txt']: + readme_exists = True + break + + # Also check in a 'readme' folder, if it exists + readme_folder = os.path.join(root_path, 'readme') + if os.path.exists(readme_folder) and os.path.isdir(readme_folder): + for fname in os.listdir(readme_folder): + if fname.lower() in ['readme.md', 'readme.txt']: + readme_exists = True + break + + if not readme_exists: + errors["Missing README file with .txt or .md extension"] = None + if extra_folders: errors[f"Extra folders found: {', '.join(extra_folders)}"] = None - if not any(fname.startswith('README') and fname.split('.')[-1] in ['txt', 'md'] for fname in existing_folders): - errors["Missing README file with .txt or .md extension"] = None if not errors: errors["Directory structure is valid."] = None return errors @@ -109,13 +136,13 @@ def read_csv_file(file_path): def validate_metadata_files(root_path): metadata_path = os.path.join(root_path, 'Metadata') receipt = OrderedDict() - required_fields = ['identifier', 'title', 'description', 'visibility', 'rights_holder'] + required_fields = ['identifier', 'title', 'visibility'] if not os.path.exists(metadata_path): receipt["Missing required folder: Metadata"] = None return receipt - # Scan for all metadata files that match the patterns + # Ensure either 'rights' or 'license' is present only for item_metadata.csv for file_name in os.listdir(metadata_path): if re.search(r'(collection_metadata\.csv|item_metadata\.csv)$', file_name.lower()): file_path = os.path.join(metadata_path, file_name) @@ -141,33 +168,31 @@ def validate_metadata_files(root_path): if matched_field != field: receipt[f"Validation error in {file_name}: Field name should be {field} but found {matched_field}"] = None missing_field = True - elif field != 'rights_holder' and not validate_special_characters(matched_field): - receipt[f"Validation error in {file_name}: Field name contains special characters: {matched_field}"] = None - missing_field = True + elif field == 'visibility' and not validate_visibility(rows[0]['visibility']): + receipt[f"Validation error in {file_name}: Invalid value for visibility: {rows[0]['visibility']} (expected 'true' or 'false')"] = None - # If any required field is missing or incorrect, skip row validation - if missing_field: - continue + # Skip rights/license validation for collection_metadata.csv + if 'collection_metadata.csv' in file_name.lower(): + continue # Skip rights/license check for collection_metadata - # Check each row for content - for row in rows: - identifier = row.get('identifier', 'unknown').strip() - - # Validate identifier - if not validate_special_characters(identifier): - receipt[f"Validation error in {file_name} (identifier {identifier}): Invalid identifier."] = None - - # Validate date format - date = row.get('date', '').strip() - if date and not validate_date_format(date): - receipt[f"Validation error in {file_name} (identifier {identifier}): Invalid date format."] = None - - # Required fields check - for field in required_fields: - if field not in row or not row[field].strip(): - receipt[f"Validation error in {file_name} (identifier {identifier}): Missing or invalid {field}."] = None + # Validate rights and license fields for item_metadata.csv + if 'item_metadata.csv' in file_name.lower(): + for row in rows: + identifier = row.get('identifier', 'unknown').strip() + + # Validate identifier + if not validate_special_characters(identifier): + receipt[f"Validation error in {file_name} (identifier {identifier}): Invalid identifier."] = None + + # Validate rights or license + if 'rights' in row and not re.match(r'https?://rightsstatements.org/vocab/(InC|InC-OW-EU|InC-EDU|InC-NC|InC-RUU|NoC-CR|NoC-NC|NoC-OKLR|NoC-US|CNE|UND|NKC)/1.0/', row['rights'], re.IGNORECASE): + receipt[f"Validation error in {file_name} (identifier {identifier}): Invalid value for rights: {row['rights']}"] = None + elif 'license' in row and not re.match(r'https?://creativecommons.org/licenses/(by/2.0|by/4.0|by-sa/4.0|by-nd/4.0|by-nc/4.0|by-nc-sa/4.0|by-nc-nd/4.0)/', row['license'], re.IGNORECASE): + receipt[f"Validation error in {file_name} (identifier {identifier}): Invalid value for license: {row['license']}"] = None + elif 'rights' not in row and 'license' not in row: + receipt[f"Validation error in {file_name} (identifier {identifier}): Missing required field: either 'rights' or 'license'"] = None - return receipt + return receipt # Always return the receipt, even if empty def write_validation_receipt(receipt, root_path): receipt_path = os.path.join(root_path, 'validation_receipt.txt')