orion/scripts/validate/validate_audit.py

#!/usr/bin/env python3
"""
IT Internal Audit Validator

Validates code against internal audit rules defined in .audit-rules/
Focuses on governance, compliance, and control requirements.
"""

import re
import sys
from pathlib import Path


# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from base_validator import BaseValidator


class AuditValidator(BaseValidator):
    """Validates IT internal audit rules."""

    def __init__(self, project_root: Path | None = None):
        super().__init__(".audit-rules", project_root)

    def validate(self) -> bool:
        """Run all audit validations."""
        self._validate_audit_trail()
        self._validate_access_control()
        self._validate_data_governance()
        self._validate_compliance()
        self._validate_change_management()
        self._validate_third_party()
        self._validate_documentation()
        return len(self.errors) == 0

    # ==================
    # AUDIT TRAIL
    # ==================

    def _validate_audit_trail(self) -> None:
        """Validate audit trail requirements."""
        # Check authentication logging
        auth_files = [
            self.project_root / "app" / "api" / "v1" / "auth.py",
            self.project_root / "app" / "routes" / "admin.py",
        ]

        for file in auth_files:
            if file.exists():
                content = file.read_text()
                if "logger" not in content:
                    self.add_error(
                        "AUDIT-LOG-001",
                        "Authentication operations must include logging",
                        str(file),
                    )

        # Check service layer logging
        services_path = self.project_root / "app" / "services"
        if services_path.exists():
            for file in services_path.glob("*.py"):
                if file.name == "__init__.py":
                    continue
                content = file.read_text()
                # Services that modify data should have logging
                if re.search(r"def (create|update|delete)", content):
                    if "logger" not in content:
                        self.add_warning(
                            "AUDIT-LOG-002",
                            "Service with data modifications should include logging",
                            str(file),
                        )

        # Check for audit timestamp fields in models
        # Models can have timestamps directly or inherit from BaseModel/TimestampMixin
        models_path = self.project_root / "models" / "database"
        if models_path.exists():
            for file in models_path.glob("*.py"):
                # audit_log.py uses timestamp field instead of created_at/updated_at
                if file.name in ("__init__.py", "base.py", "audit_log.py"):
                    continue
                content = file.read_text()
                if "class " in content:  # Has model definition
                    # Check if timestamps are present directly or via inheritance
                    has_timestamps = (
                        "created_at" in content
                        or "updated_at" in content
                        or "BaseModel" in content  # Inherits from BaseModel
                        or "TimestampMixin" in content  # Uses TimestampMixin
                    )
                    if not has_timestamps:
                        self.add_warning(
                            "AUDIT-FIELD-001",
                            "Database model should include audit timestamp fields",
                            str(file),
                        )

        # Check for forbidden log modification patterns
        self._check_forbidden_patterns(
            paths=["app/**/*.py"],
            patterns=[
                r"os\.remove.*\.log",
                r"truncate.*log",
                r"open.*\.log.*['\"]w['\"]",
            ],
            rule_id="AUDIT-INT-001",
            message="Application must not modify or delete log files",
        )

    # ==================
    # ACCESS CONTROL
    # ==================

    def _validate_access_control(self) -> None:
        """Validate access control requirements."""
        # Check API endpoints have authentication
        api_path = self.project_root / "app" / "api" / "v1"
        if api_path.exists():
            for file in api_path.glob("*.py"):
                # Skip endpoints that are intentionally unauthenticated
                if file.name in ("__init__.py", "health.py", "metrics.py"):
                    continue
                content = file.read_text()
                # Check for authentication dependency
                if "@router" in content:
                    if not re.search(
                        r"CurrentUser|Depends.*get_current_user|AdminUser", content
                    ):
                        # auth.py handles its own auth
                        if file.name != "auth.py":
                            self.add_warning(
                                "ACCESS-AUTH-001",
                                "API endpoint should require authentication",
                                str(file),
                            )

        # Check admin routes verify admin role
        admin_route = self.project_root / "app" / "routes" / "admin.py"
        if admin_route.exists():
            content = admin_route.read_text()
            if "is_admin" not in content and "admin_required" not in content:
                self.add_warning(
                    "ACCESS-AUTH-002",
                    "Admin routes should verify admin privileges",
                    str(admin_route),
                )

        # Check password hashing
        security_file = self.project_root / "app" / "core" / "security.py"
        if security_file.exists():
            content = security_file.read_text()
            if not re.search(r"bcrypt|argon2|scrypt|pbkdf2", content, re.IGNORECASE):
                self.add_error(
                    "ACCESS-ACCT-003",
                    "Passwords must use approved hashing algorithms",
                    str(security_file),
                )

        # Check password not in API responses
        # Note: Only flag if a class with "Response" in name directly defines password_hash
        # Internal schemas (like UserInDB) are not flagged as they're not API responses
        schema_path = self.project_root / "models" / "schema"
        if schema_path.exists():
            for file in schema_path.glob("*.py"):
                content = file.read_text()
                # Check for Response classes that directly define password_hash
                # Split by class definitions and check each
                class_blocks = re.split(r"(?=^class\s)", content, flags=re.MULTILINE)
                for block in class_blocks:
                    # Check if this class is a Response class
                    class_match = re.match(r"class\s+(\w*Response\w*)", block)
                    if class_match:
                        # Check if password_hash is defined in this class (not inherited)
                        if "password_hash:" in block or "password_hash =" in block:
                            if "exclude" not in block.lower():
                                self.add_error(
                                    "ACCESS-PRIV-002",
                                    f"Password hash must be excluded from {class_match.group(1)}",
                                    str(file),
                                )

    # ==================
    # DATA GOVERNANCE
    # ==================

    def _validate_data_governance(self) -> None:
        """Validate data governance requirements."""
        # Check PII not logged
        # Note: Patterns detect actual password values, not descriptive usage like "Password reset"
        # We look for patterns that suggest password values are being logged:
        # - password= or password: followed by a variable
        # - %s or {} after password indicating interpolation of password value
        self._check_forbidden_patterns(
            paths=["app/**/*.py", "middleware/**/*.py"],
            patterns=[
                r"logger\.\w+\(.*password\s*[=:]\s*['\"]?%",  # password=%s
                r"logger\.\w+\(.*password\s*[=:]\s*\{",  # password={var}
                r"logging\.\w+\(.*password\s*[=:]\s*['\"]?%",  # password=%s
                r"print\(.*password\s*=",  # print(password=xxx)
                r"logger.*credit.*card.*\d",  # credit card with numbers
                r"logger.*\bssn\b.*\d",  # SSN with numbers
            ],
            rule_id="DATA-PII-003",
            message="PII/sensitive data must not be logged",
        )

        # Check input validation (Pydantic)
        schema_path = self.project_root / "models" / "schema"
        if schema_path.exists():
            has_validation = False
            for file in schema_path.glob("*.py"):
                content = file.read_text()
                if re.search(r"Field|validator|field_validator", content):
                    has_validation = True
                    break
            if not has_validation:
                self.add_error(
                    "DATA-INT-001",
                    "Pydantic validation required for data integrity",
                    str(schema_path),
                )

        # Check user data access endpoint exists (GDPR)
        users_api = self.project_root / "app" / "api" / "v1" / "users.py"
        if users_api.exists():
            content = users_api.read_text()
            if "/me" not in content and "current" not in content.lower():
                self.add_warning(
                    "DATA-PRIV-001",
                    "Endpoint for users to access their own data required (GDPR Art. 15)",
                    str(users_api),
                )

    # ==================
    # COMPLIANCE
    # ==================

    def _validate_compliance(self) -> None:
        """Validate compliance requirements."""
        # Check HTTPS configuration
        config_files = [
            self.project_root / "app" / "core" / "config.py",
            self.project_root / "main.py",
        ]
        https_configured = False
        for file in config_files:
            if file.exists():
                content = file.read_text()
                if re.search(r"https|SSL|TLS|SECURE", content, re.IGNORECASE):
                    https_configured = True
                    break
        if not https_configured:
            self.add_warning(
                "COMP-REG-002",
                "HTTPS configuration should be documented",
                "app/core/config.py",
            )

        # Check version control
        if not (self.project_root / ".git").exists():
            self.add_error(
                "COMP-EVID-003",
                "Version control (Git) is required",
                str(self.project_root),
            )

        # Check CI/CD exists (GitHub or GitLab)
        github_ci = self.project_root / ".github" / "workflows" / "ci.yml"
        gitlab_ci = self.project_root / ".gitlab-ci.yml"
        if not github_ci.exists() and not gitlab_ci.exists():
            self.add_warning(
                "COMP-EVID-001",
                "CI workflow for automated testing recommended",
                ".gitlab-ci.yml or .github/workflows/ci.yml",
            )

        # Check code review process (GitHub or GitLab)
        github_pr_template = self.project_root / ".github" / "PULL_REQUEST_TEMPLATE.md"
        gitlab_mr_templates = self.project_root / ".gitlab" / "merge_request_templates"
        has_mr_template = github_pr_template.exists() or (
            gitlab_mr_templates.exists() and any(gitlab_mr_templates.iterdir())
        )
        if not has_mr_template:
            self.add_warning(
                "COMP-POL-001",
                "Merge request template recommended for code review",
                ".gitlab/merge_request_templates/ or .github/PULL_REQUEST_TEMPLATE.md",
            )

    # ==================
    # CHANGE MANAGEMENT
    # ==================

    def _validate_change_management(self) -> None:
        """Validate change management requirements."""
        # Check .gitignore exists and excludes secrets
        gitignore = self.project_root / ".gitignore"
        if gitignore.exists():
            content = gitignore.read_text()
            required_exclusions = [".env", "*.pem", "*.key"]
            for pattern in required_exclusions:
                # Simplified check - just look for the pattern
                if pattern.replace("*", "") not in content:
                    self.add_warning(
                        "CHANGE-VC-003",
                        f"Secret pattern '{pattern}' should be in .gitignore",
                        str(gitignore),
                    )
        else:
            self.add_error(
                "CHANGE-VC-002",
                ".gitignore file required",
                str(self.project_root),
            )

        # Check database migrations
        alembic_dir = self.project_root / "alembic"
        if not alembic_dir.exists():
            self.add_warning(
                "CHANGE-ROLL-001",
                "Database migration tool (Alembic) recommended",
                "alembic/",
            )
        else:
            # Check for downgrade functions
            versions_dir = alembic_dir / "versions"
            if versions_dir.exists():
                for file in versions_dir.glob("*.py"):
                    content = file.read_text()
                    if "def upgrade" in content and "def downgrade" not in content:
                        self.add_warning(
                            "CHANGE-ROLL-002",
                            "Migration should include downgrade function",
                            str(file),
                        )

        # Check environment separation
        config_file = self.project_root / "app" / "core" / "config.py"
        if config_file.exists():
            content = config_file.read_text()
            if not re.search(r"ENVIRONMENT|development|staging|production", content):
                self.add_warning(
                    "CHANGE-DEP-001",
                    "Environment separation configuration recommended",
                    str(config_file),
                )

    # ==================
    # THIRD PARTY
    # ==================

    def _validate_third_party(self) -> None:
        """Validate third-party dependency management."""
        # Check dependency lock file exists
        lock_files = ["uv.lock", "poetry.lock", "Pipfile.lock", "requirements.lock"]
        has_lock = any((self.project_root / f).exists() for f in lock_files)
        if not has_lock:
            self.add_warning(
                "THIRD-DEP-001",
                "Dependency lock file recommended for reproducible builds",
                "uv.lock or similar",
            )

        # Check dependency manifest exists
        manifest_files = ["pyproject.toml", "requirements.txt", "Pipfile"]
        has_manifest = any((self.project_root / f).exists() for f in manifest_files)
        if not has_manifest:
            self.add_error(
                "THIRD-DEP-002",
                "Dependency manifest file required",
                "pyproject.toml",
            )

        # Check for dependency scanning (GitHub Dependabot or GitLab)
        dependabot = self.project_root / ".github" / "dependabot.yml"
        gitlab_ci = self.project_root / ".gitlab-ci.yml"
        has_dep_scanning = dependabot.exists()
        if not has_dep_scanning and gitlab_ci.exists():
            # Check if GitLab CI includes dependency scanning
            ci_content = gitlab_ci.read_text()
            has_dep_scanning = "dependency_scanning" in ci_content.lower()
        if not has_dep_scanning:
            self.add_info(
                "THIRD-VULN-002",
                "Consider enabling dependency scanning for security updates",
                ".gitlab-ci.yml (include dependency_scanning) or .github/dependabot.yml",
            )

        # Check for insecure package sources
        pyproject = self.project_root / "pyproject.toml"
        if pyproject.exists():
            content = pyproject.read_text()
            if "http://" in content and "https://" not in content:
                self.add_error(
                    "THIRD-VEND-001",
                    "Only HTTPS sources allowed for packages",
                    str(pyproject),
                )

    # ==================
    # DOCUMENTATION
    # ==================

    def _validate_documentation(self) -> None:
        """Validate documentation requirements."""
        # Check README exists
        readme_files = ["README.md", "README.rst", "README.txt"]
        has_readme = any((self.project_root / f).exists() for f in readme_files)
        if not has_readme:
            self.add_error(
                "DOC-PROJ-001",
                "Project README required",
                "README.md",
            )
        else:
            # Check README has setup instructions
            for readme in readme_files:
                readme_path = self.project_root / readme
                if readme_path.exists():
                    content = readme_path.read_text().lower()
                    has_setup = any(
                        term in content
                        for term in [
                            "install",
                            "setup",
                            "quick start",
                            "getting started",
                        ]
                    )
                    if not has_setup:
                        self.add_warning(
                            "DOC-PROJ-002",
                            "README should include setup instructions",
                            str(readme_path),
                        )
                    break

        # Check security policy exists
        security_files = ["SECURITY.md", ".github/SECURITY.md"]
        has_security = any((self.project_root / f).exists() for f in security_files)
        if not has_security:
            self.add_warning(
                "DOC-SEC-001",
                "Security policy (SECURITY.md) recommended",
                "SECURITY.md",
            )

        # Check API documentation
        docs_api = self.project_root / "docs" / "api"
        if not docs_api.exists() or not list(docs_api.glob("*.md")):
            self.add_warning(
                "DOC-API-003",
                "API documentation recommended",
                "docs/api/",
            )

        # Check authentication documentation
        auth_doc = self.project_root / "docs" / "api" / "authentication.md"
        if not auth_doc.exists():
            self.add_warning(
                "DOC-SEC-002",
                "Authentication documentation recommended",
                "docs/api/authentication.md",
            )

        # Check architecture documentation
        arch_docs = self.project_root / "docs" / "architecture"
        if not arch_docs.exists() or not list(arch_docs.glob("*.md")):
            self.add_warning(
                "DOC-ARCH-001",
                "Architecture documentation recommended",
                "docs/architecture/",
            )

        # Check deployment documentation
        deploy_doc = self.project_root / "docs" / "deployment" / "index.md"
        if not deploy_doc.exists():
            self.add_warning(
                "DOC-OPS-001",
                "Deployment documentation recommended",
                "docs/deployment/index.md",
            )

    # ==================
    # HELPERS
    # ==================

    def _check_forbidden_patterns(
        self,
        paths: list[str],
        patterns: list[str],
        rule_id: str,
        message: str,
    ) -> None:
        """Check for forbidden patterns in files."""
        for path_pattern in paths:
            if "**" in path_pattern:
                base, pattern = path_pattern.split("**", 1)
                base_path = self.project_root / base.rstrip("/")
                if base_path.exists():
                    files = base_path.rglob(pattern.lstrip("/"))
                else:
                    continue
            else:
                files = [self.project_root / path_pattern]

            for file in files:
                if not file.exists() or not file.is_file():
                    continue
                try:
                    content = file.read_text()
                    for pattern in patterns:
                        if re.search(pattern, content, re.IGNORECASE):
                            self.add_error(rule_id, message, str(file))
                except Exception:
                    pass


def main() -> int:
    """Run audit validation."""
    import argparse

    parser = argparse.ArgumentParser(description="Validate IT internal audit rules")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    parser.add_argument(
        "--format",
        choices=["text", "json"],
        default="text",
        help="Output format",
    )
    args = parser.parse_args()

    validator = AuditValidator()
    validator.load_rules()
    success = validator.validate()
    validator.print_results()

    return 0 if success else 1


if __name__ == "__main__":
    sys.exit(main())