""" Test suite for field-level pseudonymisation operations on unstructured data. This test suite validates the pseudonymisation of unstructured text with PII detection, covering the following Acceptance Criteria: ## Test Coverage Summary ### Acceptance Criteria Coverage: - AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests - AC2 (Invalid Execution Handling): 5 tests - AC3 (Execution Audit & Logging - Positive Scenario): 3 tests - AC4 (Execution Audit & Logging - Negative Scenario): 4 tests - Additional Coverage: 3 tests ### Test Pattern: - Each test uses build_op_context with config_to_dagster_dict for configuration - Tests validate dual outputs (data, metrics) - Vault access is mocked for isolation - Tests validate Scrubadub automatic PII detection - Tests ensure placeholder replacement for unconfigured PII """ import pytest import re from dagster import build_op_context from unittest.mock import patch, MagicMock from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( AnonymisePseudonymizeUnstructuredConfig, EncryptConfig, RetainConfig, PseudoTechniqueConfig, ) from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import ( anonymize_pseudonymize_unstructured, ) from .conftest import clear_vault_key def config_to_dagster_dict_unstructured(config): """Convert unstructured config to Dagster format.""" config_dict = {"language": config.language.value, "used_function": []} for func_config in config.used_function: technique = func_config.technique technique_type = technique.type technique_dict = technique.model_dump() if "pii" in technique_dict: technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii] technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"} config_dict["used_function"].append( {"technique": {technique_type: technique_dict_without_type}} ) return config_dict def run_unstructured_op(config, text): """ Helper to run unstructured pseudonymisation op. Returns: tuple: (result_text: str, metrics_markdown: str) """ context = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text) # Extract actual values from Output objects return result_text.value, metrics.value def parse_metrics_markdown(metrics_md: str) -> dict: """ Parse markdown metrics into structured dict for easier testing. Args: metrics_md: Markdown metrics string from op output Returns: dict with keys: total_pii_detected, pii_by_type, techniques_applied, language """ result = { "total_pii_detected": 0, "pii_by_type": {}, "techniques_applied": {}, "language": "", } # Extract total PII detected total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md) if total_match: result["total_pii_detected"] = int(total_match.group(1)) # Extract language lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md) if lang_match: result["language"] = lang_match.group(1) # Extract PII by type from table pii_table_section = re.search( r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)", metrics_md, ) if pii_table_section: for line in pii_table_section.group(1).strip().split("\n"): parts = [p.strip() for p in line.split("|") if p.strip()] if len(parts) == 2: entity_type, count = parts result["pii_by_type"][entity_type] = int(count) # Extract techniques applied techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md) if techniques_section: for line in techniques_section.group(1).strip().split("\n"): tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line) if tech_match: pii_type, technique = tech_match.groups() result["techniques_applied"][pii_type] = technique return result # -------------------------------- Fixtures ---------------------------------------- @pytest.fixture def sample_text_en(): """English text with various PII types.""" return """ John Smith works at Acme Corporation. His email is john.smith@example.com and his phone number is +1-555-123-4567. He lives in New York City at 123 Main Street, Apartment 4B. His SSN is 123-45-6789. """ @pytest.fixture def sample_text_multi_person(): """Text with multiple person names.""" return """ The meeting included Alice Johnson, Bob Williams, and Charlie Brown. They discussed the project with Maria Garcia and David Wilson. """ @pytest.fixture def sample_text_mixed_pii(): """Text with multiple PII types for AC1 comprehensive testing.""" return """ Contact Information: Name: Dr. Emily Watson Email: emily.watson@hospital.com Phone: +44-20-7946-0958 Website: https://patient-portal.hospital.com/records """ @pytest.fixture def encrypt_person_config(): """Configuration to encrypt PERSON entities.""" return AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_person_key", ) ) ], ) @pytest.fixture def retain_person_config(): """Configuration to retain PERSON entities unchanged.""" return AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) ], ) @pytest.fixture def mixed_technique_config(): """Configuration with encryption and retention for AC1 testing.""" return AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], key_name="test_mixed_key", ) ), PseudoTechniqueConfig( technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS]) ), ], ) # ================================================================================================ # AC1: Pseudonymisation and Retention Are Applied Correctly # ================================================================================================ def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config): """AC1: Test that configured PII types are encrypted correctly.""" clear_vault_key("test_person_key") result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii) metrics = parse_metrics_markdown(metrics_md) # Verify person name is encrypted (not in plaintext) assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted" # Verify encryption token is present assert "{encrypt:" in result_text, "Encrypted token should be present in result" # Verify PII was detected and processed assert metrics["total_pii_detected"] > 0, "System should detect PII entities" assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII" # Verify text structure is preserved (surrounding text intact) assert "Contact Information:" in result_text, "Non-PII text structure should be preserved" def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person): """AC1: Test that PII types marked for retention remain unchanged.""" retain_config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) ], ) result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person) metrics = parse_metrics_markdown(metrics_md) # Verify retained PII types remain in plaintext assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged" assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged" # Verify technique applied is 'retain' assert ( "retain" in metrics["techniques_applied"].get("PERSON", "").lower() ), "Retain technique should be recorded for PERSON type" def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii): """AC1: Test that unconfigured PII types are replaced with placeholders.""" encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_person_only_key", ) ) ], ) clear_vault_key("test_person_only_key") result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) # Verify person is encrypted (configured) assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" # Verify unconfigured PII types have placeholders assert ( "{{" in result_text and "}}" in result_text ), "Unconfigured PII should be replaced with placeholders" # Verify original unconfigured PII values are not in result assert ( "emily.watson@hospital.com" not in result_text ), "Unconfigured EMAIL should be replaced with placeholder" # Verify placeholder format assert ( "{{EMAIL}}" in result_text or "{{URL}}" in result_text ), "Placeholders should indicate entity type" def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config): """AC1: Test that multiple techniques (encrypt, retain) are applied correctly.""" clear_vault_key("test_mixed_key") result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii) metrics = parse_metrics_markdown(metrics_md) # Verify encrypted PII types (PERSON, EMAIL) assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted" # Verify retained PII type (PHONE_NUMBERS) assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained" # Verify metrics reflect different techniques assert ( "encrypt" in metrics["techniques_applied"].get("PERSON", "").lower() ), "Encrypt technique should be applied to PERSON" assert ( "encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower() ), "Encrypt technique should be applied to EMAIL" assert ( "retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower() ), "Retain technique should be applied to PHONE_NUMBERS" def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config): """AC1: Test that all instances of a configured PII type are processed.""" clear_vault_key("test_person_key") result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person) metrics = parse_metrics_markdown(metrics_md) # Verify all person names are encrypted person_names = [ "Alice Johnson", "Bob Williams", "Charlie Brown", "Maria Garcia", "David Wilson", ] for name in person_names: assert name not in result_text, f"All PERSON instances should be encrypted: {name}" # Verify metrics count multiple instances assert metrics["pii_by_type"].get("PERSON", 0) >= len( person_names ), f"Should detect at least {len(person_names)} PERSON entities" def test_ac1_empty_text_returns_empty(encrypt_person_config): """AC1: Test that empty or null text input raises a ValueError.""" clear_vault_key("test_person_key") with pytest.raises(ValueError) as exc_info: run_unstructured_op(encrypt_person_config, "") assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input" def test_ac1_text_without_pii_remains_unchanged(): """AC1: Test that text without any PII remains unchanged after processing.""" no_pii_text = """ The weather today is sunny with a high of 25 degrees Celsius. The conference starts at 9:00 AM in Room 301. """ config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_no_pii_key", ) ) ], ) clear_vault_key("test_no_pii_key") result_text, metrics_md = run_unstructured_op(config, no_pii_text) metrics = parse_metrics_markdown(metrics_md) assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged" assert metrics["total_pii_detected"] == 0, "No PII should be detected" def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii): """AC1: Test that placeholders for unconfigured PII indicate the entity type.""" encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_placeholder_key", ) ) ], ) clear_vault_key("test_placeholder_key") result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) metrics = parse_metrics_markdown(metrics_md) # Verify placeholder format (scrubadub uses {{TYPE}} format) placeholder_pattern = r"\{\{[A-Z_]+\}\}" placeholders = re.findall(placeholder_pattern, result_text) assert ( len(placeholders) > 0 ), "Result should contain entity-type placeholders for unconfigured PII" # Verify metrics track which PII types were detected assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types" # ================================================================================================ # AC2: Invalid Execution Handling # ================================================================================================ def test_ac2_graceful_abort_on_scrubadub_failure(): """AC2: Test graceful abort when the PII detection engine (Scrubadub) fails.""" text = "Test user John Smith with email john@example.com" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_abort_key", ) ) ], ) clear_vault_key("test_abort_key") # Mock Scrubadub to fail at the right import path with patch( "field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber" ) as mock_scrubber_class: mock_scrubber = MagicMock() mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error") mock_scrubber_class.return_value = mock_scrubber with pytest.raises(RuntimeError) as exc_info: run_unstructured_op(config, text) error_msg = str(exc_info.value).lower() assert ( "pii" in error_msg or "detection" in error_msg or "scrubadub" in error_msg or "failed" in error_msg ), "Error message should indicate PII detection failure" def test_ac2_graceful_abort_on_encryption_failure(sample_text_en): """AC2: Test graceful abort when an encryption technique fails during execution.""" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_encrypt_fail_key", ) ) ], ) clear_vault_key("test_encrypt_fail_key") # Mock encrypt function at correct path - it's imported from techniques module encrypt_path = ( "field_level_pseudo_anonymisation" ".techniques.anonymisation_pseudonymisation_techniques.encrypt" ) with patch(encrypt_path) as mock_encrypt: mock_encrypt.side_effect = Exception("Encryption algorithm failure") with pytest.raises(RuntimeError) as exc_info: run_unstructured_op(config, sample_text_en) error_msg = str(exc_info.value).lower() assert ( "encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg ), "Error message should indicate encryption failure" def test_ac2_null_text_input_raises_error(encrypt_person_config): """AC2: Test that a null (None) text input is rejected with an error.""" clear_vault_key("test_person_key") # Dagster will raise DagsterTypeCheckDidNotPass before op executes from dagster import DagsterTypeCheckDidNotPass with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)): run_unstructured_op(encrypt_person_config, None) def test_ac2_invalid_language_configuration(): """AC2: Test that an unsupported language in the config raises a validation error.""" # This should fail at config creation due to Pydantic validation with pytest.raises((ValueError, TypeError)): AnonymisePseudonymizeUnstructuredConfig( language="invalid_lang", # Should fail Pydantic validation used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key" ) ) ], ) def test_ac2_very_large_text_processing(): """AC2: Test that very large text inputs are processed successfully without memory errors.""" # Create large text with repeated PII patterns large_text = ( """ John Smith works at company. Email: john.smith@example.com. """ * 1000 ) # ~60KB of text with repeated PII config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], key_name="test_large_text_key", ) ) ], ) clear_vault_key("test_large_text_key") result_text, metrics_md = run_unstructured_op(config, large_text) metrics = parse_metrics_markdown(metrics_md) # Verify processing completed assert result_text is not None, "Large text should be processed successfully" assert len(result_text) > 0, "Result should not be empty" assert metrics["total_pii_detected"] > 0, "PII should be detected in large text" # ================================================================================================ # AC3: Execution Audit & Logging - Positive Scenario # ================================================================================================ def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config): """AC3: Test that successful execution context contains a run ID for logging.""" clear_vault_key("test_person_key") op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) context = build_op_context(op_config=op_config_dict) # Capture run context run_id = context.run_id # Execute operation result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) # Verify run identifier is available for logging assert run_id is not None, "Run ID must be available for audit logging" # Verify outputs are returned (for Dagster to log) assert result_text is not None, "Result text should be available for logging" assert metrics is not None, "Metrics should be available for logging" def test_ac3_successful_execution_logs_configuration_parameters( sample_text_en, mixed_technique_config ): """AC3: Test that the used configuration is accessible for logging on success.""" clear_vault_key("test_mixed_key") op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config) context = build_op_context(op_config=op_config_dict) result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) # Verify configuration is captured and accessible assert "used_function" in op_config_dict, "Configuration must be accessible for logging" assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured" # Verify techniques are logged techniques = [func["technique"] for func in op_config_dict["used_function"]] assert any( "encrypt" in str(tech) for tech in techniques ), "Encrypt technique should be in configuration" assert any( "retain" in str(tech) for tech in techniques ), "Retain technique should be in configuration" # Verify metrics contain technique information (in markdown string) metrics_str = metrics.value assert ( "Techniques Applied" in metrics_str ), "Applied techniques should be in metrics for logging" def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config): """AC3: Test that logs and metrics from a successful run do not contain raw PII.""" clear_vault_key("test_person_key") op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) context = build_op_context(op_config=op_config_dict) result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii) # Verify raw PII values are not in metrics metrics_str = metrics.value sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"] for pii_value in sensitive_values: assert ( pii_value not in metrics_str ), f"Raw PII value should not appear in metrics: {pii_value}" # Verify configuration logs do not contain raw PII config_str = str(op_config_dict) for pii_value in sensitive_values: assert ( pii_value not in config_str ), f"Raw PII value should not appear in configuration logs: {pii_value}" # ================================================================================================ # AC4: Execution Audit & Logging - Negative Scenario # ================================================================================================ def test_ac4_failed_execution_logs_error_details(): """AC4: Negative execution should surface clear error details (encryption key failure).""" text = "Test user John Smith" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_fail_log_key", ) ) ], ) clear_vault_key("test_fail_log_key") ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) # Patch the key retrieval used inside unstructured_ops to force failure with patch( "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key", side_effect=RuntimeError("Encryption key retrieval failed"), ): with pytest.raises(RuntimeError) as exc_info: # Consume the generator to trigger execution and raise the exception list(anonymize_pseudonymize_unstructured(ctx, text=text)) msg = str(exc_info.value).lower() assert "key" in msg and "failed" in msg, "Error message should mention key failure" def test_ac4_failed_execution_logs_configuration_used(): """AC4: Test that the attempted configuration is available for logging on failure.""" text = "Test data with person John Doe" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_config_fail_key", ) ) ], ) clear_vault_key("test_config_fail_key") op_config_dict = config_to_dagster_dict_unstructured(config) context = build_op_context(op_config=op_config_dict) # Mock _initialize_scrubber to fail with patch( "field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber" ) as mock_init_scrubber: mock_init_scrubber.side_effect = Exception("Scrubber module not available") with pytest.raises((RuntimeError, Exception)) as exc_info: list(anonymize_pseudonymize_unstructured(context, text=text)) # Verify configuration is still accessible despite failure assert op_config_dict is not None, "Configuration must be accessible for failure audit" assert ( "used_function" in op_config_dict ), "Technique configuration should be available for diagnosis" # Verify error was raised with proper message error_msg = str(exc_info.value).lower() assert ( "pii" in error_msg or "detection" in error_msg or "failed" in error_msg or "scrubber" in error_msg or "module" in error_msg ), "Error should indicate detection/processing failed" def test_ac4_failed_execution_logs_failure_reason(): """AC4: Test that the reason for a failure is clearly indicated in the error message.""" text = "User: Alice Smith, Email: alice@example.com" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], key_name="test_failure_reason_key", ) ) ], ) clear_vault_key("test_failure_reason_key") # Mock key retrieval function to fail with patch( "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key" ) as mock_get_key: mock_get_key.side_effect = RuntimeError("Vault connection timeout") with pytest.raises(RuntimeError) as exc_info: run_unstructured_op(config, text) # Verify failure reason is in error message error_msg = str(exc_info.value).lower() assert ( "encrypt" in error_msg or "key" in error_msg or "timeout" in error_msg or "failed" in error_msg ), "Error should indicate key retrieval/encryption failure" # ================================================================================================ # Additional Tests - Edge Cases and Integration # ================================================================================================ def test_multi_language_support_italian(): """Additional test: Verify that Italian text is processed correctly.""" italian_text = """ Il dottor Marco Rossi lavora presso l'ospedale. Email: marco.rossi@ospedale.it Telefono: +39-06-12345678 """ config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.it, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_italian_key", ) ) ], ) clear_vault_key("test_italian_key") result_text, metrics_md = run_unstructured_op(config, italian_text) metrics = parse_metrics_markdown(metrics_md) # Verify processing occurred assert result_text != italian_text, "Italian text should be processed" assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text" def test_special_characters_in_text(): """Additional test: Verify handling of text with special Unicode characters.""" special_text = """ User: João da Silva 🇧🇷 Email: joão@empresa.com.br Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ® """ config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.pt, used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], key_name="test_special_chars_key", ) ) ], ) clear_vault_key("test_special_chars_key") result_text, metrics_md = run_unstructured_op(config, special_text) # Verify processing completed without encoding errors assert result_text is not None, "Special characters should not cause processing failure" assert len(result_text) > 0, "Result should not be empty" def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config): """Additional test: Verify encryption format consistency across runs.""" clear_vault_key("test_person_key") result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en) result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en) # Both should have encryption tokens assert "{encrypt:" in result1, "First run should produce encrypted tokens" assert "{encrypt:" in result2, "Second run should produce encrypted tokens" # Verify consistent PII detection metrics1 = parse_metrics_markdown(metrics_md1) metrics2 = parse_metrics_markdown(metrics_md2) assert ( metrics1["total_pii_detected"] == metrics2["total_pii_detected"] ), "PII detection should be consistent across runs" # Verify token format is consistent (Fernet base64 pattern) token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}" tokens1 = re.findall(token_pattern, result1) tokens2 = re.findall(token_pattern, result2) assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"