feat(SIMPL-24642): migrate tests from 3 source repos with updated imports

2026-04-24 18:42:07 +02:00
parent 4e0b216410
commit d14b2dfac4
26 changed files with 6280 additions and 0 deletions
--- a/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py
+++ b/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py
@@ -0,0 +1,853 @@
+"""
+Test suite for field-level pseudonymisation operations on unstructured data.
+
+This test suite validates the pseudonymisation of unstructured text with PII detection,
+covering the following Acceptance Criteria:
+
+## Test Coverage Summary
+
+### Acceptance Criteria Coverage:
+- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests
+- AC2 (Invalid Execution Handling): 5 tests
+- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests
+- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests
+- Additional Coverage: 3 tests
+
+### Test Pattern:
+- Each test uses build_op_context with config_to_dagster_dict for configuration
+- Tests validate dual outputs (data, metrics)
+- Vault access is mocked for isolation
+- Tests validate Scrubadub automatic PII detection
+- Tests ensure placeholder replacement for unconfigured PII
+"""
+
+import pytest
+import re
+from dagster import build_op_context
+from unittest.mock import patch, MagicMock
+
+from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
+    AnonymisePseudonymizeUnstructuredConfig,
+    EncryptConfig,
+    RetainConfig,
+    PseudoTechniqueConfig,
+)
+from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum
+from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import (
+    anonymize_pseudonymize_unstructured,
+)
+
+from .conftest import clear_vault_key
+
+
+def config_to_dagster_dict_unstructured(config):
+    """Convert unstructured config to Dagster format."""
+    config_dict = {"language": config.language.value, "used_function": []}
+
+    for func_config in config.used_function:
+        technique = func_config.technique
+        technique_type = technique.type
+        technique_dict = technique.model_dump()
+
+        if "pii" in technique_dict:
+            technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii]
+
+        technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"}
+
+        config_dict["used_function"].append(
+            {"technique": {technique_type: technique_dict_without_type}}
+        )
+
+    return config_dict
+
+
+def run_unstructured_op(config, text):
+    """
+    Helper to run unstructured pseudonymisation op.
+
+    Returns:
+        tuple: (result_text: str, metrics_markdown: str)
+    """
+    context = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
+    result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text)
+
+    # Extract actual values from Output objects
+    return result_text.value, metrics.value
+
+
+def parse_metrics_markdown(metrics_md: str) -> dict:
+    """
+    Parse markdown metrics into structured dict for easier testing.
+
+    Args:
+        metrics_md: Markdown metrics string from op output
+
+    Returns:
+        dict with keys: total_pii_detected, pii_by_type, techniques_applied, language
+    """
+    result = {
+        "total_pii_detected": 0,
+        "pii_by_type": {},
+        "techniques_applied": {},
+        "language": "",
+    }
+
+    # Extract total PII detected
+    total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md)
+    if total_match:
+        result["total_pii_detected"] = int(total_match.group(1))
+
+    # Extract language
+    lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md)
+    if lang_match:
+        result["language"] = lang_match.group(1)
+
+    # Extract PII by type from table
+    pii_table_section = re.search(
+        r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)",
+        metrics_md,
+    )
+    if pii_table_section:
+        for line in pii_table_section.group(1).strip().split("\n"):
+            parts = [p.strip() for p in line.split("|") if p.strip()]
+            if len(parts) == 2:
+                entity_type, count = parts
+                result["pii_by_type"][entity_type] = int(count)
+
+    # Extract techniques applied
+    techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md)
+    if techniques_section:
+        for line in techniques_section.group(1).strip().split("\n"):
+            tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line)
+            if tech_match:
+                pii_type, technique = tech_match.groups()
+                result["techniques_applied"][pii_type] = technique
+
+    return result
+
+
+# -------------------------------- Fixtures ----------------------------------------
+
+
+@pytest.fixture
+def sample_text_en():
+    """English text with various PII types."""
+    return """
+    John Smith works at Acme Corporation. His email is john.smith@example.com
+    and his phone number is +1-555-123-4567. He lives in New York City at
+    123 Main Street, Apartment 4B. His SSN is 123-45-6789.
+    """
+
+
+@pytest.fixture
+def sample_text_multi_person():
+    """Text with multiple person names."""
+    return """
+    The meeting included Alice Johnson, Bob Williams, and Charlie Brown.
+    They discussed the project with Maria Garcia and David Wilson.
+    """
+
+
+@pytest.fixture
+def sample_text_mixed_pii():
+    """Text with multiple PII types for AC1 comprehensive testing."""
+    return """
+    Contact Information:
+    Name: Dr. Emily Watson
+    Email: emily.watson@hospital.com
+    Phone: +44-20-7946-0958
+    Website: https://patient-portal.hospital.com/records
+    """
+
+
+@pytest.fixture
+def encrypt_person_config():
+    """Configuration to encrypt PERSON entities."""
+    return AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_person_key",
+                )
+            )
+        ],
+    )
+
+
+@pytest.fixture
+def retain_person_config():
+    """Configuration to retain PERSON entities unchanged."""
+    return AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
+        ],
+    )
+
+
+@pytest.fixture
+def mixed_technique_config():
+    """Configuration with encryption and retention for AC1 testing."""
+    return AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
+                    key_name="test_mixed_key",
+                )
+            ),
+            PseudoTechniqueConfig(
+                technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS])
+            ),
+        ],
+    )
+
+
+# ================================================================================================
+# AC1: Pseudonymisation and Retention Are Applied Correctly
+# ================================================================================================
+
+
+def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config):
+    """AC1: Test that configured PII types are encrypted correctly."""
+    clear_vault_key("test_person_key")
+
+    result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify person name is encrypted (not in plaintext)
+    assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted"
+
+    # Verify encryption token is present
+    assert "{encrypt:" in result_text, "Encrypted token should be present in result"
+
+    # Verify PII was detected and processed
+    assert metrics["total_pii_detected"] > 0, "System should detect PII entities"
+    assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII"
+
+    # Verify text structure is preserved (surrounding text intact)
+    assert "Contact Information:" in result_text, "Non-PII text structure should be preserved"
+
+
+def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person):
+    """AC1: Test that PII types marked for retention remain unchanged."""
+    retain_config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
+        ],
+    )
+
+    result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify retained PII types remain in plaintext
+    assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged"
+    assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged"
+
+    # Verify technique applied is 'retain'
+    assert (
+        "retain" in metrics["techniques_applied"].get("PERSON", "").lower()
+    ), "Retain technique should be recorded for PERSON type"
+
+
+def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii):
+    """AC1: Test that unconfigured PII types are replaced with placeholders."""
+    encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_person_only_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_person_only_key")
+
+    result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
+
+    # Verify person is encrypted (configured)
+    assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
+
+    # Verify unconfigured PII types have placeholders
+    assert (
+        "{{" in result_text and "}}" in result_text
+    ), "Unconfigured PII should be replaced with placeholders"
+
+    # Verify original unconfigured PII values are not in result
+    assert (
+        "emily.watson@hospital.com" not in result_text
+    ), "Unconfigured EMAIL should be replaced with placeholder"
+
+    # Verify placeholder format
+    assert (
+        "{{EMAIL}}" in result_text or "{{URL}}" in result_text
+    ), "Placeholders should indicate entity type"
+
+
+def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config):
+    """AC1: Test that multiple techniques (encrypt, retain) are applied correctly."""
+    clear_vault_key("test_mixed_key")
+
+    result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify encrypted PII types (PERSON, EMAIL)
+    assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
+    assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted"
+
+    # Verify retained PII type (PHONE_NUMBERS)
+    assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained"
+
+    # Verify metrics reflect different techniques
+    assert (
+        "encrypt" in metrics["techniques_applied"].get("PERSON", "").lower()
+    ), "Encrypt technique should be applied to PERSON"
+    assert (
+        "encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower()
+    ), "Encrypt technique should be applied to EMAIL"
+    assert (
+        "retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower()
+    ), "Retain technique should be applied to PHONE_NUMBERS"
+
+
+def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config):
+    """AC1: Test that all instances of a configured PII type are processed."""
+    clear_vault_key("test_person_key")
+
+    result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify all person names are encrypted
+    person_names = [
+        "Alice Johnson",
+        "Bob Williams",
+        "Charlie Brown",
+        "Maria Garcia",
+        "David Wilson",
+    ]
+    for name in person_names:
+        assert name not in result_text, f"All PERSON instances should be encrypted: {name}"
+
+    # Verify metrics count multiple instances
+    assert metrics["pii_by_type"].get("PERSON", 0) >= len(
+        person_names
+    ), f"Should detect at least {len(person_names)} PERSON entities"
+
+
+def test_ac1_empty_text_returns_empty(encrypt_person_config):
+    """AC1: Test that empty or null text input raises a ValueError."""
+    clear_vault_key("test_person_key")
+
+    with pytest.raises(ValueError) as exc_info:
+        run_unstructured_op(encrypt_person_config, "")
+
+    assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input"
+
+
+def test_ac1_text_without_pii_remains_unchanged():
+    """AC1: Test that text without any PII remains unchanged after processing."""
+    no_pii_text = """
+    The weather today is sunny with a high of 25 degrees Celsius.
+    The conference starts at 9:00 AM in Room 301.
+    """
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_no_pii_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_no_pii_key")
+
+    result_text, metrics_md = run_unstructured_op(config, no_pii_text)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged"
+    assert metrics["total_pii_detected"] == 0, "No PII should be detected"
+
+
+def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii):
+    """AC1: Test that placeholders for unconfigured PII indicate the entity type."""
+    encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_placeholder_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_placeholder_key")
+
+    result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify placeholder format (scrubadub uses {{TYPE}} format)
+    placeholder_pattern = r"\{\{[A-Z_]+\}\}"
+    placeholders = re.findall(placeholder_pattern, result_text)
+
+    assert (
+        len(placeholders) > 0
+    ), "Result should contain entity-type placeholders for unconfigured PII"
+
+    # Verify metrics track which PII types were detected
+    assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types"
+
+
+# ================================================================================================
+# AC2: Invalid Execution Handling
+# ================================================================================================
+
+
+def test_ac2_graceful_abort_on_scrubadub_failure():
+    """AC2: Test graceful abort when the PII detection engine (Scrubadub) fails."""
+    text = "Test user John Smith with email john@example.com"
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_abort_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_abort_key")
+
+    # Mock Scrubadub to fail at the right import path
+    with patch(
+        "field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber"
+    ) as mock_scrubber_class:
+        mock_scrubber = MagicMock()
+        mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error")
+        mock_scrubber_class.return_value = mock_scrubber
+
+        with pytest.raises(RuntimeError) as exc_info:
+            run_unstructured_op(config, text)
+
+        error_msg = str(exc_info.value).lower()
+        assert (
+            "pii" in error_msg
+            or "detection" in error_msg
+            or "scrubadub" in error_msg
+            or "failed" in error_msg
+        ), "Error message should indicate PII detection failure"
+
+
+def test_ac2_graceful_abort_on_encryption_failure(sample_text_en):
+    """AC2: Test graceful abort when an encryption technique fails during execution."""
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_encrypt_fail_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_encrypt_fail_key")
+
+    # Mock encrypt function at correct path - it's imported from techniques module
+    encrypt_path = (
+        "field_level_pseudo_anonymisation"
+        ".techniques.anonymisation_pseudonymisation_techniques.encrypt"
+    )
+    with patch(encrypt_path) as mock_encrypt:
+        mock_encrypt.side_effect = Exception("Encryption algorithm failure")
+
+        with pytest.raises(RuntimeError) as exc_info:
+            run_unstructured_op(config, sample_text_en)
+
+        error_msg = str(exc_info.value).lower()
+        assert (
+            "encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg
+        ), "Error message should indicate encryption failure"
+
+
+def test_ac2_null_text_input_raises_error(encrypt_person_config):
+    """AC2: Test that a null (None) text input is rejected with an error."""
+    clear_vault_key("test_person_key")
+
+    # Dagster will raise DagsterTypeCheckDidNotPass before op executes
+    from dagster import DagsterTypeCheckDidNotPass
+
+    with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)):
+        run_unstructured_op(encrypt_person_config, None)
+
+
+def test_ac2_invalid_language_configuration():
+    """AC2: Test that an unsupported language in the config raises a validation error."""
+    # This should fail at config creation due to Pydantic validation
+    with pytest.raises((ValueError, TypeError)):
+        AnonymisePseudonymizeUnstructuredConfig(
+            language="invalid_lang",  # Should fail Pydantic validation
+            used_function=[
+                PseudoTechniqueConfig(
+                    technique=EncryptConfig(
+                        type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key"
+                    )
+                )
+            ],
+        )
+
+
+def test_ac2_very_large_text_processing():
+    """AC2: Test that very large text inputs are processed successfully without memory errors."""
+    # Create large text with repeated PII patterns
+    large_text = (
+        """
+    John Smith works at company. Email: john.smith@example.com.
+    """
+        * 1000
+    )  # ~60KB of text with repeated PII
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
+                    key_name="test_large_text_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_large_text_key")
+
+    result_text, metrics_md = run_unstructured_op(config, large_text)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify processing completed
+    assert result_text is not None, "Large text should be processed successfully"
+    assert len(result_text) > 0, "Result should not be empty"
+    assert metrics["total_pii_detected"] > 0, "PII should be detected in large text"
+
+
+# ================================================================================================
+# AC3: Execution Audit & Logging - Positive Scenario
+# ================================================================================================
+
+
+def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config):
+    """AC3: Test that successful execution context contains a run ID for logging."""
+    clear_vault_key("test_person_key")
+
+    op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
+    context = build_op_context(op_config=op_config_dict)
+
+    # Capture run context
+    run_id = context.run_id
+
+    # Execute operation
+    result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
+
+    # Verify run identifier is available for logging
+    assert run_id is not None, "Run ID must be available for audit logging"
+
+    # Verify outputs are returned (for Dagster to log)
+    assert result_text is not None, "Result text should be available for logging"
+    assert metrics is not None, "Metrics should be available for logging"
+
+
+def test_ac3_successful_execution_logs_configuration_parameters(
+    sample_text_en, mixed_technique_config
+):
+    """AC3: Test that the used configuration is accessible for logging on success."""
+    clear_vault_key("test_mixed_key")
+
+    op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config)
+    context = build_op_context(op_config=op_config_dict)
+
+    result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
+
+    # Verify configuration is captured and accessible
+    assert "used_function" in op_config_dict, "Configuration must be accessible for logging"
+    assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured"
+
+    # Verify techniques are logged
+    techniques = [func["technique"] for func in op_config_dict["used_function"]]
+    assert any(
+        "encrypt" in str(tech) for tech in techniques
+    ), "Encrypt technique should be in configuration"
+    assert any(
+        "retain" in str(tech) for tech in techniques
+    ), "Retain technique should be in configuration"
+
+    # Verify metrics contain technique information (in markdown string)
+    metrics_str = metrics.value
+    assert (
+        "Techniques Applied" in metrics_str
+    ), "Applied techniques should be in metrics for logging"
+
+
+def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config):
+    """AC3: Test that logs and metrics from a successful run do not contain raw PII."""
+    clear_vault_key("test_person_key")
+
+    op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
+    context = build_op_context(op_config=op_config_dict)
+
+    result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii)
+
+    # Verify raw PII values are not in metrics
+    metrics_str = metrics.value
+
+    sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"]
+
+    for pii_value in sensitive_values:
+        assert (
+            pii_value not in metrics_str
+        ), f"Raw PII value should not appear in metrics: {pii_value}"
+
+    # Verify configuration logs do not contain raw PII
+    config_str = str(op_config_dict)
+    for pii_value in sensitive_values:
+        assert (
+            pii_value not in config_str
+        ), f"Raw PII value should not appear in configuration logs: {pii_value}"
+
+
+# ================================================================================================
+# AC4: Execution Audit & Logging - Negative Scenario
+# ================================================================================================
+
+
+def test_ac4_failed_execution_logs_error_details():
+    """AC4: Negative execution should surface clear error details (encryption key failure)."""
+    text = "Test user John Smith"
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_fail_log_key",
+                )
+            )
+        ],
+    )
+    clear_vault_key("test_fail_log_key")
+    ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
+
+    # Patch the key retrieval used inside unstructured_ops to force failure
+    with patch(
+        "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key",
+        side_effect=RuntimeError("Encryption key retrieval failed"),
+    ):
+        with pytest.raises(RuntimeError) as exc_info:
+            # Consume the generator to trigger execution and raise the exception
+            list(anonymize_pseudonymize_unstructured(ctx, text=text))
+
+        msg = str(exc_info.value).lower()
+        assert "key" in msg and "failed" in msg, "Error message should mention key failure"
+
+
+def test_ac4_failed_execution_logs_configuration_used():
+    """AC4: Test that the attempted configuration is available for logging on failure."""
+    text = "Test data with person John Doe"
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_config_fail_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_config_fail_key")
+
+    op_config_dict = config_to_dagster_dict_unstructured(config)
+    context = build_op_context(op_config=op_config_dict)
+
+    # Mock _initialize_scrubber to fail
+    with patch(
+        "field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber"
+    ) as mock_init_scrubber:
+        mock_init_scrubber.side_effect = Exception("Scrubber module not available")
+
+        with pytest.raises((RuntimeError, Exception)) as exc_info:
+            list(anonymize_pseudonymize_unstructured(context, text=text))
+
+        # Verify configuration is still accessible despite failure
+        assert op_config_dict is not None, "Configuration must be accessible for failure audit"
+        assert (
+            "used_function" in op_config_dict
+        ), "Technique configuration should be available for diagnosis"
+
+        # Verify error was raised with proper message
+        error_msg = str(exc_info.value).lower()
+        assert (
+            "pii" in error_msg
+            or "detection" in error_msg
+            or "failed" in error_msg
+            or "scrubber" in error_msg
+            or "module" in error_msg
+        ), "Error should indicate detection/processing failed"
+
+
+def test_ac4_failed_execution_logs_failure_reason():
+    """AC4: Test that the reason for a failure is clearly indicated in the error message."""
+    text = "User: Alice Smith, Email: alice@example.com"
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.en,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
+                    key_name="test_failure_reason_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_failure_reason_key")
+
+    # Mock key retrieval function to fail
+    with patch(
+        "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
+    ) as mock_get_key:
+        mock_get_key.side_effect = RuntimeError("Vault connection timeout")
+
+        with pytest.raises(RuntimeError) as exc_info:
+            run_unstructured_op(config, text)
+
+        # Verify failure reason is in error message
+        error_msg = str(exc_info.value).lower()
+        assert (
+            "encrypt" in error_msg
+            or "key" in error_msg
+            or "timeout" in error_msg
+            or "failed" in error_msg
+        ), "Error should indicate key retrieval/encryption failure"
+
+
+# ================================================================================================
+# Additional Tests - Edge Cases and Integration
+# ================================================================================================
+
+
+def test_multi_language_support_italian():
+    """Additional test: Verify that Italian text is processed correctly."""
+    italian_text = """
+    Il dottor Marco Rossi lavora presso l'ospedale.
+    Email: marco.rossi@ospedale.it
+    Telefono: +39-06-12345678
+    """
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.it,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON],
+                    key_name="test_italian_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_italian_key")
+
+    result_text, metrics_md = run_unstructured_op(config, italian_text)
+    metrics = parse_metrics_markdown(metrics_md)
+
+    # Verify processing occurred
+    assert result_text != italian_text, "Italian text should be processed"
+    assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text"
+
+
+def test_special_characters_in_text():
+    """Additional test: Verify handling of text with special Unicode characters."""
+    special_text = """
+    User: João da Silva 🇧🇷
+    Email: joão@empresa.com.br
+    Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ®
+    """
+
+    config = AnonymisePseudonymizeUnstructuredConfig(
+        language=LanguageEnum.pt,
+        used_function=[
+            PseudoTechniqueConfig(
+                technique=EncryptConfig(
+                    type="encrypt",
+                    pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
+                    key_name="test_special_chars_key",
+                )
+            )
+        ],
+    )
+
+    clear_vault_key("test_special_chars_key")
+
+    result_text, metrics_md = run_unstructured_op(config, special_text)
+
+    # Verify processing completed without encoding errors
+    assert result_text is not None, "Special characters should not cause processing failure"
+    assert len(result_text) > 0, "Result should not be empty"
+
+
+def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config):
+    """Additional test: Verify encryption format consistency across runs."""
+    clear_vault_key("test_person_key")
+
+    result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en)
+    result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en)
+
+    # Both should have encryption tokens
+    assert "{encrypt:" in result1, "First run should produce encrypted tokens"
+    assert "{encrypt:" in result2, "Second run should produce encrypted tokens"
+
+    # Verify consistent PII detection
+    metrics1 = parse_metrics_markdown(metrics_md1)
+    metrics2 = parse_metrics_markdown(metrics_md2)
+
+    assert (
+        metrics1["total_pii_detected"] == metrics2["total_pii_detected"]
+    ), "PII detection should be consistent across runs"
+
+    # Verify token format is consistent (Fernet base64 pattern)
+    token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}"
+    tokens1 = re.findall(token_pattern, result1)
+    tokens2 = re.findall(token_pattern, result2)
+
+    assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"