feat(SIMPL-24642): migrate tests from 3 source repos with updated imports
This commit is contained in:
1
tests/field_level_pseudo_anonymisation/__init__.py
Normal file
1
tests/field_level_pseudo_anonymisation/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
444
tests/field_level_pseudo_anonymisation/conftest.py
Normal file
444
tests/field_level_pseudo_anonymisation/conftest.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
Shared pytest fixtures and helpers for field-level pseudonymisation tests.
|
||||
|
||||
This module provides:
|
||||
- Mock Vault client for testing without real Vault connections
|
||||
- Sample data fixtures
|
||||
- Configuration fixtures for encryption/decryption operations
|
||||
- Helper functions for running ops and managing test Vault storage
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from dagster import build_op_context
|
||||
from cryptography.fernet import Fernet
|
||||
from hvac.exceptions import InvalidPath, Forbidden
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
||||
AnonymisePseudonymizeStructuredConfig,
|
||||
DepseudonymizeStructuredConfig,
|
||||
EncryptConfig,
|
||||
DecryptConfig,
|
||||
PseudoTechniqueConfig,
|
||||
DepseudoTechniqueConfig,
|
||||
)
|
||||
from template_code_location.field_level_pseudo_anonymisation.ops import (
|
||||
anonymize_pseudonymize_structured,
|
||||
depseudonymize_structured,
|
||||
)
|
||||
|
||||
|
||||
# -------------------------------- Mock Vault Storage ----------------------------------------
|
||||
|
||||
# In-memory Vault simulation for tests
|
||||
_test_vault_storage = {}
|
||||
_test_vault_access_control = {} # For simulating access control
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_vault_client():
|
||||
"""
|
||||
Auto-use fixture that mocks the hvac.Client to avoid real Vault connections.
|
||||
Uses an in-memory dict to simulate Vault storage for tests.
|
||||
Includes access control simulation for AC3.
|
||||
"""
|
||||
global _test_vault_storage, _test_vault_access_control
|
||||
_test_vault_storage = {} # Reset storage before each test
|
||||
_test_vault_access_control = {} # Reset access control
|
||||
|
||||
def mock_read_secret(path, mount_point):
|
||||
"""Mock reading secret from Vault with access control"""
|
||||
full_path = f"{mount_point}/{path}"
|
||||
|
||||
# Check access control first
|
||||
if full_path in _test_vault_access_control:
|
||||
if not _test_vault_access_control[full_path]:
|
||||
raise Forbidden(f"Access denied to secret: {full_path}")
|
||||
|
||||
if full_path not in _test_vault_storage:
|
||||
raise InvalidPath(f"Secret not found: {full_path}")
|
||||
return {"data": {"data": {"value": _test_vault_storage[full_path]}}}
|
||||
|
||||
def mock_create_or_update_secret(path, mount_point, secret):
|
||||
"""Mock creating/updating secret in Vault"""
|
||||
full_path = f"{mount_point}/{path}"
|
||||
_test_vault_storage[full_path] = secret["value"]
|
||||
|
||||
def mock_delete_metadata(path, mount_point):
|
||||
"""Mock deleting secret from Vault"""
|
||||
full_path = f"{mount_point}/{path}"
|
||||
if full_path in _test_vault_storage:
|
||||
del _test_vault_storage[full_path]
|
||||
if full_path in _test_vault_access_control:
|
||||
del _test_vault_access_control[full_path]
|
||||
|
||||
with patch("hvac.Client") as mock_client_class:
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret
|
||||
mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = (
|
||||
mock_create_or_update_secret
|
||||
)
|
||||
mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = (
|
||||
mock_delete_metadata
|
||||
)
|
||||
mock_client_class.return_value = mock_instance
|
||||
yield mock_instance
|
||||
|
||||
|
||||
# -------------------------------- Sample Data Fixtures ----------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_df():
|
||||
"""
|
||||
Fixture providing a sample structured dataset with PII data.
|
||||
Represents typical data that requires pseudonymisation and restoration.
|
||||
"""
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"id": [1, 2, 3, 4, 5],
|
||||
"name": [
|
||||
"Alice Smith",
|
||||
"Bob Jones",
|
||||
"Charlie Brown",
|
||||
"David Wilson",
|
||||
"Eva Garcia",
|
||||
],
|
||||
"email": [
|
||||
"alice@example.com",
|
||||
"bob@example.com",
|
||||
"charlie@example.com",
|
||||
"david@example.com",
|
||||
"eva@example.com",
|
||||
],
|
||||
"ssn": [
|
||||
"123-45-6789",
|
||||
"234-56-7890",
|
||||
"345-67-8901",
|
||||
"456-78-9012",
|
||||
"567-89-0123",
|
||||
],
|
||||
"age": [25, 30, 35, 40, 45],
|
||||
"salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0],
|
||||
"department": ["HR", "IT", "Finance", "IT", "HR"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# -------------------------------- Configuration Fixtures ----------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def encrypt_config_single_field():
|
||||
"""
|
||||
Configuration for encrypting a single field (email).
|
||||
Used to create pseudonymised data for restoration tests.
|
||||
"""
|
||||
return AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
columns=["email"],
|
||||
key_name="test_restoration_key_single",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def decrypt_config_single_field():
|
||||
"""
|
||||
Configuration for decrypting a single field (email).
|
||||
Used to restore original values.
|
||||
"""
|
||||
return DepseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
type="decrypt",
|
||||
columns=["email"],
|
||||
key_name="test_restoration_key_single",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def encrypt_config_multiple_fields():
|
||||
"""
|
||||
Configuration for encrypting multiple fields (name, email, ssn).
|
||||
Tests restoration of multiple sensitive fields.
|
||||
"""
|
||||
return AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
columns=["name", "email", "ssn"],
|
||||
key_name="test_restoration_key_multi",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def decrypt_config_multiple_fields():
|
||||
"""
|
||||
Configuration for decrypting multiple fields (name, email, ssn).
|
||||
"""
|
||||
return DepseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
type="decrypt",
|
||||
columns=["name", "email", "ssn"],
|
||||
key_name="test_restoration_key_multi",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def encrypt_config_partial_fields():
|
||||
"""
|
||||
Configuration for encrypting only some fields (email, ssn).
|
||||
Tests partial restoration scenarios.
|
||||
"""
|
||||
return AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
columns=["email", "ssn"],
|
||||
key_name="test_restoration_key_partial",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def decrypt_config_partial_fields():
|
||||
"""
|
||||
Configuration for decrypting only some fields (email, ssn).
|
||||
"""
|
||||
return DepseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
type="decrypt",
|
||||
columns=["email", "ssn"],
|
||||
key_name="test_restoration_key_partial",
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def authorized_multi_key_scenario():
|
||||
"""
|
||||
Fixture for testing multi-key authorization scenarios.
|
||||
Sets up two keys: one authorized, one denied.
|
||||
"""
|
||||
clear_vault_key("authorized_key")
|
||||
clear_vault_key("unauthorized_key")
|
||||
|
||||
# Create authorized key by generating it
|
||||
authorized_key = Fernet.generate_key().decode()
|
||||
set_vault_key("authorized_key", authorized_key)
|
||||
|
||||
# Create unauthorized key and deny access
|
||||
unauthorized_key = Fernet.generate_key().decode()
|
||||
set_vault_key("unauthorized_key", unauthorized_key)
|
||||
deny_vault_access("unauthorized_key")
|
||||
|
||||
yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"}
|
||||
|
||||
# Cleanup
|
||||
clear_vault_key("authorized_key")
|
||||
clear_vault_key("unauthorized_key")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def large_dataset():
|
||||
"""
|
||||
Fixture providing a large dataset (10,000 rows) for performance testing.
|
||||
Reusable across multiple performance tests.
|
||||
"""
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"id": range(1, 10001),
|
||||
"email": [f"user{i}@example.com" for i in range(1, 10001)],
|
||||
"name": [f"User {i}" for i in range(1, 10001)],
|
||||
"ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)],
|
||||
"age": [20 + (i % 50) for i in range(1, 10001)],
|
||||
"salary": [30000.0 + (i * 10) for i in range(1, 10001)],
|
||||
"department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def vault_test_keys():
|
||||
"""
|
||||
Session-scoped fixture to pre-generate test keys for faster test execution.
|
||||
Avoids repeated key generation in each test.
|
||||
"""
|
||||
keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)}
|
||||
|
||||
return keys
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cleanup_test_keys(request):
|
||||
"""
|
||||
Fixture to automatically cleanup test keys after each test.
|
||||
Use with: @pytest.mark.usefixtures("cleanup_test_keys")
|
||||
"""
|
||||
yield
|
||||
|
||||
# Cleanup all test keys from mock Vault
|
||||
test_keys = [k for k in _test_vault_storage.keys() if "test_" in k]
|
||||
for key in test_keys:
|
||||
_test_vault_storage.pop(key, None)
|
||||
|
||||
|
||||
# -------------------------------- Helper Functions ----------------------------------------
|
||||
|
||||
|
||||
def config_to_dagster_dict(config):
|
||||
"""
|
||||
Convert Pydantic config to Dagster-compatible dictionary.
|
||||
|
||||
For AnonymisePseudonymizeStructuredConfig (uses discriminated Union):
|
||||
Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}}
|
||||
Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}}
|
||||
|
||||
For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union):
|
||||
Pydantic v2 outputs:
|
||||
{'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}}
|
||||
Dagster expects: Same flat structure with 'type' field
|
||||
|
||||
Args:
|
||||
config: Pydantic config instance
|
||||
(AnonymisePseudonymizeStructuredConfig or
|
||||
DepseudonymizeStructuredConfig)
|
||||
|
||||
Returns:
|
||||
dict: Dagster-compatible configuration dictionary
|
||||
"""
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
||||
AnonymisePseudonymizeStructuredConfig,
|
||||
)
|
||||
|
||||
config_dict = config.model_dump()
|
||||
|
||||
# Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig
|
||||
# DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union)
|
||||
if isinstance(config, AnonymisePseudonymizeStructuredConfig):
|
||||
if "used_function" in config_dict:
|
||||
for func_config in config_dict["used_function"]:
|
||||
if "technique" in func_config:
|
||||
technique = func_config["technique"]
|
||||
# Pydantic outputs flat dict with 'type' field for discriminated unions
|
||||
if isinstance(technique, dict) and "type" in technique:
|
||||
# Extract the type discriminator
|
||||
technique_type = technique["type"]
|
||||
# Create nested structure without the 'type' field
|
||||
technique_data = {k: v for k, v in technique.items() if k != "type"}
|
||||
# Nest under the discriminator key for Dagster
|
||||
func_config["technique"] = {technique_type: technique_data}
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
def run_encrypt_op(config, df):
|
||||
"""
|
||||
Helper function to execute the anonymize_pseudonymize_structured op.
|
||||
|
||||
Args:
|
||||
config: AnonymisePseudonymizeStructuredConfig instance
|
||||
df: Input pandas DataFrame
|
||||
|
||||
Returns:
|
||||
tuple: (result_df, metrics) - Output DataFrame and metrics dict
|
||||
"""
|
||||
context = build_op_context(op_config=config_to_dagster_dict(config))
|
||||
result_df, metrics = anonymize_pseudonymize_structured(context, df=df)
|
||||
return result_df.value, metrics.value
|
||||
|
||||
|
||||
def run_decrypt_op(config, df):
|
||||
"""
|
||||
Helper function to execute the depseudonymize_structured op.
|
||||
|
||||
Args:
|
||||
config: DepseudonymizeStructuredConfig instance
|
||||
df: Input pandas DataFrame
|
||||
|
||||
Returns:
|
||||
tuple: (result_df, metrics) - Output DataFrame and metrics dict
|
||||
"""
|
||||
context = build_op_context(op_config=config_to_dagster_dict(config))
|
||||
result_df, metrics = depseudonymize_structured(context, df=df)
|
||||
return result_df.value, metrics.value
|
||||
|
||||
|
||||
def clear_vault_key(key_name: str):
|
||||
"""
|
||||
Helper function to clear a key from the simulated Vault storage for test isolation.
|
||||
|
||||
Args:
|
||||
key_name: Name of the key to delete from Vault
|
||||
"""
|
||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
||||
if full_path in _test_vault_storage:
|
||||
del _test_vault_storage[full_path]
|
||||
if full_path in _test_vault_access_control:
|
||||
del _test_vault_access_control[full_path]
|
||||
|
||||
|
||||
def set_vault_key(key_name: str, key_value: str):
|
||||
"""
|
||||
Helper function to set a key in the simulated Vault storage.
|
||||
|
||||
Args:
|
||||
key_name: Name of the key
|
||||
key_value: Value of the key (Fernet key as string)
|
||||
"""
|
||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
||||
_test_vault_storage[full_path] = key_value
|
||||
|
||||
|
||||
def deny_vault_access(key_name: str):
|
||||
"""
|
||||
Helper function to deny access to a key for authorization testing (AC3).
|
||||
|
||||
Args:
|
||||
key_name: Name of the key to deny access to
|
||||
"""
|
||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
||||
_test_vault_access_control[full_path] = False
|
||||
|
||||
|
||||
def get_vault_key(key_name: str) -> bytes:
|
||||
"""
|
||||
Helper function to retrieve a key from the simulated Vault storage.
|
||||
|
||||
Args:
|
||||
key_name: Name of the key to retrieve
|
||||
|
||||
Returns:
|
||||
bytes: The encryption key
|
||||
"""
|
||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
||||
if full_path not in _test_vault_storage:
|
||||
raise InvalidPath(f"Key not found: {key_name}")
|
||||
return _test_vault_storage[full_path].encode()
|
||||
@@ -0,0 +1,633 @@
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
||||
AnonymisePseudonymizeStructuredConfig,
|
||||
DepseudonymizeStructuredConfig,
|
||||
PseudoTechniqueConfig,
|
||||
DepseudoTechniqueConfig,
|
||||
HashConfig,
|
||||
EncryptConfig,
|
||||
RedactConfig,
|
||||
ReplaceConfig,
|
||||
DecryptConfig,
|
||||
)
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
||||
AnonymisePseudonymizeUnstructuredConfig,
|
||||
DepseudonymizeUnstructuredConfig,
|
||||
PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig,
|
||||
DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig,
|
||||
HashConfig as UnstructuredHashConfig,
|
||||
EncryptConfig as UnstructuredEncryptConfig,
|
||||
RedactConfig as UnstructuredRedactConfig,
|
||||
ReplaceConfig as UnstructuredReplaceConfig,
|
||||
RetainConfig,
|
||||
DecryptConfig as UnstructuredDecryptConfig,
|
||||
)
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum
|
||||
|
||||
|
||||
# ==================== Structured Config Tests ====================
|
||||
|
||||
class TestStructuredConfigValidators:
|
||||
"""Tests for structured_config.py validators and validators."""
|
||||
|
||||
def test_ensure_unique_columns_valid_single_technique(self):
|
||||
"""Test that single technique with single column passes validation."""
|
||||
config = AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key1"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert config is not None
|
||||
assert len(config.used_function) == 1
|
||||
|
||||
def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self):
|
||||
"""Test that multiple techniques with different columns passes validation."""
|
||||
config = AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=HashConfig(
|
||||
columns=["ssn"],
|
||||
algorithm="sha256"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert config is not None
|
||||
assert len(config.used_function) == 2
|
||||
|
||||
def test_ensure_unique_columns_duplicate_columns_same_technique(self):
|
||||
"""Test that duplicate columns in different techniques raises error."""
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=HashConfig(
|
||||
columns=["email"],
|
||||
algorithm="sha256"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert "Duplicate column" in str(exc_info.value)
|
||||
assert "email" in str(exc_info.value)
|
||||
|
||||
def test_ensure_unique_columns_multiple_duplicates(self):
|
||||
"""Test error message with multiple duplicate columns."""
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
columns=["email", "phone"],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=HashConfig(
|
||||
columns=["email", "phone"],
|
||||
algorithm="sha256"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
error_msg = str(exc_info.value)
|
||||
assert "Duplicate column" in error_msg
|
||||
assert "email" in error_msg
|
||||
assert "phone" in error_msg
|
||||
|
||||
def test_collect_column_to_techniques_single_technique(self):
|
||||
"""Test _collect_column_to_techniques with single technique."""
|
||||
config = AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
columns=["email", "phone"],
|
||||
key_name="key1"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
mapping = config._collect_column_to_techniques()
|
||||
assert mapping == {
|
||||
"email": ["encrypt"],
|
||||
"phone": ["encrypt"]
|
||||
}
|
||||
|
||||
def test_extract_technique_and_columns_dict_with_type_field(self):
|
||||
"""Test _extract_technique_and_columns with dict containing 'type' field."""
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(
|
||||
{
|
||||
"technique": {
|
||||
"type": "encrypt",
|
||||
"columns": ["email", "ssn"],
|
||||
"key_name": "test_key"
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "encrypt"
|
||||
assert columns == ["email", "ssn"]
|
||||
|
||||
def test_extract_technique_and_columns_dict_with_variant_mapping(self):
|
||||
"""Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}."""
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(
|
||||
{
|
||||
"technique": {
|
||||
"encrypt": {
|
||||
"columns": ["ssn"],
|
||||
"key_name": "test_key"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "encrypt"
|
||||
assert columns == ["ssn"]
|
||||
|
||||
def test_extract_technique_and_columns_model_instance(self):
|
||||
"""Test _extract_technique_and_columns with PseudoTechniqueConfig model instance."""
|
||||
pseudo_config = PseudoTechniqueConfig(
|
||||
technique=RedactConfig(columns=["address"])
|
||||
)
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
|
||||
assert technique_type == "redact"
|
||||
assert columns == ["address"]
|
||||
|
||||
def test_extract_technique_and_columns_empty_dict(self):
|
||||
"""Test _extract_technique_and_columns with empty dict."""
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(
|
||||
{"technique": {}}
|
||||
)
|
||||
assert technique_type is None
|
||||
assert columns == []
|
||||
|
||||
def test_extract_technique_and_columns_none_technique(self):
|
||||
"""Test _extract_technique_and_columns with None technique."""
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(
|
||||
{"technique": None}
|
||||
)
|
||||
assert technique_type is None
|
||||
assert columns == []
|
||||
|
||||
def test_extract_technique_and_columns_missing_columns_key(self):
|
||||
"""Test _extract_technique_and_columns when 'columns' key is missing."""
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(
|
||||
{
|
||||
"technique": {
|
||||
"type": "encrypt",
|
||||
"key_name": "test_key"
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "encrypt"
|
||||
assert columns == []
|
||||
|
||||
def test_extract_technique_and_columns_model_without_columns_attr(self):
|
||||
"""Test _extract_technique_and_columns with model instance missing columns attribute."""
|
||||
pseudo_config = PseudoTechniqueConfig(
|
||||
technique=ReplaceConfig(columns=["old_value"], new_value="NEW")
|
||||
)
|
||||
config = AnonymisePseudonymizeStructuredConfig()
|
||||
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
|
||||
assert technique_type == "replace"
|
||||
assert columns == ["old_value"]
|
||||
|
||||
|
||||
class TestStructuredDepseudonymizeConfig:
|
||||
"""Tests for DepseudonymizeStructuredConfig."""
|
||||
|
||||
def test_depseudonymize_config_normalize_used_function_with_dict(self):
|
||||
"""Test _normalize_depseudo_used_function with dict input."""
|
||||
config = DepseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
{
|
||||
"technique": {
|
||||
"type": "decrypt",
|
||||
"columns": ["email"],
|
||||
"key_name": "key1"
|
||||
}
|
||||
}
|
||||
]
|
||||
)
|
||||
assert len(config.used_function) == 1
|
||||
assert isinstance(config.used_function[0], DepseudoTechniqueConfig)
|
||||
assert config.used_function[0].technique.type == "decrypt"
|
||||
|
||||
def test_depseudonymize_config_normalize_used_function_with_model(self):
|
||||
"""Test _normalize_depseudo_used_function with model instance."""
|
||||
depseudo_tech = DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key1"
|
||||
)
|
||||
)
|
||||
config = DepseudonymizeStructuredConfig(
|
||||
used_function=[depseudo_tech]
|
||||
)
|
||||
assert len(config.used_function) == 1
|
||||
assert config.used_function[0] is depseudo_tech
|
||||
|
||||
def test_depseudonymize_config_ensure_unique_columns_no_op(self):
|
||||
"""Test that ensure_unique_columns is a no-op for depseudonymize."""
|
||||
# For depseudonymize, there's no per-column uniqueness constraint
|
||||
config = DepseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(
|
||||
columns=["email"],
|
||||
key_name="key2"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
# Should not raise - no-op validator
|
||||
assert config is not None
|
||||
|
||||
|
||||
# ==================== Unstructured Config Tests ====================
|
||||
|
||||
class TestUnstructuredConfigValidators:
|
||||
"""Tests for unstructured_config.py validators."""
|
||||
|
||||
def test_normalize_used_function_with_dict(self):
|
||||
"""Test _normalize_used_function with dict input."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
{
|
||||
"technique": {
|
||||
"encrypt": {
|
||||
"pii": [PIIEntityEnum.EMAIL.value],
|
||||
"key_name": "key1"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
)
|
||||
assert len(config.used_function) == 1
|
||||
|
||||
def test_normalize_used_function_with_model(self):
|
||||
"""Test _normalize_used_function with model instance."""
|
||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredEncryptConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value],
|
||||
key_name="key1"
|
||||
)
|
||||
)
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[pseudo_tech]
|
||||
)
|
||||
assert len(config.used_function) == 1
|
||||
|
||||
def test_ensure_unique_pii_valid_different_pii_types(self):
|
||||
"""Test that different PII types pass validation."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredEncryptConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredHashConfig(
|
||||
pii=[PIIEntityEnum.PERSON.value],
|
||||
algorithm="sha256"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert config is not None
|
||||
assert len(config.used_function) == 2
|
||||
|
||||
def test_ensure_unique_pii_duplicate_pii_types(self):
|
||||
"""Test that duplicate PII types raise error."""
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredEncryptConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value],
|
||||
key_name="key1"
|
||||
)
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredHashConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value],
|
||||
algorithm="sha256"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert "Duplicate PII" in str(exc_info.value)
|
||||
# Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value
|
||||
assert "EMAIL" in str(exc_info.value)
|
||||
|
||||
def test_collect_pii_to_techniques_single_technique(self):
|
||||
"""Test _collect_pii_to_techniques with single technique."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredEncryptConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value],
|
||||
key_name="key1"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
mapping = config._collect_pii_to_techniques()
|
||||
assert mapping == {
|
||||
PIIEntityEnum.EMAIL.value: ["encrypt"],
|
||||
PIIEntityEnum.PERSON.value: ["encrypt"]
|
||||
}
|
||||
|
||||
def test_extract_technique_and_pii_dict_with_type_field(self):
|
||||
"""Test _extract_technique_and_pii with dict containing 'type' field."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(
|
||||
{
|
||||
"technique": {
|
||||
"type": "encrypt",
|
||||
"pii": [PIIEntityEnum.EMAIL.value],
|
||||
"key_name": "test_key"
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "encrypt"
|
||||
assert piis == [PIIEntityEnum.EMAIL.value]
|
||||
|
||||
def test_extract_technique_and_pii_dict_with_variant_mapping(self):
|
||||
"""Test _extract_technique_and_pii with variant-key mapping."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(
|
||||
{
|
||||
"technique": {
|
||||
"hash": {
|
||||
"pii": [PIIEntityEnum.PERSON.value],
|
||||
"algorithm": "sha256"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "hash"
|
||||
assert piis == [PIIEntityEnum.PERSON.value]
|
||||
|
||||
def test_extract_technique_and_pii_dict_fallback_to_columns(self):
|
||||
"""Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(
|
||||
{
|
||||
"technique": {
|
||||
"type": "redact",
|
||||
"columns": ["fallback_col"]
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "redact"
|
||||
assert piis == ["fallback_col"]
|
||||
|
||||
def test_extract_technique_and_pii_model_instance(self):
|
||||
"""Test _extract_technique_and_pii with model instance."""
|
||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredRedactConfig(
|
||||
pii=[PIIEntityEnum.EMAIL.value]
|
||||
)
|
||||
)
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
|
||||
assert technique_type == "redact"
|
||||
assert piis == [PIIEntityEnum.EMAIL.value]
|
||||
|
||||
def test_extract_technique_and_pii_model_with_getattr_fallback(self):
|
||||
"""Test _extract_technique_and_pii model with getattr fallback to columns."""
|
||||
# Create a mock-like scenario where pii attribute doesn't exist
|
||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
||||
technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value])
|
||||
)
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
|
||||
assert technique_type == "retain"
|
||||
assert piis == [PIIEntityEnum.PERSON.value]
|
||||
|
||||
def test_extract_technique_and_pii_empty_dict(self):
|
||||
"""Test _extract_technique_and_pii with empty dict."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(
|
||||
{"technique": {}}
|
||||
)
|
||||
assert technique_type is None
|
||||
assert piis == []
|
||||
|
||||
def test_extract_technique_and_pii_missing_pii_key(self):
|
||||
"""Test _extract_technique_and_pii when 'pii' key is missing."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
||||
technique_type, piis = config._extract_technique_and_pii(
|
||||
{
|
||||
"technique": {
|
||||
"type": "encrypt",
|
||||
"key_name": "test_key"
|
||||
}
|
||||
}
|
||||
)
|
||||
assert technique_type == "encrypt"
|
||||
assert piis == []
|
||||
|
||||
|
||||
class TestUnstructuredDepseudonymizeConfig:
|
||||
"""Tests for DepseudonymizeUnstructuredConfig."""
|
||||
|
||||
def test_depseudonymize_unstructured_config_default(self):
|
||||
"""Test default DepseudonymizeUnstructuredConfig."""
|
||||
config = DepseudonymizeUnstructuredConfig()
|
||||
assert config is not None
|
||||
assert len(config.used_function) >= 1
|
||||
|
||||
def test_depseudonymize_unstructured_config_with_custom_function(self):
|
||||
"""Test DepseudonymizeUnstructuredConfig with custom function."""
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
UnstructuredDepseudoTechniqueConfig(
|
||||
technique=UnstructuredDecryptConfig(
|
||||
key_name="custom_key"
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(config.used_function) == 1
|
||||
assert config.used_function[0].technique.key_name == "custom_key"
|
||||
|
||||
|
||||
class TestLanguageSupport:
|
||||
"""Tests for language configuration support."""
|
||||
|
||||
def test_all_supported_languages(self):
|
||||
"""Test that all supported languages can be set."""
|
||||
supported_languages = [
|
||||
LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en,
|
||||
LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el,
|
||||
LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt,
|
||||
LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv
|
||||
]
|
||||
|
||||
for lang in supported_languages:
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(language=lang)
|
||||
assert config.language == lang
|
||||
|
||||
def test_default_language_is_english(self):
|
||||
"""Test that default language is English."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig()
|
||||
assert config.language == LanguageEnum.en
|
||||
|
||||
|
||||
class TestTechniqueConfigDefaults:
|
||||
"""Tests for technique config defaults."""
|
||||
|
||||
def test_hash_config_default_algorithm(self):
|
||||
"""Test HashConfig default algorithm."""
|
||||
config = HashConfig()
|
||||
assert config.algorithm == "sha256"
|
||||
assert config.type == "hash"
|
||||
|
||||
def test_encrypt_config_defaults(self):
|
||||
"""Test EncryptConfig defaults."""
|
||||
config = EncryptConfig()
|
||||
assert config.type == "encrypt"
|
||||
assert config.key_name == "my_key"
|
||||
|
||||
def test_redact_config_defaults(self):
|
||||
"""Test RedactConfig defaults."""
|
||||
config = RedactConfig()
|
||||
assert config.type == "redact"
|
||||
|
||||
def test_replace_config_defaults(self):
|
||||
"""Test ReplaceConfig defaults."""
|
||||
config = ReplaceConfig()
|
||||
assert config.type == "replace"
|
||||
assert config.new_value == "REPLACED"
|
||||
|
||||
def test_decrypt_config_defaults(self):
|
||||
"""Test DecryptConfig defaults."""
|
||||
config = DecryptConfig()
|
||||
assert config.type == "decrypt"
|
||||
assert config.key_name == "my_key"
|
||||
|
||||
def test_unstructured_retain_config_defaults(self):
|
||||
"""Test RetainConfig defaults."""
|
||||
config = RetainConfig()
|
||||
assert config.type == "retain"
|
||||
|
||||
|
||||
class TestPseudoTechniqueConfigDefaults:
|
||||
"""Tests for PseudoTechniqueConfig defaults."""
|
||||
|
||||
def test_pseudo_technique_default_to_hash(self):
|
||||
"""Test PseudoTechniqueConfig defaults to hash technique."""
|
||||
config = PseudoTechniqueConfig()
|
||||
# For Dagster Config, technique may be a dict with the discriminator structure
|
||||
if isinstance(config.technique, dict):
|
||||
# Check if it has hash configuration
|
||||
assert "hash" in config.technique or config.technique.get("type") == "hash"
|
||||
else:
|
||||
assert config.technique.type == "hash"
|
||||
|
||||
def test_unstructured_pseudo_technique_default_to_hash(self):
|
||||
"""Test UnstructuredPseudoTechniqueConfig defaults to hash technique."""
|
||||
config = UnstructuredPseudoTechniqueConfig()
|
||||
# For Dagster Config, technique may be a dict with the discriminator structure
|
||||
if isinstance(config.technique, dict):
|
||||
# Check if it has hash configuration
|
||||
assert "hash" in config.technique or config.technique.get("type") == "hash"
|
||||
else:
|
||||
assert config.technique.type == "hash"
|
||||
|
||||
|
||||
class TestConfigModelIntegration:
|
||||
"""Integration tests for config models."""
|
||||
|
||||
def test_structured_config_with_all_technique_types(self):
|
||||
"""Test structured config with all technique types."""
|
||||
config = AnonymisePseudonymizeStructuredConfig(
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=HashConfig(columns=["col1"])
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(columns=["col2"], key_name="k1")
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=RedactConfig(columns=["col3"])
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=ReplaceConfig(columns=["col4"], new_value="X")
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(config.used_function) == 4
|
||||
techniques = {f.technique.type for f in config.used_function}
|
||||
assert techniques == {"hash", "encrypt", "redact", "replace"}
|
||||
|
||||
def test_unstructured_config_with_all_technique_types(self):
|
||||
"""Test unstructured config with all technique types."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value])
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredEncryptConfig(
|
||||
pii=[PIIEntityEnum.PERSON.value],
|
||||
key_name="k1"
|
||||
)
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value])
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=UnstructuredReplaceConfig(
|
||||
pii=[PIIEntityEnum.CREDIT_CARD.value],
|
||||
new_value="X"
|
||||
)
|
||||
),
|
||||
UnstructuredPseudoTechniqueConfig(
|
||||
technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value])
|
||||
)
|
||||
]
|
||||
)
|
||||
assert len(config.used_function) == 5
|
||||
techniques = {f.technique.type for f in config.used_function}
|
||||
assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}
|
||||
1090
tests/field_level_pseudo_anonymisation/test_decrypt_structured.py
Normal file
1090
tests/field_level_pseudo_anonymisation/test_decrypt_structured.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,288 @@
|
||||
"""
|
||||
Test suite for data restoration (depseudonymisation) of unstructured text.
|
||||
|
||||
## Test Coverage Summary
|
||||
|
||||
### Acceptance Criteria Coverage:
|
||||
- AC1 (Data Restoration with Valid Key): 2 tests
|
||||
- AC2 (Restoration Denial - Missing Key): 1 test
|
||||
- AC3 (Restoration Denial - Unauthorized Access): 1 test
|
||||
- AC4 (Restoration Denial - Invalid Key): 1 test
|
||||
- Additional Coverage: 2 tests (edge cases)
|
||||
|
||||
### Test Pattern:
|
||||
- Each test uses build_op_context with .model_dump() for configuration
|
||||
- Tests validate dual outputs (data, metrics)
|
||||
- Tests verify complete restoration of original text
|
||||
- Tests validate security controls and error handling
|
||||
- Tests use descriptive names mapping to AC scenarios
|
||||
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
from cryptography.fernet import Fernet
|
||||
from dagster import build_op_context
|
||||
|
||||
from src.field_level_pseudo_anonymisation.unstructured_ops import (
|
||||
depseudonymize_unstructured,
|
||||
)
|
||||
from src.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
||||
DepseudonymizeUnstructuredConfig,
|
||||
DecryptConfig,
|
||||
DepseudoTechniqueConfig,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fernet_key() -> bytes:
|
||||
"""Generate a valid Fernet key for encryption in tests."""
|
||||
return Fernet.generate_key()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def encrypted_text_data(fernet_key: bytes) -> dict:
|
||||
"""
|
||||
Create encrypted data for testing decryption.
|
||||
|
||||
Returns a dict with:
|
||||
- original_text: The unencrypted text
|
||||
- encrypted_text: Text with PII values encrypted in {encrypt:...} format
|
||||
"""
|
||||
original_text = "My name is John Doe and my email is john.doe@example.com."
|
||||
fernet = Fernet(fernet_key)
|
||||
encrypted_name = fernet.encrypt(b"John Doe").decode()
|
||||
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
|
||||
encrypted_text = (
|
||||
f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}."
|
||||
)
|
||||
return {
|
||||
"original_text": original_text,
|
||||
"encrypted_text": encrypted_text,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------- AC1: Data Restoration with Valid Key --------------------------------
|
||||
|
||||
|
||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
||||
def test_ac1_restore_encrypted_pii_entities_with_valid_key(
|
||||
mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict
|
||||
):
|
||||
"""AC1: Restore encrypted PII entities with a valid key from secret management tool."""
|
||||
# Arrange - Mock the Vault key retrieval to return the valid key
|
||||
mock_create_get_key.return_value = fernet_key
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act - Request data restoration
|
||||
result_gen = depseudonymize_unstructured(
|
||||
context, input_text=encrypted_text_data["encrypted_text"]
|
||||
)
|
||||
data_output = next(result_gen)
|
||||
metrics_output = next(result_gen)
|
||||
|
||||
# Assert - Verify successful restoration
|
||||
# 1. All original values are restored exactly
|
||||
assert (
|
||||
data_output.value == encrypted_text_data["original_text"]
|
||||
), "Original text should be fully restored"
|
||||
|
||||
# 2. Correct output structure
|
||||
assert data_output.output_name == "data", "Output should be named 'data'"
|
||||
|
||||
# 3. Metrics show correct number of restored entities
|
||||
assert (
|
||||
metrics_output.value["total_depseudo_count"] == 2
|
||||
), "Should restore 2 encrypted entities (name and email)"
|
||||
|
||||
# 4. System retrieved key from secret management tool
|
||||
mock_create_get_key.assert_called_once_with("decrypt", "test_key")
|
||||
|
||||
|
||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
||||
def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes):
|
||||
"""AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key."""
|
||||
# Arrange - Create text with multiple PII types encrypted
|
||||
original_text = "Contact John Doe at john.doe@example.com or call 555-1234."
|
||||
fernet = Fernet(fernet_key)
|
||||
encrypted_name = fernet.encrypt(b"John Doe").decode()
|
||||
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
|
||||
encrypted_phone = fernet.encrypt(b"555-1234").decode()
|
||||
encrypted_text = (
|
||||
f"Contact {{encrypt:{encrypted_name}}} at "
|
||||
f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}."
|
||||
)
|
||||
|
||||
mock_create_get_key.return_value = fernet_key
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(type="decrypt", key_name="multi_pii_key")
|
||||
)
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act
|
||||
result_gen = depseudonymize_unstructured(context, input_text=encrypted_text)
|
||||
data_output = next(result_gen)
|
||||
metrics_output = next(result_gen)
|
||||
|
||||
# Assert
|
||||
assert data_output.value == original_text, "All PII types should be restored"
|
||||
assert (
|
||||
metrics_output.value["total_depseudo_count"] == 3
|
||||
), "Should restore 3 encrypted entities (name, email, phone)"
|
||||
mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key")
|
||||
|
||||
|
||||
# ------------------- AC2: Restoration Denial when Key is Missing ----------------------------
|
||||
|
||||
|
||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
||||
def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict):
|
||||
"""AC2: Deny restoration when decryption key is missing from secret management tool."""
|
||||
# Arrange - Mock Vault to indicate key is missing
|
||||
mock_create_get_key.side_effect = ValueError(
|
||||
"Fernet key 'non_existent_key' not found in Vault for decrypt."
|
||||
)
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(type="decrypt", key_name="non_existent_key")
|
||||
)
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act & Assert - Verify system fails the restoration request
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Fernet key 'non_existent_key' not found in Vault for decrypt.",
|
||||
) as exc_info:
|
||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
||||
|
||||
# Verify error message is clear and actionable
|
||||
assert "not found in Vault" in str(
|
||||
exc_info.value
|
||||
), "Error message should indicate key is missing from Vault"
|
||||
|
||||
# Verify system attempted to retrieve the key (logged attempt)
|
||||
mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key")
|
||||
|
||||
|
||||
# ------------- AC3: Restoration Denial when Access is Unauthorized --------------------------
|
||||
|
||||
|
||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
||||
def test_ac3_restoration_denial_when_unauthorized_access(
|
||||
mock_create_get_key, encrypted_text_data: dict
|
||||
):
|
||||
"""AC3: Deny restoration when participant is not authorized to access the decryption key."""
|
||||
# Arrange - Mock Vault to deny access
|
||||
mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key")
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(
|
||||
technique=DecryptConfig(type="decrypt", key_name="unauthorized_key")
|
||||
)
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act & Assert - Verify system denies access
|
||||
with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info:
|
||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
||||
|
||||
# Verify error message clearly indicates access denial
|
||||
assert "Access denied" in str(
|
||||
exc_info.value
|
||||
), "Error message should clearly indicate access was denied"
|
||||
|
||||
# Verify the unauthorized access attempt was logged (function was called)
|
||||
mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key")
|
||||
|
||||
|
||||
# ------------------- AC4: Restoration Denial when Key is Invalid ----------------------------
|
||||
|
||||
|
||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
||||
def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict):
|
||||
"""AC4: Deny restoration when decryption key does not correspond to the encrypted fields."""
|
||||
# Arrange - Mock Vault to return a different (wrong) key
|
||||
invalid_key = Fernet.generate_key() # A different, incorrect key
|
||||
mock_create_get_key.return_value = invalid_key
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key"))
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act & Assert - Verify system fails the restoration
|
||||
with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info:
|
||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
||||
|
||||
# Verify error message indicates decryption failure
|
||||
assert "Invalid Fernet token" in str(
|
||||
exc_info.value
|
||||
), "Error message should indicate the key is invalid for this data"
|
||||
|
||||
# Verify key was retrieved (system attempted decryption)
|
||||
mock_create_get_key.assert_called_once_with("decrypt", "wrong_key")
|
||||
|
||||
|
||||
# -------------------------------- Additional Edge Cases ----------------------------------------
|
||||
|
||||
|
||||
def test_depseudonymize_unstructured_no_decrypt_config():
|
||||
"""Edge case: Text is returned unchanged when no decryption techniques are configured."""
|
||||
# Arrange
|
||||
original_text = "This text has no {encrypt:values} to decrypt."
|
||||
config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act
|
||||
result_gen = depseudonymize_unstructured(context, input_text=original_text)
|
||||
result_output = next(result_gen)
|
||||
metrics_output = next(result_gen)
|
||||
|
||||
# Assert
|
||||
assert (
|
||||
result_output.value == original_text
|
||||
), "Text should remain unchanged when no decryption is configured"
|
||||
assert (
|
||||
metrics_output.value["total_depseudo_count"] == 0
|
||||
), "Should report zero decryptions performed"
|
||||
|
||||
|
||||
def test_depseudonymize_unstructured_empty_text():
|
||||
"""Edge case: Empty input text is returned unchanged with zero decryptions performed."""
|
||||
# Arrange
|
||||
empty_text = ""
|
||||
config = DepseudonymizeUnstructuredConfig(
|
||||
used_function=[
|
||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
|
||||
]
|
||||
)
|
||||
context = build_op_context(op_config=config.model_dump())
|
||||
|
||||
# Act
|
||||
with patch(
|
||||
"src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
|
||||
) as mock_key:
|
||||
mock_key.return_value = Fernet.generate_key()
|
||||
result_gen = depseudonymize_unstructured(context, input_text=empty_text)
|
||||
result_output = next(result_gen)
|
||||
metrics_output = next(result_gen)
|
||||
|
||||
# Assert
|
||||
assert result_output.value == "", "Empty text should remain empty"
|
||||
assert (
|
||||
metrics_output.value["total_depseudo_count"] == 0
|
||||
), "Should report zero decryptions for empty text"
|
||||
1119
tests/field_level_pseudo_anonymisation/test_encrypt_structured.py
Normal file
1119
tests/field_level_pseudo_anonymisation/test_encrypt_structured.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,853 @@
|
||||
"""
|
||||
Test suite for field-level pseudonymisation operations on unstructured data.
|
||||
|
||||
This test suite validates the pseudonymisation of unstructured text with PII detection,
|
||||
covering the following Acceptance Criteria:
|
||||
|
||||
## Test Coverage Summary
|
||||
|
||||
### Acceptance Criteria Coverage:
|
||||
- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests
|
||||
- AC2 (Invalid Execution Handling): 5 tests
|
||||
- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests
|
||||
- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests
|
||||
- Additional Coverage: 3 tests
|
||||
|
||||
### Test Pattern:
|
||||
- Each test uses build_op_context with config_to_dagster_dict for configuration
|
||||
- Tests validate dual outputs (data, metrics)
|
||||
- Vault access is mocked for isolation
|
||||
- Tests validate Scrubadub automatic PII detection
|
||||
- Tests ensure placeholder replacement for unconfigured PII
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import re
|
||||
from dagster import build_op_context
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
||||
AnonymisePseudonymizeUnstructuredConfig,
|
||||
EncryptConfig,
|
||||
RetainConfig,
|
||||
PseudoTechniqueConfig,
|
||||
)
|
||||
from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum
|
||||
from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import (
|
||||
anonymize_pseudonymize_unstructured,
|
||||
)
|
||||
|
||||
from .conftest import clear_vault_key
|
||||
|
||||
|
||||
def config_to_dagster_dict_unstructured(config):
|
||||
"""Convert unstructured config to Dagster format."""
|
||||
config_dict = {"language": config.language.value, "used_function": []}
|
||||
|
||||
for func_config in config.used_function:
|
||||
technique = func_config.technique
|
||||
technique_type = technique.type
|
||||
technique_dict = technique.model_dump()
|
||||
|
||||
if "pii" in technique_dict:
|
||||
technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii]
|
||||
|
||||
technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"}
|
||||
|
||||
config_dict["used_function"].append(
|
||||
{"technique": {technique_type: technique_dict_without_type}}
|
||||
)
|
||||
|
||||
return config_dict
|
||||
|
||||
|
||||
def run_unstructured_op(config, text):
|
||||
"""
|
||||
Helper to run unstructured pseudonymisation op.
|
||||
|
||||
Returns:
|
||||
tuple: (result_text: str, metrics_markdown: str)
|
||||
"""
|
||||
context = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
|
||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text)
|
||||
|
||||
# Extract actual values from Output objects
|
||||
return result_text.value, metrics.value
|
||||
|
||||
|
||||
def parse_metrics_markdown(metrics_md: str) -> dict:
|
||||
"""
|
||||
Parse markdown metrics into structured dict for easier testing.
|
||||
|
||||
Args:
|
||||
metrics_md: Markdown metrics string from op output
|
||||
|
||||
Returns:
|
||||
dict with keys: total_pii_detected, pii_by_type, techniques_applied, language
|
||||
"""
|
||||
result = {
|
||||
"total_pii_detected": 0,
|
||||
"pii_by_type": {},
|
||||
"techniques_applied": {},
|
||||
"language": "",
|
||||
}
|
||||
|
||||
# Extract total PII detected
|
||||
total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md)
|
||||
if total_match:
|
||||
result["total_pii_detected"] = int(total_match.group(1))
|
||||
|
||||
# Extract language
|
||||
lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md)
|
||||
if lang_match:
|
||||
result["language"] = lang_match.group(1)
|
||||
|
||||
# Extract PII by type from table
|
||||
pii_table_section = re.search(
|
||||
r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)",
|
||||
metrics_md,
|
||||
)
|
||||
if pii_table_section:
|
||||
for line in pii_table_section.group(1).strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split("|") if p.strip()]
|
||||
if len(parts) == 2:
|
||||
entity_type, count = parts
|
||||
result["pii_by_type"][entity_type] = int(count)
|
||||
|
||||
# Extract techniques applied
|
||||
techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md)
|
||||
if techniques_section:
|
||||
for line in techniques_section.group(1).strip().split("\n"):
|
||||
tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line)
|
||||
if tech_match:
|
||||
pii_type, technique = tech_match.groups()
|
||||
result["techniques_applied"][pii_type] = technique
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# -------------------------------- Fixtures ----------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_en():
|
||||
"""English text with various PII types."""
|
||||
return """
|
||||
John Smith works at Acme Corporation. His email is john.smith@example.com
|
||||
and his phone number is +1-555-123-4567. He lives in New York City at
|
||||
123 Main Street, Apartment 4B. His SSN is 123-45-6789.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_multi_person():
|
||||
"""Text with multiple person names."""
|
||||
return """
|
||||
The meeting included Alice Johnson, Bob Williams, and Charlie Brown.
|
||||
They discussed the project with Maria Garcia and David Wilson.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_text_mixed_pii():
|
||||
"""Text with multiple PII types for AC1 comprehensive testing."""
|
||||
return """
|
||||
Contact Information:
|
||||
Name: Dr. Emily Watson
|
||||
Email: emily.watson@hospital.com
|
||||
Phone: +44-20-7946-0958
|
||||
Website: https://patient-portal.hospital.com/records
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def encrypt_person_config():
|
||||
"""Configuration to encrypt PERSON entities."""
|
||||
return AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_person_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def retain_person_config():
|
||||
"""Configuration to retain PERSON entities unchanged."""
|
||||
return AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_technique_config():
|
||||
"""Configuration with encryption and retention for AC1 testing."""
|
||||
return AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
||||
key_name="test_mixed_key",
|
||||
)
|
||||
),
|
||||
PseudoTechniqueConfig(
|
||||
technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS])
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# AC1: Pseudonymisation and Retention Are Applied Correctly
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config):
|
||||
"""AC1: Test that configured PII types are encrypted correctly."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify person name is encrypted (not in plaintext)
|
||||
assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted"
|
||||
|
||||
# Verify encryption token is present
|
||||
assert "{encrypt:" in result_text, "Encrypted token should be present in result"
|
||||
|
||||
# Verify PII was detected and processed
|
||||
assert metrics["total_pii_detected"] > 0, "System should detect PII entities"
|
||||
assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII"
|
||||
|
||||
# Verify text structure is preserved (surrounding text intact)
|
||||
assert "Contact Information:" in result_text, "Non-PII text structure should be preserved"
|
||||
|
||||
|
||||
def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person):
|
||||
"""AC1: Test that PII types marked for retention remain unchanged."""
|
||||
retain_config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
|
||||
],
|
||||
)
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify retained PII types remain in plaintext
|
||||
assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged"
|
||||
assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged"
|
||||
|
||||
# Verify technique applied is 'retain'
|
||||
assert (
|
||||
"retain" in metrics["techniques_applied"].get("PERSON", "").lower()
|
||||
), "Retain technique should be recorded for PERSON type"
|
||||
|
||||
|
||||
def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii):
|
||||
"""AC1: Test that unconfigured PII types are replaced with placeholders."""
|
||||
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_person_only_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_person_only_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
|
||||
|
||||
# Verify person is encrypted (configured)
|
||||
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
|
||||
|
||||
# Verify unconfigured PII types have placeholders
|
||||
assert (
|
||||
"{{" in result_text and "}}" in result_text
|
||||
), "Unconfigured PII should be replaced with placeholders"
|
||||
|
||||
# Verify original unconfigured PII values are not in result
|
||||
assert (
|
||||
"emily.watson@hospital.com" not in result_text
|
||||
), "Unconfigured EMAIL should be replaced with placeholder"
|
||||
|
||||
# Verify placeholder format
|
||||
assert (
|
||||
"{{EMAIL}}" in result_text or "{{URL}}" in result_text
|
||||
), "Placeholders should indicate entity type"
|
||||
|
||||
|
||||
def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config):
|
||||
"""AC1: Test that multiple techniques (encrypt, retain) are applied correctly."""
|
||||
clear_vault_key("test_mixed_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify encrypted PII types (PERSON, EMAIL)
|
||||
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
|
||||
assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted"
|
||||
|
||||
# Verify retained PII type (PHONE_NUMBERS)
|
||||
assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained"
|
||||
|
||||
# Verify metrics reflect different techniques
|
||||
assert (
|
||||
"encrypt" in metrics["techniques_applied"].get("PERSON", "").lower()
|
||||
), "Encrypt technique should be applied to PERSON"
|
||||
assert (
|
||||
"encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower()
|
||||
), "Encrypt technique should be applied to EMAIL"
|
||||
assert (
|
||||
"retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower()
|
||||
), "Retain technique should be applied to PHONE_NUMBERS"
|
||||
|
||||
|
||||
def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config):
|
||||
"""AC1: Test that all instances of a configured PII type are processed."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify all person names are encrypted
|
||||
person_names = [
|
||||
"Alice Johnson",
|
||||
"Bob Williams",
|
||||
"Charlie Brown",
|
||||
"Maria Garcia",
|
||||
"David Wilson",
|
||||
]
|
||||
for name in person_names:
|
||||
assert name not in result_text, f"All PERSON instances should be encrypted: {name}"
|
||||
|
||||
# Verify metrics count multiple instances
|
||||
assert metrics["pii_by_type"].get("PERSON", 0) >= len(
|
||||
person_names
|
||||
), f"Should detect at least {len(person_names)} PERSON entities"
|
||||
|
||||
|
||||
def test_ac1_empty_text_returns_empty(encrypt_person_config):
|
||||
"""AC1: Test that empty or null text input raises a ValueError."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
run_unstructured_op(encrypt_person_config, "")
|
||||
|
||||
assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input"
|
||||
|
||||
|
||||
def test_ac1_text_without_pii_remains_unchanged():
|
||||
"""AC1: Test that text without any PII remains unchanged after processing."""
|
||||
no_pii_text = """
|
||||
The weather today is sunny with a high of 25 degrees Celsius.
|
||||
The conference starts at 9:00 AM in Room 301.
|
||||
"""
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_no_pii_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_no_pii_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(config, no_pii_text)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged"
|
||||
assert metrics["total_pii_detected"] == 0, "No PII should be detected"
|
||||
|
||||
|
||||
def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii):
|
||||
"""AC1: Test that placeholders for unconfigured PII indicate the entity type."""
|
||||
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_placeholder_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_placeholder_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify placeholder format (scrubadub uses {{TYPE}} format)
|
||||
placeholder_pattern = r"\{\{[A-Z_]+\}\}"
|
||||
placeholders = re.findall(placeholder_pattern, result_text)
|
||||
|
||||
assert (
|
||||
len(placeholders) > 0
|
||||
), "Result should contain entity-type placeholders for unconfigured PII"
|
||||
|
||||
# Verify metrics track which PII types were detected
|
||||
assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# AC2: Invalid Execution Handling
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_ac2_graceful_abort_on_scrubadub_failure():
|
||||
"""AC2: Test graceful abort when the PII detection engine (Scrubadub) fails."""
|
||||
text = "Test user John Smith with email john@example.com"
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_abort_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_abort_key")
|
||||
|
||||
# Mock Scrubadub to fail at the right import path
|
||||
with patch(
|
||||
"field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber"
|
||||
) as mock_scrubber_class:
|
||||
mock_scrubber = MagicMock()
|
||||
mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error")
|
||||
mock_scrubber_class.return_value = mock_scrubber
|
||||
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
run_unstructured_op(config, text)
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert (
|
||||
"pii" in error_msg
|
||||
or "detection" in error_msg
|
||||
or "scrubadub" in error_msg
|
||||
or "failed" in error_msg
|
||||
), "Error message should indicate PII detection failure"
|
||||
|
||||
|
||||
def test_ac2_graceful_abort_on_encryption_failure(sample_text_en):
|
||||
"""AC2: Test graceful abort when an encryption technique fails during execution."""
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_encrypt_fail_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_encrypt_fail_key")
|
||||
|
||||
# Mock encrypt function at correct path - it's imported from techniques module
|
||||
encrypt_path = (
|
||||
"field_level_pseudo_anonymisation"
|
||||
".techniques.anonymisation_pseudonymisation_techniques.encrypt"
|
||||
)
|
||||
with patch(encrypt_path) as mock_encrypt:
|
||||
mock_encrypt.side_effect = Exception("Encryption algorithm failure")
|
||||
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
run_unstructured_op(config, sample_text_en)
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert (
|
||||
"encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg
|
||||
), "Error message should indicate encryption failure"
|
||||
|
||||
|
||||
def test_ac2_null_text_input_raises_error(encrypt_person_config):
|
||||
"""AC2: Test that a null (None) text input is rejected with an error."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
# Dagster will raise DagsterTypeCheckDidNotPass before op executes
|
||||
from dagster import DagsterTypeCheckDidNotPass
|
||||
|
||||
with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)):
|
||||
run_unstructured_op(encrypt_person_config, None)
|
||||
|
||||
|
||||
def test_ac2_invalid_language_configuration():
|
||||
"""AC2: Test that an unsupported language in the config raises a validation error."""
|
||||
# This should fail at config creation due to Pydantic validation
|
||||
with pytest.raises((ValueError, TypeError)):
|
||||
AnonymisePseudonymizeUnstructuredConfig(
|
||||
language="invalid_lang", # Should fail Pydantic validation
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key"
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_ac2_very_large_text_processing():
|
||||
"""AC2: Test that very large text inputs are processed successfully without memory errors."""
|
||||
# Create large text with repeated PII patterns
|
||||
large_text = (
|
||||
"""
|
||||
John Smith works at company. Email: john.smith@example.com.
|
||||
"""
|
||||
* 1000
|
||||
) # ~60KB of text with repeated PII
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
||||
key_name="test_large_text_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_large_text_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(config, large_text)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify processing completed
|
||||
assert result_text is not None, "Large text should be processed successfully"
|
||||
assert len(result_text) > 0, "Result should not be empty"
|
||||
assert metrics["total_pii_detected"] > 0, "PII should be detected in large text"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# AC3: Execution Audit & Logging - Positive Scenario
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config):
|
||||
"""AC3: Test that successful execution context contains a run ID for logging."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
|
||||
context = build_op_context(op_config=op_config_dict)
|
||||
|
||||
# Capture run context
|
||||
run_id = context.run_id
|
||||
|
||||
# Execute operation
|
||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
|
||||
|
||||
# Verify run identifier is available for logging
|
||||
assert run_id is not None, "Run ID must be available for audit logging"
|
||||
|
||||
# Verify outputs are returned (for Dagster to log)
|
||||
assert result_text is not None, "Result text should be available for logging"
|
||||
assert metrics is not None, "Metrics should be available for logging"
|
||||
|
||||
|
||||
def test_ac3_successful_execution_logs_configuration_parameters(
|
||||
sample_text_en, mixed_technique_config
|
||||
):
|
||||
"""AC3: Test that the used configuration is accessible for logging on success."""
|
||||
clear_vault_key("test_mixed_key")
|
||||
|
||||
op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config)
|
||||
context = build_op_context(op_config=op_config_dict)
|
||||
|
||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
|
||||
|
||||
# Verify configuration is captured and accessible
|
||||
assert "used_function" in op_config_dict, "Configuration must be accessible for logging"
|
||||
assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured"
|
||||
|
||||
# Verify techniques are logged
|
||||
techniques = [func["technique"] for func in op_config_dict["used_function"]]
|
||||
assert any(
|
||||
"encrypt" in str(tech) for tech in techniques
|
||||
), "Encrypt technique should be in configuration"
|
||||
assert any(
|
||||
"retain" in str(tech) for tech in techniques
|
||||
), "Retain technique should be in configuration"
|
||||
|
||||
# Verify metrics contain technique information (in markdown string)
|
||||
metrics_str = metrics.value
|
||||
assert (
|
||||
"Techniques Applied" in metrics_str
|
||||
), "Applied techniques should be in metrics for logging"
|
||||
|
||||
|
||||
def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config):
|
||||
"""AC3: Test that logs and metrics from a successful run do not contain raw PII."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
|
||||
context = build_op_context(op_config=op_config_dict)
|
||||
|
||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii)
|
||||
|
||||
# Verify raw PII values are not in metrics
|
||||
metrics_str = metrics.value
|
||||
|
||||
sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"]
|
||||
|
||||
for pii_value in sensitive_values:
|
||||
assert (
|
||||
pii_value not in metrics_str
|
||||
), f"Raw PII value should not appear in metrics: {pii_value}"
|
||||
|
||||
# Verify configuration logs do not contain raw PII
|
||||
config_str = str(op_config_dict)
|
||||
for pii_value in sensitive_values:
|
||||
assert (
|
||||
pii_value not in config_str
|
||||
), f"Raw PII value should not appear in configuration logs: {pii_value}"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# AC4: Execution Audit & Logging - Negative Scenario
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_ac4_failed_execution_logs_error_details():
|
||||
"""AC4: Negative execution should surface clear error details (encryption key failure)."""
|
||||
text = "Test user John Smith"
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_fail_log_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
clear_vault_key("test_fail_log_key")
|
||||
ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
|
||||
|
||||
# Patch the key retrieval used inside unstructured_ops to force failure
|
||||
with patch(
|
||||
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key",
|
||||
side_effect=RuntimeError("Encryption key retrieval failed"),
|
||||
):
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
# Consume the generator to trigger execution and raise the exception
|
||||
list(anonymize_pseudonymize_unstructured(ctx, text=text))
|
||||
|
||||
msg = str(exc_info.value).lower()
|
||||
assert "key" in msg and "failed" in msg, "Error message should mention key failure"
|
||||
|
||||
|
||||
def test_ac4_failed_execution_logs_configuration_used():
|
||||
"""AC4: Test that the attempted configuration is available for logging on failure."""
|
||||
text = "Test data with person John Doe"
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_config_fail_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_config_fail_key")
|
||||
|
||||
op_config_dict = config_to_dagster_dict_unstructured(config)
|
||||
context = build_op_context(op_config=op_config_dict)
|
||||
|
||||
# Mock _initialize_scrubber to fail
|
||||
with patch(
|
||||
"field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber"
|
||||
) as mock_init_scrubber:
|
||||
mock_init_scrubber.side_effect = Exception("Scrubber module not available")
|
||||
|
||||
with pytest.raises((RuntimeError, Exception)) as exc_info:
|
||||
list(anonymize_pseudonymize_unstructured(context, text=text))
|
||||
|
||||
# Verify configuration is still accessible despite failure
|
||||
assert op_config_dict is not None, "Configuration must be accessible for failure audit"
|
||||
assert (
|
||||
"used_function" in op_config_dict
|
||||
), "Technique configuration should be available for diagnosis"
|
||||
|
||||
# Verify error was raised with proper message
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert (
|
||||
"pii" in error_msg
|
||||
or "detection" in error_msg
|
||||
or "failed" in error_msg
|
||||
or "scrubber" in error_msg
|
||||
or "module" in error_msg
|
||||
), "Error should indicate detection/processing failed"
|
||||
|
||||
|
||||
def test_ac4_failed_execution_logs_failure_reason():
|
||||
"""AC4: Test that the reason for a failure is clearly indicated in the error message."""
|
||||
text = "User: Alice Smith, Email: alice@example.com"
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.en,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
||||
key_name="test_failure_reason_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_failure_reason_key")
|
||||
|
||||
# Mock key retrieval function to fail
|
||||
with patch(
|
||||
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
|
||||
) as mock_get_key:
|
||||
mock_get_key.side_effect = RuntimeError("Vault connection timeout")
|
||||
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
run_unstructured_op(config, text)
|
||||
|
||||
# Verify failure reason is in error message
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert (
|
||||
"encrypt" in error_msg
|
||||
or "key" in error_msg
|
||||
or "timeout" in error_msg
|
||||
or "failed" in error_msg
|
||||
), "Error should indicate key retrieval/encryption failure"
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# Additional Tests - Edge Cases and Integration
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def test_multi_language_support_italian():
|
||||
"""Additional test: Verify that Italian text is processed correctly."""
|
||||
italian_text = """
|
||||
Il dottor Marco Rossi lavora presso l'ospedale.
|
||||
Email: marco.rossi@ospedale.it
|
||||
Telefono: +39-06-12345678
|
||||
"""
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.it,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON],
|
||||
key_name="test_italian_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_italian_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(config, italian_text)
|
||||
metrics = parse_metrics_markdown(metrics_md)
|
||||
|
||||
# Verify processing occurred
|
||||
assert result_text != italian_text, "Italian text should be processed"
|
||||
assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text"
|
||||
|
||||
|
||||
def test_special_characters_in_text():
|
||||
"""Additional test: Verify handling of text with special Unicode characters."""
|
||||
special_text = """
|
||||
User: João da Silva 🇧🇷
|
||||
Email: joão@empresa.com.br
|
||||
Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ®
|
||||
"""
|
||||
|
||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
||||
language=LanguageEnum.pt,
|
||||
used_function=[
|
||||
PseudoTechniqueConfig(
|
||||
technique=EncryptConfig(
|
||||
type="encrypt",
|
||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
||||
key_name="test_special_chars_key",
|
||||
)
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
clear_vault_key("test_special_chars_key")
|
||||
|
||||
result_text, metrics_md = run_unstructured_op(config, special_text)
|
||||
|
||||
# Verify processing completed without encoding errors
|
||||
assert result_text is not None, "Special characters should not cause processing failure"
|
||||
assert len(result_text) > 0, "Result should not be empty"
|
||||
|
||||
|
||||
def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config):
|
||||
"""Additional test: Verify encryption format consistency across runs."""
|
||||
clear_vault_key("test_person_key")
|
||||
|
||||
result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en)
|
||||
result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en)
|
||||
|
||||
# Both should have encryption tokens
|
||||
assert "{encrypt:" in result1, "First run should produce encrypted tokens"
|
||||
assert "{encrypt:" in result2, "Second run should produce encrypted tokens"
|
||||
|
||||
# Verify consistent PII detection
|
||||
metrics1 = parse_metrics_markdown(metrics_md1)
|
||||
metrics2 = parse_metrics_markdown(metrics_md2)
|
||||
|
||||
assert (
|
||||
metrics1["total_pii_detected"] == metrics2["total_pii_detected"]
|
||||
), "PII detection should be consistent across runs"
|
||||
|
||||
# Verify token format is consistent (Fernet base64 pattern)
|
||||
token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}"
|
||||
tokens1 = re.findall(token_pattern, result1)
|
||||
tokens2 = re.findall(token_pattern, result2)
|
||||
|
||||
assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"
|
||||
58
tests/field_level_pseudo_anonymisation/test_jobs.py
Normal file
58
tests/field_level_pseudo_anonymisation/test_jobs.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from template_code_location.field_level_pseudo_anonymisation.jobs import (
|
||||
anonymize_pseudonymize_structured_job,
|
||||
anonymize_pseudonymize_structured_job_s3,
|
||||
depseudonymize_structured_job,
|
||||
depseudonymize_structured_job_s3,
|
||||
anonymize_pseudonymize_unstructured_job_s3,
|
||||
anonymize_pseudonymize_unstructured_job,
|
||||
depseudonymize_unstructured_job_s3,
|
||||
depseudonymize_unstructured_job
|
||||
)
|
||||
|
||||
|
||||
def test_anonymize_pseudonymize_structured_job_is_callable():
|
||||
"""Test anonymize_pseudonymize_structured_job is a valid Dagster job"""
|
||||
assert callable(anonymize_pseudonymize_structured_job)
|
||||
assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process')
|
||||
|
||||
|
||||
def test_anonymize_pseudonymize_structured_job_s3_is_callable():
|
||||
"""Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job"""
|
||||
assert callable(anonymize_pseudonymize_structured_job_s3)
|
||||
assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_depseudonymize_structured_job_is_callable():
|
||||
"""Test depseudonymize_structured_job is a valid Dagster job"""
|
||||
assert callable(depseudonymize_structured_job)
|
||||
assert hasattr(depseudonymize_structured_job, 'execute_in_process')
|
||||
|
||||
|
||||
def test_depseudonymize_structured_job_s3_is_callable():
|
||||
"""Test depseudonymize_structured_job_s3 is a valid Dagster job"""
|
||||
assert callable(depseudonymize_structured_job_s3)
|
||||
assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_anonymize_pseudonymize_unstructured_job_is_callable():
|
||||
"""Test anonymize_pseudonymize_unstructured_job is a valid Dagster job"""
|
||||
assert callable(anonymize_pseudonymize_unstructured_job)
|
||||
assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process')
|
||||
|
||||
|
||||
def test_anonymize_pseudonymize_unstructured_job_s3_is_callable():
|
||||
"""Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job"""
|
||||
assert callable(anonymize_pseudonymize_unstructured_job_s3)
|
||||
assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_depseudonymize_unstructured_job_is_callable():
|
||||
"""Test depseudonymize_unstructured_job is a valid Dagster job"""
|
||||
assert callable(depseudonymize_unstructured_job)
|
||||
assert hasattr(depseudonymize_unstructured_job, 'execute_in_process')
|
||||
|
||||
|
||||
def test_depseudonymize_unstructured_job_s3_is_callable():
|
||||
"""Test depseudonymize_unstructured_job_s3 is a valid Dagster job"""
|
||||
assert callable(depseudonymize_unstructured_job_s3)
|
||||
assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process')
|
||||
Reference in New Issue
Block a user