Files
template-code-location/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py

634 lines
25 KiB
Python

import pytest
from pydantic import ValidationError
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
PseudoTechniqueConfig,
DepseudoTechniqueConfig,
HashConfig,
EncryptConfig,
RedactConfig,
ReplaceConfig,
DecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
AnonymisePseudonymizeUnstructuredConfig,
DepseudonymizeUnstructuredConfig,
PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig,
DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig,
HashConfig as UnstructuredHashConfig,
EncryptConfig as UnstructuredEncryptConfig,
RedactConfig as UnstructuredRedactConfig,
ReplaceConfig as UnstructuredReplaceConfig,
RetainConfig,
DecryptConfig as UnstructuredDecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum
from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum
# ==================== Structured Config Tests ====================
class TestStructuredConfigValidators:
"""Tests for structured_config.py validators and validators."""
def test_ensure_unique_columns_valid_single_technique(self):
"""Test that single technique with single column passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
)
]
)
assert config is not None
assert len(config.used_function) == 1
def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self):
"""Test that multiple techniques with different columns passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["ssn"],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_columns_duplicate_columns_same_technique(self):
"""Test that duplicate columns in different techniques raises error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email"],
algorithm="sha256"
)
)
]
)
assert "Duplicate column" in str(exc_info.value)
assert "email" in str(exc_info.value)
def test_ensure_unique_columns_multiple_duplicates(self):
"""Test error message with multiple duplicate columns."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email", "phone"],
algorithm="sha256"
)
)
]
)
error_msg = str(exc_info.value)
assert "Duplicate column" in error_msg
assert "email" in error_msg
assert "phone" in error_msg
def test_collect_column_to_techniques_single_technique(self):
"""Test _collect_column_to_techniques with single technique."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
)
]
)
mapping = config._collect_column_to_techniques()
assert mapping == {
"email": ["encrypt"],
"phone": ["encrypt"]
}
def test_extract_technique_and_columns_dict_with_type_field(self):
"""Test _extract_technique_and_columns with dict containing 'type' field."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"columns": ["email", "ssn"],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == ["email", "ssn"]
def test_extract_technique_and_columns_dict_with_variant_mapping(self):
"""Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"encrypt": {
"columns": ["ssn"],
"key_name": "test_key"
}
}
}
)
assert technique_type == "encrypt"
assert columns == ["ssn"]
def test_extract_technique_and_columns_model_instance(self):
"""Test _extract_technique_and_columns with PseudoTechniqueConfig model instance."""
pseudo_config = PseudoTechniqueConfig(
technique=RedactConfig(columns=["address"])
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "redact"
assert columns == ["address"]
def test_extract_technique_and_columns_empty_dict(self):
"""Test _extract_technique_and_columns with empty dict."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": {}}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_none_technique(self):
"""Test _extract_technique_and_columns with None technique."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": None}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_missing_columns_key(self):
"""Test _extract_technique_and_columns when 'columns' key is missing."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == []
def test_extract_technique_and_columns_model_without_columns_attr(self):
"""Test _extract_technique_and_columns with model instance missing columns attribute."""
pseudo_config = PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["old_value"], new_value="NEW")
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "replace"
assert columns == ["old_value"]
class TestStructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeStructuredConfig."""
def test_depseudonymize_config_normalize_used_function_with_dict(self):
"""Test _normalize_depseudo_used_function with dict input."""
config = DepseudonymizeStructuredConfig(
used_function=[
{
"technique": {
"type": "decrypt",
"columns": ["email"],
"key_name": "key1"
}
}
]
)
assert len(config.used_function) == 1
assert isinstance(config.used_function[0], DepseudoTechniqueConfig)
assert config.used_function[0].technique.type == "decrypt"
def test_depseudonymize_config_normalize_used_function_with_model(self):
"""Test _normalize_depseudo_used_function with model instance."""
depseudo_tech = DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
)
config = DepseudonymizeStructuredConfig(
used_function=[depseudo_tech]
)
assert len(config.used_function) == 1
assert config.used_function[0] is depseudo_tech
def test_depseudonymize_config_ensure_unique_columns_no_op(self):
"""Test that ensure_unique_columns is a no-op for depseudonymize."""
# For depseudonymize, there's no per-column uniqueness constraint
config = DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
),
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key2"
)
)
]
)
# Should not raise - no-op validator
assert config is not None
# ==================== Unstructured Config Tests ====================
class TestUnstructuredConfigValidators:
"""Tests for unstructured_config.py validators."""
def test_normalize_used_function_with_dict(self):
"""Test _normalize_used_function with dict input."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
{
"technique": {
"encrypt": {
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "key1"
}
}
}
]
)
assert len(config.used_function) == 1
def test_normalize_used_function_with_model(self):
"""Test _normalize_used_function with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
)
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[pseudo_tech]
)
assert len(config.used_function) == 1
def test_ensure_unique_pii_valid_different_pii_types(self):
"""Test that different PII types pass validation."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.PERSON.value],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_pii_duplicate_pii_types(self):
"""Test that duplicate PII types raise error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.EMAIL.value],
algorithm="sha256"
)
)
]
)
assert "Duplicate PII" in str(exc_info.value)
# Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value
assert "EMAIL" in str(exc_info.value)
def test_collect_pii_to_techniques_single_technique(self):
"""Test _collect_pii_to_techniques with single technique."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value],
key_name="key1"
)
)
]
)
mapping = config._collect_pii_to_techniques()
assert mapping == {
PIIEntityEnum.EMAIL.value: ["encrypt"],
PIIEntityEnum.PERSON.value: ["encrypt"]
}
def test_extract_technique_and_pii_dict_with_type_field(self):
"""Test _extract_technique_and_pii with dict containing 'type' field."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_dict_with_variant_mapping(self):
"""Test _extract_technique_and_pii with variant-key mapping."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"hash": {
"pii": [PIIEntityEnum.PERSON.value],
"algorithm": "sha256"
}
}
}
)
assert technique_type == "hash"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_dict_fallback_to_columns(self):
"""Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "redact",
"columns": ["fallback_col"]
}
}
)
assert technique_type == "redact"
assert piis == ["fallback_col"]
def test_extract_technique_and_pii_model_instance(self):
"""Test _extract_technique_and_pii with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(
pii=[PIIEntityEnum.EMAIL.value]
)
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "redact"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_model_with_getattr_fallback(self):
"""Test _extract_technique_and_pii model with getattr fallback to columns."""
# Create a mock-like scenario where pii attribute doesn't exist
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value])
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "retain"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_empty_dict(self):
"""Test _extract_technique_and_pii with empty dict."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{"technique": {}}
)
assert technique_type is None
assert piis == []
def test_extract_technique_and_pii_missing_pii_key(self):
"""Test _extract_technique_and_pii when 'pii' key is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == []
class TestUnstructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeUnstructuredConfig."""
def test_depseudonymize_unstructured_config_default(self):
"""Test default DepseudonymizeUnstructuredConfig."""
config = DepseudonymizeUnstructuredConfig()
assert config is not None
assert len(config.used_function) >= 1
def test_depseudonymize_unstructured_config_with_custom_function(self):
"""Test DepseudonymizeUnstructuredConfig with custom function."""
config = DepseudonymizeUnstructuredConfig(
used_function=[
UnstructuredDepseudoTechniqueConfig(
technique=UnstructuredDecryptConfig(
key_name="custom_key"
)
)
]
)
assert len(config.used_function) == 1
assert config.used_function[0].technique.key_name == "custom_key"
class TestLanguageSupport:
"""Tests for language configuration support."""
def test_all_supported_languages(self):
"""Test that all supported languages can be set."""
supported_languages = [
LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en,
LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el,
LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt,
LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv
]
for lang in supported_languages:
config = AnonymisePseudonymizeUnstructuredConfig(language=lang)
assert config.language == lang
def test_default_language_is_english(self):
"""Test that default language is English."""
config = AnonymisePseudonymizeUnstructuredConfig()
assert config.language == LanguageEnum.en
class TestTechniqueConfigDefaults:
"""Tests for technique config defaults."""
def test_hash_config_default_algorithm(self):
"""Test HashConfig default algorithm."""
config = HashConfig()
assert config.algorithm == "sha256"
assert config.type == "hash"
def test_encrypt_config_defaults(self):
"""Test EncryptConfig defaults."""
config = EncryptConfig()
assert config.type == "encrypt"
assert config.key_name == "my_key"
def test_redact_config_defaults(self):
"""Test RedactConfig defaults."""
config = RedactConfig()
assert config.type == "redact"
def test_replace_config_defaults(self):
"""Test ReplaceConfig defaults."""
config = ReplaceConfig()
assert config.type == "replace"
assert config.new_value == "REPLACED"
def test_decrypt_config_defaults(self):
"""Test DecryptConfig defaults."""
config = DecryptConfig()
assert config.type == "decrypt"
assert config.key_name == "my_key"
def test_unstructured_retain_config_defaults(self):
"""Test RetainConfig defaults."""
config = RetainConfig()
assert config.type == "retain"
class TestPseudoTechniqueConfigDefaults:
"""Tests for PseudoTechniqueConfig defaults."""
def test_pseudo_technique_default_to_hash(self):
"""Test PseudoTechniqueConfig defaults to hash technique."""
config = PseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
def test_unstructured_pseudo_technique_default_to_hash(self):
"""Test UnstructuredPseudoTechniqueConfig defaults to hash technique."""
config = UnstructuredPseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
class TestConfigModelIntegration:
"""Integration tests for config models."""
def test_structured_config_with_all_technique_types(self):
"""Test structured config with all technique types."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=HashConfig(columns=["col1"])
),
PseudoTechniqueConfig(
technique=EncryptConfig(columns=["col2"], key_name="k1")
),
PseudoTechniqueConfig(
technique=RedactConfig(columns=["col3"])
),
PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["col4"], new_value="X")
)
]
)
assert len(config.used_function) == 4
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace"}
def test_unstructured_config_with_all_technique_types(self):
"""Test unstructured config with all technique types."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.PERSON.value],
key_name="k1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredReplaceConfig(
pii=[PIIEntityEnum.CREDIT_CARD.value],
new_value="X"
)
),
UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value])
)
]
)
assert len(config.used_function) == 5
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}