import pytest from pydantic import ValidationError from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( AnonymisePseudonymizeStructuredConfig, DepseudonymizeStructuredConfig, PseudoTechniqueConfig, DepseudoTechniqueConfig, HashConfig, EncryptConfig, RedactConfig, ReplaceConfig, DecryptConfig, ) from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( AnonymisePseudonymizeUnstructuredConfig, DepseudonymizeUnstructuredConfig, PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig, DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig, HashConfig as UnstructuredHashConfig, EncryptConfig as UnstructuredEncryptConfig, RedactConfig as UnstructuredRedactConfig, ReplaceConfig as UnstructuredReplaceConfig, RetainConfig, DecryptConfig as UnstructuredDecryptConfig, ) from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum # ==================== Structured Config Tests ==================== class TestStructuredConfigValidators: """Tests for structured_config.py validators and validators.""" def test_ensure_unique_columns_valid_single_technique(self): """Test that single technique with single column passes validation.""" config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( columns=["email"], key_name="key1" ) ) ] ) assert config is not None assert len(config.used_function) == 1 def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self): """Test that multiple techniques with different columns passes validation.""" config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( columns=["email"], key_name="key1" ) ), PseudoTechniqueConfig( technique=HashConfig( columns=["ssn"], algorithm="sha256" ) ) ] ) assert config is not None assert len(config.used_function) == 2 def test_ensure_unique_columns_duplicate_columns_same_technique(self): """Test that duplicate columns in different techniques raises error.""" with pytest.raises(ValueError) as exc_info: AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( columns=["email"], key_name="key1" ) ), PseudoTechniqueConfig( technique=HashConfig( columns=["email"], algorithm="sha256" ) ) ] ) assert "Duplicate column" in str(exc_info.value) assert "email" in str(exc_info.value) def test_ensure_unique_columns_multiple_duplicates(self): """Test error message with multiple duplicate columns.""" with pytest.raises(ValueError) as exc_info: AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( columns=["email", "phone"], key_name="key1" ) ), PseudoTechniqueConfig( technique=HashConfig( columns=["email", "phone"], algorithm="sha256" ) ) ] ) error_msg = str(exc_info.value) assert "Duplicate column" in error_msg assert "email" in error_msg assert "phone" in error_msg def test_collect_column_to_techniques_single_technique(self): """Test _collect_column_to_techniques with single technique.""" config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( columns=["email", "phone"], key_name="key1" ) ) ] ) mapping = config._collect_column_to_techniques() assert mapping == { "email": ["encrypt"], "phone": ["encrypt"] } def test_extract_technique_and_columns_dict_with_type_field(self): """Test _extract_technique_and_columns with dict containing 'type' field.""" config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns( { "technique": { "type": "encrypt", "columns": ["email", "ssn"], "key_name": "test_key" } } ) assert technique_type == "encrypt" assert columns == ["email", "ssn"] def test_extract_technique_and_columns_dict_with_variant_mapping(self): """Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}.""" config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns( { "technique": { "encrypt": { "columns": ["ssn"], "key_name": "test_key" } } } ) assert technique_type == "encrypt" assert columns == ["ssn"] def test_extract_technique_and_columns_model_instance(self): """Test _extract_technique_and_columns with PseudoTechniqueConfig model instance.""" pseudo_config = PseudoTechniqueConfig( technique=RedactConfig(columns=["address"]) ) config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns(pseudo_config) assert technique_type == "redact" assert columns == ["address"] def test_extract_technique_and_columns_empty_dict(self): """Test _extract_technique_and_columns with empty dict.""" config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns( {"technique": {}} ) assert technique_type is None assert columns == [] def test_extract_technique_and_columns_none_technique(self): """Test _extract_technique_and_columns with None technique.""" config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns( {"technique": None} ) assert technique_type is None assert columns == [] def test_extract_technique_and_columns_missing_columns_key(self): """Test _extract_technique_and_columns when 'columns' key is missing.""" config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns( { "technique": { "type": "encrypt", "key_name": "test_key" } } ) assert technique_type == "encrypt" assert columns == [] def test_extract_technique_and_columns_model_without_columns_attr(self): """Test _extract_technique_and_columns with model instance missing columns attribute.""" pseudo_config = PseudoTechniqueConfig( technique=ReplaceConfig(columns=["old_value"], new_value="NEW") ) config = AnonymisePseudonymizeStructuredConfig() technique_type, columns = config._extract_technique_and_columns(pseudo_config) assert technique_type == "replace" assert columns == ["old_value"] class TestStructuredDepseudonymizeConfig: """Tests for DepseudonymizeStructuredConfig.""" def test_depseudonymize_config_normalize_used_function_with_dict(self): """Test _normalize_depseudo_used_function with dict input.""" config = DepseudonymizeStructuredConfig( used_function=[ { "technique": { "type": "decrypt", "columns": ["email"], "key_name": "key1" } } ] ) assert len(config.used_function) == 1 assert isinstance(config.used_function[0], DepseudoTechniqueConfig) assert config.used_function[0].technique.type == "decrypt" def test_depseudonymize_config_normalize_used_function_with_model(self): """Test _normalize_depseudo_used_function with model instance.""" depseudo_tech = DepseudoTechniqueConfig( technique=DecryptConfig( columns=["email"], key_name="key1" ) ) config = DepseudonymizeStructuredConfig( used_function=[depseudo_tech] ) assert len(config.used_function) == 1 assert config.used_function[0] is depseudo_tech def test_depseudonymize_config_ensure_unique_columns_no_op(self): """Test that ensure_unique_columns is a no-op for depseudonymize.""" # For depseudonymize, there's no per-column uniqueness constraint config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( columns=["email"], key_name="key1" ) ), DepseudoTechniqueConfig( technique=DecryptConfig( columns=["email"], key_name="key2" ) ) ] ) # Should not raise - no-op validator assert config is not None # ==================== Unstructured Config Tests ==================== class TestUnstructuredConfigValidators: """Tests for unstructured_config.py validators.""" def test_normalize_used_function_with_dict(self): """Test _normalize_used_function with dict input.""" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ { "technique": { "encrypt": { "pii": [PIIEntityEnum.EMAIL.value], "key_name": "key1" } } } ] ) assert len(config.used_function) == 1 def test_normalize_used_function_with_model(self): """Test _normalize_used_function with model instance.""" pseudo_tech = UnstructuredPseudoTechniqueConfig( technique=UnstructuredEncryptConfig( pii=[PIIEntityEnum.EMAIL.value], key_name="key1" ) ) config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[pseudo_tech] ) assert len(config.used_function) == 1 def test_ensure_unique_pii_valid_different_pii_types(self): """Test that different PII types pass validation.""" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ UnstructuredPseudoTechniqueConfig( technique=UnstructuredEncryptConfig( pii=[PIIEntityEnum.EMAIL.value], key_name="key1" ) ), UnstructuredPseudoTechniqueConfig( technique=UnstructuredHashConfig( pii=[PIIEntityEnum.PERSON.value], algorithm="sha256" ) ) ] ) assert config is not None assert len(config.used_function) == 2 def test_ensure_unique_pii_duplicate_pii_types(self): """Test that duplicate PII types raise error.""" with pytest.raises(ValueError) as exc_info: AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ UnstructuredPseudoTechniqueConfig( technique=UnstructuredEncryptConfig( pii=[PIIEntityEnum.EMAIL.value], key_name="key1" ) ), UnstructuredPseudoTechniqueConfig( technique=UnstructuredHashConfig( pii=[PIIEntityEnum.EMAIL.value], algorithm="sha256" ) ) ] ) assert "Duplicate PII" in str(exc_info.value) # Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value assert "EMAIL" in str(exc_info.value) def test_collect_pii_to_techniques_single_technique(self): """Test _collect_pii_to_techniques with single technique.""" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ UnstructuredPseudoTechniqueConfig( technique=UnstructuredEncryptConfig( pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value], key_name="key1" ) ) ] ) mapping = config._collect_pii_to_techniques() assert mapping == { PIIEntityEnum.EMAIL.value: ["encrypt"], PIIEntityEnum.PERSON.value: ["encrypt"] } def test_extract_technique_and_pii_dict_with_type_field(self): """Test _extract_technique_and_pii with dict containing 'type' field.""" config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii( { "technique": { "type": "encrypt", "pii": [PIIEntityEnum.EMAIL.value], "key_name": "test_key" } } ) assert technique_type == "encrypt" assert piis == [PIIEntityEnum.EMAIL.value] def test_extract_technique_and_pii_dict_with_variant_mapping(self): """Test _extract_technique_and_pii with variant-key mapping.""" config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii( { "technique": { "hash": { "pii": [PIIEntityEnum.PERSON.value], "algorithm": "sha256" } } } ) assert technique_type == "hash" assert piis == [PIIEntityEnum.PERSON.value] def test_extract_technique_and_pii_dict_fallback_to_columns(self): """Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing.""" config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii( { "technique": { "type": "redact", "columns": ["fallback_col"] } } ) assert technique_type == "redact" assert piis == ["fallback_col"] def test_extract_technique_and_pii_model_instance(self): """Test _extract_technique_and_pii with model instance.""" pseudo_tech = UnstructuredPseudoTechniqueConfig( technique=UnstructuredRedactConfig( pii=[PIIEntityEnum.EMAIL.value] ) ) config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii(pseudo_tech) assert technique_type == "redact" assert piis == [PIIEntityEnum.EMAIL.value] def test_extract_technique_and_pii_model_with_getattr_fallback(self): """Test _extract_technique_and_pii model with getattr fallback to columns.""" # Create a mock-like scenario where pii attribute doesn't exist pseudo_tech = UnstructuredPseudoTechniqueConfig( technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value]) ) config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii(pseudo_tech) assert technique_type == "retain" assert piis == [PIIEntityEnum.PERSON.value] def test_extract_technique_and_pii_empty_dict(self): """Test _extract_technique_and_pii with empty dict.""" config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii( {"technique": {}} ) assert technique_type is None assert piis == [] def test_extract_technique_and_pii_missing_pii_key(self): """Test _extract_technique_and_pii when 'pii' key is missing.""" config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) technique_type, piis = config._extract_technique_and_pii( { "technique": { "type": "encrypt", "key_name": "test_key" } } ) assert technique_type == "encrypt" assert piis == [] class TestUnstructuredDepseudonymizeConfig: """Tests for DepseudonymizeUnstructuredConfig.""" def test_depseudonymize_unstructured_config_default(self): """Test default DepseudonymizeUnstructuredConfig.""" config = DepseudonymizeUnstructuredConfig() assert config is not None assert len(config.used_function) >= 1 def test_depseudonymize_unstructured_config_with_custom_function(self): """Test DepseudonymizeUnstructuredConfig with custom function.""" config = DepseudonymizeUnstructuredConfig( used_function=[ UnstructuredDepseudoTechniqueConfig( technique=UnstructuredDecryptConfig( key_name="custom_key" ) ) ] ) assert len(config.used_function) == 1 assert config.used_function[0].technique.key_name == "custom_key" class TestLanguageSupport: """Tests for language configuration support.""" def test_all_supported_languages(self): """Test that all supported languages can be set.""" supported_languages = [ LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en, LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el, LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt, LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv ] for lang in supported_languages: config = AnonymisePseudonymizeUnstructuredConfig(language=lang) assert config.language == lang def test_default_language_is_english(self): """Test that default language is English.""" config = AnonymisePseudonymizeUnstructuredConfig() assert config.language == LanguageEnum.en class TestTechniqueConfigDefaults: """Tests for technique config defaults.""" def test_hash_config_default_algorithm(self): """Test HashConfig default algorithm.""" config = HashConfig() assert config.algorithm == "sha256" assert config.type == "hash" def test_encrypt_config_defaults(self): """Test EncryptConfig defaults.""" config = EncryptConfig() assert config.type == "encrypt" assert config.key_name == "my_key" def test_redact_config_defaults(self): """Test RedactConfig defaults.""" config = RedactConfig() assert config.type == "redact" def test_replace_config_defaults(self): """Test ReplaceConfig defaults.""" config = ReplaceConfig() assert config.type == "replace" assert config.new_value == "REPLACED" def test_decrypt_config_defaults(self): """Test DecryptConfig defaults.""" config = DecryptConfig() assert config.type == "decrypt" assert config.key_name == "my_key" def test_unstructured_retain_config_defaults(self): """Test RetainConfig defaults.""" config = RetainConfig() assert config.type == "retain" class TestPseudoTechniqueConfigDefaults: """Tests for PseudoTechniqueConfig defaults.""" def test_pseudo_technique_default_to_hash(self): """Test PseudoTechniqueConfig defaults to hash technique.""" config = PseudoTechniqueConfig() # For Dagster Config, technique may be a dict with the discriminator structure if isinstance(config.technique, dict): # Check if it has hash configuration assert "hash" in config.technique or config.technique.get("type") == "hash" else: assert config.technique.type == "hash" def test_unstructured_pseudo_technique_default_to_hash(self): """Test UnstructuredPseudoTechniqueConfig defaults to hash technique.""" config = UnstructuredPseudoTechniqueConfig() # For Dagster Config, technique may be a dict with the discriminator structure if isinstance(config.technique, dict): # Check if it has hash configuration assert "hash" in config.technique or config.technique.get("type") == "hash" else: assert config.technique.type == "hash" class TestConfigModelIntegration: """Integration tests for config models.""" def test_structured_config_with_all_technique_types(self): """Test structured config with all technique types.""" config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=HashConfig(columns=["col1"]) ), PseudoTechniqueConfig( technique=EncryptConfig(columns=["col2"], key_name="k1") ), PseudoTechniqueConfig( technique=RedactConfig(columns=["col3"]) ), PseudoTechniqueConfig( technique=ReplaceConfig(columns=["col4"], new_value="X") ) ] ) assert len(config.used_function) == 4 techniques = {f.technique.type for f in config.used_function} assert techniques == {"hash", "encrypt", "redact", "replace"} def test_unstructured_config_with_all_technique_types(self): """Test unstructured config with all technique types.""" config = AnonymisePseudonymizeUnstructuredConfig( language=LanguageEnum.en, used_function=[ UnstructuredPseudoTechniqueConfig( technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value]) ), UnstructuredPseudoTechniqueConfig( technique=UnstructuredEncryptConfig( pii=[PIIEntityEnum.PERSON.value], key_name="k1" ) ), UnstructuredPseudoTechniqueConfig( technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value]) ), UnstructuredPseudoTechniqueConfig( technique=UnstructuredReplaceConfig( pii=[PIIEntityEnum.CREDIT_CARD.value], new_value="X" ) ), UnstructuredPseudoTechniqueConfig( technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value]) ) ] ) assert len(config.used_function) == 5 techniques = {f.technique.type for f in config.used_function} assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}