""" Test suite for data restoration (depseudonymization) operations. This test suite validates the data restoration feature against the following Acceptance Criteria: ## Test Coverage Summary ### Acceptance Criteria Coverage: - AC1 (Data Restoration with Valid Key): 7 tests - AC2 (Restoration Denial - Missing Key): 3 tests - AC3 (Restoration Denial - Unauthorized Access): 2 tests - AC4 (Restoration Denial - Invalid Key): 3 tests - Additional Coverage: 3 tests ### Test Pattern: - Each test uses build_op_context with .model_dump() for configuration - Tests validate dual outputs (data, metrics) - Tests verify complete restoration of original values - Tests validate security controls and error handling """ import pandas as pd import pytest from cryptography.fernet import Fernet from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( AnonymisePseudonymizeStructuredConfig, DepseudonymizeStructuredConfig, EncryptConfig, DecryptConfig, PseudoTechniqueConfig, DepseudoTechniqueConfig, ) # Import helper functions (fixtures are auto-discovered by pytest) from .conftest import ( run_encrypt_op, run_decrypt_op, clear_vault_key, set_vault_key, deny_vault_access, get_vault_key, ) # -------------------------------- Test Markers Configuration -------------------------------- # Register custom markers pytest.mark.slow = pytest.mark.slow pytest.mark.security = pytest.mark.security pytest.mark.edge_case = pytest.mark.edge_case pytest.mark.integration = pytest.mark.integration # ---------------------- AC1: Data Restoration with Valid Key -------------------------------- def test_ac1_restore_single_encrypted_field_with_valid_key( sample_df, encrypt_config_single_field, decrypt_config_single_field ): """ AC1: Data Restoration using Secret Management Tool-Stored Decryption Key Scenario: Restore encrypted field with a valid key Given: A pseudonymised dataset with encrypted email field And: A valid decryption key stored in secret management tool And: The participant provided the field that needs to be restored (email) And: The participant is authorized When: The participant requests data restoration And: Provides the correct key name Then: The system retrieves the key from secret management tool And: Decrypts the dataset accurately And: All original values are restored And: A success message is presented to the user (via successful return) And: The result is presented to the user """ # Clear any existing test key clear_vault_key("test_restoration_key_single") # Step 1: Encrypt the data (pseudonymisation phase) encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Verify encryption occurred assert not encrypted_df["email"].equals(sample_df["email"]), "Email field should be encrypted" # Verify key was created in Vault key = get_vault_key("test_restoration_key_single") assert key is not None, "Encryption key should exist in Vault" # Step 2: Restore the data (depseudonymisation phase) restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) # Verify restoration succeeded assert restored_df is not None, "Restored DataFrame should not be None" assert metrics is not None, "Metrics should not be None" # Verify all original values are restored exactly assert restored_df["email"].equals( sample_df["email"] ), "Email field should be restored to original values" # Verify each individual value for idx, (original, restored) in enumerate(zip(sample_df["email"], restored_df["email"])): assert ( original == restored ), f"Row {idx}: Original '{original}' should match restored '{restored}'" # Verify row count preserved assert len(restored_df) == len(sample_df), "Row count should be preserved during restoration" # Verify non-encrypted columns remain unchanged assert restored_df["name"].equals( sample_df["name"] ), "Non-encrypted fields should remain unchanged" assert restored_df["age"].equals( sample_df["age"] ), "Non-encrypted fields should remain unchanged" assert restored_df["department"].equals( sample_df["department"] ), "Non-encrypted fields should remain unchanged" def test_ac1_restore_multiple_encrypted_fields_with_valid_key( sample_df, encrypt_config_multiple_fields, decrypt_config_multiple_fields ): """ AC1: Data Restoration of multiple encrypted fields with a valid key Scenario: Restore multiple encrypted fields (name, email, ssn) with a valid key Given: A pseudonymised dataset with multiple encrypted fields And: A valid decryption key stored in secret management tool And: The participant provided the fields that need to be restored When: The participant requests data restoration Then: All specified fields are decrypted accurately And: All original values are restored """ clear_vault_key("test_restoration_key_multi") # Encrypt multiple fields encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) # Verify all specified fields were encrypted assert not encrypted_df["name"].equals(sample_df["name"]), "Name should be encrypted" assert not encrypted_df["email"].equals(sample_df["email"]), "Email should be encrypted" assert not encrypted_df["ssn"].equals(sample_df["ssn"]), "SSN should be encrypted" # Restore all encrypted fields restored_df, _ = run_decrypt_op(decrypt_config_multiple_fields, encrypted_df.copy()) # Verify all fields restored to original values assert restored_df["name"].equals( sample_df["name"] ), "Name field should be restored to original values" assert restored_df["email"].equals( sample_df["email"] ), "Email field should be restored to original values" assert restored_df["ssn"].equals( sample_df["ssn"] ), "SSN field should be restored to original values" # Verify non-encrypted columns remain unchanged assert restored_df["age"].equals( sample_df["age"] ), "Non-encrypted fields should remain unchanged" assert restored_df["salary"].equals( sample_df["salary"] ), "Non-encrypted fields should remain unchanged" def test_ac1_restore_partial_fields_leaves_others_encrypted( sample_df, encrypt_config_multiple_fields ): """ AC1: Partial restoration - participant specifies only some fields to restore Scenario: Restore only selected fields while leaving others encrypted Given: A pseudonymised dataset with multiple encrypted fields (name, email, ssn) And: The participant specifies only some fields to restore (e.g., only email) When: The participant requests partial restoration Then: Only the specified fields are decrypted And: Other encrypted fields remain encrypted """ clear_vault_key("test_restoration_key_multi") # Encrypt multiple fields encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) # Create config to restore only email field partial_decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], # Only restore email key_name="test_restoration_key_multi", ) ) ] ) # Restore only email field restored_df, _ = run_decrypt_op(partial_decrypt_config, encrypted_df.copy()) # Verify email is restored assert restored_df["email"].equals( sample_df["email"] ), "Email field should be restored to original values" # Verify other fields remain encrypted (different from original) assert not restored_df["name"].equals(sample_df["name"]), "Name field should remain encrypted" assert not restored_df["ssn"].equals(sample_df["ssn"]), "SSN field should remain encrypted" def test_ac1_restore_preserves_data_types(sample_df): """ AC1: Data restoration preserves original data types for all fields Scenario: Restore encrypted numeric and string fields Given: A dataset with mixed data types (strings, integers, floats) When: Fields are encrypted and then restored Then: Original data types are preserved after restoration """ # Create config to encrypt mixed types encrypt_config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", columns=["name", "age", "salary"], key_name="test_restoration_types", ) ) ] ) decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["name", "age", "salary"], key_name="test_restoration_types", ) ) ] ) clear_vault_key("test_restoration_types") # Encrypt and restore encrypted_df, _ = run_encrypt_op(encrypt_config, sample_df.copy()) restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) # Verify values are restored (as strings due to encryption/decryption) # Note: Fernet encryption/decryption converts everything to strings # This is expected behavior - original types are preserved via string representation assert ( restored_df["name"].tolist() == sample_df["name"].tolist() ), "String values should be restored" assert ( restored_df["age"].tolist() == sample_df["age"].astype(str).tolist() ), "Integer values should be restored as strings" assert ( restored_df["salary"].tolist() == sample_df["salary"].astype(str).tolist() ), "Float values should be restored as strings" def test_ac1_restore_empty_dataframe(encrypt_config_single_field, decrypt_config_single_field): """ AC1: Edge case - restore an empty dataset Scenario: Attempt to restore an empty pseudonymised dataset Given: An empty DataFrame with correct schema When: Restoration is attempted Then: Operation completes successfully without errors And: Returns an empty DataFrame """ clear_vault_key("test_restoration_key_single") # Create empty DataFrame with same schema empty_df = pd.DataFrame(columns=["id", "name", "email", "ssn", "age", "salary", "department"]) # Encrypt (should handle empty DataFrame) encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, empty_df.copy()) # Restore (should also handle empty DataFrame) restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) # Verify empty DataFrame returned assert len(restored_df) == 0, "Restored DataFrame should be empty" assert list(restored_df.columns) == list(empty_df.columns), "Column schema should be preserved" def test_ac1_restore_with_special_characters( encrypt_config_single_field, decrypt_config_single_field ): """ AC1: Data restoration with special characters and edge case values Scenario: Restore data containing special characters, unicode, etc. Given: A dataset with special characters in string fields When: Data is encrypted and then restored Then: All special characters are preserved accurately """ clear_vault_key("test_restoration_key_single") # Create DataFrame with special characters special_df = pd.DataFrame( { "id": [1, 2, 3, 4], "name": ["José García", "François Müller", "李明", "O'Brien"], "email": [ "josé@example.com", "françois@example.com", "li@example.cn", "o'brien@example.ie", ], "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", "456-78-9012"], "age": [25, 30, 35, 40], "salary": [50000.0, 60000.0, 70000.0, 80000.0], "department": ["HR", "IT", "Finance", "IT"], } ) # Encrypt and restore encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, special_df.copy()) restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) # Verify special characters preserved assert restored_df["email"].equals( special_df["email"] ), "Special characters should be preserved during restoration" for idx, (original, restored) in enumerate(zip(special_df["email"], restored_df["email"])): assert ( original == restored ), f"Row {idx}: Special characters in '{original}' should be preserved" # ------------------- AC2: Restoration Denial when Key is Missing ---------------------------- def test_ac2_restore_fails_when_key_missing(sample_df, encrypt_config_single_field): """ AC2: Restoration Denial when Decryption Key is missing Scenario: Attempt to restore encrypted fields when decryption key is missing Given: A pseudonymised dataset And: The decryption key is missing from Vault And: The participant provides the correct key name When: The participant attempts to restore the data Then: The system fails the restoration request And: Logs the failed key retrieval for auditing (via exception) And: An error message is presented to the user """ clear_vault_key("test_restoration_key_single") # Encrypt data first encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Delete the key from Vault to simulate missing key clear_vault_key("test_restoration_key_single") # Create decrypt config with missing key decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Attempt restoration - should fail with clear error with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) # Verify error message is informative error_message = str(exc_info.value) assert ( "not found" in error_message.lower() or "decrypt" in error_message.lower() ), "Error message should indicate key not found for decrypt operation" assert ( "test_restoration_key_single" in error_message ), "Error message should include the key name for auditing" def test_ac2_restore_fails_with_nonexistent_key_name(sample_df, encrypt_config_single_field): """ AC2: Restoration fails when using a key name that never existed Scenario: Attempt to restore with a key name that was never created Given: A pseudonymised dataset And: A key name that does not exist in Vault When: The participant attempts to restore the data Then: The system fails the restoration request with appropriate error """ clear_vault_key("test_restoration_key_single") # Encrypt data with one key encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Try to decrypt with a different, non-existent key decrypt_config_wrong_key = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="nonexistent_key_name" ) ) ] ) # Attempt restoration - should fail with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config_wrong_key, encrypted_df.copy()) error_message = str(exc_info.value) assert "not found" in error_message.lower(), "Error message should indicate key not found" def test_ac2_restore_fails_when_key_corrupted(sample_df, encrypt_config_single_field): """ AC2: Restoration Denial when Decryption Key is corrupted Scenario: Attempt to restore when key is corrupted in Vault Given: A pseudonymised dataset And: The decryption key is corrupted (invalid format) When: The participant attempts to restore the data Then: The system fails the restoration request And: An appropriate error message is presented """ clear_vault_key("test_restoration_key_single") # Encrypt data first encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Corrupt the key by replacing it with invalid data set_vault_key("test_restoration_key_single", "corrupted_invalid_key_data") # Create decrypt config decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Attempt restoration - should fail due to corrupted key with pytest.raises(Exception) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) # Should raise either ValueError or Fernet-related exception assert "Fernet" in str(type(exc_info.value)) or "ValueError" in str( type(exc_info.value) ), "Should raise Fernet or ValueError for corrupted key" # ------------- AC3: Restoration Denial when Access is Unauthorized -------------------------- def test_ac3_restore_fails_when_access_unauthorized(sample_df, encrypt_config_single_field): """ AC3: Restoration Denial when Decryption Key access is unauthorized Scenario: Attempt to restore encrypted fields without authorization Given: A pseudonymised dataset And: A decryption key in secret management tool And: The participant is not authorized to access the key When: The participant attempts to restore the data Then: The system denies the participant access to the key And: The system denies the initiation of the restoration process And: The system logs the unauthorized access attempt (via exception) And: An appropriate error message is presented to the user """ clear_vault_key("test_restoration_key_single") # Encrypt data first encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Set access control to deny access deny_vault_access("test_restoration_key_single") # Create decrypt config decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Attempt restoration - should fail with ValueError (wrapping Forbidden) with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) # Verify error indicates access denial error_message = str(exc_info.value) assert ( "access denied" in error_message.lower() or "error while reading" in error_message.lower() ), "Error message should indicate access denial or error reading key" assert ( "test_restoration_key_single" in error_message ), "Error message should include the key name for auditing" def test_ac3_restore_multiple_keys_with_mixed_authorization(sample_df): """ AC3: Restoration with mixed authorization - some keys authorized, others not Scenario: Attempt to restore multiple fields where user has access to some keys but not others Given: A pseudonymised dataset with multiple encrypted fields using different keys And: The participant is authorized for some keys but not others When: The participant attempts to restore all fields Then: The system denies access when unauthorized key is encountered """ # Encrypt email with one key, ssn with another encrypt_config_multi_keys = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", columns=["email"], key_name="authorized_key" ) ) ] ) clear_vault_key("authorized_key") clear_vault_key("unauthorized_key") # Encrypt data encrypted_df, _ = run_encrypt_op(encrypt_config_multi_keys, sample_df.copy()) # Manually encrypt another field with different key (simulating separate encryption) encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", columns=["ssn"], key_name="unauthorized_key" ) ) ] ) encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) # Deny access to unauthorized_key deny_vault_access("unauthorized_key") # Try to decrypt both fields decrypt_config_both = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="authorized_key" ) ), DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["ssn"], key_name="unauthorized_key" ) ), ] ) # Should fail when trying to access unauthorized_key with ValueError (wrapping Forbidden) with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config_both, encrypted_df.copy()) # Verify error indicates access issue with unauthorized key error_message = str(exc_info.value) assert ( "access denied" in error_message.lower() or "error while reading" in error_message.lower() ), "Error message should indicate access denial" assert "unauthorized_key" in error_message, "Error message should mention the unauthorized key" # ------------------- AC4: Restoration Denial when Key is Invalid ---------------------------- def test_ac4_restore_fails_with_wrong_key(sample_df): """ AC4: Restoration Denial when Decryption Key is invalid Scenario: Attempt to restore encrypted fields with a key that doesn't match the encryption key Given: A pseudonymised dataset encrypted with key A And: A different valid decryption key B is stored in secret management tool And: The participant provides key B (which is not the correct key) And: Key B does not correspond to the fields to be restored When: The participant attempts to restore the data Then: The system fails the restoration request And: Logs the failed decryption attempt for auditing (via exception) And: An error message is presented to the user """ # Encrypt with one key encrypt_config_key_a = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", columns=["email"], key_name="encryption_key_a" ) ) ] ) clear_vault_key("encryption_key_a") clear_vault_key("encryption_key_b") # Encrypt data with key A encrypted_df, _ = run_encrypt_op(encrypt_config_key_a, sample_df.copy()) # Generate a different valid key B in Vault different_key = Fernet.generate_key().decode() set_vault_key("encryption_key_b", different_key) # Try to decrypt with key B (wrong key) decrypt_config_key_b = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="encryption_key_b" ) ) ] ) # Attempt restoration - should fail with InvalidToken or ValueError with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config_key_b, encrypted_df.copy()) # Verify error message indicates decryption failure error_message = str(exc_info.value) assert ( "invalid" in error_message.lower() or "token" in error_message.lower() ), "Error message should indicate invalid token or decryption failure" assert ( "encryption_key_b" in error_message ), "Error message should include the key name for auditing" def test_ac4_restore_fails_with_key_from_different_field(sample_df): """ AC4: Restoration fails when using a key intended for a different field Scenario: Attempt to restore field A using the key for field B Given: A dataset with multiple fields encrypted with different keys And: The participant provides the key for field B to decrypt field A When: The participant attempts to restore field A Then: The system fails the restoration request """ # Encrypt email and ssn with different keys encrypt_config_email = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["email"], key_name="email_key") ) ] ) encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["ssn"], key_name="ssn_key") ) ] ) clear_vault_key("email_key") clear_vault_key("ssn_key") # Encrypt both fields encrypted_df, _ = run_encrypt_op(encrypt_config_email, sample_df.copy()) encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) # Try to decrypt email field using ssn_key decrypt_config_wrong_field = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], # Trying to decrypt email key_name="ssn_key", # But using ssn's key ) ) ] ) # Should fail with InvalidToken with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config_wrong_field, encrypted_df.copy()) error_message = str(exc_info.value) assert ( "invalid" in error_message.lower() or "token" in error_message.lower() ), "Error message should indicate invalid token" def test_ac4_restore_fails_with_tampered_encrypted_data(sample_df, encrypt_config_single_field): """ AC4: Restoration fails when encrypted data has been tampered with Scenario: Attempt to restore encrypted data that has been modified Given: A pseudonymised dataset And: Some encrypted values have been tampered with And: The correct decryption key is provided When: The participant attempts to restore the data Then: The system fails the restoration for tampered values And: An appropriate error message is presented """ clear_vault_key("test_restoration_key_single") # Encrypt data encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Tamper with encrypted data (modify one encrypted value) encrypted_df.loc[0, "email"] = "tampered_invalid_encrypted_data" # Create decrypt config decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Attempt restoration - should fail on tampered data with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) error_message = str(exc_info.value) assert ( "invalid" in error_message.lower() or "token" in error_message.lower() ), "Error message should indicate invalid token due to tampering" # ---------------- Additional Edge Cases and Integration Tests ------------------------------- def test_integration_full_cycle_encrypt_decrypt_multiple_operations(sample_df): """ Integration test: Full cycle of multiple encrypt/decrypt operations Scenario: Complex workflow with multiple encryption and restoration operations Given: A dataset When: Multiple fields are encrypted at different times And: Fields are restored in different orders Then: All operations complete successfully And: Final restored data matches original """ # Phase 1: Encrypt email encrypt_config_1 = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_1") ) ] ) clear_vault_key("key_1") encrypted_df_1, _ = run_encrypt_op(encrypt_config_1, sample_df.copy()) # Phase 2: Encrypt name and ssn encrypt_config_2 = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["name", "ssn"], key_name="key_2") ) ] ) clear_vault_key("key_2") encrypted_df_2, _ = run_encrypt_op(encrypt_config_2, encrypted_df_1.copy()) # Phase 3: Restore email first decrypt_config_1 = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_1") ) ] ) restored_df_1, _ = run_decrypt_op(decrypt_config_1, encrypted_df_2.copy()) assert restored_df_1["email"].equals(sample_df["email"]), "Email should be restored" # Phase 4: Restore name and ssn decrypt_config_2 = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig(type="decrypt", columns=["name", "ssn"], key_name="key_2") ) ] ) restored_df_2, _ = run_decrypt_op(decrypt_config_2, restored_df_1.copy()) # Verify all fields restored assert restored_df_2["email"].equals(sample_df["email"]), "Email should remain restored" assert restored_df_2["name"].equals(sample_df["name"]), "Name should be restored" assert restored_df_2["ssn"].equals(sample_df["ssn"]), "SSN should be restored" def test_restore_with_null_values(encrypt_config_single_field, decrypt_config_single_field): """ Edge case: Restoration of dataset with null/NaN values Scenario: Dataset contains null values in encrypted fields Given: A dataset with null values in fields to be encrypted When: Data is encrypted and then restored Then: Null values are handled appropriately """ clear_vault_key("test_restoration_key_single") # Create DataFrame with null values df_with_nulls = pd.DataFrame( { "id": [1, 2, 3, 4], "name": ["Alice", "Bob", None, "David"], "email": [ "alice@example.com", None, "charlie@example.com", "david@example.com", ], "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", None], "age": [25, 30, 35, 40], "salary": [50000.0, 60000.0, 70000.0, 80000.0], "department": ["HR", "IT", "Finance", "IT"], } ) # Note: Encryption of NaN/None values will convert them to string "nan" or "None" # This is expected behavior - Fernet encryption requires string input encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, df_with_nulls.copy()) restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) # Verify non-null values are restored correctly assert restored_df.loc[0, "email"] == "alice@example.com" assert restored_df.loc[2, "email"] == "charlie@example.com" assert restored_df.loc[3, "email"] == "david@example.com" def test_restore_large_dataset_performance(): """ Performance test: Restoration of large dataset Scenario: Restore a large dataset with many rows Given: A large dataset with 10,000 rows When: Data is encrypted and then restored Then: Operation completes without errors or timeout And: All values are restored correctly """ # Create large dataset large_df = pd.DataFrame( { "id": range(1, 10001), "email": [f"user{i}@example.com" for i in range(1, 10001)], "name": [f"User {i}" for i in range(1, 10001)], "ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)], "age": [20 + (i % 50) for i in range(1, 10001)], "salary": [30000 + (i * 10) for i in range(1, 10001)], "department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)], } ) encrypt_config = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig( type="encrypt", columns=["email"], key_name="test_large_dataset" ) ) ] ) decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_large_dataset" ) ) ] ) clear_vault_key("test_large_dataset") # Encrypt and restore encrypted_df, _ = run_encrypt_op(encrypt_config, large_df.copy()) restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) # Verify sample of values assert len(restored_df) == 10000, "Should restore all 10,000 rows" assert restored_df["email"].equals(large_df["email"]), "All emails should be restored" # Spot check specific values assert restored_df.loc[0, "email"] == "user1@example.com" assert restored_df.loc[5000, "email"] == "user5001@example.com" assert restored_df.loc[9999, "email"] == "user10000@example.com" @pytest.mark.edge_case @pytest.mark.security def test_restore_after_key_rotation(sample_df, encrypt_config_single_field): """ AC4: Restoration fails after key rotation (key changed in Vault) Scenario: Key is rotated in Vault after encryption Given: Data encrypted with key version 1 And: Key is rotated to version 2 in Vault When: Participant attempts to restore using new key version Then: Restoration fails with clear error message """ clear_vault_key("test_restoration_key_single") # Encrypt with original key encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Rotate key (replace with new key) new_key = Fernet.generate_key().decode() set_vault_key("test_restoration_key_single", new_key) decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Should fail - key mismatch with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) assert ( "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() ), "Should indicate invalid token due to key rotation" @pytest.mark.edge_case def test_restore_partially_encrypted_column(sample_df, encrypt_config_single_field): """ Edge case: Attempt to restore column where only some rows are encrypted Scenario: Column has mixed encrypted/plaintext values (data corruption scenario) """ clear_vault_key("test_restoration_key_single") # Encrypt data encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Corrupt by replacing some encrypted values with plaintext encrypted_df.loc[0, "email"] = "plaintext@example.com" encrypted_df.loc[2, "email"] = "another_plaintext@example.com" decrypt_config = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig( type="decrypt", columns=["email"], key_name="test_restoration_key_single", ) ) ] ) # Should fail on plaintext values with pytest.raises(ValueError) as exc_info: run_decrypt_op(decrypt_config, encrypted_df.copy()) assert ( "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() ), "Should indicate invalid token for plaintext values" @pytest.mark.edge_case def test_restore_with_missing_column_in_encrypted_data( sample_df, encrypt_config_single_field, decrypt_config_single_field ): """ AC2: Restoration fails when specified column doesn't exist in encrypted dataset """ clear_vault_key("test_restoration_key_single") # First encrypt the sample data to create the key encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) # Create encrypted DataFrame missing the 'email' column incomplete_df = pd.DataFrame( { "id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], # Missing 'email' column that decrypt config expects "age": [25, 30, 35], "salary": [50000.0, 60000.0, 70000.0], "department": ["HR", "IT", "Finance"], } ) with pytest.raises((ValueError, KeyError)) as exc_info: run_decrypt_op(decrypt_config_single_field, incomplete_df) error_msg = str(exc_info.value) assert ( "email" in error_msg or "not present" in error_msg or "not found" in error_msg ), f"Error should indicate missing column, got: {error_msg}" @pytest.mark.integration def test_restore_with_multiple_encryption_keys(sample_df): """ Integration test: Restore data encrypted with multiple different keys Scenario: Different fields encrypted with different keys Given: name encrypted with key_a, email encrypted with key_b When: Participant provides both keys for restoration Then: Both fields are restored correctly """ clear_vault_key("key_a") clear_vault_key("key_b") # Encrypt name with key_a encrypt_config_name = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["name"], key_name="key_a") ) ] ) # Encrypt email with key_b encrypt_config_email = AnonymisePseudonymizeStructuredConfig( used_function=[ PseudoTechniqueConfig( technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_b") ) ] ) # Encrypt both fields df_encrypted = sample_df.copy() df_encrypted, _ = run_encrypt_op(encrypt_config_name, df_encrypted) df_encrypted, _ = run_encrypt_op(encrypt_config_email, df_encrypted) # Decrypt name with key_a decrypt_config_name = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig(type="decrypt", columns=["name"], key_name="key_a") ) ] ) # Decrypt email with key_b decrypt_config_email = DepseudonymizeStructuredConfig( used_function=[ DepseudoTechniqueConfig( technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_b") ) ] ) # Restore both fields df_restored = df_encrypted.copy() df_restored, _ = run_decrypt_op(decrypt_config_name, df_restored) df_restored, _ = run_decrypt_op(decrypt_config_email, df_restored) # Verify both fields restored assert df_restored["name"].equals(sample_df["name"]), "Name field should be restored with key_a" assert df_restored["email"].equals( sample_df["email"] ), "Email field should be restored with key_b"