feat(SIMPL-24642): migrate tests from 3 source repos with updated imports

This commit is contained in:
ILay
2026-04-24 18:42:07 +02:00
parent 4e0b216410
commit d14b2dfac4
26 changed files with 6280 additions and 0 deletions

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,53 @@
"""Pytest configuration and shared fixtures."""
import pytest
import pandas as pd
from unittest.mock import MagicMock, patch
import sys
from dagster import build_op_context
# Mock external dependencies that might not be available in test environment
sys.modules['spellchecker'] = MagicMock()
@pytest.fixture
def mock_context():
"""Create a mock Dagster context for testing operations."""
context = build_op_context()
return context
@pytest.fixture
def sample_dataframe():
"""Create a sample DataFrame for testing."""
return pd.DataFrame({
'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'],
'Age': [25, 30, 25, None, 25],
'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'],
'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active']
})
@pytest.fixture
def sample_dataframe_with_typos():
"""Create a sample DataFrame with typos for spell checking."""
return pd.DataFrame({
'Name': ['jon doe', 'jane smith', 'bob jonson'],
'Description': ['developer', 'analst', 'enginer']
})
@pytest.fixture
def empty_dataframe():
"""Create an empty DataFrame."""
return pd.DataFrame()
@pytest.fixture
def dataframe_with_missing_values():
"""Create a DataFrame with various missing values."""
return pd.DataFrame({
'Column1': [1, None, 3, None, 5],
'Column2': ['a', 'b', None, 'd', None],
'Column3': [None, None, None, None, None]
})

View File

@@ -0,0 +1,7 @@
"""Configuration utilities for testing."""
import os
import sys
# Add src directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))

View File

@@ -0,0 +1,202 @@
"""Unit tests for configuration models."""
import pytest
from pydantic import ValidationError
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration,
AggregationConfiguration
)
class TestColumnsSelectConfiguration:
"""Tests for ColumnsSelectConfiguration."""
def test_default_columns(self):
"""Test default columns configuration."""
config = ColumnsSelectConfiguration()
assert config.columns == ['Name']
def test_custom_columns(self):
"""Test custom columns configuration."""
config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3'])
assert config.columns == ['Col1', 'Col2', 'Col3']
def test_empty_columns_list(self):
"""Test with empty columns list."""
config = ColumnsSelectConfiguration(columns=[])
assert config.columns == []
def test_single_column(self):
"""Test with a single column."""
config = ColumnsSelectConfiguration(columns=['SingleCol'])
assert config.columns == ['SingleCol']
def test_columns_with_special_characters(self):
"""Test columns with special characters."""
config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3'])
assert config.columns == ['Col-1', 'Col_2', 'Col.3']
def test_duplicate_columns_are_removed(self):
"""Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys)."""
config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B'])
assert config.columns == ['A', 'B', 'C']
def test_duplicate_default_behavior(self):
"""Verifica che anche input estremi vengano gestiti correttamente."""
config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name'])
assert config.columns == ['Name']
class TestFillMissingConfiguration:
"""Tests for FillMissingConfiguration."""
def test_default_fill_map(self):
"""Test default fill map configuration."""
config = FillMissingConfiguration()
assert config.fill_map == {'Age': 'UNKNOWN_AGE'}
def test_custom_fill_map(self):
"""Test custom fill map configuration."""
fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_empty_fill_map(self):
"""Test with empty fill map."""
config = FillMissingConfiguration(fill_map={})
assert config.fill_map == {}
def test_fill_map_with_numeric_values(self):
"""Test fill map with numeric string values."""
fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_fill_map_with_string_values(self):
"""Test fill map with string values."""
fill_map = {'Name': 'Unknown', 'Email': 'no-email'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_fill_map_mixed_types(self):
"""Test fill map with mixed value types (all strings)."""
fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
class TestSpellCheckConfiguration:
"""Tests for SpellCheckConfiguration."""
def test_default_spell_check_config(self):
"""Test default spell check configuration."""
config = SpellCheckConfiguration()
assert config.columns == ['Name']
assert config.language == 'en'
def test_custom_spell_check_config(self):
"""Test custom spell check configuration."""
config = SpellCheckConfiguration(
columns=['Description', 'Notes'],
language='es'
)
assert config.columns == ['Description', 'Notes']
assert config.language == 'es'
def test_spell_check_all_languages(self):
"""Test spell check with all supported languages."""
supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']
for lang in supported_languages:
config = SpellCheckConfiguration(language=lang)
assert config.language == lang
def test_spell_check_invalid_language(self):
"""Test spell check with invalid language."""
with pytest.raises(ValidationError):
SpellCheckConfiguration(language='invalid')
def test_spell_check_multiple_columns(self):
"""Test spell check with multiple columns."""
columns = ['Col1', 'Col2', 'Col3', 'Col4']
config = SpellCheckConfiguration(columns=columns)
assert config.columns == columns
def test_spell_check_empty_columns(self):
"""Test spell check with empty columns list."""
config = SpellCheckConfiguration(columns=[])
assert config.columns == []
assert config.language == 'en'
def test_spell_check_inheritance(self):
"""Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration."""
config = SpellCheckConfiguration()
assert isinstance(config, ColumnsSelectConfiguration)
assert hasattr(config, 'columns')
assert hasattr(config, 'language')
@pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'])
def test_spell_check_languages_parametrized(self, language):
"""Test spell check with parametrized languages."""
config = SpellCheckConfiguration(language=language)
assert config.language == language
class TestAggregationConfiguration:
"""Tests for AggregationConfiguration."""
def test_aggregation_default_config(self):
"""Test default aggregation configuration."""
config = AggregationConfiguration()
assert config.columns == ['Name']
assert config.operation == 'sum'
@pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"])
def test_aggregation_valid_operations(self, op):
"""Test all allowed aggregation operations."""
config = AggregationConfiguration(operation=op)
assert config.operation == op
def test_aggregation_invalid_operation(self):
"""Test that an invalid operation raises a ValidationError."""
with pytest.raises(ValidationError) as excinfo:
AggregationConfiguration(operation="invalid_op")
assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value)
def test_aggregation_custom_columns(self):
"""Test aggregation with custom columns."""
config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean')
assert config.columns == ['Price', 'Quantity']
assert config.operation == 'mean'
def test_aggregation_inheritance(self):
"""Test that AggregationConfiguration inherits from ColumnsSelectConfiguration."""
config = AggregationConfiguration()
assert isinstance(config, ColumnsSelectConfiguration)
assert hasattr(config, 'columns')
assert hasattr(config, 'operation')
def test_aggregation_model_dump(self):
"""Test that model_dump contains all expected fields (useful for the Dagster op)."""
config = AggregationConfiguration(columns=['Value'], operation='max')
dump = config.model_dump()
assert dump['columns'] == ['Value']
assert dump['operation'] == 'max'

View File

@@ -0,0 +1,185 @@
"""Integration tests for data processing jobs."""
import pytest
import pandas as pd
from unittest.mock import patch, MagicMock
from template_code_location.data_processing.ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos
)
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration
)
class TestPipelineIntegration:
"""Integration tests for data processing pipeline."""
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
"""Test pipeline: remove duplicates then standardize."""
df = pd.DataFrame({
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
})
# Step 1: Remove duplicates
df_no_dupes = remove_duplicates(mock_context, df)
assert df_no_dupes.shape[0] == 3
# Step 2: Standardize
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
assert df_standardized['Name'].iloc[0] == 'john doe'
assert df_standardized['City'].iloc[0] == 'new york'
def test_pipeline_fill_missing_then_standardize(self, mock_context):
"""Test pipeline: fill missing values then standardize."""
df = pd.DataFrame({
'Category': [' ACTIVE ', None, ' PENDING '],
'Value': ['1', '2', None]
})
# Step 1: Fill missing values
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df_filled = fill_missing_values(mock_context, fill_config, df)
# Step 2: Standardize
std_config = ColumnsSelectConfiguration(columns=['Category'])
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
assert df_standardized['Category'].iloc[0] == 'active'
assert df_filled['Value'].iloc[2] == '0'
def test_pipeline_all_operations(self, mock_context):
"""Test complete pipeline with all operations."""
df = pd.DataFrame({
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
'Value': ['1', None, '1', '2']
})
# Step 1: Remove duplicates
df = remove_duplicates(mock_context, df)
assert df.shape[0] == 3
# Step 2: Fill missing
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df = fill_missing_values(mock_context, fill_config, df)
assert df['Value'].isna().sum() == 0
# Step 3: Standardize
std_config = ColumnsSelectConfiguration(columns=['Name'])
df = standardize_categorical_values(mock_context, std_config, df)
assert df['Name'].iloc[0] == 'john doe'
def test_pipeline_with_large_dataset(self, mock_context):
"""Test pipeline performance with larger dataset."""
# Create larger dataset
size = 1000
df = pd.DataFrame({
'ID': list(range(size)),
'Name': ['User_' + str(i % 50) for i in range(size)],
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
'Score': [i % 100 for i in range(size)]
})
# Add some duplicates
df = pd.concat([df, df.head(100)], ignore_index=True)
# Process
df_cleaned = remove_duplicates(mock_context, df)
assert df_cleaned.shape[0] == 1000
assert df_cleaned.shape[1] == 4
class TestErrorHandling:
"""Tests for error handling and edge cases."""
def test_operation_with_corrupted_data(self, mock_context):
"""Test operations with corrupted/unusual data."""
df = pd.DataFrame({
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
})
# Should handle special float values
result = remove_duplicates(mock_context, df)
assert result.shape[0] > 0
def test_operation_preserves_index(self, mock_context):
"""Test that index is handled correctly."""
df = pd.DataFrame(
{'Col': [1, 2, 1, 3]},
index=['a', 'b', 'c', 'd']
)
result = remove_duplicates(mock_context, df)
# Index may be reset, so just check shape
assert result.shape[0] == 3
def test_standardize_with_unicode_characters(self, mock_context):
"""Test standardization with unicode characters."""
df = pd.DataFrame({
'Name': ['José', 'François', 'Müller']
})
config = ColumnsSelectConfiguration(columns=['Name'])
result = standardize_categorical_values(mock_context, config, df)
# Should handle unicode correctly
assert result.shape[0] == 3
def test_fill_with_same_key_multiple_times(self, mock_context):
"""Test filling when fill_map has multiple entries."""
df = pd.DataFrame({
'A': ['1', None, '3'],
'B': [None, None, 'c'],
'C': [None, '2', None]
})
config = FillMissingConfiguration(fill_map={
'A': '-1',
'B': 'EMPTY',
'C': '0'
})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '-1'
assert result.loc[0, 'B'] == 'EMPTY'
assert result.loc[0, 'C'] == '0'
class TestDataTypePreservation:
"""Tests to ensure data types are preserved appropriately."""
def test_remove_duplicates_preserves_dtypes(self, mock_context):
"""Test that remove_duplicates preserves column data types."""
df = pd.DataFrame({
'int32': pd.array([1, 2, 1], dtype='int32'),
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
'str': ['a', 'b', 'a']
})
result = remove_duplicates(mock_context, df)
assert result['int32'].dtype == df['int32'].dtype
assert result['float64'].dtype == df['float64'].dtype
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
"""Test that fill_missing handles type preservation."""
df = pd.DataFrame({
'A': pd.array(['1', None, '3'], dtype='string'),
'B': ['x', 'y', 'z']
})
config = FillMissingConfiguration(fill_map={'A': '0'})
result = fill_missing_values(mock_context, config, df)
assert result['A'].loc[1] == '0'
assert result['B'].dtype == df['B'].dtype

View File

@@ -0,0 +1,56 @@
from template_code_location.data_processing.jobs import (
remove_duplicates_job_s3,
fill_missing_values_job_s3,
standardize_categorical_values_job_s3,
correct_typos_job_s3,
normalize_numeric_min_max_job_s3,
normalize_datetime_job_s3,
normalize_coordinates_job_s3,
add_global_aggregations_job_s3
)
def test_remove_duplicates_job_s3_is_callable():
"""Test remove_duplicates_job_s3 is a valid Dagster job"""
assert callable(remove_duplicates_job_s3)
assert hasattr(remove_duplicates_job_s3, 'execute_in_process')
def test_fill_missing_values_job_s3_is_callable():
"""Test fill_missing_values_job_s3 is a valid Dagster job"""
assert callable(fill_missing_values_job_s3)
assert hasattr(fill_missing_values_job_s3, 'execute_in_process')
def test_standardize_categorical_values_job_s3_is_callable():
"""Test standardize_categorical_values_job_s3 is a valid Dagster job"""
assert callable(standardize_categorical_values_job_s3)
assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process')
def test_correct_typos_job_s3_is_callable():
"""Test correct_typos_job_s3 is a valid Dagster job"""
assert callable(correct_typos_job_s3)
assert hasattr(correct_typos_job_s3, 'execute_in_process')
def test_normalize_numeric_min_max_job_s3_is_callable():
"""Test normalize_numeric_min_max_job_s3 is a valid Dagster job"""
assert callable(normalize_numeric_min_max_job_s3)
assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process')
def test_normalize_datetime_job_s3_is_callable():
"""Test normalize_datetime_job_s3 is a valid Dagster job"""
assert callable(normalize_datetime_job_s3)
assert hasattr(normalize_datetime_job_s3, 'execute_in_process')
def test_normalize_coordinates_job_s3_is_callable():
"""Test normalize_coordinates_job_s3 is a valid Dagster job"""
assert callable(normalize_coordinates_job_s3)
assert hasattr(normalize_coordinates_job_s3, 'execute_in_process')
def test_add_global_aggregations_job_s3_is_callable():
"""Test add_global_aggregations_job_s3 is a valid Dagster job"""
assert callable(add_global_aggregations_job_s3)
assert hasattr(add_global_aggregations_job_s3, 'execute_in_process')

View File

@@ -0,0 +1,700 @@
"""Unit tests for data processing operations."""
import pytest
import pandas as pd
from template_code_location.data_processing.ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos,
normalize_datetime,
normalize_numeric_min_max,
normalize_coordinates,
add_global_aggregations
)
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration,
AggregationConfiguration,
CoordinatesNormalizationConfiguration
)
class TestRemoveDuplicates:
"""Tests for the remove_duplicates operation."""
def test_remove_duplicates_basic(self, mock_context, sample_dataframe):
"""Test basic duplicate removal."""
result = remove_duplicates(mock_context, sample_dataframe)
# Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x)
assert result.shape[0] == 3
assert len(result) < len(sample_dataframe)
def test_remove_duplicates_no_duplicates(self, mock_context):
"""Test remove_duplicates when there are no duplicates."""
df = pd.DataFrame({
'A': [1, 2, 3],
'B': ['x', 'y', 'z']
})
result = remove_duplicates(mock_context, df)
assert result.shape[0] == 3
pd.testing.assert_frame_equal(result, df)
def test_remove_duplicates_all_duplicates(self, mock_context):
"""Test remove_duplicates when all rows are identical."""
df = pd.DataFrame({
'A': [1, 1, 1],
'B': ['x', 'x', 'x']
})
result = remove_duplicates(mock_context, df)
assert result.shape[0] == 1
def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe):
"""Test remove_duplicates with empty DataFrame."""
result = remove_duplicates(mock_context, empty_dataframe)
assert result.shape[0] == 0
assert result.shape[1] == 0
def test_remove_duplicates_preserves_data_types(self, mock_context):
"""Test that remove_duplicates preserves data types."""
df = pd.DataFrame({
'int_col': [1, 2, 1],
'str_col': ['a', 'b', 'a'],
'float_col': [1.5, 2.5, 1.5]
})
result = remove_duplicates(mock_context, df)
assert result['int_col'].dtype == df['int_col'].dtype
assert result['str_col'].dtype == df['str_col'].dtype
assert result['float_col'].dtype == df['float_col'].dtype
class TestFillMissingValues:
"""Tests for the fill_missing_values operation."""
def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values):
"""Test basic missing value filling."""
config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'})
result = fill_missing_values(mock_context, config, dataframe_with_missing_values)
# Check that no NaN values remain
assert result['Column1'].isna().sum() == 0
assert result['Column2'].isna().sum() == 0
def test_fill_missing_values_with_different_values(self, mock_context):
"""Test filling with different replacement values."""
df = pd.DataFrame({
'A': [1, None, 3],
'B': [None, 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '-1'
assert result.loc[0, 'B'] == 'UNKNOWN'
def test_fill_missing_values_partial_columns(self, mock_context):
"""Test filling only specified columns."""
df = pd.DataFrame({
'A': [1, None, 3],
'B': [None, 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '999'})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '999'
assert pd.isna(result.loc[0, 'B']) # B should still have NaN
def test_fill_missing_values_no_missing(self, mock_context):
"""Test when there are no missing values."""
df = pd.DataFrame({
'A': ['1', '2', '3'],
'B': ['a', 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '0'})
result = fill_missing_values(mock_context, config, df)
pd.testing.assert_frame_equal(result, df)
def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = FillMissingConfiguration(fill_map={})
result = fill_missing_values(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
class TestStandardizeCategoricalValues:
"""Tests for the standardize_categorical_values operation."""
def test_standardize_categorical_basic(self, mock_context, sample_dataframe):
"""Test basic categorical standardization."""
config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status'])
result = standardize_categorical_values(mock_context, config, sample_dataframe)
# Check that values are lowercase and stripped
assert result['Name'].iloc[0] == 'john doe'
assert result['City'].iloc[1] == 'los angeles'
assert result['Status'].iloc[1] == 'inactive'
def test_standardize_categorical_single_column(self, mock_context):
"""Test standardization on a single column."""
df = pd.DataFrame({
'City': [' NEW YORK ', 'LOS ANGELES', ' chicago ']
})
config = ColumnsSelectConfiguration(columns=['City'])
result = standardize_categorical_values(mock_context, config, df)
assert result['City'].iloc[0] == 'new york'
assert result['City'].iloc[1] == 'los angeles'
assert result['City'].iloc[2] == 'chicago'
def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe):
"""Test with non-existent column (should skip)."""
config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name'])
result = standardize_categorical_values(mock_context, config, sample_dataframe)
# Should process 'Name' column without error
assert result['Name'].iloc[0] == 'john doe'
def test_standardize_categorical_with_missing_values(self, mock_context):
"""Test standardization with missing values."""
df = pd.DataFrame({
'Category': [' ACTIVE ', None, ' pending ']
})
config = ColumnsSelectConfiguration(columns=['Category'])
result = standardize_categorical_values(mock_context, config, df)
assert result['Category'].iloc[0] == 'active'
assert result['Category'].iloc[1] == ''
assert result['Category'].iloc[2] == 'pending'
def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = ColumnsSelectConfiguration(columns=['A', 'B'])
result = standardize_categorical_values(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
def test_standardize_categorical_numeric_columns(self, mock_context):
"""Test that numeric columns are converted to strings."""
df = pd.DataFrame({
'NumCol': [1, 2, 3]
})
config = ColumnsSelectConfiguration(columns=['NumCol'])
result = standardize_categorical_values(mock_context, config, df)
assert result['NumCol'].iloc[0] == '1'
assert isinstance(result['NumCol'].iloc[0], str)
class TestCorrectTypos:
"""Tests for the correct_typos operation."""
def test_correct_typos_basic(self, mock_context):
"""Test basic typo correction."""
df = pd.DataFrame({
'Name': ['jon', 'jayne', 'bob']
})
config = SpellCheckConfiguration(columns=['Name'], language='en')
result = correct_typos(mock_context, config, df)
# Result should have corrections applied
assert result.shape[0] == 3
def test_correct_typos_missing_column(self, mock_context):
"""Test with non-existent column (should skip)."""
df = pd.DataFrame({
'Name': ['jon', 'jayne']
})
config = SpellCheckConfiguration(columns=['NonExistent'], language='en')
result = correct_typos(mock_context, config, df)
# Should not raise error, just skip
pd.testing.assert_frame_equal(result, df)
def test_correct_typos_with_missing_values(self, mock_context):
"""Test typo correction with missing values."""
df = pd.DataFrame({
'Text': ['helo', '', 'wrld']
})
config = SpellCheckConfiguration(columns=['Text'], language='en')
result = correct_typos(mock_context, config, df)
# Empty strings should be preserved
assert result.loc[1, 'Text'] == ''
def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = SpellCheckConfiguration(columns=['A'], language='en')
result = correct_typos(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
def test_correct_typos_different_languages(self, mock_context):
"""Test typo correction with different languages."""
df = pd.DataFrame({
'Text': ['ciao', 'mondo']
})
for lang in ['en', 'es', 'it']:
config = SpellCheckConfiguration(columns=['Text'], language=lang)
result = correct_typos(mock_context, config, df)
# Should process without error
assert result.shape[0] == 2
def test_correct_typos_numeric_values(self, mock_context):
"""Test typo correction on numeric values converted to strings."""
df = pd.DataFrame({
'Values': [123, 456, 789]
})
config = SpellCheckConfiguration(columns=['Values'], language='en')
result = correct_typos(mock_context, config, df)
# Numeric values should be converted to string and processed
assert result.shape[0] == 3
class TestNormalizeDatetime:
"""Tests for the normalize_datetime operation."""
def test_normalize_datetime_basic(self, mock_context):
"""Test basic datetime normalization to ISO format."""
df = pd.DataFrame({
'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59']
})
config = ColumnsSelectConfiguration(columns=['date_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'date_col_iso' in result.columns
assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z'
assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z'
def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe):
"""Test behavior when a configured column is missing in the DataFrame."""
config = ColumnsSelectConfiguration(columns=['non_existent_column'])
result = normalize_datetime(mock_context, config, sample_dataframe.copy())
pd.testing.assert_frame_equal(result, sample_dataframe)
def test_normalize_datetime_unparseable_values(self, mock_context):
"""Test column with values that cannot be parsed as dates."""
df = pd.DataFrame({
'invalid_col': ['not-a-date', 'completely-random-text']
})
config = ColumnsSelectConfiguration(columns=['invalid_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'invalid_col_iso' not in result.columns
def test_normalize_datetime_mixed_and_nulls(self, mock_context):
"""Test column with mixed valid dates, invalid dates, and NaNs."""
df = pd.DataFrame({
'mixed_col': ['2023-05-01', None, 'invalid-date']
})
config = ColumnsSelectConfiguration(columns=['mixed_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'mixed_col_iso' in result.columns
assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z'
assert result['mixed_col_iso'].iloc[1] == ""
assert result['mixed_col_iso'].iloc[2] == ""
def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with an empty DataFrame."""
config = ColumnsSelectConfiguration(columns=['some_col'])
result = normalize_datetime(mock_context, config, empty_dataframe)
assert result.empty
def test_normalize_datetime_epoch_only(self, mock_context, capsys):
"""If parsing a column yields only the Unix epoch date, it should be skipped."""
df = pd.DataFrame({
'weird_col': ['0', 0, '0000', '']
})
config = ColumnsSelectConfiguration(columns=['weird_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'weird_col_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys):
"""If all formatted values are '1970-01-01', the column should be skipped with a warning."""
df = pd.DataFrame({
'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00']
})
config = ColumnsSelectConfiguration(columns=['ts_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'ts_col_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys):
"""If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped."""
df = pd.DataFrame({
'age': [66, 45, 40, 43, 20, 26, 69, 21, 46]
})
config = ColumnsSelectConfiguration(columns=['age'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'age_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
class TestNormalizeNumericMinMax:
"""Tests for the normalize_numeric_min_max operation."""
def test_normalize_numeric_basic(self, mock_context):
"""Test standard min-max normalization between 0 and 1."""
df = pd.DataFrame({
'score': [10, 20, 30, 40, 50]
})
config = ColumnsSelectConfiguration(columns=['score'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'score_norm' in result.columns
assert result['score_norm'].min() == 0.0
assert result['score_norm'].max() == 1.0
assert result['score_norm'].iloc[2] == 0.5
def test_normalize_numeric_missing_column(self, mock_context):
"""Test skipping of non-existent columns."""
df = pd.DataFrame({'existing': [1, 2, 3]})
config = ColumnsSelectConfiguration(columns=['missing_col'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'missing_col_norm' not in result.columns
def test_normalize_numeric_constant_values(self, mock_context):
"""Test skipping when min == max to avoid division by zero."""
df = pd.DataFrame({
'constant': [10, 10, 10]
})
config = ColumnsSelectConfiguration(columns=['constant'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'constant_norm' not in result.columns
def test_normalize_numeric_with_nans(self, mock_context):
"""Test normalization with NaN values (pandas min/max ignore NaNs by default)."""
df = pd.DataFrame({
'with_nans': [10, None, 50]
})
config = ColumnsSelectConfiguration(columns=['with_nans'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'with_nans_norm' in result.columns
assert result['with_nans_norm'].iloc[0] == 0.0
assert result['with_nans_norm'].iloc[2] == 1.0
assert pd.isna(result['with_nans_norm'].iloc[1])
def test_normalize_numeric_multiple_columns(self, mock_context):
"""Test processing multiple columns in one call."""
df = pd.DataFrame({
'A': [1, 2],
'B': [10, 20]
})
config = ColumnsSelectConfiguration(columns=['A', 'B'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'A_norm' in result.columns
assert 'B_norm' in result.columns
class TestNormalizeCoordinates:
"""Tests for the normalize_coordinates operation."""
def test_normalize_coordinates_basic(self, mock_context):
"""Test rounding and basic coordinate normalization."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_filtering(self, mock_context):
"""Test filtering of out-of-range coordinates."""
df = pd.DataFrame({
'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range
'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert result['lat'].iloc[0] == 45.0
def test_normalize_coordinates_invalid_types(self, mock_context):
"""Test conversion of strings to numeric and handling of NaNs."""
df = pd.DataFrame({
'lat': ["45.5", "invalid", None],
'lon': ["9.5", "10.0", "11.0"]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert isinstance(result['lat'].iloc[0], float)
def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe):
"""Test with an empty DataFrame."""
df = pd.DataFrame(columns=['lat', 'lon'])
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df)
assert len(result) == 0
assert result.empty
def test_normalize_coordinates_default_config(self, mock_context):
"""Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration()
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_null_config_values(self, mock_context):
"""Test that null lat/lon column names fall back to defaults ('lat'/'lon')."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None)
assert config.latColumn == "lat"
assert config.lonColumn == "lon"
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_dms_degree_symbol(self, mock_context):
"""Test DMS parsing with degree/minute/second symbols like 40°26'46\"N."""
df = pd.DataFrame({
'lat': ["40°26'46\"N", "51°30'26\"N"],
'lon': ["79°58'56\"W", "0°7'39\"W"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 2
# 40°26'46"N ≈ 40.4461
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
# 79°58'56"W ≈ -79.9822
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
def test_normalize_coordinates_dms_spaced_format(self, mock_context):
"""Test DMS parsing with space-separated format like '40 26 46 N'."""
df = pd.DataFrame({
'lat': ["40 26 46 N"],
'lon': ["79 58 56 W"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
def test_normalize_coordinates_dms_already_decimal(self, mock_context):
"""Test that string columns with decimal values are auto-parsed correctly."""
df = pd.DataFrame({
'lat': ["45.5", "46.0"],
'lon': ["9.5", "10.0"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 2
assert result['lat'].iloc[0] == 45.5
assert result['lon'].iloc[0] == 9.5
def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context):
"""Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values."""
df = pd.DataFrame({
'lat': ["40°26'46\"N", "not_a_coord", "51.5"],
'lon': ["79°58'56\"W", "10.0", "0.1"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
# Row with "not_a_coord" for lat should be dropped (NaN lat)
assert len(result) == 2
def test_normalize_coordinates_dms_out_of_range(self, mock_context):
"""Test that DMS-parsed coordinates outside valid range are filtered out."""
df = pd.DataFrame({
'lat': ["91°0'0\"N", "45°0'0\"N"],
'lon': ["0°0'0\"E", "9°0'0\"E"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
# First row has lat=91° which is out of [-90, 90]
assert len(result) == 1
assert abs(result['lat'].iloc[0] - 45.0) < 0.001
def test_normalize_coordinates_dms_south_and_east(self, mock_context):
"""Test DMS parsing with south latitude and east longitude."""
df = pd.DataFrame({
'lat': ["33°51'54\"S"],
'lon': ["151°12'36\"E"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
# 33°51'54"S ≈ -33.865
assert result['lat'].iloc[0] < 0
assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001
# 151°12'36"E ≈ 151.21
assert result['lon'].iloc[0] > 0
assert abs(result['lon'].iloc[0] - 151.21) < 0.01
def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context):
"""Test that numeric columns are coerced directly while string columns are parsed as DMS."""
# Numeric columns — should go through pd.to_numeric path
df_numeric = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy())
assert result_numeric['lat'].iloc[0] == 45.1235
assert len(result_numeric) == 2
# String DMS columns — should go through _parse_dms_to_decimal path
df_dms = pd.DataFrame({
'lat': ["40°26'46\"N"],
'lon': ["79°58'56\"W"]
})
result_dms = normalize_coordinates(mock_context, config, df_dms.copy())
assert len(result_dms) == 1
assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001
class TestAddGlobalAggregations:
"""Tests for the add_global_aggregations operation."""
def test_add_global_aggregations_success(self, mock_context):
"""Test a successful group by and aggregation."""
df = pd.DataFrame({
'category': ['A', 'A', 'B'],
'value': [10, 20, 100],
'ignored_str': ['x', 'y', 'z']
})
config = AggregationConfiguration(
columns=['category'],
operation='sum'
)
result = add_global_aggregations(mock_context, config, df.copy())
assert len(result) == 2
assert result.loc[result['category'] == 'A', 'value'].values[0] == 30
assert result.loc[result['category'] == 'B', 'value'].values[0] == 100
assert 'ignored_str' not in result.columns
mock_context.log.info.assert_called()
def test_add_global_aggregations_missing_column(self, mock_context):
"""Test skipping a column that does not exist in the dataframe."""
df = pd.DataFrame({'value': [1, 2, 3]})
config = AggregationConfiguration(
columns=['missing_col'],
operation='count'
)
result = add_global_aggregations(mock_context, config, df.copy())
mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.")
assert len(result) == 1
def test_add_global_aggregations_unsupported_op(self, mock_context):
"""Test the warning when an unsupported operation is provided."""
df = pd.DataFrame({'category': ['A'], 'value': [1]})
config = AggregationConfiguration(
columns=['category'],
operation='unsupported'
)
with pytest.raises(Exception):
add_global_aggregations(mock_context, config, df.copy())
mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'")
def test_add_global_aggregations_only_numeric_kept(self, mock_context):
"""Verify that non-numeric and non-grouping columns are dropped."""
df = pd.DataFrame({
'group': ['A', 'A'],
'num': [1, 2],
'text': ['hello', 'world']
})
config = AggregationConfiguration(columns=['group'], operation='mean')
result = add_global_aggregations(mock_context, config, df.copy())
assert 'text' not in result.columns
assert 'num' in result.columns
assert 'group' in result.columns

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,54 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration
def test_valid_configuration_with_overrides():
cfg = BaseConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=10.0,
generalisation_hierarchies={"age": "age_hierarchy"},
)
assert cfg.ident == ["id"]
assert cfg.quasi_identifiers == ["age"]
assert cfg.supp_level == 10.0
assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"}
def test_default_values_are_loaded():
cfg = BaseConfiguration()
assert cfg.ident == ["Name"]
assert cfg.quasi_identifiers == ["Age"]
assert cfg.supp_level == 50.0
assert cfg.generalisation_hierarchies == {"Age": "simpl_age"}
def test_missing_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=[]
)
def test_missing_quasi_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
quasi_identifiers=[]
)
def test_overlap_between_ident_and_quasi_identifiers():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=["age"],
quasi_identifiers=["age"]
)
def test_supp_level_bounds():
with pytest.raises(ValidationError):
BaseConfiguration(
supp_level=150.0 # fuori range
)

View File

@@ -0,0 +1,48 @@
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import (
simpl_age,
simpl_age2,
simpl_gender,
get_all_hierarchies,
)
def test_simpl_age_structure():
assert isinstance(simpl_age, dict)
assert 0 in simpl_age
assert isinstance(simpl_age[0], list)
# verify first level contains 100 ages
assert len(simpl_age[0]) == 100
assert simpl_age[0][0] == 0
assert simpl_age[0][-1] == 99
def test_simpl_age2_structure():
assert isinstance(simpl_age2, dict)
assert 0 in simpl_age2
assert 1 in simpl_age2
assert isinstance(simpl_age2[0], list)
assert isinstance(simpl_age2[1], list)
def test_simpl_gender_structure():
assert isinstance(simpl_gender, dict)
assert 0 in simpl_gender
assert 1 in simpl_gender
assert simpl_gender[0] == ["M", "F", "O"]
assert simpl_gender[1] == ["*", "*", "*"]
def test_get_all_hierarchies():
hier = get_all_hierarchies()
# the function should return dicts only
assert isinstance(hier, dict)
# ensure expected dicts are included
assert "simpl_age" in hier
assert "simpl_age2" in hier
assert "simpl_gender" in hier
# ensure the values returned are references to the actual dicts
assert hier["simpl_age"] is simpl_age
assert hier["simpl_gender"] is simpl_gender

View File

@@ -0,0 +1,41 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import (
KAnonymityConfiguration,
)
def test_valid_k_anonymity_config_with_overrides():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
sensitive_attributes=["disease"],
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["disease"]
assert cfg.generalisation_hierarchies == {"age": "age_hier"}
def test_default_values_are_loaded():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["Disease"]
def test_invalid_k_value_raises_error():
with pytest.raises(ValidationError):
KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=1, # invalid, must be >= 2
sensitive_attributes=["disease"],
)

View File

@@ -0,0 +1,44 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import (
LDiversityConfiguration,
)
def test_valid_l_diversity_config_with_overrides():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=2,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.l == 2
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.l == 3
assert cfg.sensitive_attribute == "Disease"
def test_invalid_l_value_raises_error():
with pytest.raises(ValidationError):
LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=0, # invalid, must be >= 1
sensitive_attribute="disease",
)

View File

@@ -0,0 +1,56 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import (
TClosenessConfiguration,
)
def test_valid_t_closeness_config_with_overrides():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=0.4,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.t == 0.4
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.t == 0.5
assert cfg.sensitive_attribute == "Disease"
def test_invalid_t_value_low():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=-0.1, # invalid
sensitive_attribute="disease",
)
def test_invalid_t_value_high():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=2.0, # invalid > 1
sensitive_attribute="disease",
)

View File

@@ -0,0 +1,44 @@
from template_code_location.dataframe_level_anonymisation.jobs import (
k_anonymity_job,
l_diversity_job,
t_closeness_job,
k_anonymity_job_s3,
l_diversity_job_s3,
t_closeness_job_s3
)
def test_k_anonymity_job_is_callable():
"""Test k_anonymity_job is a valid Dagster job"""
assert callable(k_anonymity_job)
assert hasattr(k_anonymity_job, 'execute_in_process')
def test_l_diversity_job_is_callable():
"""Test l_diversity_job is a valid Dagster job"""
assert callable(l_diversity_job)
assert hasattr(l_diversity_job, 'execute_in_process')
def test_t_closeness_job_is_callable():
"""Test t_closeness_job is a valid Dagster job"""
assert callable(t_closeness_job)
assert hasattr(t_closeness_job, 'execute_in_process')
def test_k_anonymity_job_s3_is_callable():
"""Test k_anonymity_job_s3 is a valid Dagster job"""
assert callable(k_anonymity_job_s3)
assert hasattr(k_anonymity_job_s3, 'execute_in_process')
def test_l_diversity_job_s3_is_callable():
"""Test l_diversity_job_s3 is a valid Dagster job"""
assert callable(l_diversity_job_s3)
assert hasattr(l_diversity_job_s3, 'execute_in_process')
def test_t_closeness_job_s3_is_callable():
"""Test t_closeness_job_s3 is a valid Dagster job"""
assert callable(t_closeness_job_s3)
assert hasattr(t_closeness_job_s3, 'execute_in_process')

View File

@@ -0,0 +1,230 @@
import pytest
import pandas as pd
from unittest.mock import patch
from dagster import DagsterInvalidInvocationError, build_op_context
from template_code_location.dataframe_level_anonymisation.ops import (
apply_k_anonymity,
apply_l_diversity,
apply_t_closeness,
)
from template_code_location.dataframe_level_anonymisation.config_models import (
KAnonymityConfiguration,
LDiversityConfiguration,
TClosenessConfiguration,
)
# ---------------------------
# Fixtures
# ---------------------------
@pytest.fixture
def fake_df():
return pd.DataFrame({"id": [1, 2], "age": [30, 40]})
@pytest.fixture
def k_config():
return KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attributes=["age"],
k=2,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def l_config():
return LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
l=1,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def t_config():
return TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
t=0.5,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def op_context():
return build_op_context()
# ---------------------------
# Helper for patching external functions
# ---------------------------
@pytest.fixture(autouse=True)
def patch_external_ops():
with (
patch(
"dataframe_level_anonymisation.ops.get_all_hierarchies",
return_value={"simpl_age": {0: [30, 40]}},
),
patch(
"dataframe_level_anonymisation.ops.k_anonymity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.l_diversity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.t_closeness",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
):
yield
# ---------------------------
# Tests for apply_k_anonymity
# ---------------------------
def test_apply_k_anonymity_outputs(op_context, k_config, fake_df):
results = list(apply_k_anonymity(op_context, k_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
# Check types
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
# ---------------------------
# Tests for apply_l_diversity
# ---------------------------
def test_apply_l_diversity_outputs(op_context, l_config, fake_df):
results = list(apply_l_diversity(op_context, l_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_l_diversity_empty_raises(op_context, l_config):
with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Tests for apply_t_closeness
# ---------------------------
def test_apply_t_closeness_outputs(op_context, t_config, fake_df):
results = list(apply_t_closeness(op_context, t_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_t_closeness_empty_raises(op_context, t_config):
with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Additional tests for _validate_and_get_hierarchies
# ---------------------------
def test_validate_hierarchies_dataset_too_small(k_config):
small_df = pd.DataFrame({"id": [1], "age": [30]})
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, small_df)
def test_validate_hierarchies_missing_hierarchy(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}})
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(bad_config, fake_df)
def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}):
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, fake_df)
# ---------------------------
# Additional tests for _calc_dataframe_metrics
# ---------------------------
def test_calc_dataframe_metrics_basic():
from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics
df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]})
df_anon = df_org.copy()
with (
patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2),
patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1),
patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1),
):
report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"])
assert "k-anonymity" in report
assert metrics["k_anon"] == 2
assert metrics["l_div"] == 1
assert metrics["t_clos"] == 0.1
# ---------------------------
# Tests for apply_t_closeness exception branches
# ---------------------------
def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df):
"""Covers the branch where ValueError contains 'Cannot be quasi-identifiers'."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness",
side_effect=ValueError("Cannot be quasi-identifiers invalid"),
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))
def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df):
"""Covers the branch where ValueError is raised but message does NOT contain that substring."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error")
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))

View File

@@ -0,0 +1,70 @@
import numpy as np
from template_code_location.dataframe_level_anonymisation.utils import (
parse_value_list,
normalize_hierarchy_levels,
)
# ------------------------------------
# Tests for parse_value_list
# ------------------------------------
def test_parse_value_list_all_strings_digits():
values = ["1", "2", "3"]
assert parse_value_list(values) == [1, 2, 3]
def test_parse_value_list_mixed_values():
values = ["1", 2, "abc", "5"]
assert parse_value_list(values) == [1, 2, "abc", 5]
def test_parse_value_list_no_digits():
values = ["a", "b", "c"]
assert parse_value_list(values) == ["a", "b", "c"]
# ------------------------------------
# Tests for normalize_hierarchy_levels
# ------------------------------------
def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array():
hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert "age" in normalized
assert 0 in normalized["age"]
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list
assert normalized["age"][1] == ["0-10", "11-20"] # untouched
def test_normalize_hierarchy_levels_multiple_columns():
hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}}
normalized = normalize_hierarchy_levels(hierarchy)
# First column
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [10, 20]
# Second column
assert isinstance(normalized["gender"][0], np.ndarray)
assert normalized["gender"][0].tolist() == ["M", "F"]
assert normalized["gender"][1] == ["*"]
def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0():
hierarchy = {"test": {"0": ["1", "x", "3"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert isinstance(normalized["test"][0], np.ndarray)
assert normalized["test"][0].tolist() == ["1", "x", "3"]
def test_normalize_hierarchy_levels_empty_mapping():
hierarchy = {"col": {}}
normalized = normalize_hierarchy_levels(hierarchy)
assert normalized == {"col": {}}

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,444 @@
"""
Shared pytest fixtures and helpers for field-level pseudonymisation tests.
This module provides:
- Mock Vault client for testing without real Vault connections
- Sample data fixtures
- Configuration fixtures for encryption/decryption operations
- Helper functions for running ops and managing test Vault storage
"""
import pandas as pd
import pytest
from dagster import build_op_context
from cryptography.fernet import Fernet
from hvac.exceptions import InvalidPath, Forbidden
from unittest.mock import patch, MagicMock
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
EncryptConfig,
DecryptConfig,
PseudoTechniqueConfig,
DepseudoTechniqueConfig,
)
from template_code_location.field_level_pseudo_anonymisation.ops import (
anonymize_pseudonymize_structured,
depseudonymize_structured,
)
# -------------------------------- Mock Vault Storage ----------------------------------------
# In-memory Vault simulation for tests
_test_vault_storage = {}
_test_vault_access_control = {} # For simulating access control
@pytest.fixture(autouse=True)
def mock_vault_client():
"""
Auto-use fixture that mocks the hvac.Client to avoid real Vault connections.
Uses an in-memory dict to simulate Vault storage for tests.
Includes access control simulation for AC3.
"""
global _test_vault_storage, _test_vault_access_control
_test_vault_storage = {} # Reset storage before each test
_test_vault_access_control = {} # Reset access control
def mock_read_secret(path, mount_point):
"""Mock reading secret from Vault with access control"""
full_path = f"{mount_point}/{path}"
# Check access control first
if full_path in _test_vault_access_control:
if not _test_vault_access_control[full_path]:
raise Forbidden(f"Access denied to secret: {full_path}")
if full_path not in _test_vault_storage:
raise InvalidPath(f"Secret not found: {full_path}")
return {"data": {"data": {"value": _test_vault_storage[full_path]}}}
def mock_create_or_update_secret(path, mount_point, secret):
"""Mock creating/updating secret in Vault"""
full_path = f"{mount_point}/{path}"
_test_vault_storage[full_path] = secret["value"]
def mock_delete_metadata(path, mount_point):
"""Mock deleting secret from Vault"""
full_path = f"{mount_point}/{path}"
if full_path in _test_vault_storage:
del _test_vault_storage[full_path]
if full_path in _test_vault_access_control:
del _test_vault_access_control[full_path]
with patch("hvac.Client") as mock_client_class:
mock_instance = MagicMock()
mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret
mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = (
mock_create_or_update_secret
)
mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = (
mock_delete_metadata
)
mock_client_class.return_value = mock_instance
yield mock_instance
# -------------------------------- Sample Data Fixtures ----------------------------------------
@pytest.fixture
def sample_df():
"""
Fixture providing a sample structured dataset with PII data.
Represents typical data that requires pseudonymisation and restoration.
"""
return pd.DataFrame(
{
"id": [1, 2, 3, 4, 5],
"name": [
"Alice Smith",
"Bob Jones",
"Charlie Brown",
"David Wilson",
"Eva Garcia",
],
"email": [
"alice@example.com",
"bob@example.com",
"charlie@example.com",
"david@example.com",
"eva@example.com",
],
"ssn": [
"123-45-6789",
"234-56-7890",
"345-67-8901",
"456-78-9012",
"567-89-0123",
],
"age": [25, 30, 35, 40, 45],
"salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0],
"department": ["HR", "IT", "Finance", "IT", "HR"],
}
)
# -------------------------------- Configuration Fixtures ----------------------------------------
@pytest.fixture
def encrypt_config_single_field():
"""
Configuration for encrypting a single field (email).
Used to create pseudonymised data for restoration tests.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["email"],
key_name="test_restoration_key_single",
)
)
]
)
@pytest.fixture
def decrypt_config_single_field():
"""
Configuration for decrypting a single field (email).
Used to restore original values.
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["email"],
key_name="test_restoration_key_single",
)
)
]
)
@pytest.fixture
def encrypt_config_multiple_fields():
"""
Configuration for encrypting multiple fields (name, email, ssn).
Tests restoration of multiple sensitive fields.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["name", "email", "ssn"],
key_name="test_restoration_key_multi",
)
)
]
)
@pytest.fixture
def decrypt_config_multiple_fields():
"""
Configuration for decrypting multiple fields (name, email, ssn).
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["name", "email", "ssn"],
key_name="test_restoration_key_multi",
)
)
]
)
@pytest.fixture
def encrypt_config_partial_fields():
"""
Configuration for encrypting only some fields (email, ssn).
Tests partial restoration scenarios.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["email", "ssn"],
key_name="test_restoration_key_partial",
)
)
]
)
@pytest.fixture
def decrypt_config_partial_fields():
"""
Configuration for decrypting only some fields (email, ssn).
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["email", "ssn"],
key_name="test_restoration_key_partial",
)
)
]
)
@pytest.fixture
def authorized_multi_key_scenario():
"""
Fixture for testing multi-key authorization scenarios.
Sets up two keys: one authorized, one denied.
"""
clear_vault_key("authorized_key")
clear_vault_key("unauthorized_key")
# Create authorized key by generating it
authorized_key = Fernet.generate_key().decode()
set_vault_key("authorized_key", authorized_key)
# Create unauthorized key and deny access
unauthorized_key = Fernet.generate_key().decode()
set_vault_key("unauthorized_key", unauthorized_key)
deny_vault_access("unauthorized_key")
yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"}
# Cleanup
clear_vault_key("authorized_key")
clear_vault_key("unauthorized_key")
@pytest.fixture
def large_dataset():
"""
Fixture providing a large dataset (10,000 rows) for performance testing.
Reusable across multiple performance tests.
"""
return pd.DataFrame(
{
"id": range(1, 10001),
"email": [f"user{i}@example.com" for i in range(1, 10001)],
"name": [f"User {i}" for i in range(1, 10001)],
"ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)],
"age": [20 + (i % 50) for i in range(1, 10001)],
"salary": [30000.0 + (i * 10) for i in range(1, 10001)],
"department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)],
}
)
@pytest.fixture(scope="session")
def vault_test_keys():
"""
Session-scoped fixture to pre-generate test keys for faster test execution.
Avoids repeated key generation in each test.
"""
keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)}
return keys
@pytest.fixture
def cleanup_test_keys(request):
"""
Fixture to automatically cleanup test keys after each test.
Use with: @pytest.mark.usefixtures("cleanup_test_keys")
"""
yield
# Cleanup all test keys from mock Vault
test_keys = [k for k in _test_vault_storage.keys() if "test_" in k]
for key in test_keys:
_test_vault_storage.pop(key, None)
# -------------------------------- Helper Functions ----------------------------------------
def config_to_dagster_dict(config):
"""
Convert Pydantic config to Dagster-compatible dictionary.
For AnonymisePseudonymizeStructuredConfig (uses discriminated Union):
Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}}
Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}}
For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union):
Pydantic v2 outputs:
{'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}}
Dagster expects: Same flat structure with 'type' field
Args:
config: Pydantic config instance
(AnonymisePseudonymizeStructuredConfig or
DepseudonymizeStructuredConfig)
Returns:
dict: Dagster-compatible configuration dictionary
"""
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
)
config_dict = config.model_dump()
# Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig
# DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union)
if isinstance(config, AnonymisePseudonymizeStructuredConfig):
if "used_function" in config_dict:
for func_config in config_dict["used_function"]:
if "technique" in func_config:
technique = func_config["technique"]
# Pydantic outputs flat dict with 'type' field for discriminated unions
if isinstance(technique, dict) and "type" in technique:
# Extract the type discriminator
technique_type = technique["type"]
# Create nested structure without the 'type' field
technique_data = {k: v for k, v in technique.items() if k != "type"}
# Nest under the discriminator key for Dagster
func_config["technique"] = {technique_type: technique_data}
return config_dict
def run_encrypt_op(config, df):
"""
Helper function to execute the anonymize_pseudonymize_structured op.
Args:
config: AnonymisePseudonymizeStructuredConfig instance
df: Input pandas DataFrame
Returns:
tuple: (result_df, metrics) - Output DataFrame and metrics dict
"""
context = build_op_context(op_config=config_to_dagster_dict(config))
result_df, metrics = anonymize_pseudonymize_structured(context, df=df)
return result_df.value, metrics.value
def run_decrypt_op(config, df):
"""
Helper function to execute the depseudonymize_structured op.
Args:
config: DepseudonymizeStructuredConfig instance
df: Input pandas DataFrame
Returns:
tuple: (result_df, metrics) - Output DataFrame and metrics dict
"""
context = build_op_context(op_config=config_to_dagster_dict(config))
result_df, metrics = depseudonymize_structured(context, df=df)
return result_df.value, metrics.value
def clear_vault_key(key_name: str):
"""
Helper function to clear a key from the simulated Vault storage for test isolation.
Args:
key_name: Name of the key to delete from Vault
"""
full_path = f"secret/PseudonymKeys/{key_name}"
if full_path in _test_vault_storage:
del _test_vault_storage[full_path]
if full_path in _test_vault_access_control:
del _test_vault_access_control[full_path]
def set_vault_key(key_name: str, key_value: str):
"""
Helper function to set a key in the simulated Vault storage.
Args:
key_name: Name of the key
key_value: Value of the key (Fernet key as string)
"""
full_path = f"secret/PseudonymKeys/{key_name}"
_test_vault_storage[full_path] = key_value
def deny_vault_access(key_name: str):
"""
Helper function to deny access to a key for authorization testing (AC3).
Args:
key_name: Name of the key to deny access to
"""
full_path = f"secret/PseudonymKeys/{key_name}"
_test_vault_access_control[full_path] = False
def get_vault_key(key_name: str) -> bytes:
"""
Helper function to retrieve a key from the simulated Vault storage.
Args:
key_name: Name of the key to retrieve
Returns:
bytes: The encryption key
"""
full_path = f"secret/PseudonymKeys/{key_name}"
if full_path not in _test_vault_storage:
raise InvalidPath(f"Key not found: {key_name}")
return _test_vault_storage[full_path].encode()

View File

@@ -0,0 +1,633 @@
import pytest
from pydantic import ValidationError
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
PseudoTechniqueConfig,
DepseudoTechniqueConfig,
HashConfig,
EncryptConfig,
RedactConfig,
ReplaceConfig,
DecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
AnonymisePseudonymizeUnstructuredConfig,
DepseudonymizeUnstructuredConfig,
PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig,
DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig,
HashConfig as UnstructuredHashConfig,
EncryptConfig as UnstructuredEncryptConfig,
RedactConfig as UnstructuredRedactConfig,
ReplaceConfig as UnstructuredReplaceConfig,
RetainConfig,
DecryptConfig as UnstructuredDecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum
from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum
# ==================== Structured Config Tests ====================
class TestStructuredConfigValidators:
"""Tests for structured_config.py validators and validators."""
def test_ensure_unique_columns_valid_single_technique(self):
"""Test that single technique with single column passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
)
]
)
assert config is not None
assert len(config.used_function) == 1
def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self):
"""Test that multiple techniques with different columns passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["ssn"],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_columns_duplicate_columns_same_technique(self):
"""Test that duplicate columns in different techniques raises error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email"],
algorithm="sha256"
)
)
]
)
assert "Duplicate column" in str(exc_info.value)
assert "email" in str(exc_info.value)
def test_ensure_unique_columns_multiple_duplicates(self):
"""Test error message with multiple duplicate columns."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email", "phone"],
algorithm="sha256"
)
)
]
)
error_msg = str(exc_info.value)
assert "Duplicate column" in error_msg
assert "email" in error_msg
assert "phone" in error_msg
def test_collect_column_to_techniques_single_technique(self):
"""Test _collect_column_to_techniques with single technique."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
)
]
)
mapping = config._collect_column_to_techniques()
assert mapping == {
"email": ["encrypt"],
"phone": ["encrypt"]
}
def test_extract_technique_and_columns_dict_with_type_field(self):
"""Test _extract_technique_and_columns with dict containing 'type' field."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"columns": ["email", "ssn"],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == ["email", "ssn"]
def test_extract_technique_and_columns_dict_with_variant_mapping(self):
"""Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"encrypt": {
"columns": ["ssn"],
"key_name": "test_key"
}
}
}
)
assert technique_type == "encrypt"
assert columns == ["ssn"]
def test_extract_technique_and_columns_model_instance(self):
"""Test _extract_technique_and_columns with PseudoTechniqueConfig model instance."""
pseudo_config = PseudoTechniqueConfig(
technique=RedactConfig(columns=["address"])
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "redact"
assert columns == ["address"]
def test_extract_technique_and_columns_empty_dict(self):
"""Test _extract_technique_and_columns with empty dict."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": {}}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_none_technique(self):
"""Test _extract_technique_and_columns with None technique."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": None}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_missing_columns_key(self):
"""Test _extract_technique_and_columns when 'columns' key is missing."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == []
def test_extract_technique_and_columns_model_without_columns_attr(self):
"""Test _extract_technique_and_columns with model instance missing columns attribute."""
pseudo_config = PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["old_value"], new_value="NEW")
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "replace"
assert columns == ["old_value"]
class TestStructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeStructuredConfig."""
def test_depseudonymize_config_normalize_used_function_with_dict(self):
"""Test _normalize_depseudo_used_function with dict input."""
config = DepseudonymizeStructuredConfig(
used_function=[
{
"technique": {
"type": "decrypt",
"columns": ["email"],
"key_name": "key1"
}
}
]
)
assert len(config.used_function) == 1
assert isinstance(config.used_function[0], DepseudoTechniqueConfig)
assert config.used_function[0].technique.type == "decrypt"
def test_depseudonymize_config_normalize_used_function_with_model(self):
"""Test _normalize_depseudo_used_function with model instance."""
depseudo_tech = DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
)
config = DepseudonymizeStructuredConfig(
used_function=[depseudo_tech]
)
assert len(config.used_function) == 1
assert config.used_function[0] is depseudo_tech
def test_depseudonymize_config_ensure_unique_columns_no_op(self):
"""Test that ensure_unique_columns is a no-op for depseudonymize."""
# For depseudonymize, there's no per-column uniqueness constraint
config = DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
),
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key2"
)
)
]
)
# Should not raise - no-op validator
assert config is not None
# ==================== Unstructured Config Tests ====================
class TestUnstructuredConfigValidators:
"""Tests for unstructured_config.py validators."""
def test_normalize_used_function_with_dict(self):
"""Test _normalize_used_function with dict input."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
{
"technique": {
"encrypt": {
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "key1"
}
}
}
]
)
assert len(config.used_function) == 1
def test_normalize_used_function_with_model(self):
"""Test _normalize_used_function with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
)
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[pseudo_tech]
)
assert len(config.used_function) == 1
def test_ensure_unique_pii_valid_different_pii_types(self):
"""Test that different PII types pass validation."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.PERSON.value],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_pii_duplicate_pii_types(self):
"""Test that duplicate PII types raise error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.EMAIL.value],
algorithm="sha256"
)
)
]
)
assert "Duplicate PII" in str(exc_info.value)
# Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value
assert "EMAIL" in str(exc_info.value)
def test_collect_pii_to_techniques_single_technique(self):
"""Test _collect_pii_to_techniques with single technique."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value],
key_name="key1"
)
)
]
)
mapping = config._collect_pii_to_techniques()
assert mapping == {
PIIEntityEnum.EMAIL.value: ["encrypt"],
PIIEntityEnum.PERSON.value: ["encrypt"]
}
def test_extract_technique_and_pii_dict_with_type_field(self):
"""Test _extract_technique_and_pii with dict containing 'type' field."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_dict_with_variant_mapping(self):
"""Test _extract_technique_and_pii with variant-key mapping."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"hash": {
"pii": [PIIEntityEnum.PERSON.value],
"algorithm": "sha256"
}
}
}
)
assert technique_type == "hash"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_dict_fallback_to_columns(self):
"""Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "redact",
"columns": ["fallback_col"]
}
}
)
assert technique_type == "redact"
assert piis == ["fallback_col"]
def test_extract_technique_and_pii_model_instance(self):
"""Test _extract_technique_and_pii with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(
pii=[PIIEntityEnum.EMAIL.value]
)
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "redact"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_model_with_getattr_fallback(self):
"""Test _extract_technique_and_pii model with getattr fallback to columns."""
# Create a mock-like scenario where pii attribute doesn't exist
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value])
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "retain"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_empty_dict(self):
"""Test _extract_technique_and_pii with empty dict."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{"technique": {}}
)
assert technique_type is None
assert piis == []
def test_extract_technique_and_pii_missing_pii_key(self):
"""Test _extract_technique_and_pii when 'pii' key is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == []
class TestUnstructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeUnstructuredConfig."""
def test_depseudonymize_unstructured_config_default(self):
"""Test default DepseudonymizeUnstructuredConfig."""
config = DepseudonymizeUnstructuredConfig()
assert config is not None
assert len(config.used_function) >= 1
def test_depseudonymize_unstructured_config_with_custom_function(self):
"""Test DepseudonymizeUnstructuredConfig with custom function."""
config = DepseudonymizeUnstructuredConfig(
used_function=[
UnstructuredDepseudoTechniqueConfig(
technique=UnstructuredDecryptConfig(
key_name="custom_key"
)
)
]
)
assert len(config.used_function) == 1
assert config.used_function[0].technique.key_name == "custom_key"
class TestLanguageSupport:
"""Tests for language configuration support."""
def test_all_supported_languages(self):
"""Test that all supported languages can be set."""
supported_languages = [
LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en,
LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el,
LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt,
LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv
]
for lang in supported_languages:
config = AnonymisePseudonymizeUnstructuredConfig(language=lang)
assert config.language == lang
def test_default_language_is_english(self):
"""Test that default language is English."""
config = AnonymisePseudonymizeUnstructuredConfig()
assert config.language == LanguageEnum.en
class TestTechniqueConfigDefaults:
"""Tests for technique config defaults."""
def test_hash_config_default_algorithm(self):
"""Test HashConfig default algorithm."""
config = HashConfig()
assert config.algorithm == "sha256"
assert config.type == "hash"
def test_encrypt_config_defaults(self):
"""Test EncryptConfig defaults."""
config = EncryptConfig()
assert config.type == "encrypt"
assert config.key_name == "my_key"
def test_redact_config_defaults(self):
"""Test RedactConfig defaults."""
config = RedactConfig()
assert config.type == "redact"
def test_replace_config_defaults(self):
"""Test ReplaceConfig defaults."""
config = ReplaceConfig()
assert config.type == "replace"
assert config.new_value == "REPLACED"
def test_decrypt_config_defaults(self):
"""Test DecryptConfig defaults."""
config = DecryptConfig()
assert config.type == "decrypt"
assert config.key_name == "my_key"
def test_unstructured_retain_config_defaults(self):
"""Test RetainConfig defaults."""
config = RetainConfig()
assert config.type == "retain"
class TestPseudoTechniqueConfigDefaults:
"""Tests for PseudoTechniqueConfig defaults."""
def test_pseudo_technique_default_to_hash(self):
"""Test PseudoTechniqueConfig defaults to hash technique."""
config = PseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
def test_unstructured_pseudo_technique_default_to_hash(self):
"""Test UnstructuredPseudoTechniqueConfig defaults to hash technique."""
config = UnstructuredPseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
class TestConfigModelIntegration:
"""Integration tests for config models."""
def test_structured_config_with_all_technique_types(self):
"""Test structured config with all technique types."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=HashConfig(columns=["col1"])
),
PseudoTechniqueConfig(
technique=EncryptConfig(columns=["col2"], key_name="k1")
),
PseudoTechniqueConfig(
technique=RedactConfig(columns=["col3"])
),
PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["col4"], new_value="X")
)
]
)
assert len(config.used_function) == 4
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace"}
def test_unstructured_config_with_all_technique_types(self):
"""Test unstructured config with all technique types."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.PERSON.value],
key_name="k1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredReplaceConfig(
pii=[PIIEntityEnum.CREDIT_CARD.value],
new_value="X"
)
),
UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value])
)
]
)
assert len(config.used_function) == 5
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,288 @@
"""
Test suite for data restoration (depseudonymisation) of unstructured text.
## Test Coverage Summary
### Acceptance Criteria Coverage:
- AC1 (Data Restoration with Valid Key): 2 tests
- AC2 (Restoration Denial - Missing Key): 1 test
- AC3 (Restoration Denial - Unauthorized Access): 1 test
- AC4 (Restoration Denial - Invalid Key): 1 test
- Additional Coverage: 2 tests (edge cases)
### Test Pattern:
- Each test uses build_op_context with .model_dump() for configuration
- Tests validate dual outputs (data, metrics)
- Tests verify complete restoration of original text
- Tests validate security controls and error handling
- Tests use descriptive names mapping to AC scenarios
"""
import pytest
from unittest.mock import patch
from cryptography.fernet import Fernet
from dagster import build_op_context
from src.field_level_pseudo_anonymisation.unstructured_ops import (
depseudonymize_unstructured,
)
from src.field_level_pseudo_anonymisation.config_models.unstructured_config import (
DepseudonymizeUnstructuredConfig,
DecryptConfig,
DepseudoTechniqueConfig,
)
@pytest.fixture
def fernet_key() -> bytes:
"""Generate a valid Fernet key for encryption in tests."""
return Fernet.generate_key()
@pytest.fixture
def encrypted_text_data(fernet_key: bytes) -> dict:
"""
Create encrypted data for testing decryption.
Returns a dict with:
- original_text: The unencrypted text
- encrypted_text: Text with PII values encrypted in {encrypt:...} format
"""
original_text = "My name is John Doe and my email is john.doe@example.com."
fernet = Fernet(fernet_key)
encrypted_name = fernet.encrypt(b"John Doe").decode()
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
encrypted_text = (
f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}."
)
return {
"original_text": original_text,
"encrypted_text": encrypted_text,
}
# ---------------------- AC1: Data Restoration with Valid Key --------------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac1_restore_encrypted_pii_entities_with_valid_key(
mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict
):
"""AC1: Restore encrypted PII entities with a valid key from secret management tool."""
# Arrange - Mock the Vault key retrieval to return the valid key
mock_create_get_key.return_value = fernet_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act - Request data restoration
result_gen = depseudonymize_unstructured(
context, input_text=encrypted_text_data["encrypted_text"]
)
data_output = next(result_gen)
metrics_output = next(result_gen)
# Assert - Verify successful restoration
# 1. All original values are restored exactly
assert (
data_output.value == encrypted_text_data["original_text"]
), "Original text should be fully restored"
# 2. Correct output structure
assert data_output.output_name == "data", "Output should be named 'data'"
# 3. Metrics show correct number of restored entities
assert (
metrics_output.value["total_depseudo_count"] == 2
), "Should restore 2 encrypted entities (name and email)"
# 4. System retrieved key from secret management tool
mock_create_get_key.assert_called_once_with("decrypt", "test_key")
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes):
"""AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key."""
# Arrange - Create text with multiple PII types encrypted
original_text = "Contact John Doe at john.doe@example.com or call 555-1234."
fernet = Fernet(fernet_key)
encrypted_name = fernet.encrypt(b"John Doe").decode()
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
encrypted_phone = fernet.encrypt(b"555-1234").decode()
encrypted_text = (
f"Contact {{encrypt:{encrypted_name}}} at "
f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}."
)
mock_create_get_key.return_value = fernet_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="multi_pii_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act
result_gen = depseudonymize_unstructured(context, input_text=encrypted_text)
data_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert data_output.value == original_text, "All PII types should be restored"
assert (
metrics_output.value["total_depseudo_count"] == 3
), "Should restore 3 encrypted entities (name, email, phone)"
mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key")
# ------------------- AC2: Restoration Denial when Key is Missing ----------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict):
"""AC2: Deny restoration when decryption key is missing from secret management tool."""
# Arrange - Mock Vault to indicate key is missing
mock_create_get_key.side_effect = ValueError(
"Fernet key 'non_existent_key' not found in Vault for decrypt."
)
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="non_existent_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system fails the restoration request
with pytest.raises(
ValueError,
match="Fernet key 'non_existent_key' not found in Vault for decrypt.",
) as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message is clear and actionable
assert "not found in Vault" in str(
exc_info.value
), "Error message should indicate key is missing from Vault"
# Verify system attempted to retrieve the key (logged attempt)
mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key")
# ------------- AC3: Restoration Denial when Access is Unauthorized --------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac3_restoration_denial_when_unauthorized_access(
mock_create_get_key, encrypted_text_data: dict
):
"""AC3: Deny restoration when participant is not authorized to access the decryption key."""
# Arrange - Mock Vault to deny access
mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key")
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="unauthorized_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system denies access
with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message clearly indicates access denial
assert "Access denied" in str(
exc_info.value
), "Error message should clearly indicate access was denied"
# Verify the unauthorized access attempt was logged (function was called)
mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key")
# ------------------- AC4: Restoration Denial when Key is Invalid ----------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict):
"""AC4: Deny restoration when decryption key does not correspond to the encrypted fields."""
# Arrange - Mock Vault to return a different (wrong) key
invalid_key = Fernet.generate_key() # A different, incorrect key
mock_create_get_key.return_value = invalid_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system fails the restoration
with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message indicates decryption failure
assert "Invalid Fernet token" in str(
exc_info.value
), "Error message should indicate the key is invalid for this data"
# Verify key was retrieved (system attempted decryption)
mock_create_get_key.assert_called_once_with("decrypt", "wrong_key")
# -------------------------------- Additional Edge Cases ----------------------------------------
def test_depseudonymize_unstructured_no_decrypt_config():
"""Edge case: Text is returned unchanged when no decryption techniques are configured."""
# Arrange
original_text = "This text has no {encrypt:values} to decrypt."
config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques
context = build_op_context(op_config=config.model_dump())
# Act
result_gen = depseudonymize_unstructured(context, input_text=original_text)
result_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert (
result_output.value == original_text
), "Text should remain unchanged when no decryption is configured"
assert (
metrics_output.value["total_depseudo_count"] == 0
), "Should report zero decryptions performed"
def test_depseudonymize_unstructured_empty_text():
"""Edge case: Empty input text is returned unchanged with zero decryptions performed."""
# Arrange
empty_text = ""
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act
with patch(
"src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
) as mock_key:
mock_key.return_value = Fernet.generate_key()
result_gen = depseudonymize_unstructured(context, input_text=empty_text)
result_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert result_output.value == "", "Empty text should remain empty"
assert (
metrics_output.value["total_depseudo_count"] == 0
), "Should report zero decryptions for empty text"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,853 @@
"""
Test suite for field-level pseudonymisation operations on unstructured data.
This test suite validates the pseudonymisation of unstructured text with PII detection,
covering the following Acceptance Criteria:
## Test Coverage Summary
### Acceptance Criteria Coverage:
- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests
- AC2 (Invalid Execution Handling): 5 tests
- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests
- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests
- Additional Coverage: 3 tests
### Test Pattern:
- Each test uses build_op_context with config_to_dagster_dict for configuration
- Tests validate dual outputs (data, metrics)
- Vault access is mocked for isolation
- Tests validate Scrubadub automatic PII detection
- Tests ensure placeholder replacement for unconfigured PII
"""
import pytest
import re
from dagster import build_op_context
from unittest.mock import patch, MagicMock
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
AnonymisePseudonymizeUnstructuredConfig,
EncryptConfig,
RetainConfig,
PseudoTechniqueConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum
from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import (
anonymize_pseudonymize_unstructured,
)
from .conftest import clear_vault_key
def config_to_dagster_dict_unstructured(config):
"""Convert unstructured config to Dagster format."""
config_dict = {"language": config.language.value, "used_function": []}
for func_config in config.used_function:
technique = func_config.technique
technique_type = technique.type
technique_dict = technique.model_dump()
if "pii" in technique_dict:
technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii]
technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"}
config_dict["used_function"].append(
{"technique": {technique_type: technique_dict_without_type}}
)
return config_dict
def run_unstructured_op(config, text):
"""
Helper to run unstructured pseudonymisation op.
Returns:
tuple: (result_text: str, metrics_markdown: str)
"""
context = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text)
# Extract actual values from Output objects
return result_text.value, metrics.value
def parse_metrics_markdown(metrics_md: str) -> dict:
"""
Parse markdown metrics into structured dict for easier testing.
Args:
metrics_md: Markdown metrics string from op output
Returns:
dict with keys: total_pii_detected, pii_by_type, techniques_applied, language
"""
result = {
"total_pii_detected": 0,
"pii_by_type": {},
"techniques_applied": {},
"language": "",
}
# Extract total PII detected
total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md)
if total_match:
result["total_pii_detected"] = int(total_match.group(1))
# Extract language
lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md)
if lang_match:
result["language"] = lang_match.group(1)
# Extract PII by type from table
pii_table_section = re.search(
r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)",
metrics_md,
)
if pii_table_section:
for line in pii_table_section.group(1).strip().split("\n"):
parts = [p.strip() for p in line.split("|") if p.strip()]
if len(parts) == 2:
entity_type, count = parts
result["pii_by_type"][entity_type] = int(count)
# Extract techniques applied
techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md)
if techniques_section:
for line in techniques_section.group(1).strip().split("\n"):
tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line)
if tech_match:
pii_type, technique = tech_match.groups()
result["techniques_applied"][pii_type] = technique
return result
# -------------------------------- Fixtures ----------------------------------------
@pytest.fixture
def sample_text_en():
"""English text with various PII types."""
return """
John Smith works at Acme Corporation. His email is john.smith@example.com
and his phone number is +1-555-123-4567. He lives in New York City at
123 Main Street, Apartment 4B. His SSN is 123-45-6789.
"""
@pytest.fixture
def sample_text_multi_person():
"""Text with multiple person names."""
return """
The meeting included Alice Johnson, Bob Williams, and Charlie Brown.
They discussed the project with Maria Garcia and David Wilson.
"""
@pytest.fixture
def sample_text_mixed_pii():
"""Text with multiple PII types for AC1 comprehensive testing."""
return """
Contact Information:
Name: Dr. Emily Watson
Email: emily.watson@hospital.com
Phone: +44-20-7946-0958
Website: https://patient-portal.hospital.com/records
"""
@pytest.fixture
def encrypt_person_config():
"""Configuration to encrypt PERSON entities."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_person_key",
)
)
],
)
@pytest.fixture
def retain_person_config():
"""Configuration to retain PERSON entities unchanged."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
],
)
@pytest.fixture
def mixed_technique_config():
"""Configuration with encryption and retention for AC1 testing."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_mixed_key",
)
),
PseudoTechniqueConfig(
technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS])
),
],
)
# ================================================================================================
# AC1: Pseudonymisation and Retention Are Applied Correctly
# ================================================================================================
def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config):
"""AC1: Test that configured PII types are encrypted correctly."""
clear_vault_key("test_person_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify person name is encrypted (not in plaintext)
assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted"
# Verify encryption token is present
assert "{encrypt:" in result_text, "Encrypted token should be present in result"
# Verify PII was detected and processed
assert metrics["total_pii_detected"] > 0, "System should detect PII entities"
assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII"
# Verify text structure is preserved (surrounding text intact)
assert "Contact Information:" in result_text, "Non-PII text structure should be preserved"
def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person):
"""AC1: Test that PII types marked for retention remain unchanged."""
retain_config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
],
)
result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person)
metrics = parse_metrics_markdown(metrics_md)
# Verify retained PII types remain in plaintext
assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged"
assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged"
# Verify technique applied is 'retain'
assert (
"retain" in metrics["techniques_applied"].get("PERSON", "").lower()
), "Retain technique should be recorded for PERSON type"
def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii):
"""AC1: Test that unconfigured PII types are replaced with placeholders."""
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_person_only_key",
)
)
],
)
clear_vault_key("test_person_only_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
# Verify person is encrypted (configured)
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
# Verify unconfigured PII types have placeholders
assert (
"{{" in result_text and "}}" in result_text
), "Unconfigured PII should be replaced with placeholders"
# Verify original unconfigured PII values are not in result
assert (
"emily.watson@hospital.com" not in result_text
), "Unconfigured EMAIL should be replaced with placeholder"
# Verify placeholder format
assert (
"{{EMAIL}}" in result_text or "{{URL}}" in result_text
), "Placeholders should indicate entity type"
def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config):
"""AC1: Test that multiple techniques (encrypt, retain) are applied correctly."""
clear_vault_key("test_mixed_key")
result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify encrypted PII types (PERSON, EMAIL)
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted"
# Verify retained PII type (PHONE_NUMBERS)
assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained"
# Verify metrics reflect different techniques
assert (
"encrypt" in metrics["techniques_applied"].get("PERSON", "").lower()
), "Encrypt technique should be applied to PERSON"
assert (
"encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower()
), "Encrypt technique should be applied to EMAIL"
assert (
"retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower()
), "Retain technique should be applied to PHONE_NUMBERS"
def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config):
"""AC1: Test that all instances of a configured PII type are processed."""
clear_vault_key("test_person_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person)
metrics = parse_metrics_markdown(metrics_md)
# Verify all person names are encrypted
person_names = [
"Alice Johnson",
"Bob Williams",
"Charlie Brown",
"Maria Garcia",
"David Wilson",
]
for name in person_names:
assert name not in result_text, f"All PERSON instances should be encrypted: {name}"
# Verify metrics count multiple instances
assert metrics["pii_by_type"].get("PERSON", 0) >= len(
person_names
), f"Should detect at least {len(person_names)} PERSON entities"
def test_ac1_empty_text_returns_empty(encrypt_person_config):
"""AC1: Test that empty or null text input raises a ValueError."""
clear_vault_key("test_person_key")
with pytest.raises(ValueError) as exc_info:
run_unstructured_op(encrypt_person_config, "")
assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input"
def test_ac1_text_without_pii_remains_unchanged():
"""AC1: Test that text without any PII remains unchanged after processing."""
no_pii_text = """
The weather today is sunny with a high of 25 degrees Celsius.
The conference starts at 9:00 AM in Room 301.
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_no_pii_key",
)
)
],
)
clear_vault_key("test_no_pii_key")
result_text, metrics_md = run_unstructured_op(config, no_pii_text)
metrics = parse_metrics_markdown(metrics_md)
assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged"
assert metrics["total_pii_detected"] == 0, "No PII should be detected"
def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii):
"""AC1: Test that placeholders for unconfigured PII indicate the entity type."""
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_placeholder_key",
)
)
],
)
clear_vault_key("test_placeholder_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify placeholder format (scrubadub uses {{TYPE}} format)
placeholder_pattern = r"\{\{[A-Z_]+\}\}"
placeholders = re.findall(placeholder_pattern, result_text)
assert (
len(placeholders) > 0
), "Result should contain entity-type placeholders for unconfigured PII"
# Verify metrics track which PII types were detected
assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types"
# ================================================================================================
# AC2: Invalid Execution Handling
# ================================================================================================
def test_ac2_graceful_abort_on_scrubadub_failure():
"""AC2: Test graceful abort when the PII detection engine (Scrubadub) fails."""
text = "Test user John Smith with email john@example.com"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_abort_key",
)
)
],
)
clear_vault_key("test_abort_key")
# Mock Scrubadub to fail at the right import path
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber"
) as mock_scrubber_class:
mock_scrubber = MagicMock()
mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error")
mock_scrubber_class.return_value = mock_scrubber
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, text)
error_msg = str(exc_info.value).lower()
assert (
"pii" in error_msg
or "detection" in error_msg
or "scrubadub" in error_msg
or "failed" in error_msg
), "Error message should indicate PII detection failure"
def test_ac2_graceful_abort_on_encryption_failure(sample_text_en):
"""AC2: Test graceful abort when an encryption technique fails during execution."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_encrypt_fail_key",
)
)
],
)
clear_vault_key("test_encrypt_fail_key")
# Mock encrypt function at correct path - it's imported from techniques module
encrypt_path = (
"field_level_pseudo_anonymisation"
".techniques.anonymisation_pseudonymisation_techniques.encrypt"
)
with patch(encrypt_path) as mock_encrypt:
mock_encrypt.side_effect = Exception("Encryption algorithm failure")
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, sample_text_en)
error_msg = str(exc_info.value).lower()
assert (
"encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg
), "Error message should indicate encryption failure"
def test_ac2_null_text_input_raises_error(encrypt_person_config):
"""AC2: Test that a null (None) text input is rejected with an error."""
clear_vault_key("test_person_key")
# Dagster will raise DagsterTypeCheckDidNotPass before op executes
from dagster import DagsterTypeCheckDidNotPass
with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)):
run_unstructured_op(encrypt_person_config, None)
def test_ac2_invalid_language_configuration():
"""AC2: Test that an unsupported language in the config raises a validation error."""
# This should fail at config creation due to Pydantic validation
with pytest.raises((ValueError, TypeError)):
AnonymisePseudonymizeUnstructuredConfig(
language="invalid_lang", # Should fail Pydantic validation
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key"
)
)
],
)
def test_ac2_very_large_text_processing():
"""AC2: Test that very large text inputs are processed successfully without memory errors."""
# Create large text with repeated PII patterns
large_text = (
"""
John Smith works at company. Email: john.smith@example.com.
"""
* 1000
) # ~60KB of text with repeated PII
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_large_text_key",
)
)
],
)
clear_vault_key("test_large_text_key")
result_text, metrics_md = run_unstructured_op(config, large_text)
metrics = parse_metrics_markdown(metrics_md)
# Verify processing completed
assert result_text is not None, "Large text should be processed successfully"
assert len(result_text) > 0, "Result should not be empty"
assert metrics["total_pii_detected"] > 0, "PII should be detected in large text"
# ================================================================================================
# AC3: Execution Audit & Logging - Positive Scenario
# ================================================================================================
def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config):
"""AC3: Test that successful execution context contains a run ID for logging."""
clear_vault_key("test_person_key")
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
context = build_op_context(op_config=op_config_dict)
# Capture run context
run_id = context.run_id
# Execute operation
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
# Verify run identifier is available for logging
assert run_id is not None, "Run ID must be available for audit logging"
# Verify outputs are returned (for Dagster to log)
assert result_text is not None, "Result text should be available for logging"
assert metrics is not None, "Metrics should be available for logging"
def test_ac3_successful_execution_logs_configuration_parameters(
sample_text_en, mixed_technique_config
):
"""AC3: Test that the used configuration is accessible for logging on success."""
clear_vault_key("test_mixed_key")
op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config)
context = build_op_context(op_config=op_config_dict)
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
# Verify configuration is captured and accessible
assert "used_function" in op_config_dict, "Configuration must be accessible for logging"
assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured"
# Verify techniques are logged
techniques = [func["technique"] for func in op_config_dict["used_function"]]
assert any(
"encrypt" in str(tech) for tech in techniques
), "Encrypt technique should be in configuration"
assert any(
"retain" in str(tech) for tech in techniques
), "Retain technique should be in configuration"
# Verify metrics contain technique information (in markdown string)
metrics_str = metrics.value
assert (
"Techniques Applied" in metrics_str
), "Applied techniques should be in metrics for logging"
def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config):
"""AC3: Test that logs and metrics from a successful run do not contain raw PII."""
clear_vault_key("test_person_key")
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
context = build_op_context(op_config=op_config_dict)
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii)
# Verify raw PII values are not in metrics
metrics_str = metrics.value
sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"]
for pii_value in sensitive_values:
assert (
pii_value not in metrics_str
), f"Raw PII value should not appear in metrics: {pii_value}"
# Verify configuration logs do not contain raw PII
config_str = str(op_config_dict)
for pii_value in sensitive_values:
assert (
pii_value not in config_str
), f"Raw PII value should not appear in configuration logs: {pii_value}"
# ================================================================================================
# AC4: Execution Audit & Logging - Negative Scenario
# ================================================================================================
def test_ac4_failed_execution_logs_error_details():
"""AC4: Negative execution should surface clear error details (encryption key failure)."""
text = "Test user John Smith"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_fail_log_key",
)
)
],
)
clear_vault_key("test_fail_log_key")
ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
# Patch the key retrieval used inside unstructured_ops to force failure
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key",
side_effect=RuntimeError("Encryption key retrieval failed"),
):
with pytest.raises(RuntimeError) as exc_info:
# Consume the generator to trigger execution and raise the exception
list(anonymize_pseudonymize_unstructured(ctx, text=text))
msg = str(exc_info.value).lower()
assert "key" in msg and "failed" in msg, "Error message should mention key failure"
def test_ac4_failed_execution_logs_configuration_used():
"""AC4: Test that the attempted configuration is available for logging on failure."""
text = "Test data with person John Doe"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_config_fail_key",
)
)
],
)
clear_vault_key("test_config_fail_key")
op_config_dict = config_to_dagster_dict_unstructured(config)
context = build_op_context(op_config=op_config_dict)
# Mock _initialize_scrubber to fail
with patch(
"field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber"
) as mock_init_scrubber:
mock_init_scrubber.side_effect = Exception("Scrubber module not available")
with pytest.raises((RuntimeError, Exception)) as exc_info:
list(anonymize_pseudonymize_unstructured(context, text=text))
# Verify configuration is still accessible despite failure
assert op_config_dict is not None, "Configuration must be accessible for failure audit"
assert (
"used_function" in op_config_dict
), "Technique configuration should be available for diagnosis"
# Verify error was raised with proper message
error_msg = str(exc_info.value).lower()
assert (
"pii" in error_msg
or "detection" in error_msg
or "failed" in error_msg
or "scrubber" in error_msg
or "module" in error_msg
), "Error should indicate detection/processing failed"
def test_ac4_failed_execution_logs_failure_reason():
"""AC4: Test that the reason for a failure is clearly indicated in the error message."""
text = "User: Alice Smith, Email: alice@example.com"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_failure_reason_key",
)
)
],
)
clear_vault_key("test_failure_reason_key")
# Mock key retrieval function to fail
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
) as mock_get_key:
mock_get_key.side_effect = RuntimeError("Vault connection timeout")
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, text)
# Verify failure reason is in error message
error_msg = str(exc_info.value).lower()
assert (
"encrypt" in error_msg
or "key" in error_msg
or "timeout" in error_msg
or "failed" in error_msg
), "Error should indicate key retrieval/encryption failure"
# ================================================================================================
# Additional Tests - Edge Cases and Integration
# ================================================================================================
def test_multi_language_support_italian():
"""Additional test: Verify that Italian text is processed correctly."""
italian_text = """
Il dottor Marco Rossi lavora presso l'ospedale.
Email: marco.rossi@ospedale.it
Telefono: +39-06-12345678
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.it,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_italian_key",
)
)
],
)
clear_vault_key("test_italian_key")
result_text, metrics_md = run_unstructured_op(config, italian_text)
metrics = parse_metrics_markdown(metrics_md)
# Verify processing occurred
assert result_text != italian_text, "Italian text should be processed"
assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text"
def test_special_characters_in_text():
"""Additional test: Verify handling of text with special Unicode characters."""
special_text = """
User: João da Silva 🇧🇷
Email: joão@empresa.com.br
Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ®
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.pt,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_special_chars_key",
)
)
],
)
clear_vault_key("test_special_chars_key")
result_text, metrics_md = run_unstructured_op(config, special_text)
# Verify processing completed without encoding errors
assert result_text is not None, "Special characters should not cause processing failure"
assert len(result_text) > 0, "Result should not be empty"
def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config):
"""Additional test: Verify encryption format consistency across runs."""
clear_vault_key("test_person_key")
result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en)
result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en)
# Both should have encryption tokens
assert "{encrypt:" in result1, "First run should produce encrypted tokens"
assert "{encrypt:" in result2, "Second run should produce encrypted tokens"
# Verify consistent PII detection
metrics1 = parse_metrics_markdown(metrics_md1)
metrics2 = parse_metrics_markdown(metrics_md2)
assert (
metrics1["total_pii_detected"] == metrics2["total_pii_detected"]
), "PII detection should be consistent across runs"
# Verify token format is consistent (Fernet base64 pattern)
token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}"
tokens1 = re.findall(token_pattern, result1)
tokens2 = re.findall(token_pattern, result2)
assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"

View File

@@ -0,0 +1,58 @@
from template_code_location.field_level_pseudo_anonymisation.jobs import (
anonymize_pseudonymize_structured_job,
anonymize_pseudonymize_structured_job_s3,
depseudonymize_structured_job,
depseudonymize_structured_job_s3,
anonymize_pseudonymize_unstructured_job_s3,
anonymize_pseudonymize_unstructured_job,
depseudonymize_unstructured_job_s3,
depseudonymize_unstructured_job
)
def test_anonymize_pseudonymize_structured_job_is_callable():
"""Test anonymize_pseudonymize_structured_job is a valid Dagster job"""
assert callable(anonymize_pseudonymize_structured_job)
assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process')
def test_anonymize_pseudonymize_structured_job_s3_is_callable():
"""Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job"""
assert callable(anonymize_pseudonymize_structured_job_s3)
assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process')
def test_depseudonymize_structured_job_is_callable():
"""Test depseudonymize_structured_job is a valid Dagster job"""
assert callable(depseudonymize_structured_job)
assert hasattr(depseudonymize_structured_job, 'execute_in_process')
def test_depseudonymize_structured_job_s3_is_callable():
"""Test depseudonymize_structured_job_s3 is a valid Dagster job"""
assert callable(depseudonymize_structured_job_s3)
assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process')
def test_anonymize_pseudonymize_unstructured_job_is_callable():
"""Test anonymize_pseudonymize_unstructured_job is a valid Dagster job"""
assert callable(anonymize_pseudonymize_unstructured_job)
assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process')
def test_anonymize_pseudonymize_unstructured_job_s3_is_callable():
"""Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job"""
assert callable(anonymize_pseudonymize_unstructured_job_s3)
assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process')
def test_depseudonymize_unstructured_job_is_callable():
"""Test depseudonymize_unstructured_job is a valid Dagster job"""
assert callable(depseudonymize_unstructured_job)
assert hasattr(depseudonymize_unstructured_job, 'execute_in_process')
def test_depseudonymize_unstructured_job_s3_is_callable():
"""Test depseudonymize_unstructured_job_s3 is a valid Dagster job"""
assert callable(depseudonymize_unstructured_job_s3)
assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process')