import pytest import pandas as pd from unittest.mock import patch from dagster import DagsterInvalidInvocationError, build_op_context from template_code_location.dataframe_level_anonymisation.ops import ( apply_k_anonymity, apply_l_diversity, apply_t_closeness, ) from template_code_location.dataframe_level_anonymisation.config_models import ( KAnonymityConfiguration, LDiversityConfiguration, TClosenessConfiguration, ) # --------------------------- # Fixtures # --------------------------- @pytest.fixture def fake_df(): return pd.DataFrame({"id": [1, 2], "age": [30, 40]}) @pytest.fixture def k_config(): return KAnonymityConfiguration( ident=["id"], quasi_identifiers=["age"], sensitive_attributes=["age"], k=2, supp_level=0.0, generalisation_hierarchies={"age": "simpl_age"}, ) @pytest.fixture def l_config(): return LDiversityConfiguration( ident=["id"], quasi_identifiers=["age"], sensitive_attribute="age", k=2, l=1, supp_level=0.0, generalisation_hierarchies={"age": "simpl_age"}, ) @pytest.fixture def t_config(): return TClosenessConfiguration( ident=["id"], quasi_identifiers=["age"], sensitive_attribute="age", k=2, t=0.5, supp_level=0.0, generalisation_hierarchies={"age": "simpl_age"}, ) @pytest.fixture def op_context(): return build_op_context() # --------------------------- # Helper for patching external functions # --------------------------- @pytest.fixture(autouse=True) def patch_external_ops(): with ( patch( "dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={"simpl_age": {0: [30, 40]}}, ), patch( "dataframe_level_anonymisation.ops.k_anonymity", return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), ), patch( "dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), ), patch( "dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), ), ): yield # --------------------------- # Tests for apply_k_anonymity # --------------------------- def test_apply_k_anonymity_outputs(op_context, k_config, fake_df): results = list(apply_k_anonymity(op_context, k_config, fake_df)) assert len(results) == 2 data_output = results[0].value metrics_output = results[1].value # Check types assert isinstance(data_output, pd.DataFrame) assert isinstance(metrics_output, dict) assert "k_anon" in metrics_output assert "l_div" in metrics_output assert "t_clos" in metrics_output # --------------------------- # Tests for apply_l_diversity # --------------------------- def test_apply_l_diversity_outputs(op_context, l_config, fake_df): results = list(apply_l_diversity(op_context, l_config, fake_df)) assert len(results) == 2 data_output = results[0].value metrics_output = results[1].value assert isinstance(data_output, pd.DataFrame) assert isinstance(metrics_output, dict) assert "k_anon" in metrics_output assert "l_div" in metrics_output assert "t_clos" in metrics_output def test_apply_l_diversity_empty_raises(op_context, l_config): with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()): with pytest.raises(DagsterInvalidInvocationError): list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]}))) # --------------------------- # Tests for apply_t_closeness # --------------------------- def test_apply_t_closeness_outputs(op_context, t_config, fake_df): results = list(apply_t_closeness(op_context, t_config, fake_df)) assert len(results) == 2 data_output = results[0].value metrics_output = results[1].value assert isinstance(data_output, pd.DataFrame) assert isinstance(metrics_output, dict) assert "k_anon" in metrics_output assert "l_div" in metrics_output assert "t_clos" in metrics_output def test_apply_t_closeness_empty_raises(op_context, t_config): with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()): with pytest.raises(DagsterInvalidInvocationError): list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]}))) # --------------------------- # Additional tests for _validate_and_get_hierarchies # --------------------------- def test_validate_hierarchies_dataset_too_small(k_config): small_df = pd.DataFrame({"id": [1], "age": [30]}) from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies with pytest.raises(DagsterInvalidInvocationError): _validate_and_get_hierarchies(k_config, small_df) def test_validate_hierarchies_missing_hierarchy(k_config, fake_df): from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}}) with pytest.raises(DagsterInvalidInvocationError): _validate_and_get_hierarchies(bad_config, fake_df) def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df): from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}): with pytest.raises(DagsterInvalidInvocationError): _validate_and_get_hierarchies(k_config, fake_df) # --------------------------- # Additional tests for _calc_dataframe_metrics # --------------------------- def test_calc_dataframe_metrics_basic(): from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]}) df_anon = df_org.copy() with ( patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2), patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1), patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1), ): report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"]) assert "k-anonymity" in report assert metrics["k_anon"] == 2 assert metrics["l_div"] == 1 assert metrics["t_clos"] == 0.1 # --------------------------- # Tests for apply_t_closeness exception branches # --------------------------- def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df): """Covers the branch where ValueError contains 'Cannot be quasi-identifiers'.""" with patch( "dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Cannot be quasi-identifiers invalid"), ): with pytest.raises(DagsterInvalidInvocationError): list(apply_t_closeness(op_context, t_config, fake_df)) def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df): """Covers the branch where ValueError is raised but message does NOT contain that substring.""" with patch( "dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error") ): with pytest.raises(DagsterInvalidInvocationError): list(apply_t_closeness(op_context, t_config, fake_df))