231 lines
7.5 KiB
Python
231 lines
7.5 KiB
Python
import pytest
|
|
import pandas as pd
|
|
from unittest.mock import patch
|
|
from dagster import DagsterInvalidInvocationError, build_op_context
|
|
|
|
from template_code_location.dataframe_level_anonymisation.ops import (
|
|
apply_k_anonymity,
|
|
apply_l_diversity,
|
|
apply_t_closeness,
|
|
)
|
|
from template_code_location.dataframe_level_anonymisation.config_models import (
|
|
KAnonymityConfiguration,
|
|
LDiversityConfiguration,
|
|
TClosenessConfiguration,
|
|
)
|
|
|
|
|
|
# ---------------------------
|
|
# Fixtures
|
|
# ---------------------------
|
|
@pytest.fixture
|
|
def fake_df():
|
|
return pd.DataFrame({"id": [1, 2], "age": [30, 40]})
|
|
|
|
|
|
@pytest.fixture
|
|
def k_config():
|
|
return KAnonymityConfiguration(
|
|
ident=["id"],
|
|
quasi_identifiers=["age"],
|
|
sensitive_attributes=["age"],
|
|
k=2,
|
|
supp_level=0.0,
|
|
generalisation_hierarchies={"age": "simpl_age"},
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def l_config():
|
|
return LDiversityConfiguration(
|
|
ident=["id"],
|
|
quasi_identifiers=["age"],
|
|
sensitive_attribute="age",
|
|
k=2,
|
|
l=1,
|
|
supp_level=0.0,
|
|
generalisation_hierarchies={"age": "simpl_age"},
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def t_config():
|
|
return TClosenessConfiguration(
|
|
ident=["id"],
|
|
quasi_identifiers=["age"],
|
|
sensitive_attribute="age",
|
|
k=2,
|
|
t=0.5,
|
|
supp_level=0.0,
|
|
generalisation_hierarchies={"age": "simpl_age"},
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def op_context():
|
|
return build_op_context()
|
|
|
|
|
|
# ---------------------------
|
|
# Helper for patching external functions
|
|
# ---------------------------
|
|
@pytest.fixture(autouse=True)
|
|
def patch_external_ops():
|
|
with (
|
|
patch(
|
|
"dataframe_level_anonymisation.ops.get_all_hierarchies",
|
|
return_value={"simpl_age": {0: [30, 40]}},
|
|
),
|
|
patch(
|
|
"dataframe_level_anonymisation.ops.k_anonymity",
|
|
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
),
|
|
patch(
|
|
"dataframe_level_anonymisation.ops.l_diversity",
|
|
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
),
|
|
patch(
|
|
"dataframe_level_anonymisation.ops.t_closeness",
|
|
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
),
|
|
):
|
|
yield
|
|
|
|
|
|
# ---------------------------
|
|
# Tests for apply_k_anonymity
|
|
# ---------------------------
|
|
def test_apply_k_anonymity_outputs(op_context, k_config, fake_df):
|
|
results = list(apply_k_anonymity(op_context, k_config, fake_df))
|
|
assert len(results) == 2
|
|
|
|
data_output = results[0].value
|
|
metrics_output = results[1].value
|
|
|
|
# Check types
|
|
assert isinstance(data_output, pd.DataFrame)
|
|
assert isinstance(metrics_output, dict)
|
|
assert "k_anon" in metrics_output
|
|
assert "l_div" in metrics_output
|
|
assert "t_clos" in metrics_output
|
|
|
|
|
|
# ---------------------------
|
|
# Tests for apply_l_diversity
|
|
# ---------------------------
|
|
def test_apply_l_diversity_outputs(op_context, l_config, fake_df):
|
|
results = list(apply_l_diversity(op_context, l_config, fake_df))
|
|
assert len(results) == 2
|
|
|
|
data_output = results[0].value
|
|
metrics_output = results[1].value
|
|
|
|
assert isinstance(data_output, pd.DataFrame)
|
|
assert isinstance(metrics_output, dict)
|
|
assert "k_anon" in metrics_output
|
|
assert "l_div" in metrics_output
|
|
assert "t_clos" in metrics_output
|
|
|
|
|
|
def test_apply_l_diversity_empty_raises(op_context, l_config):
|
|
with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()):
|
|
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]})))
|
|
|
|
|
|
# ---------------------------
|
|
# Tests for apply_t_closeness
|
|
# ---------------------------
|
|
def test_apply_t_closeness_outputs(op_context, t_config, fake_df):
|
|
results = list(apply_t_closeness(op_context, t_config, fake_df))
|
|
assert len(results) == 2
|
|
|
|
data_output = results[0].value
|
|
metrics_output = results[1].value
|
|
|
|
assert isinstance(data_output, pd.DataFrame)
|
|
assert isinstance(metrics_output, dict)
|
|
assert "k_anon" in metrics_output
|
|
assert "l_div" in metrics_output
|
|
assert "t_clos" in metrics_output
|
|
|
|
|
|
def test_apply_t_closeness_empty_raises(op_context, t_config):
|
|
with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()):
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]})))
|
|
|
|
|
|
# ---------------------------
|
|
# Additional tests for _validate_and_get_hierarchies
|
|
# ---------------------------
|
|
def test_validate_hierarchies_dataset_too_small(k_config):
|
|
small_df = pd.DataFrame({"id": [1], "age": [30]})
|
|
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
_validate_and_get_hierarchies(k_config, small_df)
|
|
|
|
|
|
def test_validate_hierarchies_missing_hierarchy(k_config, fake_df):
|
|
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
|
|
bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}})
|
|
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
_validate_and_get_hierarchies(bad_config, fake_df)
|
|
|
|
|
|
def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df):
|
|
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
|
|
with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}):
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
_validate_and_get_hierarchies(k_config, fake_df)
|
|
|
|
|
|
# ---------------------------
|
|
# Additional tests for _calc_dataframe_metrics
|
|
# ---------------------------
|
|
def test_calc_dataframe_metrics_basic():
|
|
from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics
|
|
|
|
df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]})
|
|
df_anon = df_org.copy()
|
|
|
|
with (
|
|
patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2),
|
|
patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1),
|
|
patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1),
|
|
):
|
|
|
|
report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"])
|
|
|
|
assert "k-anonymity" in report
|
|
assert metrics["k_anon"] == 2
|
|
assert metrics["l_div"] == 1
|
|
assert metrics["t_clos"] == 0.1
|
|
|
|
|
|
# ---------------------------
|
|
# Tests for apply_t_closeness exception branches
|
|
# ---------------------------
|
|
def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df):
|
|
"""Covers the branch where ValueError contains 'Cannot be quasi-identifiers'."""
|
|
with patch(
|
|
"dataframe_level_anonymisation.ops.t_closeness",
|
|
side_effect=ValueError("Cannot be quasi-identifiers invalid"),
|
|
):
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
list(apply_t_closeness(op_context, t_config, fake_df))
|
|
|
|
|
|
def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df):
|
|
"""Covers the branch where ValueError is raised but message does NOT contain that substring."""
|
|
with patch(
|
|
"dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error")
|
|
):
|
|
with pytest.raises(DagsterInvalidInvocationError):
|
|
list(apply_t_closeness(op_context, t_config, fake_df))
|