feat(SIMPL-24642): migrate tests from 3 source repos with updated imports

This commit is contained in:
ILay
2026-04-24 18:42:07 +02:00
parent 4e0b216410
commit d14b2dfac4
26 changed files with 6280 additions and 0 deletions

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,54 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration
def test_valid_configuration_with_overrides():
cfg = BaseConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=10.0,
generalisation_hierarchies={"age": "age_hierarchy"},
)
assert cfg.ident == ["id"]
assert cfg.quasi_identifiers == ["age"]
assert cfg.supp_level == 10.0
assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"}
def test_default_values_are_loaded():
cfg = BaseConfiguration()
assert cfg.ident == ["Name"]
assert cfg.quasi_identifiers == ["Age"]
assert cfg.supp_level == 50.0
assert cfg.generalisation_hierarchies == {"Age": "simpl_age"}
def test_missing_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=[]
)
def test_missing_quasi_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
quasi_identifiers=[]
)
def test_overlap_between_ident_and_quasi_identifiers():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=["age"],
quasi_identifiers=["age"]
)
def test_supp_level_bounds():
with pytest.raises(ValidationError):
BaseConfiguration(
supp_level=150.0 # fuori range
)

View File

@@ -0,0 +1,48 @@
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import (
simpl_age,
simpl_age2,
simpl_gender,
get_all_hierarchies,
)
def test_simpl_age_structure():
assert isinstance(simpl_age, dict)
assert 0 in simpl_age
assert isinstance(simpl_age[0], list)
# verify first level contains 100 ages
assert len(simpl_age[0]) == 100
assert simpl_age[0][0] == 0
assert simpl_age[0][-1] == 99
def test_simpl_age2_structure():
assert isinstance(simpl_age2, dict)
assert 0 in simpl_age2
assert 1 in simpl_age2
assert isinstance(simpl_age2[0], list)
assert isinstance(simpl_age2[1], list)
def test_simpl_gender_structure():
assert isinstance(simpl_gender, dict)
assert 0 in simpl_gender
assert 1 in simpl_gender
assert simpl_gender[0] == ["M", "F", "O"]
assert simpl_gender[1] == ["*", "*", "*"]
def test_get_all_hierarchies():
hier = get_all_hierarchies()
# the function should return dicts only
assert isinstance(hier, dict)
# ensure expected dicts are included
assert "simpl_age" in hier
assert "simpl_age2" in hier
assert "simpl_gender" in hier
# ensure the values returned are references to the actual dicts
assert hier["simpl_age"] is simpl_age
assert hier["simpl_gender"] is simpl_gender

View File

@@ -0,0 +1,41 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import (
KAnonymityConfiguration,
)
def test_valid_k_anonymity_config_with_overrides():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
sensitive_attributes=["disease"],
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["disease"]
assert cfg.generalisation_hierarchies == {"age": "age_hier"}
def test_default_values_are_loaded():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["Disease"]
def test_invalid_k_value_raises_error():
with pytest.raises(ValidationError):
KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=1, # invalid, must be >= 2
sensitive_attributes=["disease"],
)

View File

@@ -0,0 +1,44 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import (
LDiversityConfiguration,
)
def test_valid_l_diversity_config_with_overrides():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=2,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.l == 2
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.l == 3
assert cfg.sensitive_attribute == "Disease"
def test_invalid_l_value_raises_error():
with pytest.raises(ValidationError):
LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=0, # invalid, must be >= 1
sensitive_attribute="disease",
)

View File

@@ -0,0 +1,56 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import (
TClosenessConfiguration,
)
def test_valid_t_closeness_config_with_overrides():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=0.4,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.t == 0.4
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.t == 0.5
assert cfg.sensitive_attribute == "Disease"
def test_invalid_t_value_low():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=-0.1, # invalid
sensitive_attribute="disease",
)
def test_invalid_t_value_high():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=2.0, # invalid > 1
sensitive_attribute="disease",
)

View File

@@ -0,0 +1,44 @@
from template_code_location.dataframe_level_anonymisation.jobs import (
k_anonymity_job,
l_diversity_job,
t_closeness_job,
k_anonymity_job_s3,
l_diversity_job_s3,
t_closeness_job_s3
)
def test_k_anonymity_job_is_callable():
"""Test k_anonymity_job is a valid Dagster job"""
assert callable(k_anonymity_job)
assert hasattr(k_anonymity_job, 'execute_in_process')
def test_l_diversity_job_is_callable():
"""Test l_diversity_job is a valid Dagster job"""
assert callable(l_diversity_job)
assert hasattr(l_diversity_job, 'execute_in_process')
def test_t_closeness_job_is_callable():
"""Test t_closeness_job is a valid Dagster job"""
assert callable(t_closeness_job)
assert hasattr(t_closeness_job, 'execute_in_process')
def test_k_anonymity_job_s3_is_callable():
"""Test k_anonymity_job_s3 is a valid Dagster job"""
assert callable(k_anonymity_job_s3)
assert hasattr(k_anonymity_job_s3, 'execute_in_process')
def test_l_diversity_job_s3_is_callable():
"""Test l_diversity_job_s3 is a valid Dagster job"""
assert callable(l_diversity_job_s3)
assert hasattr(l_diversity_job_s3, 'execute_in_process')
def test_t_closeness_job_s3_is_callable():
"""Test t_closeness_job_s3 is a valid Dagster job"""
assert callable(t_closeness_job_s3)
assert hasattr(t_closeness_job_s3, 'execute_in_process')

View File

@@ -0,0 +1,230 @@
import pytest
import pandas as pd
from unittest.mock import patch
from dagster import DagsterInvalidInvocationError, build_op_context
from template_code_location.dataframe_level_anonymisation.ops import (
apply_k_anonymity,
apply_l_diversity,
apply_t_closeness,
)
from template_code_location.dataframe_level_anonymisation.config_models import (
KAnonymityConfiguration,
LDiversityConfiguration,
TClosenessConfiguration,
)
# ---------------------------
# Fixtures
# ---------------------------
@pytest.fixture
def fake_df():
return pd.DataFrame({"id": [1, 2], "age": [30, 40]})
@pytest.fixture
def k_config():
return KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attributes=["age"],
k=2,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def l_config():
return LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
l=1,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def t_config():
return TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
t=0.5,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def op_context():
return build_op_context()
# ---------------------------
# Helper for patching external functions
# ---------------------------
@pytest.fixture(autouse=True)
def patch_external_ops():
with (
patch(
"dataframe_level_anonymisation.ops.get_all_hierarchies",
return_value={"simpl_age": {0: [30, 40]}},
),
patch(
"dataframe_level_anonymisation.ops.k_anonymity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.l_diversity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.t_closeness",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
):
yield
# ---------------------------
# Tests for apply_k_anonymity
# ---------------------------
def test_apply_k_anonymity_outputs(op_context, k_config, fake_df):
results = list(apply_k_anonymity(op_context, k_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
# Check types
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
# ---------------------------
# Tests for apply_l_diversity
# ---------------------------
def test_apply_l_diversity_outputs(op_context, l_config, fake_df):
results = list(apply_l_diversity(op_context, l_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_l_diversity_empty_raises(op_context, l_config):
with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Tests for apply_t_closeness
# ---------------------------
def test_apply_t_closeness_outputs(op_context, t_config, fake_df):
results = list(apply_t_closeness(op_context, t_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_t_closeness_empty_raises(op_context, t_config):
with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Additional tests for _validate_and_get_hierarchies
# ---------------------------
def test_validate_hierarchies_dataset_too_small(k_config):
small_df = pd.DataFrame({"id": [1], "age": [30]})
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, small_df)
def test_validate_hierarchies_missing_hierarchy(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}})
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(bad_config, fake_df)
def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}):
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, fake_df)
# ---------------------------
# Additional tests for _calc_dataframe_metrics
# ---------------------------
def test_calc_dataframe_metrics_basic():
from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics
df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]})
df_anon = df_org.copy()
with (
patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2),
patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1),
patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1),
):
report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"])
assert "k-anonymity" in report
assert metrics["k_anon"] == 2
assert metrics["l_div"] == 1
assert metrics["t_clos"] == 0.1
# ---------------------------
# Tests for apply_t_closeness exception branches
# ---------------------------
def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df):
"""Covers the branch where ValueError contains 'Cannot be quasi-identifiers'."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness",
side_effect=ValueError("Cannot be quasi-identifiers invalid"),
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))
def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df):
"""Covers the branch where ValueError is raised but message does NOT contain that substring."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error")
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))

View File

@@ -0,0 +1,70 @@
import numpy as np
from template_code_location.dataframe_level_anonymisation.utils import (
parse_value_list,
normalize_hierarchy_levels,
)
# ------------------------------------
# Tests for parse_value_list
# ------------------------------------
def test_parse_value_list_all_strings_digits():
values = ["1", "2", "3"]
assert parse_value_list(values) == [1, 2, 3]
def test_parse_value_list_mixed_values():
values = ["1", 2, "abc", "5"]
assert parse_value_list(values) == [1, 2, "abc", 5]
def test_parse_value_list_no_digits():
values = ["a", "b", "c"]
assert parse_value_list(values) == ["a", "b", "c"]
# ------------------------------------
# Tests for normalize_hierarchy_levels
# ------------------------------------
def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array():
hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert "age" in normalized
assert 0 in normalized["age"]
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list
assert normalized["age"][1] == ["0-10", "11-20"] # untouched
def test_normalize_hierarchy_levels_multiple_columns():
hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}}
normalized = normalize_hierarchy_levels(hierarchy)
# First column
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [10, 20]
# Second column
assert isinstance(normalized["gender"][0], np.ndarray)
assert normalized["gender"][0].tolist() == ["M", "F"]
assert normalized["gender"][1] == ["*"]
def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0():
hierarchy = {"test": {"0": ["1", "x", "3"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert isinstance(normalized["test"][0], np.ndarray)
assert normalized["test"][0].tolist() == ["1", "x", "3"]
def test_normalize_hierarchy_levels_empty_mapping():
hierarchy = {"col": {}}
normalized = normalize_hierarchy_levels(hierarchy)
assert normalized == {"col": {}}