change to import from modules

This commit is contained in:
ILay
2026-05-06 10:58:17 +02:00
parent 733a38e128
commit 004bcd5c01
61 changed files with 10 additions and 8258 deletions

View File

@@ -41,6 +41,10 @@ dependencies = [
"cryptography>=42.0.0", "cryptography>=42.0.0",
# Util services — resolved via [tool.uv.sources] (git) # Util services — resolved via [tool.uv.sources] (git)
"util-services", "util-services",
# Code location packages — resolved via [tool.uv.sources] (git)
"data-processing",
"dataframe-level-anonymisation",
"field-level-pseudo-anonymisation",
] ]
[tool.uv] [tool.uv]
@@ -49,6 +53,9 @@ exclude-dependencies = ["transformers", "spacy-transformers"]
[tool.uv.sources] [tool.uv.sources]
torch = { index = "pytorch-cpu" } torch = { index = "pytorch-cpu" }
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" } util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" }
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "feature/SIMPL-24642" }
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "feature/SIMPL-24642" }
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "feature/SIMPL-24642" }
[[tool.uv.index]] [[tool.uv.index]]
name = "pytorch-cpu" name = "pytorch-cpu"

View File

@@ -1,18 +0,0 @@
"""Configuration models for data processing."""
from .columns_select_configuration import ColumnsSelectConfiguration
from .fill_missing_config import FillMissingConfiguration
from .spell_check_configuration import SpellCheckConfiguration
from .coordinates_normalization_configuration import CoordinatesNormalizationConfiguration
from .aggregation_configuration import AggregationConfiguration
from .filter_configuration import DatasetFilterConfiguration, FilterCondition
__all__ = [
"ColumnsSelectConfiguration",
"FillMissingConfiguration",
"SpellCheckConfiguration",
"CoordinatesNormalizationConfiguration",
"AggregationConfiguration",
"FilterCondition",
"DatasetFilterConfiguration"
]

View File

@@ -1,25 +0,0 @@
from typing import List
from pydantic import Field, field_validator
from .columns_select_configuration import ColumnsSelectConfiguration
class AggregationConfiguration(ColumnsSelectConfiguration):
operation: str = Field(
default="sum",
description="Aggregation operations: sum, mean, min, max, count"
)
@field_validator("operation")
@classmethod
def validate_operations(cls, value):
allowed = {"sum", "mean", "min", "max", "count"}
if value not in allowed:
raise ValueError(
f"Invalid aggregation operation '{value}'. "
f"Allowed values: {allowed}"
)
return value

View File

@@ -1,17 +0,0 @@
from typing import List
from pydantic import Field,field_validator
from dagster import Config
class ColumnsSelectConfiguration(Config):
columns: List[str] = Field(
default=["Name"], description="List of columns to process."
)
@field_validator("columns")
@classmethod
def ensure_unique_columns(cls, v: List[str]) -> List[str]:
unique_values = list(dict.fromkeys(v))
return unique_values

View File

@@ -1,22 +0,0 @@
from typing import Optional
from pydantic import Field, model_validator
from dagster import Config
class CoordinatesNormalizationConfiguration(Config):
latColumn: Optional[str] = Field(
default="lat", description="Latitude column name"
)
lonColumn: Optional[str] = Field(
default="lon", description="Longitude column name"
)
@model_validator(mode="before")
@classmethod
def replace_nulls_with_defaults(cls, values):
if values.get("latColumn") is None:
values["latColumn"] = "lat"
if values.get("lonColumn") is None:
values["lonColumn"] = "lon"
return values

View File

@@ -1,9 +0,0 @@
from typing import Dict
from dagster import Config
from pydantic import Field
class FillMissingConfiguration(Config):
fill_map: Dict[str, str] = Field(
default={"Age": "UNKNOWN_AGE"}, description="Missing values filling map."
)

View File

@@ -1,52 +0,0 @@
from enum import Enum
import operator
from typing import List, Literal, Callable
from pydantic import Field, model_validator
from dagster import Config
import pandas as pd
class FilterOperator(str, Enum):
EQ = "=="
NE = "!="
LT = "<"
LE = "<="
GT = ">"
GE = ">="
@property
def function(self) -> Callable:
mapping = {
FilterOperator.EQ: operator.eq,
FilterOperator.NE: operator.ne,
FilterOperator.LT: operator.lt,
FilterOperator.LE: operator.le,
FilterOperator.GT: operator.gt,
FilterOperator.GE: operator.ge,
}
return mapping[self]
class FilterCondition(Config):
column: str = Field(..., description="Name of the column to filter")
type: Literal["string", "numeric"] = Field(..., description="Column type (string or numeric)")
value: str = Field(..., description="Value to compare against")
op: FilterOperator = Field(default=FilterOperator.EQ, description="Operator to apply (string supports only EQ and NE)")
@model_validator(mode="after")
def check_operator_compatibility(self) -> "FilterCondition":
if self.type == "string" and self.op not in [FilterOperator.EQ, FilterOperator.NE]:
raise ValueError(
f"Invalid operator '{self.op.name}' for type 'string'. "
"Only EQ (==) and NE (!=) are allowed."
)
return self
def apply(self, df: pd.DataFrame) -> pd.Series:
val = float(self.value) if self.type == "numeric" else self.value
return self.op.function(df[self.column], val)
class DatasetFilterConfiguration(Config):
conditions: List[FilterCondition] = Field(
default=[],
description="List of filter conditions to apply on the dataset. "
"String columns support only 'EQ' and 'NE', numeric columns also support 'LT', 'LE', 'GT' and 'GE'."
)

View File

@@ -1,8 +0,0 @@
from typing import Literal
from pydantic import Field
from .columns_select_configuration import ColumnsSelectConfiguration
class SpellCheckConfiguration(ColumnsSelectConfiguration):
language: Literal["en", "es", "it", "fr", "pt", "de", "nl"] = Field(default="en", description="Language to use in the SpellChecker module.")

View File

@@ -1,119 +0,0 @@
from dagster import job
from util_services.util_ops import (
preview_dataframe,
read_structured_from_s3,
write_df_to_s3,
)
from .ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos,
normalize_numeric_min_max,
normalize_datetime,
normalize_coordinates,
add_global_aggregations,
filter_dataset
)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def remove_duplicates_job_s3():
org_df = read_structured_from_s3()
anon_df = remove_duplicates(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def fill_missing_values_job_s3():
org_df = read_structured_from_s3()
anon_df = fill_missing_values(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def standardize_categorical_values_job_s3():
org_df = read_structured_from_s3()
anon_df = standardize_categorical_values(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def correct_typos_job_s3():
org_df = read_structured_from_s3()
anon_df = correct_typos(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def normalize_numeric_min_max_job_s3():
org_df = read_structured_from_s3()
anon_df = normalize_numeric_min_max(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def normalize_datetime_job_s3():
org_df = read_structured_from_s3()
anon_df = normalize_datetime(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def normalize_coordinates_job_s3():
org_df = read_structured_from_s3()
anon_df = normalize_coordinates(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def add_global_aggregations_job_s3():
org_df = read_structured_from_s3()
anon_df = add_global_aggregations(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "PROCESSING",
"resource_type": "RD_DATA"
})
def filter_dataset_job_s3():
org_df = read_structured_from_s3()
anon_df = filter_dataset(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)

View File

@@ -1,256 +0,0 @@
import pandas as pd
from dagster import Out, op
from spellchecker import SpellChecker
from template_code_location.data_processing.config_models import (
AggregationConfiguration,
ColumnsSelectConfiguration,
CoordinatesNormalizationConfiguration,
FillMissingConfiguration,
SpellCheckConfiguration,
DatasetFilterConfiguration
)
def _parse_dms_to_decimal(value):
"""Parse a DMS (degrees-minutes-seconds) string to decimal degrees using PyGeodesy.
Supported formats include (but are not limited to):
- 40°26'46"N / 40°2646″N
- 40 26 46 N
- 40:26:46N
- 40d26m46sN
- -40.446 (already decimal returned as-is)
Returns None if parsing fails.
"""
from pygeodesy.dms import parseDMS
if pd.isna(value):
return None
text = str(value).strip()
if not text:
return None
try:
return float(parseDMS(text))
except (ValueError, TypeError):
try:
return float(text)
except (ValueError, TypeError):
return None
@op(out={"data": Out()})
def remove_duplicates(context, df: pd.DataFrame):
"""Remove duplicate rows from the input DataFrame."""
logger = context.log
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
logger.info(f"Removed {before - after} duplicate rows")
return df
@op(out={"data": Out()})
def fill_missing_values(context, config: FillMissingConfiguration, df: pd.DataFrame):
"""Fill missing values in the DataFrame according to the configured column-to-value mapping."""
logger = context.log
logger.info(f"Filling missing values: {config.fill_map}")
return df.fillna(config.fill_map)
@op(out={"data": Out()})
def standardize_categorical_values(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
"""Standardize categorical values in selected columns by trimming whitespace and converting text to lowercase."""
logger = context.log
for col in config.columns:
if col not in df.columns:
logger.warning(f"Column '{col}' not found in DataFrame, skipping.")
continue
original = df[col]
standardized = (
df[col]
.fillna("")
.astype(str)
.str.strip()
.str.lower()
)
changed_count = (original != standardized).sum()
df[col] = standardized
logger.info(f"Standardized '{col}' column {changed_count} values modified")
return df
@op(out={"data": Out()})
def correct_typos(context, config: SpellCheckConfiguration, df: pd.DataFrame):
"""Correct spelling mistakes in the specified text columns."""
logger = context.log
for column in config.columns:
if column not in df.columns:
logger.warning(f"Column '{column}' not found in DataFrame, skipping.")
continue
spell = SpellChecker(language=config.language)
original = df[column].astype(str)
corrected = original.apply(lambda x, spell_checker=spell: spell_checker.correction(x) if x else x)
changed_count = (original != corrected).sum()
logger.info(f"Corrected typos in '{column}' {changed_count} values modified")
df[column] = corrected
return df
@op(out={"data": Out()})
def normalize_datetime(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
logger = context.log
for col in config.columns:
if col not in df.columns:
logger.warning(f"Column '{col}' not found, skipping normalization.")
continue
normalized = pd.to_datetime(df[col], utc=True, format="mixed", dayfirst=True, errors="coerce")
if normalized.notna().sum() == 0:
logger.warning(
f"Column '{col}' has no normalizable datetime values, skipping."
)
continue
iso_col = f"{col}_iso"
formatted = normalized.dt.strftime("%Y-%m-%dT%H:%M:%SZ").fillna("")
non_empty = formatted[formatted != ""]
if len(non_empty) > 0 and non_empty.str.startswith("1970-01-01").all():
logger.warning(
f"Column '{col}' all normalized values are '1970-01-01', likely bad input — skipping."
)
continue
df[iso_col] = formatted
logger.info(f"Normalized datetime column '{col}' into '{iso_col}'")
return df
@op(out={"data": Out()})
def normalize_numeric_min_max(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
logger = context.log
for col in config.columns:
if col not in df.columns:
logger.warning(f"Column '{col}' not found, skipping normalization.")
continue
min_val = df[col].min()
max_val = df[col].max()
if min_val == max_val:
logger.warning(f"Column '{col}' has constant values, skipping normalization.")
continue
df[col + "_norm"] = (df[col] - min_val) / (max_val - min_val)
logger.info(f"Normalized numeric column '{col}'")
return df
@op(out={"data": Out()})
def normalize_coordinates(context, config: CoordinatesNormalizationConfiguration, df: pd.DataFrame):
logger = context.log
lat = config.latColumn
lon = config.lonColumn
for col in [lat, lon]:
if pd.api.types.is_numeric_dtype(df[col]):
logger.info(f"Column '{col}' is numeric — coercing directly")
df[col] = pd.to_numeric(df[col], errors="coerce")
else:
logger.info(f"Column '{col}' is non-numeric — parsing as DMS with PyGeodesy")
df[col] = df[col].apply(_parse_dms_to_decimal)
invalid_lat = df[lat].isnull().sum()
invalid_lon = df[lon].isnull().sum()
logger.info(f"Found {invalid_lat} invalid latitudes and {invalid_lon} invalid longitudes")
df[lat] = df[lat].round(4)
df[lon] = df[lon].round(4)
before_filter_rows = len(df)
df = df[(df[lat].between(-90, 90)) & (df[lon].between(-180, 180))]
after_filter_rows = len(df)
logger.info(f"Filtered coordinates out of range: removed {before_filter_rows - after_filter_rows} rows")
logger.info(f"Coordinate normalization completed: resulting dataframe has {after_filter_rows} rows")
return df
@op(out={"data": Out()})
def add_global_aggregations(context, config: AggregationConfiguration, df: pd.DataFrame):
logger = context.log
group_by_cols = []
for col in config.columns:
if col not in df.columns:
logger.warning(f"Column '{col}' not found, skipping aggregation.")
continue
group_by_cols.append(col)
if config.operation not in {"sum", "mean", "min", "max", "count"}:
logger.warning(f"Unsupported aggregation '{config.operation}'")
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
cols_to_keep = list(set(numeric_cols + group_by_cols))
df = df[[c for c in cols_to_keep if c in df.columns]]
df = df.groupby(group_by_cols).agg(config.operation).reset_index()
return df
@op(out={"data": Out()})
def filter_dataset(context, config: DatasetFilterConfiguration, df: pd.DataFrame):
logger = context.log
total_rows_before = len(df)
logger.info(f"Starting dataset filtering: initial dataframe has {total_rows_before} rows")
combined_mask = pd.Series([True] * total_rows_before, index=df.index)
for condition in config.conditions:
if condition.column not in df.columns:
logger.warning(f"Column '{condition.column}' not found, skipping filtering.")
continue
if df[condition.column].isna().all():
logger.warning(f"Column '{condition.column}' is empty (all NaN), skipping filtering.")
continue
try:
current_mask = condition.apply(df)
combined_mask &= current_mask
logger.info(f"Applied filter: {condition.column} {condition.op.value} '{condition.value}'")
except Exception as e:
logger.error(f"Error applying filter on column '{condition.column}': {e}")
filtered_df = df[combined_mask]
total_rows_after = len(filtered_df)
logger.info(
f"Filtering completed: {total_rows_after} rows remain "
f"(removed {total_rows_before - total_rows_after} rows in total)"
)
return filtered_df

View File

@@ -1,13 +0,0 @@
"""Configuration models for dataframe-level anonymization."""
from .k_anonymity_configuration import KAnonymityConfiguration
from .l_diversity_configuration import LDiversityConfiguration
from .t_closeness_configuration import TClosenessConfiguration
from .base_config import BaseConfiguration
__all__ = [
"BaseConfiguration",
"KAnonymityConfiguration",
"LDiversityConfiguration",
"TClosenessConfiguration",
]

View File

@@ -1,33 +0,0 @@
from typing import Dict, List
from dagster import Config
from pydantic import Field, field_validator, model_validator
class BaseConfiguration(Config):
ident: List[str] = Field(default=["Name"], description="List of identifier column names.")
quasi_identifiers: List[str] = Field(default=["Age"], description="List of quasi-identifier column names.")
supp_level: float = Field(default=50.0, ge=0.0, le=100.0, description="Max suppression allowed (0100).")
generalisation_hierarchies: Dict[str, str] = Field(
default={"Age": "simpl_age"}, description="Hierarchies used to generalize quasi-identifiers."
)
@field_validator("quasi_identifiers")
def validate_quasi_identifiers(cls, value):
if not value:
raise ValueError("At least one quasi-identifier must be provided.")
return value
@field_validator("ident")
def validate_ident(cls, value):
if not value:
raise ValueError("At least one identifier must be provided.")
return value
@model_validator(mode="after")
def check_no_overlap(self):
ident = set(self.ident)
quasi = set(self.quasi_identifiers)
overlap = ident & quasi
if overlap:
raise ValueError(f"Fields cannot be both identifiers and quasi-identifiers: {overlap}")
return self

View File

@@ -1,18 +0,0 @@
from anjana.anonymity.utils import utils
simpl_age = {
0: [age for age in range(0, 100)],
1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5),
2: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 10),
3: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 20),
4: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 100),
}
simpl_age2 = {
0: [age for age in range(0, 100)],
1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5),
}
simpl_gender = {0: ["M", "F", "O"], 1: ["*", "*", "*"]}
def get_all_hierarchies():
return {name: obj for name, obj in globals().items() if isinstance(obj, dict)}

View File

@@ -1,11 +0,0 @@
from typing import List
from pydantic import Field
from .base_config import BaseConfiguration
class KAnonymityConfiguration(BaseConfiguration):
k: int = Field(default=3, ge=2, description="Desired level of k-anonymity (must be >= 2).")
sensitive_attributes: List[str] = Field(
default=["Disease"], description="List of sensitive attribute column names."
)

View File

@@ -1,8 +0,0 @@
from pydantic import Field
from .base_config import BaseConfiguration
class LDiversityConfiguration(BaseConfiguration):
k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).")
l: int = Field(default=3, ge=1, description="L-diversity level (must be >= 1)")
sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.")

View File

@@ -1,8 +0,0 @@
from pydantic import Field
from .base_config import BaseConfiguration
class TClosenessConfiguration(BaseConfiguration):
k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).")
t: float = Field(default=0.5, ge=0.0, le=1.0, description="Maximum t-distance threshold.")
sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.")

View File

@@ -1,86 +0,0 @@
from dagster import job
from util_services.util_ops import (
preview_dataframe,
read_structured_to_df,
write_df_to_local,
read_structured_from_s3,
write_df_to_s3,
write_semistructured_to_s3,
read_semistructured_from_s3
)
from .ops import apply_k_anonymity, apply_l_diversity, apply_t_closeness
@job(tags={
"business_operation": "ANONYMISATION"
})
def k_anonymity_job():
org_df = read_structured_to_df()
anon_df, _ = apply_k_anonymity(org_df)
preview_dataframe(org_df)
write_df_to_local(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "ANONYMISATION"
})
def l_diversity_job():
org_df = read_structured_to_df()
anon_df, _ = apply_l_diversity(org_df)
preview_dataframe(org_df)
write_df_to_local(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "ANONYMISATION"
})
def t_closeness_job():
org_df = read_structured_to_df()
anon_df, _ = apply_t_closeness(org_df)
preview_dataframe(org_df)
write_df_to_local(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "ANONYMISATION",
"resource_type": "RD_DATA"
})
def k_anonymity_job_s3():
org_df = read_structured_from_s3()
anon_df, _ = apply_k_anonymity(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "ANONYMISATION",
"resource_type": "RD_DATA"
})
def l_diversity_job_s3():
org_df = read_structured_from_s3()
anon_df, _ = apply_l_diversity(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job(tags={
"business_operation": "ANONYMISATION",
"resource_type": "RD_DATA"
})
def t_closeness_job_s3():
org_df = read_structured_from_s3()
anon_df, _ = apply_t_closeness(org_df)
preview_dataframe(org_df)
write_df_to_s3(anon_df)
preview_dataframe(anon_df)
@job()
def read_write_semistructured_job_s3():
semistruct_data = read_semistructured_from_s3()
write_semistructured_to_s3(semistruct_data)

View File

@@ -1,187 +0,0 @@
import json
from textwrap import dedent
import pandas as pd
from anjana.anonymity import k_anonymity, l_diversity, t_closeness
from dagster import (
DagsterInvalidInvocationError,
MarkdownMetadataValue,
Out,
Output,
get_dagster_logger,
op,
)
from pycanon import anonymity
from template_code_location.dataframe_level_anonymisation.config_models import (
KAnonymityConfiguration,
LDiversityConfiguration,
TClosenessConfiguration,
)
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import get_all_hierarchies
def _calc_dataframe_metrics(df_anon, df_org, quasi_identifiers, sensitive_atttributes):
# --- Metrics ---
# Anonymization metrics
k_anon = anonymity.k_anonymity(df_anon, quasi_identifiers)
l_div = anonymity.l_diversity(df_anon, quasi_identifiers, sensitive_atttributes, True)
t_clos = anonymity.t_closeness(df_anon, quasi_identifiers, sensitive_atttributes, True)
# Data Utilization metrics
supression_rate = 1 - len(df_anon) / len(df_org)
grouped = df_anon.groupby(quasi_identifiers)
mean_equivalence_class_size = len(df_anon) / len(grouped) if len(grouped) else 0
# flake8: noqa
anon_report = dedent(
f"""
### Anonymization & Data Utilization Metrics
| Metric | Value | Description |
|--------|-------|-------------|
| **k-anonymity** | `k = {k_anon}` | Minimum number of records sharing the same quasi-identifier values. |
| **l-diversity** | `l = {l_div}` | Diversity of sensitive attributes within each equivalence class. |
| **t-closeness** | `t = {round(t_clos, 2)}` | Distance between sensitive attribute distribution in a group and the overall dataset. |
| **Suppression rate** | `{round(supression_rate, 2)}` | Fraction of records or attributes suppressed to meet privacy requirements. |
| **Mean equivalence class size** | `{round(mean_equivalence_class_size, 2)}` | Average size of equivalence classes for quasi-identifiers, indicates data grouping. |
"""
)
# flake8: enable
metrics = {
"k_anon": k_anon,
"l_div": l_div,
"t_clos": t_clos,
"supp_rate": supression_rate,
"mean_equivalence_class": mean_equivalence_class_size,
}
return anon_report, metrics
def _validate_and_get_hierarchies(config, df: pd.DataFrame):
hierarchies = get_all_hierarchies()
# Dataset smaller than k
if len(df) < config.k:
raise DagsterInvalidInvocationError(
f"Cannot apply k-anonymity: dataset has {len(df)} records, but k={config.k}"
)
# Missing or incomplete generalisation hierarchies
for qi in config.quasi_identifiers:
if qi not in config.generalisation_hierarchies or not config.generalisation_hierarchies[qi]:
raise DagsterInvalidInvocationError(
f"Generalisation hierarchy for quasi-identifier '{qi}' is missing or incomplete"
)
if config.generalisation_hierarchies[qi] not in hierarchies:
raise DagsterInvalidInvocationError(
f"Generalisation hierarchy '{config.generalisation_hierarchies[qi]}' is missing in the code basis"
)
hier = {
qi: hierarchies[config.generalisation_hierarchies[qi]] for qi in config.quasi_identifiers
}
return hier
@op(out={"data": Out(), "metrics": Out()})
def apply_k_anonymity(context, config: KAnonymityConfiguration, df: pd.DataFrame):
hier = _validate_and_get_hierarchies(config, df)
data_anon = k_anonymity(
df, config.ident, config.quasi_identifiers, config.k, config.supp_level, hier
)
if "index" in data_anon.columns and "index" not in df.columns:
data_anon.drop(columns="index", inplace=True)
anon_report, metrics = _calc_dataframe_metrics(
data_anon, df, config.quasi_identifiers, config.sensitive_attributes
)
yield Output(
value=data_anon,
metadata={
"metric_report": MarkdownMetadataValue(anon_report),
"metric_json": json.dumps(metrics),
},
output_name="data",
)
yield Output(value=metrics, output_name="metrics")
@op(out={"data": Out(), "metrics": Out()})
def apply_l_diversity(context, config: LDiversityConfiguration, df: pd.DataFrame):
hier = _validate_and_get_hierarchies(config, df)
data_anon = l_diversity(
df,
config.ident,
config.quasi_identifiers,
config.sensitive_attribute,
config.k,
config.l,
config.supp_level,
hier,
)
if data_anon.empty:
raise DagsterInvalidInvocationError(
"Could not tranform the data to l-diversity, empty dataset returned!"
)
anon_report, metrics = _calc_dataframe_metrics(
data_anon, df, config.quasi_identifiers, [config.sensitive_attribute]
)
yield Output(
value=data_anon,
metadata={
"metric_report": MarkdownMetadataValue(anon_report),
"metric_json": json.dumps(metrics),
},
output_name="data",
)
yield Output(value=metrics, output_name="metrics")
@op(out={"data": Out(), "metrics": Out()})
def apply_t_closeness(context, config: TClosenessConfiguration, df: pd.DataFrame):
hier = _validate_and_get_hierarchies(config, df)
try:
data_anon = t_closeness(
df,
config.ident,
config.quasi_identifiers,
config.sensitive_attribute,
config.k,
config.t,
config.supp_level,
hier,
)
except ValueError as e:
if "Cannot be quasi-identifiers" in str(e):
raise DagsterInvalidInvocationError(
f"T-closeness failed: k-anonymity parameter = {config.k} is too small "
f"for existing hierarchies of {config.quasi_identifiers} in inner k-anonymity call."
)
else:
# Re-raise other ValueError types with context
raise DagsterInvalidInvocationError(f"T-closeness failed with error: {str(e)}")
if data_anon.empty:
raise DagsterInvalidInvocationError(
f"Could not transform the data to t-closeness, empty dataset returned! "
f"This may indicate that the t-closeness constraint (t={config.t}) is too strict for the given data."
)
anon_report, metrics = _calc_dataframe_metrics(
data_anon, df, config.quasi_identifiers, [config.sensitive_attribute]
)
yield Output(
value=data_anon,
metadata={
"metric_report": MarkdownMetadataValue(anon_report),
"metric_json": json.dumps(metrics),
},
output_name="data",
)
yield Output(value=metrics, output_name="metrics")

View File

@@ -1,19 +0,0 @@
import numpy as np
def parse_value_list(values):
return [int(v) if isinstance(v, str) and v.isdigit() else v for v in values]
# Hierarchy normalization for Anjana
def normalize_hierarchy_levels(hierarchy_dict):
normalized = {}
for column, levels in hierarchy_dict.items():
normalized[column] = {}
for level_str, mapping_list in levels.items():
level = int(level_str)
if level == 0:
normalized[column][level] = np.array(parse_value_list(mapping_list))
else:
normalized[column][level] = mapping_list
return normalized

View File

@@ -1,28 +0,0 @@
from .structured_config import ( # noqa: F401
HashConfig,
EncryptConfig,
RedactConfig,
ReplaceConfig,
PseudoTechniqueConfig,
AnonymisePseudonymizeStructuredConfig,
DecryptConfig,
DepseudoTechniqueConfig,
DepseudonymizeStructuredConfig,
)
from .unstructured_config import ( # noqa: F401, F811
HashConfig,
EncryptConfig,
RedactConfig,
ReplaceConfig,
RetainConfig,
PseudoTechniqueConfig,
AnonymisePseudonymizeUnstructuredConfig,
DecryptConfig,
DepseudoTechniqueConfig,
DepseudonymizeUnstructuredConfig,
)
from .languages import SupportedLanguages, LanguageEnum # noqa: F401
from .pii_entities import PIIEntityEnum, PII_MAPPING # noqa: F401

View File

@@ -1,72 +0,0 @@
from enum import Enum
from typing import ClassVar
class SupportedLanguages:
LANGUAGES: ClassVar[dict[str, str]] = {
"hr": "hr_HR", # Croatian
"da": "da_DK", # Danish
"nl": "nl_NL", # Dutch
"en": "en_US", # English
"fi": "fi_FI", # Finnish
"fr": "fr_FR", # French
"de": "de_DE", # German
"el": "el_GR", # Greek
"it": "it_IT", # Italian
"lt": "lt_LT", # Lithuanian
"pl": "pl_PL", # Polish
"pt": "pt_PT", # Portuguese
"ro": "ro_RO", # Romanian
"sl": "sl_SI", # Slovenian
"es": "es_ES", # Spanish
"sv": "sv_SE", # Swedish
}
LANGUAGE_MODELS = {
"en": "en_core_web_sm",
"it": "it_core_news_sm",
"de": "de_core_news_sm",
"fr": "fr_core_news_sm",
"es": "es_core_news_sm",
"nl": "nl_core_news_sm",
"da": "da_core_news_sm",
"sv": "sv_core_news_sm",
"fi": "fi_core_news_sm",
"pl": "pl_core_news_sm",
"el": "el_core_news_sm",
"hr": "hr_core_news_sm",
"lt": "lt_core_news_sm",
"pt": "pt_core_news_sm",
"ro": "ro_core_news_sm",
"sl": "sl_core_news_sm",
}
@classmethod
def codes(cls) -> list[str]:
return list(cls.LANGUAGES.keys())
@classmethod
def get_locale(cls, code: str) -> str:
return cls.LANGUAGES[code]
@classmethod
def get_language_model(cls, code: str) -> str:
return cls.LANGUAGE_MODELS[code]
class LanguageEnum(str, Enum):
hr = "hr"
da = "da"
nl = "nl"
en = "en"
fi = "fi"
fr = "fr"
de = "de"
el = "el"
it = "it"
lt = "lt"
pl = "pl"
pt = "pt"
ro = "ro"
sl = "sl"
es = "es"
sv = "sv"

View File

@@ -1,24 +0,0 @@
from enum import Enum
class PIIEntityEnum(str, Enum):
PERSON = "Person"
EMAIL = "Email"
CREDIT_CARD = "Credit card"
DATE_OF_BIRTH = "Date of birth"
URL = "URLs"
PHONE_NUMBERS = "Phone numbers"
CREDENTIALS = "Credentials"
X_SOCIAL = "X (formally known as Twitter) username"
PII_MAPPING: dict[PIIEntityEnum, str] = {
PIIEntityEnum.PERSON: "NameFilth",
PIIEntityEnum.EMAIL: "EmailFilth",
PIIEntityEnum.CREDIT_CARD: "CreditCardFilth",
PIIEntityEnum.DATE_OF_BIRTH: "DateOfBirthFilth",
PIIEntityEnum.URL: "UrlFilth",
PIIEntityEnum.PHONE_NUMBERS: "PhoneFilth",
PIIEntityEnum.CREDENTIALS: "CredentialFilth",
PIIEntityEnum.X_SOCIAL: "TwitterFilth",
}

View File

@@ -1,110 +0,0 @@
from typing import List, Literal, Optional, Union
from dagster import Config
from pydantic import Field as PydanticField, model_validator, field_validator
class HashConfig(Config):
type: Literal["hash"] = "hash"
columns: List[str] = PydanticField(default=["example_column"], description="Columns to hash")
algorithm: str = PydanticField(default="sha256", description="Hashing algorithm")
class EncryptConfig(Config):
type: Literal["encrypt"] = "encrypt"
columns: List[str] = PydanticField(default=["example_column"], description="Columns to encrypt")
key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption")
class RedactConfig(Config):
type: Literal["redact"] = "redact"
columns: List[str] = PydanticField(default=["example_column"], description="Columns to redact")
class ReplaceConfig(Config):
type: Literal["replace"] = "replace"
columns: List[str] = PydanticField(default=["example_column"], description="Columns to replace")
new_value: str = PydanticField(default="REPLACED", description="Replacement value")
class PseudoTechniqueConfig(Config):
technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig] = PydanticField(
default={"hash": HashConfig().model_dump(exclude={"type"})},
discriminator="type"
)
class AnonymisePseudonymizeStructuredConfig(Config):
used_function: List[PseudoTechniqueConfig] = PydanticField(
default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}],
description=("List of functions to be used on column"),
)
@model_validator(mode="after")
def ensure_unique_columns(self):
column_to_techniques = self._collect_column_to_techniques()
duplicates = {
col: techs for col, techs in column_to_techniques.items() if len(techs) > 1
}
if duplicates:
formatted = "; ".join(
f"{col} -> {', '.join(techs)}" for col, techs in duplicates.items()
)
raise ValueError(f"Duplicate column(s) across techniques not allowed:\n{formatted}")
return self
def _collect_column_to_techniques(self):
"""Extract column-to-techniques mapping from used_function list."""
column_to_techniques = {}
for f in self.used_function:
technique_type, cols = self._extract_technique_and_columns(f)
for col in cols:
column_to_techniques.setdefault(col, []).append(technique_type)
return column_to_techniques
def _extract_technique_and_columns(self, item):
"""Extract technique type and columns list from a PseudoTechniqueConfig item (dict or model instance)."""
if isinstance(item, dict):
tech = item.get("technique") or {}
if isinstance(tech, dict):
if "type" in tech:
return tech.get("type"), tech.get("columns") or []
elif len(tech) == 1:
# variant-key mapping: {'hash': {...}}
technique_type, inner = next(iter(tech.items()))
return technique_type, inner.get("columns") or []
return None, []
else:
# item is a PseudoTechniqueConfig instance
technique_type = item.technique.type
cols = getattr(item.technique, "columns", [])
return technique_type, cols
class DecryptConfig(Config):
type: Literal["decrypt"] = "decrypt"
columns: List[str] = PydanticField(default=["example_column"], description="Columns to decrypt")
key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption")
class DepseudoTechniqueConfig(Config):
technique: DecryptConfig = PydanticField(default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})})
class DepseudonymizeStructuredConfig(Config):
used_function: List[DepseudoTechniqueConfig] = PydanticField(
default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}],
description=("Decryption functions to be used on column"),
)
@field_validator("used_function", mode="before")
def _normalize_depseudo_used_function(cls, v):
normalized = []
for item in v:
if isinstance(item, dict):
normalized.append(DepseudoTechniqueConfig.model_validate(item))
else:
normalized.append(item)
return normalized
@model_validator(mode="after")
def ensure_unique_columns(self):
# For depseudonymize, we don't have per-column uniqueness constraints,
# but keep a no-op validator to preserve API parity.
return self

View File

@@ -1,115 +0,0 @@
from typing import List, Literal, Optional, Union
from dagster import Config
from pydantic import Field as PydanticField, model_validator, field_validator
from .languages import LanguageEnum
from .pii_entities import PIIEntityEnum
class HashConfig(Config):
type: Literal["hash"] = "hash"
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to hash")
algorithm: str = PydanticField(default="sha256", description="Hashing algorithm")
class EncryptConfig(Config):
type: Literal["encrypt"] = "encrypt"
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to encrypt")
key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption")
class RedactConfig(Config):
type: Literal["redact"] = "redact"
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to redact")
class ReplaceConfig(Config):
type: Literal["replace"] = "replace"
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to replace")
new_value: str = PydanticField(default="REPLACED", description="Replacement value")
class RetainConfig(Config):
type: Literal["retain"] = "retain"
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to retain")
class PseudoTechniqueConfig(Config):
technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig, RetainConfig] = PydanticField(
default={"hash": HashConfig().model_dump(exclude={"type"})},
discriminator="type"
)
class AnonymisePseudonymizeUnstructuredConfig(Config):
language: LanguageEnum = PydanticField(
default=LanguageEnum.en,
description="Language code (must be one of: hr, da, nl, en, fi, fr, de, el, it, lt, pl, pt, ro, sl, es, sv)"
)
used_function: List[PseudoTechniqueConfig] = PydanticField(
default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}],
description=("List of functions to be used on PIIs"),
)
@field_validator("used_function", mode="before")
def _normalize_used_function(cls, v):
normalized = []
for item in v:
if isinstance(item, dict):
normalized.append(PseudoTechniqueConfig.model_validate(item))
else:
normalized.append(item)
return normalized
@model_validator(mode="after")
def ensure_unique_pii(self):
pii_to_techniques = self._collect_pii_to_techniques()
duplicates = {
pii: techs for pii, techs in pii_to_techniques.items() if len(techs) > 1
}
if duplicates:
formatted = "; ".join(
f"{pii} -> {', '.join(techs)}" for pii, techs in duplicates.items()
)
raise ValueError(f"Duplicate PII(s) across techniques not allowed:\n{formatted}")
return self
def _collect_pii_to_techniques(self):
"""Extract PII-to-techniques mapping from used_function list."""
pii_to_techniques = {}
for f in self.used_function:
technique_type, piis = self._extract_technique_and_pii(f)
for pii in piis:
pii_to_techniques.setdefault(pii, []).append(technique_type)
return pii_to_techniques
def _extract_technique_and_pii(self, item):
"""Extract technique type and PII list from a PseudoTechniqueConfig item (dict or model instance)."""
if isinstance(item, dict):
tech = item.get("technique") or {}
if isinstance(tech, dict):
if "type" in tech:
return tech.get("type"), tech.get("pii") or tech.get("columns") or []
elif len(tech) == 1:
# variant-key mapping: {'hash': {...}}
technique_type, inner = next(iter(tech.items()))
return technique_type, inner.get("pii") or inner.get("columns") or []
return None, []
else:
# item is a PseudoTechniqueConfig instance
technique_type = item.technique.type
piis = getattr(item.technique, "pii", []) or getattr(item.technique, "columns", [])
return technique_type, piis
class DecryptConfig(Config):
type: Literal["decrypt"] = "decrypt"
key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption")
class DepseudoTechniqueConfig(Config):
technique: DecryptConfig = PydanticField(
default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})},
)
class DepseudonymizeUnstructuredConfig(Config):
used_function: List[DepseudoTechniqueConfig] = PydanticField(
default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}],
description=("Decryption function"),
)

View File

@@ -1,126 +0,0 @@
from dagster import job
from util_services.util_ops import (
preview_dataframe,
read_structured_to_df,
write_df_to_local,
write_string_to_unstructured,
read_unstructured_to_string,
preview_unstructured,
read_structured_from_s3,
write_df_to_s3,
read_unstructured_from_s3,
write_unstructured_to_s3,
)
from .ops import (
anonymize_pseudonymize_structured,
depseudonymize_structured,
)
from .unstructured_ops import (
anonymize_pseudonymize_unstructured,
depseudonymize_unstructured,
)
@job(tags={
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
})
def anonymise_pseudonymise_structured_job():
df = read_structured_to_df()
preview_dataframe(df)
df_anon, metrics = anonymize_pseudonymize_structured(df)
preview_dataframe(df_anon)
write_df_to_local(df_anon)
@job(tags={
"business_operation": "ANONYMISATION_PSEUDONYMISATION",
"resource_type": "RD_DATA"
})
def anonymise_pseudonymise_structured_job_s3():
df = read_structured_from_s3()
preview_dataframe(df)
df_anon, metrics = anonymize_pseudonymize_structured(df)
preview_dataframe(df_anon)
write_df_to_s3(df_anon)
@job(tags={
"business_operation": "DEPSEUDONYMISATION"
})
def depseudonymise_structured_job():
df = read_structured_to_df()
preview_dataframe(df)
df_anon, metrics = depseudonymize_structured(df)
preview_dataframe(df_anon)
write_df_to_local(df_anon)
@job(tags={
"business_operation": "DEPSEUDONYMISATION",
"resource_type": "RD_DATA"
})
def depseudonymise_structured_job_s3():
df = read_structured_from_s3()
preview_dataframe(df)
df_anon, metrics = depseudonymize_structured(df)
preview_dataframe(df_anon)
write_df_to_s3(df_anon)
@job(tags={
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
})
def anonymise_pseudonymise_depseudonymise_structured_job():
df = read_structured_to_df()
preview_dataframe(df)
df_pseduo, metrics = anonymize_pseudonymize_structured(df)
preview_dataframe(df_pseduo)
df_depseduo, metrics = depseudonymize_structured(df_pseduo)
preview_dataframe(df_depseduo)
@job(tags={
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
})
def anonymise_pseudonymise_unstructured_job():
text = read_unstructured_to_string()
preview_unstructured(text)
text_anon, metrics = anonymize_pseudonymize_unstructured(text)
preview_unstructured(text_anon)
preview_unstructured(metrics)
write_string_to_unstructured(text_anon)
@job(tags={
"business_operation": "ANONYMISATION_PSEUDONYMISATION",
"resource_type": "RD_DATA"
})
def anonymise_pseudonymise_unstructured_job_s3():
text = read_unstructured_from_s3()
preview_unstructured(text)
text_anon, metrics = anonymize_pseudonymize_unstructured(text)
preview_unstructured(text_anon)
preview_unstructured(metrics)
write_unstructured_to_s3(text_anon)
@job(tags={
"business_operation": "DEPSEUDONYMISATION"
})
def depseudonymise_unstructured_job():
text = read_unstructured_to_string()
preview_unstructured(text)
text_anon, metrics = depseudonymize_unstructured(text)
preview_unstructured(text_anon)
write_string_to_unstructured(text_anon)
@job(tags={
"business_operation": "DEPSEUDONYMISATION",
"resource_type": "RD_DATA"
})
def depseudonymise_unstructured_job_s3():
text = read_unstructured_from_s3()
preview_unstructured(text)
text_anon, metrics = depseudonymize_unstructured(text)
preview_unstructured(text_anon)
write_unstructured_to_s3(text_anon)

View File

@@ -1,77 +0,0 @@
import pandas as pd
import numpy as np
from dagster import Out, Output, op
from cryptography.fernet import InvalidToken
from template_code_location.field_level_pseudo_anonymisation.config_models import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
)
from template_code_location.field_level_pseudo_anonymisation.techniques import (
anonymisation_pseudonymisation_techniques as anon_pseudo_funcs,
)
import template_code_location.field_level_pseudo_anonymisation.techniques.depseudonymisation_techniques as depseudo_funcs
from .utils import create_get_encryption_key
def _apply_column_wise_function(config, df, funcs):
for used_function in config.used_function:
func_name = used_function.technique.type
columns = used_function.technique.columns
func = getattr(funcs, func_name)
params = used_function.technique.model_dump()
del params["type"]
del params["columns"]
if func_name in ["encrypt", "decrypt"]:
key_name = used_function.technique.key_name
del params["key_name"]
params["key"] = create_get_encryption_key(func_name, key_name)
missing = [col for col in columns if col not in df.columns]
if missing:
raise ValueError(
f"The following columns required by technique '{func_name}' "
f"are not present in the DataFrame: {', '.join(missing)}"
)
# Skip processing if DataFrame is empty
if len(df) == 0:
continue
for column in columns:
try:
vectorized_func = np.vectorize(lambda x: func(x, **params))
df[column] = vectorized_func(df[column].to_numpy())
except InvalidToken:
raise ValueError(
f"Invalid Fernet token while decrypting column '{column}' "
f"using key '{key_name}'. The data may not be encrypted "
f"or the key may be incorrect. "
)
return df
@op(out={"data": Out(), "metrics": Out()})
def anonymize_pseudonymize_structured(
context, config: AnonymisePseudonymizeStructuredConfig, df: pd.DataFrame
):
df = _apply_column_wise_function(config, df, anon_pseudo_funcs)
yield Output(
value=df,
metadata={},
output_name="data",
)
yield Output(value={}, output_name="metrics")
@op(out={"data": Out(), "metrics": Out()})
def depseudonymize_structured(context, config: DepseudonymizeStructuredConfig, df: pd.DataFrame):
df = _apply_column_wise_function(config, df, depseudo_funcs)
yield Output(
value=df,
metadata={},
output_name="data",
)
yield Output(value={}, output_name="metrics")

View File

@@ -1,3 +0,0 @@
from .anonymisation_pseudonymisation_techniques import hash, redact, replace, encrypt # noqa: F401
from .depseudonymisation_techniques import decrypt # noqa: F401

View File

@@ -1,42 +0,0 @@
import hashlib
from cryptography.fernet import Fernet
def hash(value: str, algorithm: str = "sha256") -> str:
"""
Hash the value using the specified algorithm (default: SHA-256).
"""
value = str(value)
hash_func = hashlib.new(algorithm)
hash_func.update(value.encode("utf-8"))
return hash_func.hexdigest()
def redact(value: str) -> str:
"""
Redact the column and return an empty string
"""
return ""
def replace(value: str, new_value) -> str:
"""
Replace the value column with the provided value
"""
return new_value
def encrypt(value: str, key: bytes) -> str:
"""
Encrypt the value using the provided Fernet key.
"""
value = str(value)
f = Fernet(key)
return f.encrypt(value.encode()).decode()
def retain(value: str) -> str:
"""
Retain the original value without any changes.
"""
return value

View File

@@ -1,9 +0,0 @@
from cryptography.fernet import Fernet
def decrypt(value: str, key: bytes) -> str:
"""
Decrypt a string using the provided Fernet key.
"""
f = Fernet(key)
return f.decrypt(value.encode()).decode()

View File

@@ -1,428 +0,0 @@
import importlib
import importlib.abc
import importlib.machinery
import re
import sys
import types
# ---------------------------------------------------------------------------
# Stub out the `transformers` and `spacy_transformers` packages before any
# other import triggers spaCy's entry-point scan or scrubadub_spacy's runtime
# import of spacy_transformers.pipeline_component.
# ---------------------------------------------------------------------------
_STUB_PACKAGES = ("transformers", "spacy_transformers")
class _StubModule(types.ModuleType):
"""Module that returns a dummy class for any attribute access."""
def __getattr__(self, name: str):
return type(name, (), {})
class _StubFinder(importlib.abc.MetaPathFinder):
"""Intercept any import under the stubbed packages and return a stub module."""
def find_spec(self, fullname, path=None, target=None): # noqa: ANN001
for pkg in _STUB_PACKAGES:
if fullname == pkg or fullname.startswith(pkg + "."):
return importlib.machinery.ModuleSpec(fullname, _StubLoader())
return None
class _StubLoader(importlib.abc.Loader):
def create_module(self, spec): # noqa: ANN001
mod = _StubModule(spec.name)
mod.__path__ = [] # mark as package
mod.__spec__ = spec
return mod
def exec_module(self, module): # noqa: ANN001
pass
# Install the finder once, before scrubadub / spacy are imported.
if not any(isinstance(f, _StubFinder) for f in sys.meta_path):
sys.meta_path.insert(0, _StubFinder())
# ---------------------------------------------------------------------------
import scrubadub # noqa: E402
import scrubadub_spacy # noqa: E402
from cryptography.fernet import InvalidToken # noqa: E402
from dagster import Out, Output, get_dagster_logger, op # noqa: E402
from scrubadub.detectors import RegexDetector # noqa: E402
from scrubadub.filth import CredentialFilth, NameFilth # noqa: E402
from template_code_location.field_level_pseudo_anonymisation.techniques import (
anonymisation_pseudonymisation_techniques as anon_pseudo_funcs,
)
from template_code_location.field_level_pseudo_anonymisation.techniques import (
depseudonymisation_techniques as depseudo_funcs,
)
from .config_models import (
PII_MAPPING,
AnonymisePseudonymizeUnstructuredConfig,
DepseudonymizeUnstructuredConfig,
PIIEntityEnum,
PseudoTechniqueConfig,
SupportedLanguages,
)
from .utils import create_get_encryption_key
def _initialize_scrubber(language: str) -> scrubadub.Scrubber:
class SIMPLCredentialDetector(RegexDetector):
"""
Remove username/password combinations from dirty ``text``.
"""
filth_cls = CredentialFilth
name = "credential"
autoload = True
regex = re.compile(
r"""
(?:username|login|u:)\s*(?::\s*)?
(?P<username>[\w.\-@+]+)
[\s\S]{0,500}?
(?:password|pw|p:)\s*(?::\s*)?
(?P<password>[^\s]+)
""",
re.MULTILINE | re.VERBOSE | re.IGNORECASE,
)
locale = SupportedLanguages.get_locale(language)
scrubber = scrubadub.Scrubber(locale=locale)
model_name = SupportedLanguages.get_language_model(language)
spacy_detector = scrubadub_spacy.detectors.SpacyEntityDetector(model=model_name)
spacy_detector.named_entities = {
"PERSON",
"PER",
"ORG",
"persName",
"PRS",
} # Need to set it after the constructor because scrubadub_spacy uses upper on all entries
spacy_detector.filth_cls_map["persName"] = NameFilth # Required because PL uses persName
spacy_detector.filth_cls_map["PRS"] = NameFilth # Required for swedish that uses PRS
scrubber.add_detector(spacy_detector)
if language in ["en", "de"]:
scrubber.add_detector(
scrubadub.detectors.DateOfBirthDetector
) # add optional data of birth detector
scrubber.remove_detector(
scrubadub.detectors.CredentialDetector
) # remove the not so great credentials detector and replace with custom SIMPL one
scrubber.add_detector(SIMPLCredentialDetector())
return scrubber
def _map_filth_to_pii_enum(filth) -> PIIEntityEnum | None:
cls_name = filth.__class__.__name__
for pii_enum, filth_name in PII_MAPPING.items():
if filth_name == cls_name:
return pii_enum
return None
def _get_metrics(metrics_dict: dict, language: str) -> str:
# Format metrics as Markdown table
metrics_report = f"""
## PII Anonymization Report
### Summary
- **Total PII Detected**: {metrics_dict['total_pii_detected']}
- **Original Length**: {metrics_dict['text_length_original']} chars
- **Anonymized Length**: {metrics_dict['text_length_anonymised']} chars
- **Language**: {language}
### PII by Type
| Entity Type | Count |
|-------------|-------|
"""
for pii_type, count in metrics_dict["pii_by_type"].items():
metrics_report += f"| {pii_type} | {count} |\n"
metrics_report += "\n### Techniques Applied\n"
for pii, technique in metrics_dict["techniques_applied"].items():
metrics_report += f"- **{pii}**: {technique}\n"
return metrics_report
def _build_metrics_dict(
pii_counts: dict[str, int],
text: str,
anon_text: str,
technique_map: dict[PIIEntityEnum, PseudoTechniqueConfig],
) -> dict:
metrics_dict = {
"total_pii_detected": sum(pii_counts.values()),
"pii_by_type": pii_counts,
"text_length_original": len(text),
"text_length_anonymised": len(anon_text),
"techniques_applied": {
pii.name: technique_map[pii].technique.type for pii in technique_map.keys()
},
}
return metrics_dict
@op(out={"data": Out(), "metrics": Out()})
def anonymize_pseudonymize_unstructured(
context, config: AnonymisePseudonymizeUnstructuredConfig, text: str
):
logger = get_dagster_logger()
if text is None or not text.strip():
raise ValueError("Input text cannot be None or empty")
logger.debug(
f"Starting unstructured PII anonymization | lang={config.language.value} "
f"| input_chars={len(text)}"
)
# --- Filth detection ---
try:
scrubber = _initialize_scrubber(config.language.value)
filths = list(scrubber.iter_filth(text))
logger.info(f"Detected {len(filths)} potential PII entities before filtering.")
except Exception as e:
logger.error(f"Scrubber initialization/detection failed | lang={config.language.value}")
raise RuntimeError(f"PII detection failed for language '{config.language.value}'") from e
# --- Build technique routing map ---
technique_map = _build_technique_map(config)
logger.debug(
"Technique map constructed: "
+ ", ".join(f"{pii.name}->{cfg.technique.type}" for pii, cfg in technique_map.items())
)
replacements = []
key_cache = {}
pii_counts = {}
# --- Process filths ---
for idx, filth in enumerate(filths, start=1):
pii_enum = _map_filth_to_pii_enum(filth)
if pii_enum is None:
logger.debug(f"[{idx}] Skipping unknown filth class={filth.__class__.__name__}")
continue
start_idx, end_idx = _extract_span(filth, logger, idx)
if start_idx is None:
continue
original_value = text[start_idx:end_idx]
technique_cfg = technique_map.get(pii_enum)
# No technique configured
if technique_cfg is None:
_handle_missing_technique(
pii_enum,
start_idx,
end_idx,
text,
pii_counts,
replacements,
logger,
idx,
)
continue
# Apply configured technique
t = technique_cfg.technique
params = _prepare_params(t, key_cache, idx, logger)
replacement = _apply_technique(original_value, t.type, params, pii_enum, idx, logger)
replacements.append((start_idx, end_idx, replacement))
pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1
# --- Apply replacements ---
anon_text = _apply_replacements(text, replacements, logger)
logger.info(f"Anonymisation completed, total PII counts: {pii_counts}")
metrics_report = _get_metrics(
_build_metrics_dict(pii_counts, text, anon_text, technique_map),
config.language.value,
)
yield Output(value=anon_text, output_name="data")
yield Output(value=metrics_report, output_name="metrics")
@op(out={"data": Out(), "metrics": Out()})
def depseudonymize_unstructured(context, config: DepseudonymizeUnstructuredConfig, input_text: str):
input_restored, metrics = _apply_depseudonimisation_function(config, input_text, depseudo_funcs)
yield Output(
value=input_restored,
metadata={},
output_name="data",
)
yield Output(value=metrics, output_name="metrics")
def _apply_depseudonimisation_function(config, input_text: str, funcs_module):
"""
Searches and depseudonymizes text segments formatted as:
{technique:pseudonymized_value}
"""
total_depseudo_count = 0
depseudonimized_text = input_text # Initialize with input text
# Loop through each depseudonymisation technique defined in the config
for used_function in config.used_function:
func_name = used_function.technique.type
func = getattr(funcs_module, func_name)
pseudo_anon_func = ""
# Prepare parameters
params = used_function.technique.model_dump()
del params["type"]
if func_name == "decrypt":
key_name = used_function.technique.key_name
del params["key_name"]
pseudo_anon_func = "encrypt"
params["key"] = create_get_encryption_key(func_name, key_name)
# Regex pattern for this technique, e.g. {encrypt:...}
pattern = rf"\{{{pseudo_anon_func}:([^}}]+)\}}"
def replace_match(match):
nonlocal total_depseudo_count
pseudovalue = match.group(1)
total_depseudo_count += 1
try:
return func(pseudovalue, **params)
except InvalidToken:
raise ValueError(
f"Invalid Fernet token while decrypting value using key '{key_name}'. "
f"The data may not be encrypted or the key may be incorrect."
)
except Exception as e:
raise RuntimeError(f"Error during depseudonymisation with '{func_name}': {e}")
# Apply replacements for this technique
depseudonimized_text = re.sub(pattern, replace_match, depseudonimized_text)
yield depseudonimized_text
yield {"total_depseudo_count": total_depseudo_count}
def _build_technique_map(config):
technique_map = {}
for func_cfg in config.used_function:
for pii in func_cfg.technique.pii:
technique_map[pii] = func_cfg
return technique_map
def _extract_span(filth, logger, idx):
start_idx = getattr(filth, "beg", getattr(filth, "start", None))
end_idx = getattr(filth, "end", None)
if start_idx is None or end_idx is None:
logger.debug(f"[{idx}] Filth missing span attributes; skipping.")
return None, None
return start_idx, end_idx
def _handle_missing_technique(
pii_enum, start_idx, end_idx, text, pii_counts, replacements, logger, idx
):
original_value = text[start_idx:end_idx]
logger.debug(
f"[{idx}] PII={pii_enum.name} span=({start_idx},{end_idx}) value={original_value} "
f"- No technique configured, using placeholder"
)
placeholder = f"{{{{{pii_enum.name}}}}}"
replacements.append((start_idx, end_idx, placeholder))
pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1
def _prepare_params(t, key_cache, idx, logger):
params = t.model_dump()
del params["type"]
del params["pii"]
if t.type == "encrypt":
try:
if t.key_name not in key_cache:
logger.debug(
f"[{idx}] Retrieving/generating Vault key name={t.key_name} for encryption"
)
key_cache[t.key_name] = create_get_encryption_key("encrypt", t.key_name)
params["key"] = key_cache[t.key_name]
del params["key_name"]
logger.debug(f"[{idx}] Encryption key prepared")
except Exception as e:
raise RuntimeError(
f"Encryption key retrieval failed for key '{t.key_name}': {type(e).__name__}"
) from e
return params
def _apply_technique(original_value, t_type, params, pii_enum, idx, logger):
try:
func = getattr(anon_pseudo_funcs, t_type)
replacement = func(original_value, **params)
if t_type == "encrypt":
replacement = f"{{encrypt:{replacement}}}"
logger.debug(f"[{idx}] {t_type.capitalize()} complete")
return replacement
except AttributeError:
logger.warning(f"[{idx}] Technique '{t_type}' not recognized; inserting placeholder.")
return f"{{UNIMPL_{t_type}_{pii_enum.name}}}"
except Exception as e:
raise RuntimeError(
f"Technique '{t_type}' failed for PII type '{pii_enum.name}': {type(e).__name__}"
) from e
def _apply_replacements(text, replacements, logger):
if not replacements:
logger.info("No PII detected; returning original text.")
return text
logger.debug(f"Applying {len(replacements)} replacements to text body.")
replacements.sort(key=lambda r: r[0])
# Detect overlaps
for i in range(len(replacements) - 1):
if replacements[i][1] > replacements[i + 1][0]:
logger.warning(
f"Overlapping PII detected at positions "
f"({replacements[i][0]},{replacements[i][1]}) "
f"and ({replacements[i+1][0]},{replacements[i+1][1]}). "
f"Using first match."
)
replacements[i + 1] = (
replacements[i][1],
replacements[i + 1][1],
replacements[i + 1][2],
)
result_parts = []
last = 0
for start, end, repl in replacements:
if start < last:
continue
result_parts.append(text[last:start])
result_parts.append(repl)
last = end
result_parts.append(text[last:])
return "".join(result_parts)

View File

@@ -1,32 +0,0 @@
import os
import hvac
from hvac.exceptions import InvalidPath
from cryptography.fernet import Fernet
def create_get_encryption_key(func_name: str, key_name: str) -> bytes:
client = hvac.Client(url=os.getenv("OPENBAO_URL"), token=os.getenv("OPENBAO_TOKEN"))
secret_folder = os.getenv("ENCRYPTION_KEYS_PATH")
secret_path = f"{secret_folder}/{key_name}" if secret_folder else key_name
mount_point = os.getenv("ENCRYPTION_KEYS_MOUNT_POINT")
try:
secret_response = client.secrets.kv.v2.read_secret_version(
path=secret_path, mount_point=mount_point
)
key_value = secret_response["data"]["data"]["value"]
except InvalidPath:
if func_name == "encrypt":
new_key = Fernet.generate_key().decode()
client.secrets.kv.v2.create_or_update_secret(
path=secret_path, mount_point=mount_point, secret={"value": new_key}
)
key_value = new_key
else:
raise ValueError(f"Fernet key '{key_name}' not found in Vault for decrypt.")
except Exception as e:
raise ValueError(f"Error while reading Fernet key '{key_name}': {e}")
return key_value.encode()

View File

@@ -8,7 +8,7 @@ from util_services.sensors import (
from util_services.custom_json_logger import simpl_json_logger from util_services.custom_json_logger import simpl_json_logger
# Data processing jobs # Data processing jobs
from template_code_location.data_processing.jobs import ( from data_processing.jobs import (
remove_duplicates_job_s3, remove_duplicates_job_s3,
fill_missing_values_job_s3, fill_missing_values_job_s3,
standardize_categorical_values_job_s3, standardize_categorical_values_job_s3,
@@ -21,7 +21,7 @@ from template_code_location.data_processing.jobs import (
) )
# Dataframe-level anonymisation jobs # Dataframe-level anonymisation jobs
from template_code_location.dataframe_level_anonymisation.jobs import ( from dataframe_level_anonymisation.jobs import (
k_anonymity_job_s3, k_anonymity_job_s3,
l_diversity_job_s3, l_diversity_job_s3,
t_closeness_job_s3, t_closeness_job_s3,
@@ -29,7 +29,7 @@ from template_code_location.dataframe_level_anonymisation.jobs import (
) )
# Field-level pseudo-anonymisation jobs # Field-level pseudo-anonymisation jobs
from template_code_location.field_level_pseudo_anonymisation.jobs import ( from field_level_pseudo_anonymisation.jobs import (
anonymise_pseudonymise_structured_job_s3, anonymise_pseudonymise_structured_job_s3,
depseudonymise_structured_job_s3, depseudonymise_structured_job_s3,
anonymise_pseudonymise_unstructured_job_s3, anonymise_pseudonymise_unstructured_job_s3,

View File

@@ -1 +0,0 @@

View File

@@ -1 +0,0 @@

View File

@@ -1,53 +0,0 @@
"""Pytest configuration and shared fixtures."""
import pytest
import pandas as pd
from unittest.mock import MagicMock, patch
import sys
from dagster import build_op_context
# Mock external dependencies that might not be available in test environment
sys.modules['spellchecker'] = MagicMock()
@pytest.fixture
def mock_context():
"""Create a mock Dagster context for testing operations."""
context = build_op_context()
return context
@pytest.fixture
def sample_dataframe():
"""Create a sample DataFrame for testing."""
return pd.DataFrame({
'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'],
'Age': [25, 30, 25, None, 25],
'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'],
'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active']
})
@pytest.fixture
def sample_dataframe_with_typos():
"""Create a sample DataFrame with typos for spell checking."""
return pd.DataFrame({
'Name': ['jon doe', 'jane smith', 'bob jonson'],
'Description': ['developer', 'analst', 'enginer']
})
@pytest.fixture
def empty_dataframe():
"""Create an empty DataFrame."""
return pd.DataFrame()
@pytest.fixture
def dataframe_with_missing_values():
"""Create a DataFrame with various missing values."""
return pd.DataFrame({
'Column1': [1, None, 3, None, 5],
'Column2': ['a', 'b', None, 'd', None],
'Column3': [None, None, None, None, None]
})

View File

@@ -1,7 +0,0 @@
"""Configuration utilities for testing."""
import os
import sys
# Add src directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))

View File

@@ -1,202 +0,0 @@
"""Unit tests for configuration models."""
import pytest
from pydantic import ValidationError
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration,
AggregationConfiguration
)
class TestColumnsSelectConfiguration:
"""Tests for ColumnsSelectConfiguration."""
def test_default_columns(self):
"""Test default columns configuration."""
config = ColumnsSelectConfiguration()
assert config.columns == ['Name']
def test_custom_columns(self):
"""Test custom columns configuration."""
config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3'])
assert config.columns == ['Col1', 'Col2', 'Col3']
def test_empty_columns_list(self):
"""Test with empty columns list."""
config = ColumnsSelectConfiguration(columns=[])
assert config.columns == []
def test_single_column(self):
"""Test with a single column."""
config = ColumnsSelectConfiguration(columns=['SingleCol'])
assert config.columns == ['SingleCol']
def test_columns_with_special_characters(self):
"""Test columns with special characters."""
config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3'])
assert config.columns == ['Col-1', 'Col_2', 'Col.3']
def test_duplicate_columns_are_removed(self):
"""Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys)."""
config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B'])
assert config.columns == ['A', 'B', 'C']
def test_duplicate_default_behavior(self):
"""Verifica che anche input estremi vengano gestiti correttamente."""
config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name'])
assert config.columns == ['Name']
class TestFillMissingConfiguration:
"""Tests for FillMissingConfiguration."""
def test_default_fill_map(self):
"""Test default fill map configuration."""
config = FillMissingConfiguration()
assert config.fill_map == {'Age': 'UNKNOWN_AGE'}
def test_custom_fill_map(self):
"""Test custom fill map configuration."""
fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_empty_fill_map(self):
"""Test with empty fill map."""
config = FillMissingConfiguration(fill_map={})
assert config.fill_map == {}
def test_fill_map_with_numeric_values(self):
"""Test fill map with numeric string values."""
fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_fill_map_with_string_values(self):
"""Test fill map with string values."""
fill_map = {'Name': 'Unknown', 'Email': 'no-email'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
def test_fill_map_mixed_types(self):
"""Test fill map with mixed value types (all strings)."""
fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'}
config = FillMissingConfiguration(fill_map=fill_map)
assert config.fill_map == fill_map
class TestSpellCheckConfiguration:
"""Tests for SpellCheckConfiguration."""
def test_default_spell_check_config(self):
"""Test default spell check configuration."""
config = SpellCheckConfiguration()
assert config.columns == ['Name']
assert config.language == 'en'
def test_custom_spell_check_config(self):
"""Test custom spell check configuration."""
config = SpellCheckConfiguration(
columns=['Description', 'Notes'],
language='es'
)
assert config.columns == ['Description', 'Notes']
assert config.language == 'es'
def test_spell_check_all_languages(self):
"""Test spell check with all supported languages."""
supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']
for lang in supported_languages:
config = SpellCheckConfiguration(language=lang)
assert config.language == lang
def test_spell_check_invalid_language(self):
"""Test spell check with invalid language."""
with pytest.raises(ValidationError):
SpellCheckConfiguration(language='invalid')
def test_spell_check_multiple_columns(self):
"""Test spell check with multiple columns."""
columns = ['Col1', 'Col2', 'Col3', 'Col4']
config = SpellCheckConfiguration(columns=columns)
assert config.columns == columns
def test_spell_check_empty_columns(self):
"""Test spell check with empty columns list."""
config = SpellCheckConfiguration(columns=[])
assert config.columns == []
assert config.language == 'en'
def test_spell_check_inheritance(self):
"""Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration."""
config = SpellCheckConfiguration()
assert isinstance(config, ColumnsSelectConfiguration)
assert hasattr(config, 'columns')
assert hasattr(config, 'language')
@pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'])
def test_spell_check_languages_parametrized(self, language):
"""Test spell check with parametrized languages."""
config = SpellCheckConfiguration(language=language)
assert config.language == language
class TestAggregationConfiguration:
"""Tests for AggregationConfiguration."""
def test_aggregation_default_config(self):
"""Test default aggregation configuration."""
config = AggregationConfiguration()
assert config.columns == ['Name']
assert config.operation == 'sum'
@pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"])
def test_aggregation_valid_operations(self, op):
"""Test all allowed aggregation operations."""
config = AggregationConfiguration(operation=op)
assert config.operation == op
def test_aggregation_invalid_operation(self):
"""Test that an invalid operation raises a ValidationError."""
with pytest.raises(ValidationError) as excinfo:
AggregationConfiguration(operation="invalid_op")
assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value)
def test_aggregation_custom_columns(self):
"""Test aggregation with custom columns."""
config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean')
assert config.columns == ['Price', 'Quantity']
assert config.operation == 'mean'
def test_aggregation_inheritance(self):
"""Test that AggregationConfiguration inherits from ColumnsSelectConfiguration."""
config = AggregationConfiguration()
assert isinstance(config, ColumnsSelectConfiguration)
assert hasattr(config, 'columns')
assert hasattr(config, 'operation')
def test_aggregation_model_dump(self):
"""Test that model_dump contains all expected fields (useful for the Dagster op)."""
config = AggregationConfiguration(columns=['Value'], operation='max')
dump = config.model_dump()
assert dump['columns'] == ['Value']
assert dump['operation'] == 'max'

View File

@@ -1,185 +0,0 @@
"""Integration tests for data processing jobs."""
import pytest
import pandas as pd
from unittest.mock import patch, MagicMock
from template_code_location.data_processing.ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos
)
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration
)
class TestPipelineIntegration:
"""Integration tests for data processing pipeline."""
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
"""Test pipeline: remove duplicates then standardize."""
df = pd.DataFrame({
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
})
# Step 1: Remove duplicates
df_no_dupes = remove_duplicates(mock_context, df)
assert df_no_dupes.shape[0] == 3
# Step 2: Standardize
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
assert df_standardized['Name'].iloc[0] == 'john doe'
assert df_standardized['City'].iloc[0] == 'new york'
def test_pipeline_fill_missing_then_standardize(self, mock_context):
"""Test pipeline: fill missing values then standardize."""
df = pd.DataFrame({
'Category': [' ACTIVE ', None, ' PENDING '],
'Value': ['1', '2', None]
})
# Step 1: Fill missing values
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df_filled = fill_missing_values(mock_context, fill_config, df)
# Step 2: Standardize
std_config = ColumnsSelectConfiguration(columns=['Category'])
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
assert df_standardized['Category'].iloc[0] == 'active'
assert df_filled['Value'].iloc[2] == '0'
def test_pipeline_all_operations(self, mock_context):
"""Test complete pipeline with all operations."""
df = pd.DataFrame({
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
'Value': ['1', None, '1', '2']
})
# Step 1: Remove duplicates
df = remove_duplicates(mock_context, df)
assert df.shape[0] == 3
# Step 2: Fill missing
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df = fill_missing_values(mock_context, fill_config, df)
assert df['Value'].isna().sum() == 0
# Step 3: Standardize
std_config = ColumnsSelectConfiguration(columns=['Name'])
df = standardize_categorical_values(mock_context, std_config, df)
assert df['Name'].iloc[0] == 'john doe'
def test_pipeline_with_large_dataset(self, mock_context):
"""Test pipeline performance with larger dataset."""
# Create larger dataset
size = 1000
df = pd.DataFrame({
'ID': list(range(size)),
'Name': ['User_' + str(i % 50) for i in range(size)],
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
'Score': [i % 100 for i in range(size)]
})
# Add some duplicates
df = pd.concat([df, df.head(100)], ignore_index=True)
# Process
df_cleaned = remove_duplicates(mock_context, df)
assert df_cleaned.shape[0] == 1000
assert df_cleaned.shape[1] == 4
class TestErrorHandling:
"""Tests for error handling and edge cases."""
def test_operation_with_corrupted_data(self, mock_context):
"""Test operations with corrupted/unusual data."""
df = pd.DataFrame({
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
})
# Should handle special float values
result = remove_duplicates(mock_context, df)
assert result.shape[0] > 0
def test_operation_preserves_index(self, mock_context):
"""Test that index is handled correctly."""
df = pd.DataFrame(
{'Col': [1, 2, 1, 3]},
index=['a', 'b', 'c', 'd']
)
result = remove_duplicates(mock_context, df)
# Index may be reset, so just check shape
assert result.shape[0] == 3
def test_standardize_with_unicode_characters(self, mock_context):
"""Test standardization with unicode characters."""
df = pd.DataFrame({
'Name': ['José', 'François', 'Müller']
})
config = ColumnsSelectConfiguration(columns=['Name'])
result = standardize_categorical_values(mock_context, config, df)
# Should handle unicode correctly
assert result.shape[0] == 3
def test_fill_with_same_key_multiple_times(self, mock_context):
"""Test filling when fill_map has multiple entries."""
df = pd.DataFrame({
'A': ['1', None, '3'],
'B': [None, None, 'c'],
'C': [None, '2', None]
})
config = FillMissingConfiguration(fill_map={
'A': '-1',
'B': 'EMPTY',
'C': '0'
})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '-1'
assert result.loc[0, 'B'] == 'EMPTY'
assert result.loc[0, 'C'] == '0'
class TestDataTypePreservation:
"""Tests to ensure data types are preserved appropriately."""
def test_remove_duplicates_preserves_dtypes(self, mock_context):
"""Test that remove_duplicates preserves column data types."""
df = pd.DataFrame({
'int32': pd.array([1, 2, 1], dtype='int32'),
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
'str': ['a', 'b', 'a']
})
result = remove_duplicates(mock_context, df)
assert result['int32'].dtype == df['int32'].dtype
assert result['float64'].dtype == df['float64'].dtype
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
"""Test that fill_missing handles type preservation."""
df = pd.DataFrame({
'A': pd.array(['1', None, '3'], dtype='string'),
'B': ['x', 'y', 'z']
})
config = FillMissingConfiguration(fill_map={'A': '0'})
result = fill_missing_values(mock_context, config, df)
assert result['A'].loc[1] == '0'
assert result['B'].dtype == df['B'].dtype

View File

@@ -1,56 +0,0 @@
from template_code_location.data_processing.jobs import (
remove_duplicates_job_s3,
fill_missing_values_job_s3,
standardize_categorical_values_job_s3,
correct_typos_job_s3,
normalize_numeric_min_max_job_s3,
normalize_datetime_job_s3,
normalize_coordinates_job_s3,
add_global_aggregations_job_s3
)
def test_remove_duplicates_job_s3_is_callable():
"""Test remove_duplicates_job_s3 is a valid Dagster job"""
assert callable(remove_duplicates_job_s3)
assert hasattr(remove_duplicates_job_s3, 'execute_in_process')
def test_fill_missing_values_job_s3_is_callable():
"""Test fill_missing_values_job_s3 is a valid Dagster job"""
assert callable(fill_missing_values_job_s3)
assert hasattr(fill_missing_values_job_s3, 'execute_in_process')
def test_standardize_categorical_values_job_s3_is_callable():
"""Test standardize_categorical_values_job_s3 is a valid Dagster job"""
assert callable(standardize_categorical_values_job_s3)
assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process')
def test_correct_typos_job_s3_is_callable():
"""Test correct_typos_job_s3 is a valid Dagster job"""
assert callable(correct_typos_job_s3)
assert hasattr(correct_typos_job_s3, 'execute_in_process')
def test_normalize_numeric_min_max_job_s3_is_callable():
"""Test normalize_numeric_min_max_job_s3 is a valid Dagster job"""
assert callable(normalize_numeric_min_max_job_s3)
assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process')
def test_normalize_datetime_job_s3_is_callable():
"""Test normalize_datetime_job_s3 is a valid Dagster job"""
assert callable(normalize_datetime_job_s3)
assert hasattr(normalize_datetime_job_s3, 'execute_in_process')
def test_normalize_coordinates_job_s3_is_callable():
"""Test normalize_coordinates_job_s3 is a valid Dagster job"""
assert callable(normalize_coordinates_job_s3)
assert hasattr(normalize_coordinates_job_s3, 'execute_in_process')
def test_add_global_aggregations_job_s3_is_callable():
"""Test add_global_aggregations_job_s3 is a valid Dagster job"""
assert callable(add_global_aggregations_job_s3)
assert hasattr(add_global_aggregations_job_s3, 'execute_in_process')

View File

@@ -1,700 +0,0 @@
"""Unit tests for data processing operations."""
import pytest
import pandas as pd
from template_code_location.data_processing.ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos,
normalize_datetime,
normalize_numeric_min_max,
normalize_coordinates,
add_global_aggregations
)
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration,
AggregationConfiguration,
CoordinatesNormalizationConfiguration
)
class TestRemoveDuplicates:
"""Tests for the remove_duplicates operation."""
def test_remove_duplicates_basic(self, mock_context, sample_dataframe):
"""Test basic duplicate removal."""
result = remove_duplicates(mock_context, sample_dataframe)
# Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x)
assert result.shape[0] == 3
assert len(result) < len(sample_dataframe)
def test_remove_duplicates_no_duplicates(self, mock_context):
"""Test remove_duplicates when there are no duplicates."""
df = pd.DataFrame({
'A': [1, 2, 3],
'B': ['x', 'y', 'z']
})
result = remove_duplicates(mock_context, df)
assert result.shape[0] == 3
pd.testing.assert_frame_equal(result, df)
def test_remove_duplicates_all_duplicates(self, mock_context):
"""Test remove_duplicates when all rows are identical."""
df = pd.DataFrame({
'A': [1, 1, 1],
'B': ['x', 'x', 'x']
})
result = remove_duplicates(mock_context, df)
assert result.shape[0] == 1
def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe):
"""Test remove_duplicates with empty DataFrame."""
result = remove_duplicates(mock_context, empty_dataframe)
assert result.shape[0] == 0
assert result.shape[1] == 0
def test_remove_duplicates_preserves_data_types(self, mock_context):
"""Test that remove_duplicates preserves data types."""
df = pd.DataFrame({
'int_col': [1, 2, 1],
'str_col': ['a', 'b', 'a'],
'float_col': [1.5, 2.5, 1.5]
})
result = remove_duplicates(mock_context, df)
assert result['int_col'].dtype == df['int_col'].dtype
assert result['str_col'].dtype == df['str_col'].dtype
assert result['float_col'].dtype == df['float_col'].dtype
class TestFillMissingValues:
"""Tests for the fill_missing_values operation."""
def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values):
"""Test basic missing value filling."""
config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'})
result = fill_missing_values(mock_context, config, dataframe_with_missing_values)
# Check that no NaN values remain
assert result['Column1'].isna().sum() == 0
assert result['Column2'].isna().sum() == 0
def test_fill_missing_values_with_different_values(self, mock_context):
"""Test filling with different replacement values."""
df = pd.DataFrame({
'A': [1, None, 3],
'B': [None, 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '-1'
assert result.loc[0, 'B'] == 'UNKNOWN'
def test_fill_missing_values_partial_columns(self, mock_context):
"""Test filling only specified columns."""
df = pd.DataFrame({
'A': [1, None, 3],
'B': [None, 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '999'})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '999'
assert pd.isna(result.loc[0, 'B']) # B should still have NaN
def test_fill_missing_values_no_missing(self, mock_context):
"""Test when there are no missing values."""
df = pd.DataFrame({
'A': ['1', '2', '3'],
'B': ['a', 'b', 'c']
})
config = FillMissingConfiguration(fill_map={'A': '0'})
result = fill_missing_values(mock_context, config, df)
pd.testing.assert_frame_equal(result, df)
def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = FillMissingConfiguration(fill_map={})
result = fill_missing_values(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
class TestStandardizeCategoricalValues:
"""Tests for the standardize_categorical_values operation."""
def test_standardize_categorical_basic(self, mock_context, sample_dataframe):
"""Test basic categorical standardization."""
config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status'])
result = standardize_categorical_values(mock_context, config, sample_dataframe)
# Check that values are lowercase and stripped
assert result['Name'].iloc[0] == 'john doe'
assert result['City'].iloc[1] == 'los angeles'
assert result['Status'].iloc[1] == 'inactive'
def test_standardize_categorical_single_column(self, mock_context):
"""Test standardization on a single column."""
df = pd.DataFrame({
'City': [' NEW YORK ', 'LOS ANGELES', ' chicago ']
})
config = ColumnsSelectConfiguration(columns=['City'])
result = standardize_categorical_values(mock_context, config, df)
assert result['City'].iloc[0] == 'new york'
assert result['City'].iloc[1] == 'los angeles'
assert result['City'].iloc[2] == 'chicago'
def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe):
"""Test with non-existent column (should skip)."""
config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name'])
result = standardize_categorical_values(mock_context, config, sample_dataframe)
# Should process 'Name' column without error
assert result['Name'].iloc[0] == 'john doe'
def test_standardize_categorical_with_missing_values(self, mock_context):
"""Test standardization with missing values."""
df = pd.DataFrame({
'Category': [' ACTIVE ', None, ' pending ']
})
config = ColumnsSelectConfiguration(columns=['Category'])
result = standardize_categorical_values(mock_context, config, df)
assert result['Category'].iloc[0] == 'active'
assert result['Category'].iloc[1] == ''
assert result['Category'].iloc[2] == 'pending'
def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = ColumnsSelectConfiguration(columns=['A', 'B'])
result = standardize_categorical_values(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
def test_standardize_categorical_numeric_columns(self, mock_context):
"""Test that numeric columns are converted to strings."""
df = pd.DataFrame({
'NumCol': [1, 2, 3]
})
config = ColumnsSelectConfiguration(columns=['NumCol'])
result = standardize_categorical_values(mock_context, config, df)
assert result['NumCol'].iloc[0] == '1'
assert isinstance(result['NumCol'].iloc[0], str)
class TestCorrectTypos:
"""Tests for the correct_typos operation."""
def test_correct_typos_basic(self, mock_context):
"""Test basic typo correction."""
df = pd.DataFrame({
'Name': ['jon', 'jayne', 'bob']
})
config = SpellCheckConfiguration(columns=['Name'], language='en')
result = correct_typos(mock_context, config, df)
# Result should have corrections applied
assert result.shape[0] == 3
def test_correct_typos_missing_column(self, mock_context):
"""Test with non-existent column (should skip)."""
df = pd.DataFrame({
'Name': ['jon', 'jayne']
})
config = SpellCheckConfiguration(columns=['NonExistent'], language='en')
result = correct_typos(mock_context, config, df)
# Should not raise error, just skip
pd.testing.assert_frame_equal(result, df)
def test_correct_typos_with_missing_values(self, mock_context):
"""Test typo correction with missing values."""
df = pd.DataFrame({
'Text': ['helo', '', 'wrld']
})
config = SpellCheckConfiguration(columns=['Text'], language='en')
result = correct_typos(mock_context, config, df)
# Empty strings should be preserved
assert result.loc[1, 'Text'] == ''
def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with empty DataFrame."""
config = SpellCheckConfiguration(columns=['A'], language='en')
result = correct_typos(mock_context, config, empty_dataframe)
assert result.shape[0] == 0
def test_correct_typos_different_languages(self, mock_context):
"""Test typo correction with different languages."""
df = pd.DataFrame({
'Text': ['ciao', 'mondo']
})
for lang in ['en', 'es', 'it']:
config = SpellCheckConfiguration(columns=['Text'], language=lang)
result = correct_typos(mock_context, config, df)
# Should process without error
assert result.shape[0] == 2
def test_correct_typos_numeric_values(self, mock_context):
"""Test typo correction on numeric values converted to strings."""
df = pd.DataFrame({
'Values': [123, 456, 789]
})
config = SpellCheckConfiguration(columns=['Values'], language='en')
result = correct_typos(mock_context, config, df)
# Numeric values should be converted to string and processed
assert result.shape[0] == 3
class TestNormalizeDatetime:
"""Tests for the normalize_datetime operation."""
def test_normalize_datetime_basic(self, mock_context):
"""Test basic datetime normalization to ISO format."""
df = pd.DataFrame({
'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59']
})
config = ColumnsSelectConfiguration(columns=['date_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'date_col_iso' in result.columns
assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z'
assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z'
def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe):
"""Test behavior when a configured column is missing in the DataFrame."""
config = ColumnsSelectConfiguration(columns=['non_existent_column'])
result = normalize_datetime(mock_context, config, sample_dataframe.copy())
pd.testing.assert_frame_equal(result, sample_dataframe)
def test_normalize_datetime_unparseable_values(self, mock_context):
"""Test column with values that cannot be parsed as dates."""
df = pd.DataFrame({
'invalid_col': ['not-a-date', 'completely-random-text']
})
config = ColumnsSelectConfiguration(columns=['invalid_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'invalid_col_iso' not in result.columns
def test_normalize_datetime_mixed_and_nulls(self, mock_context):
"""Test column with mixed valid dates, invalid dates, and NaNs."""
df = pd.DataFrame({
'mixed_col': ['2023-05-01', None, 'invalid-date']
})
config = ColumnsSelectConfiguration(columns=['mixed_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'mixed_col_iso' in result.columns
assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z'
assert result['mixed_col_iso'].iloc[1] == ""
assert result['mixed_col_iso'].iloc[2] == ""
def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe):
"""Test with an empty DataFrame."""
config = ColumnsSelectConfiguration(columns=['some_col'])
result = normalize_datetime(mock_context, config, empty_dataframe)
assert result.empty
def test_normalize_datetime_epoch_only(self, mock_context, capsys):
"""If parsing a column yields only the Unix epoch date, it should be skipped."""
df = pd.DataFrame({
'weird_col': ['0', 0, '0000', '']
})
config = ColumnsSelectConfiguration(columns=['weird_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'weird_col_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys):
"""If all formatted values are '1970-01-01', the column should be skipped with a warning."""
df = pd.DataFrame({
'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00']
})
config = ColumnsSelectConfiguration(columns=['ts_col'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'ts_col_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys):
"""If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped."""
df = pd.DataFrame({
'age': [66, 45, 40, 43, 20, 26, 69, 21, 46]
})
config = ColumnsSelectConfiguration(columns=['age'])
result = normalize_datetime(mock_context, config, df.copy())
assert 'age_iso' not in result.columns
captured = capsys.readouterr()
assert "all normalized values are '1970-01-01'" in captured.err
class TestNormalizeNumericMinMax:
"""Tests for the normalize_numeric_min_max operation."""
def test_normalize_numeric_basic(self, mock_context):
"""Test standard min-max normalization between 0 and 1."""
df = pd.DataFrame({
'score': [10, 20, 30, 40, 50]
})
config = ColumnsSelectConfiguration(columns=['score'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'score_norm' in result.columns
assert result['score_norm'].min() == 0.0
assert result['score_norm'].max() == 1.0
assert result['score_norm'].iloc[2] == 0.5
def test_normalize_numeric_missing_column(self, mock_context):
"""Test skipping of non-existent columns."""
df = pd.DataFrame({'existing': [1, 2, 3]})
config = ColumnsSelectConfiguration(columns=['missing_col'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'missing_col_norm' not in result.columns
def test_normalize_numeric_constant_values(self, mock_context):
"""Test skipping when min == max to avoid division by zero."""
df = pd.DataFrame({
'constant': [10, 10, 10]
})
config = ColumnsSelectConfiguration(columns=['constant'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'constant_norm' not in result.columns
def test_normalize_numeric_with_nans(self, mock_context):
"""Test normalization with NaN values (pandas min/max ignore NaNs by default)."""
df = pd.DataFrame({
'with_nans': [10, None, 50]
})
config = ColumnsSelectConfiguration(columns=['with_nans'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'with_nans_norm' in result.columns
assert result['with_nans_norm'].iloc[0] == 0.0
assert result['with_nans_norm'].iloc[2] == 1.0
assert pd.isna(result['with_nans_norm'].iloc[1])
def test_normalize_numeric_multiple_columns(self, mock_context):
"""Test processing multiple columns in one call."""
df = pd.DataFrame({
'A': [1, 2],
'B': [10, 20]
})
config = ColumnsSelectConfiguration(columns=['A', 'B'])
result = normalize_numeric_min_max(mock_context, config, df.copy())
assert 'A_norm' in result.columns
assert 'B_norm' in result.columns
class TestNormalizeCoordinates:
"""Tests for the normalize_coordinates operation."""
def test_normalize_coordinates_basic(self, mock_context):
"""Test rounding and basic coordinate normalization."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_filtering(self, mock_context):
"""Test filtering of out-of-range coordinates."""
df = pd.DataFrame({
'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range
'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert result['lat'].iloc[0] == 45.0
def test_normalize_coordinates_invalid_types(self, mock_context):
"""Test conversion of strings to numeric and handling of NaNs."""
df = pd.DataFrame({
'lat': ["45.5", "invalid", None],
'lon': ["9.5", "10.0", "11.0"]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert isinstance(result['lat'].iloc[0], float)
def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe):
"""Test with an empty DataFrame."""
df = pd.DataFrame(columns=['lat', 'lon'])
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result = normalize_coordinates(mock_context, config, df)
assert len(result) == 0
assert result.empty
def test_normalize_coordinates_default_config(self, mock_context):
"""Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration()
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_null_config_values(self, mock_context):
"""Test that null lat/lon column names fall back to defaults ('lat'/'lon')."""
df = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None)
assert config.latColumn == "lat"
assert config.lonColumn == "lon"
result = normalize_coordinates(mock_context, config, df.copy())
assert result['lat'].iloc[0] == 45.1235
assert result['lon'].iloc[0] == 9.1235
assert len(result) == 2
def test_normalize_coordinates_dms_degree_symbol(self, mock_context):
"""Test DMS parsing with degree/minute/second symbols like 40°26'46\"N."""
df = pd.DataFrame({
'lat': ["40°26'46\"N", "51°30'26\"N"],
'lon': ["79°58'56\"W", "0°7'39\"W"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 2
# 40°26'46"N ≈ 40.4461
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
# 79°58'56"W ≈ -79.9822
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
def test_normalize_coordinates_dms_spaced_format(self, mock_context):
"""Test DMS parsing with space-separated format like '40 26 46 N'."""
df = pd.DataFrame({
'lat': ["40 26 46 N"],
'lon': ["79 58 56 W"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
def test_normalize_coordinates_dms_already_decimal(self, mock_context):
"""Test that string columns with decimal values are auto-parsed correctly."""
df = pd.DataFrame({
'lat': ["45.5", "46.0"],
'lon': ["9.5", "10.0"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 2
assert result['lat'].iloc[0] == 45.5
assert result['lon'].iloc[0] == 9.5
def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context):
"""Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values."""
df = pd.DataFrame({
'lat': ["40°26'46\"N", "not_a_coord", "51.5"],
'lon': ["79°58'56\"W", "10.0", "0.1"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
# Row with "not_a_coord" for lat should be dropped (NaN lat)
assert len(result) == 2
def test_normalize_coordinates_dms_out_of_range(self, mock_context):
"""Test that DMS-parsed coordinates outside valid range are filtered out."""
df = pd.DataFrame({
'lat': ["91°0'0\"N", "45°0'0\"N"],
'lon': ["0°0'0\"E", "9°0'0\"E"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
# First row has lat=91° which is out of [-90, 90]
assert len(result) == 1
assert abs(result['lat'].iloc[0] - 45.0) < 0.001
def test_normalize_coordinates_dms_south_and_east(self, mock_context):
"""Test DMS parsing with south latitude and east longitude."""
df = pd.DataFrame({
'lat': ["33°51'54\"S"],
'lon': ["151°12'36\"E"]
})
config = CoordinatesNormalizationConfiguration(
latColumn='lat', lonColumn='lon'
)
result = normalize_coordinates(mock_context, config, df.copy())
assert len(result) == 1
# 33°51'54"S ≈ -33.865
assert result['lat'].iloc[0] < 0
assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001
# 151°12'36"E ≈ 151.21
assert result['lon'].iloc[0] > 0
assert abs(result['lon'].iloc[0] - 151.21) < 0.01
def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context):
"""Test that numeric columns are coerced directly while string columns are parsed as DMS."""
# Numeric columns — should go through pd.to_numeric path
df_numeric = pd.DataFrame({
'lat': [45.123456, 46.0],
'lon': [9.123456, 10.0]
})
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy())
assert result_numeric['lat'].iloc[0] == 45.1235
assert len(result_numeric) == 2
# String DMS columns — should go through _parse_dms_to_decimal path
df_dms = pd.DataFrame({
'lat': ["40°26'46\"N"],
'lon': ["79°58'56\"W"]
})
result_dms = normalize_coordinates(mock_context, config, df_dms.copy())
assert len(result_dms) == 1
assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001
class TestAddGlobalAggregations:
"""Tests for the add_global_aggregations operation."""
def test_add_global_aggregations_success(self, mock_context):
"""Test a successful group by and aggregation."""
df = pd.DataFrame({
'category': ['A', 'A', 'B'],
'value': [10, 20, 100],
'ignored_str': ['x', 'y', 'z']
})
config = AggregationConfiguration(
columns=['category'],
operation='sum'
)
result = add_global_aggregations(mock_context, config, df.copy())
assert len(result) == 2
assert result.loc[result['category'] == 'A', 'value'].values[0] == 30
assert result.loc[result['category'] == 'B', 'value'].values[0] == 100
assert 'ignored_str' not in result.columns
mock_context.log.info.assert_called()
def test_add_global_aggregations_missing_column(self, mock_context):
"""Test skipping a column that does not exist in the dataframe."""
df = pd.DataFrame({'value': [1, 2, 3]})
config = AggregationConfiguration(
columns=['missing_col'],
operation='count'
)
result = add_global_aggregations(mock_context, config, df.copy())
mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.")
assert len(result) == 1
def test_add_global_aggregations_unsupported_op(self, mock_context):
"""Test the warning when an unsupported operation is provided."""
df = pd.DataFrame({'category': ['A'], 'value': [1]})
config = AggregationConfiguration(
columns=['category'],
operation='unsupported'
)
with pytest.raises(Exception):
add_global_aggregations(mock_context, config, df.copy())
mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'")
def test_add_global_aggregations_only_numeric_kept(self, mock_context):
"""Verify that non-numeric and non-grouping columns are dropped."""
df = pd.DataFrame({
'group': ['A', 'A'],
'num': [1, 2],
'text': ['hello', 'world']
})
config = AggregationConfiguration(columns=['group'], operation='mean')
result = add_global_aggregations(mock_context, config, df.copy())
assert 'text' not in result.columns
assert 'num' in result.columns
assert 'group' in result.columns

View File

@@ -1,54 +0,0 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration
def test_valid_configuration_with_overrides():
cfg = BaseConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=10.0,
generalisation_hierarchies={"age": "age_hierarchy"},
)
assert cfg.ident == ["id"]
assert cfg.quasi_identifiers == ["age"]
assert cfg.supp_level == 10.0
assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"}
def test_default_values_are_loaded():
cfg = BaseConfiguration()
assert cfg.ident == ["Name"]
assert cfg.quasi_identifiers == ["Age"]
assert cfg.supp_level == 50.0
assert cfg.generalisation_hierarchies == {"Age": "simpl_age"}
def test_missing_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=[]
)
def test_missing_quasi_ident_raises_error():
with pytest.raises(ValidationError):
BaseConfiguration(
quasi_identifiers=[]
)
def test_overlap_between_ident_and_quasi_identifiers():
with pytest.raises(ValidationError):
BaseConfiguration(
ident=["age"],
quasi_identifiers=["age"]
)
def test_supp_level_bounds():
with pytest.raises(ValidationError):
BaseConfiguration(
supp_level=150.0 # fuori range
)

View File

@@ -1,48 +0,0 @@
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import (
simpl_age,
simpl_age2,
simpl_gender,
get_all_hierarchies,
)
def test_simpl_age_structure():
assert isinstance(simpl_age, dict)
assert 0 in simpl_age
assert isinstance(simpl_age[0], list)
# verify first level contains 100 ages
assert len(simpl_age[0]) == 100
assert simpl_age[0][0] == 0
assert simpl_age[0][-1] == 99
def test_simpl_age2_structure():
assert isinstance(simpl_age2, dict)
assert 0 in simpl_age2
assert 1 in simpl_age2
assert isinstance(simpl_age2[0], list)
assert isinstance(simpl_age2[1], list)
def test_simpl_gender_structure():
assert isinstance(simpl_gender, dict)
assert 0 in simpl_gender
assert 1 in simpl_gender
assert simpl_gender[0] == ["M", "F", "O"]
assert simpl_gender[1] == ["*", "*", "*"]
def test_get_all_hierarchies():
hier = get_all_hierarchies()
# the function should return dicts only
assert isinstance(hier, dict)
# ensure expected dicts are included
assert "simpl_age" in hier
assert "simpl_age2" in hier
assert "simpl_gender" in hier
# ensure the values returned are references to the actual dicts
assert hier["simpl_age"] is simpl_age
assert hier["simpl_gender"] is simpl_gender

View File

@@ -1,41 +0,0 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import (
KAnonymityConfiguration,
)
def test_valid_k_anonymity_config_with_overrides():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
sensitive_attributes=["disease"],
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["disease"]
assert cfg.generalisation_hierarchies == {"age": "age_hier"}
def test_default_values_are_loaded():
cfg = KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 3
assert cfg.sensitive_attributes == ["Disease"]
def test_invalid_k_value_raises_error():
with pytest.raises(ValidationError):
KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=1, # invalid, must be >= 2
sensitive_attributes=["disease"],
)

View File

@@ -1,44 +0,0 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import (
LDiversityConfiguration,
)
def test_valid_l_diversity_config_with_overrides():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=2,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.l == 2
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.l == 3
assert cfg.sensitive_attribute == "Disease"
def test_invalid_l_value_raises_error():
with pytest.raises(ValidationError):
LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
l=0, # invalid, must be >= 1
sensitive_attribute="disease",
)

View File

@@ -1,56 +0,0 @@
import pytest
from pydantic import ValidationError
from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import (
TClosenessConfiguration,
)
def test_valid_t_closeness_config_with_overrides():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
supp_level=5.0,
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=0.4,
sensitive_attribute="disease",
)
assert cfg.k == 3
assert cfg.t == 0.4
assert cfg.sensitive_attribute == "disease"
def test_default_values_are_loaded():
cfg = TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"}
)
assert cfg.k == 2
assert cfg.t == 0.5
assert cfg.sensitive_attribute == "Disease"
def test_invalid_t_value_low():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=-0.1, # invalid
sensitive_attribute="disease",
)
def test_invalid_t_value_high():
with pytest.raises(ValidationError):
TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
generalisation_hierarchies={"age": "age_hier"},
k=3,
t=2.0, # invalid > 1
sensitive_attribute="disease",
)

View File

@@ -1,44 +0,0 @@
from template_code_location.dataframe_level_anonymisation.jobs import (
k_anonymity_job,
l_diversity_job,
t_closeness_job,
k_anonymity_job_s3,
l_diversity_job_s3,
t_closeness_job_s3
)
def test_k_anonymity_job_is_callable():
"""Test k_anonymity_job is a valid Dagster job"""
assert callable(k_anonymity_job)
assert hasattr(k_anonymity_job, 'execute_in_process')
def test_l_diversity_job_is_callable():
"""Test l_diversity_job is a valid Dagster job"""
assert callable(l_diversity_job)
assert hasattr(l_diversity_job, 'execute_in_process')
def test_t_closeness_job_is_callable():
"""Test t_closeness_job is a valid Dagster job"""
assert callable(t_closeness_job)
assert hasattr(t_closeness_job, 'execute_in_process')
def test_k_anonymity_job_s3_is_callable():
"""Test k_anonymity_job_s3 is a valid Dagster job"""
assert callable(k_anonymity_job_s3)
assert hasattr(k_anonymity_job_s3, 'execute_in_process')
def test_l_diversity_job_s3_is_callable():
"""Test l_diversity_job_s3 is a valid Dagster job"""
assert callable(l_diversity_job_s3)
assert hasattr(l_diversity_job_s3, 'execute_in_process')
def test_t_closeness_job_s3_is_callable():
"""Test t_closeness_job_s3 is a valid Dagster job"""
assert callable(t_closeness_job_s3)
assert hasattr(t_closeness_job_s3, 'execute_in_process')

View File

@@ -1,230 +0,0 @@
import pytest
import pandas as pd
from unittest.mock import patch
from dagster import DagsterInvalidInvocationError, build_op_context
from template_code_location.dataframe_level_anonymisation.ops import (
apply_k_anonymity,
apply_l_diversity,
apply_t_closeness,
)
from template_code_location.dataframe_level_anonymisation.config_models import (
KAnonymityConfiguration,
LDiversityConfiguration,
TClosenessConfiguration,
)
# ---------------------------
# Fixtures
# ---------------------------
@pytest.fixture
def fake_df():
return pd.DataFrame({"id": [1, 2], "age": [30, 40]})
@pytest.fixture
def k_config():
return KAnonymityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attributes=["age"],
k=2,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def l_config():
return LDiversityConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
l=1,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def t_config():
return TClosenessConfiguration(
ident=["id"],
quasi_identifiers=["age"],
sensitive_attribute="age",
k=2,
t=0.5,
supp_level=0.0,
generalisation_hierarchies={"age": "simpl_age"},
)
@pytest.fixture
def op_context():
return build_op_context()
# ---------------------------
# Helper for patching external functions
# ---------------------------
@pytest.fixture(autouse=True)
def patch_external_ops():
with (
patch(
"dataframe_level_anonymisation.ops.get_all_hierarchies",
return_value={"simpl_age": {0: [30, 40]}},
),
patch(
"dataframe_level_anonymisation.ops.k_anonymity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.l_diversity",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
patch(
"dataframe_level_anonymisation.ops.t_closeness",
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
),
):
yield
# ---------------------------
# Tests for apply_k_anonymity
# ---------------------------
def test_apply_k_anonymity_outputs(op_context, k_config, fake_df):
results = list(apply_k_anonymity(op_context, k_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
# Check types
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
# ---------------------------
# Tests for apply_l_diversity
# ---------------------------
def test_apply_l_diversity_outputs(op_context, l_config, fake_df):
results = list(apply_l_diversity(op_context, l_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_l_diversity_empty_raises(op_context, l_config):
with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Tests for apply_t_closeness
# ---------------------------
def test_apply_t_closeness_outputs(op_context, t_config, fake_df):
results = list(apply_t_closeness(op_context, t_config, fake_df))
assert len(results) == 2
data_output = results[0].value
metrics_output = results[1].value
assert isinstance(data_output, pd.DataFrame)
assert isinstance(metrics_output, dict)
assert "k_anon" in metrics_output
assert "l_div" in metrics_output
assert "t_clos" in metrics_output
def test_apply_t_closeness_empty_raises(op_context, t_config):
with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]})))
# ---------------------------
# Additional tests for _validate_and_get_hierarchies
# ---------------------------
def test_validate_hierarchies_dataset_too_small(k_config):
small_df = pd.DataFrame({"id": [1], "age": [30]})
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, small_df)
def test_validate_hierarchies_missing_hierarchy(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}})
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(bad_config, fake_df)
def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df):
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}):
with pytest.raises(DagsterInvalidInvocationError):
_validate_and_get_hierarchies(k_config, fake_df)
# ---------------------------
# Additional tests for _calc_dataframe_metrics
# ---------------------------
def test_calc_dataframe_metrics_basic():
from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics
df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]})
df_anon = df_org.copy()
with (
patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2),
patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1),
patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1),
):
report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"])
assert "k-anonymity" in report
assert metrics["k_anon"] == 2
assert metrics["l_div"] == 1
assert metrics["t_clos"] == 0.1
# ---------------------------
# Tests for apply_t_closeness exception branches
# ---------------------------
def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df):
"""Covers the branch where ValueError contains 'Cannot be quasi-identifiers'."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness",
side_effect=ValueError("Cannot be quasi-identifiers invalid"),
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))
def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df):
"""Covers the branch where ValueError is raised but message does NOT contain that substring."""
with patch(
"dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error")
):
with pytest.raises(DagsterInvalidInvocationError):
list(apply_t_closeness(op_context, t_config, fake_df))

View File

@@ -1,70 +0,0 @@
import numpy as np
from template_code_location.dataframe_level_anonymisation.utils import (
parse_value_list,
normalize_hierarchy_levels,
)
# ------------------------------------
# Tests for parse_value_list
# ------------------------------------
def test_parse_value_list_all_strings_digits():
values = ["1", "2", "3"]
assert parse_value_list(values) == [1, 2, 3]
def test_parse_value_list_mixed_values():
values = ["1", 2, "abc", "5"]
assert parse_value_list(values) == [1, 2, "abc", 5]
def test_parse_value_list_no_digits():
values = ["a", "b", "c"]
assert parse_value_list(values) == ["a", "b", "c"]
# ------------------------------------
# Tests for normalize_hierarchy_levels
# ------------------------------------
def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array():
hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert "age" in normalized
assert 0 in normalized["age"]
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list
assert normalized["age"][1] == ["0-10", "11-20"] # untouched
def test_normalize_hierarchy_levels_multiple_columns():
hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}}
normalized = normalize_hierarchy_levels(hierarchy)
# First column
assert isinstance(normalized["age"][0], np.ndarray)
assert normalized["age"][0].tolist() == [10, 20]
# Second column
assert isinstance(normalized["gender"][0], np.ndarray)
assert normalized["gender"][0].tolist() == ["M", "F"]
assert normalized["gender"][1] == ["*"]
def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0():
hierarchy = {"test": {"0": ["1", "x", "3"]}}
normalized = normalize_hierarchy_levels(hierarchy)
assert isinstance(normalized["test"][0], np.ndarray)
assert normalized["test"][0].tolist() == ["1", "x", "3"]
def test_normalize_hierarchy_levels_empty_mapping():
hierarchy = {"col": {}}
normalized = normalize_hierarchy_levels(hierarchy)
assert normalized == {"col": {}}

View File

@@ -1,444 +0,0 @@
"""
Shared pytest fixtures and helpers for field-level pseudonymisation tests.
This module provides:
- Mock Vault client for testing without real Vault connections
- Sample data fixtures
- Configuration fixtures for encryption/decryption operations
- Helper functions for running ops and managing test Vault storage
"""
import pandas as pd
import pytest
from dagster import build_op_context
from cryptography.fernet import Fernet
from hvac.exceptions import InvalidPath, Forbidden
from unittest.mock import patch, MagicMock
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
EncryptConfig,
DecryptConfig,
PseudoTechniqueConfig,
DepseudoTechniqueConfig,
)
from template_code_location.field_level_pseudo_anonymisation.ops import (
anonymize_pseudonymize_structured,
depseudonymize_structured,
)
# -------------------------------- Mock Vault Storage ----------------------------------------
# In-memory Vault simulation for tests
_test_vault_storage = {}
_test_vault_access_control = {} # For simulating access control
@pytest.fixture(autouse=True)
def mock_vault_client():
"""
Auto-use fixture that mocks the hvac.Client to avoid real Vault connections.
Uses an in-memory dict to simulate Vault storage for tests.
Includes access control simulation for AC3.
"""
global _test_vault_storage, _test_vault_access_control
_test_vault_storage = {} # Reset storage before each test
_test_vault_access_control = {} # Reset access control
def mock_read_secret(path, mount_point):
"""Mock reading secret from Vault with access control"""
full_path = f"{mount_point}/{path}"
# Check access control first
if full_path in _test_vault_access_control:
if not _test_vault_access_control[full_path]:
raise Forbidden(f"Access denied to secret: {full_path}")
if full_path not in _test_vault_storage:
raise InvalidPath(f"Secret not found: {full_path}")
return {"data": {"data": {"value": _test_vault_storage[full_path]}}}
def mock_create_or_update_secret(path, mount_point, secret):
"""Mock creating/updating secret in Vault"""
full_path = f"{mount_point}/{path}"
_test_vault_storage[full_path] = secret["value"]
def mock_delete_metadata(path, mount_point):
"""Mock deleting secret from Vault"""
full_path = f"{mount_point}/{path}"
if full_path in _test_vault_storage:
del _test_vault_storage[full_path]
if full_path in _test_vault_access_control:
del _test_vault_access_control[full_path]
with patch("hvac.Client") as mock_client_class:
mock_instance = MagicMock()
mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret
mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = (
mock_create_or_update_secret
)
mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = (
mock_delete_metadata
)
mock_client_class.return_value = mock_instance
yield mock_instance
# -------------------------------- Sample Data Fixtures ----------------------------------------
@pytest.fixture
def sample_df():
"""
Fixture providing a sample structured dataset with PII data.
Represents typical data that requires pseudonymisation and restoration.
"""
return pd.DataFrame(
{
"id": [1, 2, 3, 4, 5],
"name": [
"Alice Smith",
"Bob Jones",
"Charlie Brown",
"David Wilson",
"Eva Garcia",
],
"email": [
"alice@example.com",
"bob@example.com",
"charlie@example.com",
"david@example.com",
"eva@example.com",
],
"ssn": [
"123-45-6789",
"234-56-7890",
"345-67-8901",
"456-78-9012",
"567-89-0123",
],
"age": [25, 30, 35, 40, 45],
"salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0],
"department": ["HR", "IT", "Finance", "IT", "HR"],
}
)
# -------------------------------- Configuration Fixtures ----------------------------------------
@pytest.fixture
def encrypt_config_single_field():
"""
Configuration for encrypting a single field (email).
Used to create pseudonymised data for restoration tests.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["email"],
key_name="test_restoration_key_single",
)
)
]
)
@pytest.fixture
def decrypt_config_single_field():
"""
Configuration for decrypting a single field (email).
Used to restore original values.
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["email"],
key_name="test_restoration_key_single",
)
)
]
)
@pytest.fixture
def encrypt_config_multiple_fields():
"""
Configuration for encrypting multiple fields (name, email, ssn).
Tests restoration of multiple sensitive fields.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["name", "email", "ssn"],
key_name="test_restoration_key_multi",
)
)
]
)
@pytest.fixture
def decrypt_config_multiple_fields():
"""
Configuration for decrypting multiple fields (name, email, ssn).
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["name", "email", "ssn"],
key_name="test_restoration_key_multi",
)
)
]
)
@pytest.fixture
def encrypt_config_partial_fields():
"""
Configuration for encrypting only some fields (email, ssn).
Tests partial restoration scenarios.
"""
return AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
columns=["email", "ssn"],
key_name="test_restoration_key_partial",
)
)
]
)
@pytest.fixture
def decrypt_config_partial_fields():
"""
Configuration for decrypting only some fields (email, ssn).
"""
return DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
type="decrypt",
columns=["email", "ssn"],
key_name="test_restoration_key_partial",
)
)
]
)
@pytest.fixture
def authorized_multi_key_scenario():
"""
Fixture for testing multi-key authorization scenarios.
Sets up two keys: one authorized, one denied.
"""
clear_vault_key("authorized_key")
clear_vault_key("unauthorized_key")
# Create authorized key by generating it
authorized_key = Fernet.generate_key().decode()
set_vault_key("authorized_key", authorized_key)
# Create unauthorized key and deny access
unauthorized_key = Fernet.generate_key().decode()
set_vault_key("unauthorized_key", unauthorized_key)
deny_vault_access("unauthorized_key")
yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"}
# Cleanup
clear_vault_key("authorized_key")
clear_vault_key("unauthorized_key")
@pytest.fixture
def large_dataset():
"""
Fixture providing a large dataset (10,000 rows) for performance testing.
Reusable across multiple performance tests.
"""
return pd.DataFrame(
{
"id": range(1, 10001),
"email": [f"user{i}@example.com" for i in range(1, 10001)],
"name": [f"User {i}" for i in range(1, 10001)],
"ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)],
"age": [20 + (i % 50) for i in range(1, 10001)],
"salary": [30000.0 + (i * 10) for i in range(1, 10001)],
"department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)],
}
)
@pytest.fixture(scope="session")
def vault_test_keys():
"""
Session-scoped fixture to pre-generate test keys for faster test execution.
Avoids repeated key generation in each test.
"""
keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)}
return keys
@pytest.fixture
def cleanup_test_keys(request):
"""
Fixture to automatically cleanup test keys after each test.
Use with: @pytest.mark.usefixtures("cleanup_test_keys")
"""
yield
# Cleanup all test keys from mock Vault
test_keys = [k for k in _test_vault_storage.keys() if "test_" in k]
for key in test_keys:
_test_vault_storage.pop(key, None)
# -------------------------------- Helper Functions ----------------------------------------
def config_to_dagster_dict(config):
"""
Convert Pydantic config to Dagster-compatible dictionary.
For AnonymisePseudonymizeStructuredConfig (uses discriminated Union):
Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}}
Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}}
For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union):
Pydantic v2 outputs:
{'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}}
Dagster expects: Same flat structure with 'type' field
Args:
config: Pydantic config instance
(AnonymisePseudonymizeStructuredConfig or
DepseudonymizeStructuredConfig)
Returns:
dict: Dagster-compatible configuration dictionary
"""
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
)
config_dict = config.model_dump()
# Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig
# DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union)
if isinstance(config, AnonymisePseudonymizeStructuredConfig):
if "used_function" in config_dict:
for func_config in config_dict["used_function"]:
if "technique" in func_config:
technique = func_config["technique"]
# Pydantic outputs flat dict with 'type' field for discriminated unions
if isinstance(technique, dict) and "type" in technique:
# Extract the type discriminator
technique_type = technique["type"]
# Create nested structure without the 'type' field
technique_data = {k: v for k, v in technique.items() if k != "type"}
# Nest under the discriminator key for Dagster
func_config["technique"] = {technique_type: technique_data}
return config_dict
def run_encrypt_op(config, df):
"""
Helper function to execute the anonymize_pseudonymize_structured op.
Args:
config: AnonymisePseudonymizeStructuredConfig instance
df: Input pandas DataFrame
Returns:
tuple: (result_df, metrics) - Output DataFrame and metrics dict
"""
context = build_op_context(op_config=config_to_dagster_dict(config))
result_df, metrics = anonymize_pseudonymize_structured(context, df=df)
return result_df.value, metrics.value
def run_decrypt_op(config, df):
"""
Helper function to execute the depseudonymize_structured op.
Args:
config: DepseudonymizeStructuredConfig instance
df: Input pandas DataFrame
Returns:
tuple: (result_df, metrics) - Output DataFrame and metrics dict
"""
context = build_op_context(op_config=config_to_dagster_dict(config))
result_df, metrics = depseudonymize_structured(context, df=df)
return result_df.value, metrics.value
def clear_vault_key(key_name: str):
"""
Helper function to clear a key from the simulated Vault storage for test isolation.
Args:
key_name: Name of the key to delete from Vault
"""
full_path = f"secret/PseudonymKeys/{key_name}"
if full_path in _test_vault_storage:
del _test_vault_storage[full_path]
if full_path in _test_vault_access_control:
del _test_vault_access_control[full_path]
def set_vault_key(key_name: str, key_value: str):
"""
Helper function to set a key in the simulated Vault storage.
Args:
key_name: Name of the key
key_value: Value of the key (Fernet key as string)
"""
full_path = f"secret/PseudonymKeys/{key_name}"
_test_vault_storage[full_path] = key_value
def deny_vault_access(key_name: str):
"""
Helper function to deny access to a key for authorization testing (AC3).
Args:
key_name: Name of the key to deny access to
"""
full_path = f"secret/PseudonymKeys/{key_name}"
_test_vault_access_control[full_path] = False
def get_vault_key(key_name: str) -> bytes:
"""
Helper function to retrieve a key from the simulated Vault storage.
Args:
key_name: Name of the key to retrieve
Returns:
bytes: The encryption key
"""
full_path = f"secret/PseudonymKeys/{key_name}"
if full_path not in _test_vault_storage:
raise InvalidPath(f"Key not found: {key_name}")
return _test_vault_storage[full_path].encode()

View File

@@ -1,633 +0,0 @@
import pytest
from pydantic import ValidationError
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
AnonymisePseudonymizeStructuredConfig,
DepseudonymizeStructuredConfig,
PseudoTechniqueConfig,
DepseudoTechniqueConfig,
HashConfig,
EncryptConfig,
RedactConfig,
ReplaceConfig,
DecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
AnonymisePseudonymizeUnstructuredConfig,
DepseudonymizeUnstructuredConfig,
PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig,
DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig,
HashConfig as UnstructuredHashConfig,
EncryptConfig as UnstructuredEncryptConfig,
RedactConfig as UnstructuredRedactConfig,
ReplaceConfig as UnstructuredReplaceConfig,
RetainConfig,
DecryptConfig as UnstructuredDecryptConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum
from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum
# ==================== Structured Config Tests ====================
class TestStructuredConfigValidators:
"""Tests for structured_config.py validators and validators."""
def test_ensure_unique_columns_valid_single_technique(self):
"""Test that single technique with single column passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
)
]
)
assert config is not None
assert len(config.used_function) == 1
def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self):
"""Test that multiple techniques with different columns passes validation."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["ssn"],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_columns_duplicate_columns_same_technique(self):
"""Test that duplicate columns in different techniques raises error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email"],
algorithm="sha256"
)
)
]
)
assert "Duplicate column" in str(exc_info.value)
assert "email" in str(exc_info.value)
def test_ensure_unique_columns_multiple_duplicates(self):
"""Test error message with multiple duplicate columns."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
),
PseudoTechniqueConfig(
technique=HashConfig(
columns=["email", "phone"],
algorithm="sha256"
)
)
]
)
error_msg = str(exc_info.value)
assert "Duplicate column" in error_msg
assert "email" in error_msg
assert "phone" in error_msg
def test_collect_column_to_techniques_single_technique(self):
"""Test _collect_column_to_techniques with single technique."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
columns=["email", "phone"],
key_name="key1"
)
)
]
)
mapping = config._collect_column_to_techniques()
assert mapping == {
"email": ["encrypt"],
"phone": ["encrypt"]
}
def test_extract_technique_and_columns_dict_with_type_field(self):
"""Test _extract_technique_and_columns with dict containing 'type' field."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"columns": ["email", "ssn"],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == ["email", "ssn"]
def test_extract_technique_and_columns_dict_with_variant_mapping(self):
"""Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"encrypt": {
"columns": ["ssn"],
"key_name": "test_key"
}
}
}
)
assert technique_type == "encrypt"
assert columns == ["ssn"]
def test_extract_technique_and_columns_model_instance(self):
"""Test _extract_technique_and_columns with PseudoTechniqueConfig model instance."""
pseudo_config = PseudoTechniqueConfig(
technique=RedactConfig(columns=["address"])
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "redact"
assert columns == ["address"]
def test_extract_technique_and_columns_empty_dict(self):
"""Test _extract_technique_and_columns with empty dict."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": {}}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_none_technique(self):
"""Test _extract_technique_and_columns with None technique."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{"technique": None}
)
assert technique_type is None
assert columns == []
def test_extract_technique_and_columns_missing_columns_key(self):
"""Test _extract_technique_and_columns when 'columns' key is missing."""
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert columns == []
def test_extract_technique_and_columns_model_without_columns_attr(self):
"""Test _extract_technique_and_columns with model instance missing columns attribute."""
pseudo_config = PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["old_value"], new_value="NEW")
)
config = AnonymisePseudonymizeStructuredConfig()
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
assert technique_type == "replace"
assert columns == ["old_value"]
class TestStructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeStructuredConfig."""
def test_depseudonymize_config_normalize_used_function_with_dict(self):
"""Test _normalize_depseudo_used_function with dict input."""
config = DepseudonymizeStructuredConfig(
used_function=[
{
"technique": {
"type": "decrypt",
"columns": ["email"],
"key_name": "key1"
}
}
]
)
assert len(config.used_function) == 1
assert isinstance(config.used_function[0], DepseudoTechniqueConfig)
assert config.used_function[0].technique.type == "decrypt"
def test_depseudonymize_config_normalize_used_function_with_model(self):
"""Test _normalize_depseudo_used_function with model instance."""
depseudo_tech = DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
)
config = DepseudonymizeStructuredConfig(
used_function=[depseudo_tech]
)
assert len(config.used_function) == 1
assert config.used_function[0] is depseudo_tech
def test_depseudonymize_config_ensure_unique_columns_no_op(self):
"""Test that ensure_unique_columns is a no-op for depseudonymize."""
# For depseudonymize, there's no per-column uniqueness constraint
config = DepseudonymizeStructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key1"
)
),
DepseudoTechniqueConfig(
technique=DecryptConfig(
columns=["email"],
key_name="key2"
)
)
]
)
# Should not raise - no-op validator
assert config is not None
# ==================== Unstructured Config Tests ====================
class TestUnstructuredConfigValidators:
"""Tests for unstructured_config.py validators."""
def test_normalize_used_function_with_dict(self):
"""Test _normalize_used_function with dict input."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
{
"technique": {
"encrypt": {
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "key1"
}
}
}
]
)
assert len(config.used_function) == 1
def test_normalize_used_function_with_model(self):
"""Test _normalize_used_function with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
)
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[pseudo_tech]
)
assert len(config.used_function) == 1
def test_ensure_unique_pii_valid_different_pii_types(self):
"""Test that different PII types pass validation."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.PERSON.value],
algorithm="sha256"
)
)
]
)
assert config is not None
assert len(config.used_function) == 2
def test_ensure_unique_pii_duplicate_pii_types(self):
"""Test that duplicate PII types raise error."""
with pytest.raises(ValueError) as exc_info:
AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value],
key_name="key1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(
pii=[PIIEntityEnum.EMAIL.value],
algorithm="sha256"
)
)
]
)
assert "Duplicate PII" in str(exc_info.value)
# Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value
assert "EMAIL" in str(exc_info.value)
def test_collect_pii_to_techniques_single_technique(self):
"""Test _collect_pii_to_techniques with single technique."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value],
key_name="key1"
)
)
]
)
mapping = config._collect_pii_to_techniques()
assert mapping == {
PIIEntityEnum.EMAIL.value: ["encrypt"],
PIIEntityEnum.PERSON.value: ["encrypt"]
}
def test_extract_technique_and_pii_dict_with_type_field(self):
"""Test _extract_technique_and_pii with dict containing 'type' field."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"pii": [PIIEntityEnum.EMAIL.value],
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_dict_with_variant_mapping(self):
"""Test _extract_technique_and_pii with variant-key mapping."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"hash": {
"pii": [PIIEntityEnum.PERSON.value],
"algorithm": "sha256"
}
}
}
)
assert technique_type == "hash"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_dict_fallback_to_columns(self):
"""Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "redact",
"columns": ["fallback_col"]
}
}
)
assert technique_type == "redact"
assert piis == ["fallback_col"]
def test_extract_technique_and_pii_model_instance(self):
"""Test _extract_technique_and_pii with model instance."""
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(
pii=[PIIEntityEnum.EMAIL.value]
)
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "redact"
assert piis == [PIIEntityEnum.EMAIL.value]
def test_extract_technique_and_pii_model_with_getattr_fallback(self):
"""Test _extract_technique_and_pii model with getattr fallback to columns."""
# Create a mock-like scenario where pii attribute doesn't exist
pseudo_tech = UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value])
)
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
assert technique_type == "retain"
assert piis == [PIIEntityEnum.PERSON.value]
def test_extract_technique_and_pii_empty_dict(self):
"""Test _extract_technique_and_pii with empty dict."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{"technique": {}}
)
assert technique_type is None
assert piis == []
def test_extract_technique_and_pii_missing_pii_key(self):
"""Test _extract_technique_and_pii when 'pii' key is missing."""
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
technique_type, piis = config._extract_technique_and_pii(
{
"technique": {
"type": "encrypt",
"key_name": "test_key"
}
}
)
assert technique_type == "encrypt"
assert piis == []
class TestUnstructuredDepseudonymizeConfig:
"""Tests for DepseudonymizeUnstructuredConfig."""
def test_depseudonymize_unstructured_config_default(self):
"""Test default DepseudonymizeUnstructuredConfig."""
config = DepseudonymizeUnstructuredConfig()
assert config is not None
assert len(config.used_function) >= 1
def test_depseudonymize_unstructured_config_with_custom_function(self):
"""Test DepseudonymizeUnstructuredConfig with custom function."""
config = DepseudonymizeUnstructuredConfig(
used_function=[
UnstructuredDepseudoTechniqueConfig(
technique=UnstructuredDecryptConfig(
key_name="custom_key"
)
)
]
)
assert len(config.used_function) == 1
assert config.used_function[0].technique.key_name == "custom_key"
class TestLanguageSupport:
"""Tests for language configuration support."""
def test_all_supported_languages(self):
"""Test that all supported languages can be set."""
supported_languages = [
LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en,
LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el,
LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt,
LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv
]
for lang in supported_languages:
config = AnonymisePseudonymizeUnstructuredConfig(language=lang)
assert config.language == lang
def test_default_language_is_english(self):
"""Test that default language is English."""
config = AnonymisePseudonymizeUnstructuredConfig()
assert config.language == LanguageEnum.en
class TestTechniqueConfigDefaults:
"""Tests for technique config defaults."""
def test_hash_config_default_algorithm(self):
"""Test HashConfig default algorithm."""
config = HashConfig()
assert config.algorithm == "sha256"
assert config.type == "hash"
def test_encrypt_config_defaults(self):
"""Test EncryptConfig defaults."""
config = EncryptConfig()
assert config.type == "encrypt"
assert config.key_name == "my_key"
def test_redact_config_defaults(self):
"""Test RedactConfig defaults."""
config = RedactConfig()
assert config.type == "redact"
def test_replace_config_defaults(self):
"""Test ReplaceConfig defaults."""
config = ReplaceConfig()
assert config.type == "replace"
assert config.new_value == "REPLACED"
def test_decrypt_config_defaults(self):
"""Test DecryptConfig defaults."""
config = DecryptConfig()
assert config.type == "decrypt"
assert config.key_name == "my_key"
def test_unstructured_retain_config_defaults(self):
"""Test RetainConfig defaults."""
config = RetainConfig()
assert config.type == "retain"
class TestPseudoTechniqueConfigDefaults:
"""Tests for PseudoTechniqueConfig defaults."""
def test_pseudo_technique_default_to_hash(self):
"""Test PseudoTechniqueConfig defaults to hash technique."""
config = PseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
def test_unstructured_pseudo_technique_default_to_hash(self):
"""Test UnstructuredPseudoTechniqueConfig defaults to hash technique."""
config = UnstructuredPseudoTechniqueConfig()
# For Dagster Config, technique may be a dict with the discriminator structure
if isinstance(config.technique, dict):
# Check if it has hash configuration
assert "hash" in config.technique or config.technique.get("type") == "hash"
else:
assert config.technique.type == "hash"
class TestConfigModelIntegration:
"""Integration tests for config models."""
def test_structured_config_with_all_technique_types(self):
"""Test structured config with all technique types."""
config = AnonymisePseudonymizeStructuredConfig(
used_function=[
PseudoTechniqueConfig(
technique=HashConfig(columns=["col1"])
),
PseudoTechniqueConfig(
technique=EncryptConfig(columns=["col2"], key_name="k1")
),
PseudoTechniqueConfig(
technique=RedactConfig(columns=["col3"])
),
PseudoTechniqueConfig(
technique=ReplaceConfig(columns=["col4"], new_value="X")
)
]
)
assert len(config.used_function) == 4
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace"}
def test_unstructured_config_with_all_technique_types(self):
"""Test unstructured config with all technique types."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredEncryptConfig(
pii=[PIIEntityEnum.PERSON.value],
key_name="k1"
)
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value])
),
UnstructuredPseudoTechniqueConfig(
technique=UnstructuredReplaceConfig(
pii=[PIIEntityEnum.CREDIT_CARD.value],
new_value="X"
)
),
UnstructuredPseudoTechniqueConfig(
technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value])
)
]
)
assert len(config.used_function) == 5
techniques = {f.technique.type for f in config.used_function}
assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}

View File

@@ -1,288 +0,0 @@
"""
Test suite for data restoration (depseudonymisation) of unstructured text.
## Test Coverage Summary
### Acceptance Criteria Coverage:
- AC1 (Data Restoration with Valid Key): 2 tests
- AC2 (Restoration Denial - Missing Key): 1 test
- AC3 (Restoration Denial - Unauthorized Access): 1 test
- AC4 (Restoration Denial - Invalid Key): 1 test
- Additional Coverage: 2 tests (edge cases)
### Test Pattern:
- Each test uses build_op_context with .model_dump() for configuration
- Tests validate dual outputs (data, metrics)
- Tests verify complete restoration of original text
- Tests validate security controls and error handling
- Tests use descriptive names mapping to AC scenarios
"""
import pytest
from unittest.mock import patch
from cryptography.fernet import Fernet
from dagster import build_op_context
from src.field_level_pseudo_anonymisation.unstructured_ops import (
depseudonymize_unstructured,
)
from src.field_level_pseudo_anonymisation.config_models.unstructured_config import (
DepseudonymizeUnstructuredConfig,
DecryptConfig,
DepseudoTechniqueConfig,
)
@pytest.fixture
def fernet_key() -> bytes:
"""Generate a valid Fernet key for encryption in tests."""
return Fernet.generate_key()
@pytest.fixture
def encrypted_text_data(fernet_key: bytes) -> dict:
"""
Create encrypted data for testing decryption.
Returns a dict with:
- original_text: The unencrypted text
- encrypted_text: Text with PII values encrypted in {encrypt:...} format
"""
original_text = "My name is John Doe and my email is john.doe@example.com."
fernet = Fernet(fernet_key)
encrypted_name = fernet.encrypt(b"John Doe").decode()
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
encrypted_text = (
f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}."
)
return {
"original_text": original_text,
"encrypted_text": encrypted_text,
}
# ---------------------- AC1: Data Restoration with Valid Key --------------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac1_restore_encrypted_pii_entities_with_valid_key(
mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict
):
"""AC1: Restore encrypted PII entities with a valid key from secret management tool."""
# Arrange - Mock the Vault key retrieval to return the valid key
mock_create_get_key.return_value = fernet_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act - Request data restoration
result_gen = depseudonymize_unstructured(
context, input_text=encrypted_text_data["encrypted_text"]
)
data_output = next(result_gen)
metrics_output = next(result_gen)
# Assert - Verify successful restoration
# 1. All original values are restored exactly
assert (
data_output.value == encrypted_text_data["original_text"]
), "Original text should be fully restored"
# 2. Correct output structure
assert data_output.output_name == "data", "Output should be named 'data'"
# 3. Metrics show correct number of restored entities
assert (
metrics_output.value["total_depseudo_count"] == 2
), "Should restore 2 encrypted entities (name and email)"
# 4. System retrieved key from secret management tool
mock_create_get_key.assert_called_once_with("decrypt", "test_key")
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes):
"""AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key."""
# Arrange - Create text with multiple PII types encrypted
original_text = "Contact John Doe at john.doe@example.com or call 555-1234."
fernet = Fernet(fernet_key)
encrypted_name = fernet.encrypt(b"John Doe").decode()
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
encrypted_phone = fernet.encrypt(b"555-1234").decode()
encrypted_text = (
f"Contact {{encrypt:{encrypted_name}}} at "
f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}."
)
mock_create_get_key.return_value = fernet_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="multi_pii_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act
result_gen = depseudonymize_unstructured(context, input_text=encrypted_text)
data_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert data_output.value == original_text, "All PII types should be restored"
assert (
metrics_output.value["total_depseudo_count"] == 3
), "Should restore 3 encrypted entities (name, email, phone)"
mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key")
# ------------------- AC2: Restoration Denial when Key is Missing ----------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict):
"""AC2: Deny restoration when decryption key is missing from secret management tool."""
# Arrange - Mock Vault to indicate key is missing
mock_create_get_key.side_effect = ValueError(
"Fernet key 'non_existent_key' not found in Vault for decrypt."
)
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="non_existent_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system fails the restoration request
with pytest.raises(
ValueError,
match="Fernet key 'non_existent_key' not found in Vault for decrypt.",
) as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message is clear and actionable
assert "not found in Vault" in str(
exc_info.value
), "Error message should indicate key is missing from Vault"
# Verify system attempted to retrieve the key (logged attempt)
mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key")
# ------------- AC3: Restoration Denial when Access is Unauthorized --------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac3_restoration_denial_when_unauthorized_access(
mock_create_get_key, encrypted_text_data: dict
):
"""AC3: Deny restoration when participant is not authorized to access the decryption key."""
# Arrange - Mock Vault to deny access
mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key")
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(
technique=DecryptConfig(type="decrypt", key_name="unauthorized_key")
)
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system denies access
with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message clearly indicates access denial
assert "Access denied" in str(
exc_info.value
), "Error message should clearly indicate access was denied"
# Verify the unauthorized access attempt was logged (function was called)
mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key")
# ------------------- AC4: Restoration Denial when Key is Invalid ----------------------------
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict):
"""AC4: Deny restoration when decryption key does not correspond to the encrypted fields."""
# Arrange - Mock Vault to return a different (wrong) key
invalid_key = Fernet.generate_key() # A different, incorrect key
mock_create_get_key.return_value = invalid_key
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act & Assert - Verify system fails the restoration
with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info:
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
# Verify error message indicates decryption failure
assert "Invalid Fernet token" in str(
exc_info.value
), "Error message should indicate the key is invalid for this data"
# Verify key was retrieved (system attempted decryption)
mock_create_get_key.assert_called_once_with("decrypt", "wrong_key")
# -------------------------------- Additional Edge Cases ----------------------------------------
def test_depseudonymize_unstructured_no_decrypt_config():
"""Edge case: Text is returned unchanged when no decryption techniques are configured."""
# Arrange
original_text = "This text has no {encrypt:values} to decrypt."
config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques
context = build_op_context(op_config=config.model_dump())
# Act
result_gen = depseudonymize_unstructured(context, input_text=original_text)
result_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert (
result_output.value == original_text
), "Text should remain unchanged when no decryption is configured"
assert (
metrics_output.value["total_depseudo_count"] == 0
), "Should report zero decryptions performed"
def test_depseudonymize_unstructured_empty_text():
"""Edge case: Empty input text is returned unchanged with zero decryptions performed."""
# Arrange
empty_text = ""
config = DepseudonymizeUnstructuredConfig(
used_function=[
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
]
)
context = build_op_context(op_config=config.model_dump())
# Act
with patch(
"src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
) as mock_key:
mock_key.return_value = Fernet.generate_key()
result_gen = depseudonymize_unstructured(context, input_text=empty_text)
result_output = next(result_gen)
metrics_output = next(result_gen)
# Assert
assert result_output.value == "", "Empty text should remain empty"
assert (
metrics_output.value["total_depseudo_count"] == 0
), "Should report zero decryptions for empty text"

View File

@@ -1,853 +0,0 @@
"""
Test suite for field-level pseudonymisation operations on unstructured data.
This test suite validates the pseudonymisation of unstructured text with PII detection,
covering the following Acceptance Criteria:
## Test Coverage Summary
### Acceptance Criteria Coverage:
- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests
- AC2 (Invalid Execution Handling): 5 tests
- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests
- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests
- Additional Coverage: 3 tests
### Test Pattern:
- Each test uses build_op_context with config_to_dagster_dict for configuration
- Tests validate dual outputs (data, metrics)
- Vault access is mocked for isolation
- Tests validate Scrubadub automatic PII detection
- Tests ensure placeholder replacement for unconfigured PII
"""
import pytest
import re
from dagster import build_op_context
from unittest.mock import patch, MagicMock
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
AnonymisePseudonymizeUnstructuredConfig,
EncryptConfig,
RetainConfig,
PseudoTechniqueConfig,
)
from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum
from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import (
anonymize_pseudonymize_unstructured,
)
from .conftest import clear_vault_key
def config_to_dagster_dict_unstructured(config):
"""Convert unstructured config to Dagster format."""
config_dict = {"language": config.language.value, "used_function": []}
for func_config in config.used_function:
technique = func_config.technique
technique_type = technique.type
technique_dict = technique.model_dump()
if "pii" in technique_dict:
technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii]
technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"}
config_dict["used_function"].append(
{"technique": {technique_type: technique_dict_without_type}}
)
return config_dict
def run_unstructured_op(config, text):
"""
Helper to run unstructured pseudonymisation op.
Returns:
tuple: (result_text: str, metrics_markdown: str)
"""
context = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text)
# Extract actual values from Output objects
return result_text.value, metrics.value
def parse_metrics_markdown(metrics_md: str) -> dict:
"""
Parse markdown metrics into structured dict for easier testing.
Args:
metrics_md: Markdown metrics string from op output
Returns:
dict with keys: total_pii_detected, pii_by_type, techniques_applied, language
"""
result = {
"total_pii_detected": 0,
"pii_by_type": {},
"techniques_applied": {},
"language": "",
}
# Extract total PII detected
total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md)
if total_match:
result["total_pii_detected"] = int(total_match.group(1))
# Extract language
lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md)
if lang_match:
result["language"] = lang_match.group(1)
# Extract PII by type from table
pii_table_section = re.search(
r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)",
metrics_md,
)
if pii_table_section:
for line in pii_table_section.group(1).strip().split("\n"):
parts = [p.strip() for p in line.split("|") if p.strip()]
if len(parts) == 2:
entity_type, count = parts
result["pii_by_type"][entity_type] = int(count)
# Extract techniques applied
techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md)
if techniques_section:
for line in techniques_section.group(1).strip().split("\n"):
tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line)
if tech_match:
pii_type, technique = tech_match.groups()
result["techniques_applied"][pii_type] = technique
return result
# -------------------------------- Fixtures ----------------------------------------
@pytest.fixture
def sample_text_en():
"""English text with various PII types."""
return """
John Smith works at Acme Corporation. His email is john.smith@example.com
and his phone number is +1-555-123-4567. He lives in New York City at
123 Main Street, Apartment 4B. His SSN is 123-45-6789.
"""
@pytest.fixture
def sample_text_multi_person():
"""Text with multiple person names."""
return """
The meeting included Alice Johnson, Bob Williams, and Charlie Brown.
They discussed the project with Maria Garcia and David Wilson.
"""
@pytest.fixture
def sample_text_mixed_pii():
"""Text with multiple PII types for AC1 comprehensive testing."""
return """
Contact Information:
Name: Dr. Emily Watson
Email: emily.watson@hospital.com
Phone: +44-20-7946-0958
Website: https://patient-portal.hospital.com/records
"""
@pytest.fixture
def encrypt_person_config():
"""Configuration to encrypt PERSON entities."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_person_key",
)
)
],
)
@pytest.fixture
def retain_person_config():
"""Configuration to retain PERSON entities unchanged."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
],
)
@pytest.fixture
def mixed_technique_config():
"""Configuration with encryption and retention for AC1 testing."""
return AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_mixed_key",
)
),
PseudoTechniqueConfig(
technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS])
),
],
)
# ================================================================================================
# AC1: Pseudonymisation and Retention Are Applied Correctly
# ================================================================================================
def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config):
"""AC1: Test that configured PII types are encrypted correctly."""
clear_vault_key("test_person_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify person name is encrypted (not in plaintext)
assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted"
# Verify encryption token is present
assert "{encrypt:" in result_text, "Encrypted token should be present in result"
# Verify PII was detected and processed
assert metrics["total_pii_detected"] > 0, "System should detect PII entities"
assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII"
# Verify text structure is preserved (surrounding text intact)
assert "Contact Information:" in result_text, "Non-PII text structure should be preserved"
def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person):
"""AC1: Test that PII types marked for retention remain unchanged."""
retain_config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
],
)
result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person)
metrics = parse_metrics_markdown(metrics_md)
# Verify retained PII types remain in plaintext
assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged"
assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged"
# Verify technique applied is 'retain'
assert (
"retain" in metrics["techniques_applied"].get("PERSON", "").lower()
), "Retain technique should be recorded for PERSON type"
def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii):
"""AC1: Test that unconfigured PII types are replaced with placeholders."""
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_person_only_key",
)
)
],
)
clear_vault_key("test_person_only_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
# Verify person is encrypted (configured)
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
# Verify unconfigured PII types have placeholders
assert (
"{{" in result_text and "}}" in result_text
), "Unconfigured PII should be replaced with placeholders"
# Verify original unconfigured PII values are not in result
assert (
"emily.watson@hospital.com" not in result_text
), "Unconfigured EMAIL should be replaced with placeholder"
# Verify placeholder format
assert (
"{{EMAIL}}" in result_text or "{{URL}}" in result_text
), "Placeholders should indicate entity type"
def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config):
"""AC1: Test that multiple techniques (encrypt, retain) are applied correctly."""
clear_vault_key("test_mixed_key")
result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify encrypted PII types (PERSON, EMAIL)
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted"
# Verify retained PII type (PHONE_NUMBERS)
assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained"
# Verify metrics reflect different techniques
assert (
"encrypt" in metrics["techniques_applied"].get("PERSON", "").lower()
), "Encrypt technique should be applied to PERSON"
assert (
"encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower()
), "Encrypt technique should be applied to EMAIL"
assert (
"retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower()
), "Retain technique should be applied to PHONE_NUMBERS"
def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config):
"""AC1: Test that all instances of a configured PII type are processed."""
clear_vault_key("test_person_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person)
metrics = parse_metrics_markdown(metrics_md)
# Verify all person names are encrypted
person_names = [
"Alice Johnson",
"Bob Williams",
"Charlie Brown",
"Maria Garcia",
"David Wilson",
]
for name in person_names:
assert name not in result_text, f"All PERSON instances should be encrypted: {name}"
# Verify metrics count multiple instances
assert metrics["pii_by_type"].get("PERSON", 0) >= len(
person_names
), f"Should detect at least {len(person_names)} PERSON entities"
def test_ac1_empty_text_returns_empty(encrypt_person_config):
"""AC1: Test that empty or null text input raises a ValueError."""
clear_vault_key("test_person_key")
with pytest.raises(ValueError) as exc_info:
run_unstructured_op(encrypt_person_config, "")
assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input"
def test_ac1_text_without_pii_remains_unchanged():
"""AC1: Test that text without any PII remains unchanged after processing."""
no_pii_text = """
The weather today is sunny with a high of 25 degrees Celsius.
The conference starts at 9:00 AM in Room 301.
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_no_pii_key",
)
)
],
)
clear_vault_key("test_no_pii_key")
result_text, metrics_md = run_unstructured_op(config, no_pii_text)
metrics = parse_metrics_markdown(metrics_md)
assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged"
assert metrics["total_pii_detected"] == 0, "No PII should be detected"
def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii):
"""AC1: Test that placeholders for unconfigured PII indicate the entity type."""
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_placeholder_key",
)
)
],
)
clear_vault_key("test_placeholder_key")
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
metrics = parse_metrics_markdown(metrics_md)
# Verify placeholder format (scrubadub uses {{TYPE}} format)
placeholder_pattern = r"\{\{[A-Z_]+\}\}"
placeholders = re.findall(placeholder_pattern, result_text)
assert (
len(placeholders) > 0
), "Result should contain entity-type placeholders for unconfigured PII"
# Verify metrics track which PII types were detected
assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types"
# ================================================================================================
# AC2: Invalid Execution Handling
# ================================================================================================
def test_ac2_graceful_abort_on_scrubadub_failure():
"""AC2: Test graceful abort when the PII detection engine (Scrubadub) fails."""
text = "Test user John Smith with email john@example.com"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_abort_key",
)
)
],
)
clear_vault_key("test_abort_key")
# Mock Scrubadub to fail at the right import path
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber"
) as mock_scrubber_class:
mock_scrubber = MagicMock()
mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error")
mock_scrubber_class.return_value = mock_scrubber
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, text)
error_msg = str(exc_info.value).lower()
assert (
"pii" in error_msg
or "detection" in error_msg
or "scrubadub" in error_msg
or "failed" in error_msg
), "Error message should indicate PII detection failure"
def test_ac2_graceful_abort_on_encryption_failure(sample_text_en):
"""AC2: Test graceful abort when an encryption technique fails during execution."""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_encrypt_fail_key",
)
)
],
)
clear_vault_key("test_encrypt_fail_key")
# Mock encrypt function at correct path - it's imported from techniques module
encrypt_path = (
"field_level_pseudo_anonymisation"
".techniques.anonymisation_pseudonymisation_techniques.encrypt"
)
with patch(encrypt_path) as mock_encrypt:
mock_encrypt.side_effect = Exception("Encryption algorithm failure")
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, sample_text_en)
error_msg = str(exc_info.value).lower()
assert (
"encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg
), "Error message should indicate encryption failure"
def test_ac2_null_text_input_raises_error(encrypt_person_config):
"""AC2: Test that a null (None) text input is rejected with an error."""
clear_vault_key("test_person_key")
# Dagster will raise DagsterTypeCheckDidNotPass before op executes
from dagster import DagsterTypeCheckDidNotPass
with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)):
run_unstructured_op(encrypt_person_config, None)
def test_ac2_invalid_language_configuration():
"""AC2: Test that an unsupported language in the config raises a validation error."""
# This should fail at config creation due to Pydantic validation
with pytest.raises((ValueError, TypeError)):
AnonymisePseudonymizeUnstructuredConfig(
language="invalid_lang", # Should fail Pydantic validation
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key"
)
)
],
)
def test_ac2_very_large_text_processing():
"""AC2: Test that very large text inputs are processed successfully without memory errors."""
# Create large text with repeated PII patterns
large_text = (
"""
John Smith works at company. Email: john.smith@example.com.
"""
* 1000
) # ~60KB of text with repeated PII
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_large_text_key",
)
)
],
)
clear_vault_key("test_large_text_key")
result_text, metrics_md = run_unstructured_op(config, large_text)
metrics = parse_metrics_markdown(metrics_md)
# Verify processing completed
assert result_text is not None, "Large text should be processed successfully"
assert len(result_text) > 0, "Result should not be empty"
assert metrics["total_pii_detected"] > 0, "PII should be detected in large text"
# ================================================================================================
# AC3: Execution Audit & Logging - Positive Scenario
# ================================================================================================
def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config):
"""AC3: Test that successful execution context contains a run ID for logging."""
clear_vault_key("test_person_key")
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
context = build_op_context(op_config=op_config_dict)
# Capture run context
run_id = context.run_id
# Execute operation
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
# Verify run identifier is available for logging
assert run_id is not None, "Run ID must be available for audit logging"
# Verify outputs are returned (for Dagster to log)
assert result_text is not None, "Result text should be available for logging"
assert metrics is not None, "Metrics should be available for logging"
def test_ac3_successful_execution_logs_configuration_parameters(
sample_text_en, mixed_technique_config
):
"""AC3: Test that the used configuration is accessible for logging on success."""
clear_vault_key("test_mixed_key")
op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config)
context = build_op_context(op_config=op_config_dict)
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
# Verify configuration is captured and accessible
assert "used_function" in op_config_dict, "Configuration must be accessible for logging"
assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured"
# Verify techniques are logged
techniques = [func["technique"] for func in op_config_dict["used_function"]]
assert any(
"encrypt" in str(tech) for tech in techniques
), "Encrypt technique should be in configuration"
assert any(
"retain" in str(tech) for tech in techniques
), "Retain technique should be in configuration"
# Verify metrics contain technique information (in markdown string)
metrics_str = metrics.value
assert (
"Techniques Applied" in metrics_str
), "Applied techniques should be in metrics for logging"
def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config):
"""AC3: Test that logs and metrics from a successful run do not contain raw PII."""
clear_vault_key("test_person_key")
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
context = build_op_context(op_config=op_config_dict)
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii)
# Verify raw PII values are not in metrics
metrics_str = metrics.value
sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"]
for pii_value in sensitive_values:
assert (
pii_value not in metrics_str
), f"Raw PII value should not appear in metrics: {pii_value}"
# Verify configuration logs do not contain raw PII
config_str = str(op_config_dict)
for pii_value in sensitive_values:
assert (
pii_value not in config_str
), f"Raw PII value should not appear in configuration logs: {pii_value}"
# ================================================================================================
# AC4: Execution Audit & Logging - Negative Scenario
# ================================================================================================
def test_ac4_failed_execution_logs_error_details():
"""AC4: Negative execution should surface clear error details (encryption key failure)."""
text = "Test user John Smith"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_fail_log_key",
)
)
],
)
clear_vault_key("test_fail_log_key")
ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
# Patch the key retrieval used inside unstructured_ops to force failure
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key",
side_effect=RuntimeError("Encryption key retrieval failed"),
):
with pytest.raises(RuntimeError) as exc_info:
# Consume the generator to trigger execution and raise the exception
list(anonymize_pseudonymize_unstructured(ctx, text=text))
msg = str(exc_info.value).lower()
assert "key" in msg and "failed" in msg, "Error message should mention key failure"
def test_ac4_failed_execution_logs_configuration_used():
"""AC4: Test that the attempted configuration is available for logging on failure."""
text = "Test data with person John Doe"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_config_fail_key",
)
)
],
)
clear_vault_key("test_config_fail_key")
op_config_dict = config_to_dagster_dict_unstructured(config)
context = build_op_context(op_config=op_config_dict)
# Mock _initialize_scrubber to fail
with patch(
"field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber"
) as mock_init_scrubber:
mock_init_scrubber.side_effect = Exception("Scrubber module not available")
with pytest.raises((RuntimeError, Exception)) as exc_info:
list(anonymize_pseudonymize_unstructured(context, text=text))
# Verify configuration is still accessible despite failure
assert op_config_dict is not None, "Configuration must be accessible for failure audit"
assert (
"used_function" in op_config_dict
), "Technique configuration should be available for diagnosis"
# Verify error was raised with proper message
error_msg = str(exc_info.value).lower()
assert (
"pii" in error_msg
or "detection" in error_msg
or "failed" in error_msg
or "scrubber" in error_msg
or "module" in error_msg
), "Error should indicate detection/processing failed"
def test_ac4_failed_execution_logs_failure_reason():
"""AC4: Test that the reason for a failure is clearly indicated in the error message."""
text = "User: Alice Smith, Email: alice@example.com"
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.en,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_failure_reason_key",
)
)
],
)
clear_vault_key("test_failure_reason_key")
# Mock key retrieval function to fail
with patch(
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
) as mock_get_key:
mock_get_key.side_effect = RuntimeError("Vault connection timeout")
with pytest.raises(RuntimeError) as exc_info:
run_unstructured_op(config, text)
# Verify failure reason is in error message
error_msg = str(exc_info.value).lower()
assert (
"encrypt" in error_msg
or "key" in error_msg
or "timeout" in error_msg
or "failed" in error_msg
), "Error should indicate key retrieval/encryption failure"
# ================================================================================================
# Additional Tests - Edge Cases and Integration
# ================================================================================================
def test_multi_language_support_italian():
"""Additional test: Verify that Italian text is processed correctly."""
italian_text = """
Il dottor Marco Rossi lavora presso l'ospedale.
Email: marco.rossi@ospedale.it
Telefono: +39-06-12345678
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.it,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON],
key_name="test_italian_key",
)
)
],
)
clear_vault_key("test_italian_key")
result_text, metrics_md = run_unstructured_op(config, italian_text)
metrics = parse_metrics_markdown(metrics_md)
# Verify processing occurred
assert result_text != italian_text, "Italian text should be processed"
assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text"
def test_special_characters_in_text():
"""Additional test: Verify handling of text with special Unicode characters."""
special_text = """
User: João da Silva 🇧🇷
Email: joão@empresa.com.br
Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ®
"""
config = AnonymisePseudonymizeUnstructuredConfig(
language=LanguageEnum.pt,
used_function=[
PseudoTechniqueConfig(
technique=EncryptConfig(
type="encrypt",
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
key_name="test_special_chars_key",
)
)
],
)
clear_vault_key("test_special_chars_key")
result_text, metrics_md = run_unstructured_op(config, special_text)
# Verify processing completed without encoding errors
assert result_text is not None, "Special characters should not cause processing failure"
assert len(result_text) > 0, "Result should not be empty"
def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config):
"""Additional test: Verify encryption format consistency across runs."""
clear_vault_key("test_person_key")
result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en)
result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en)
# Both should have encryption tokens
assert "{encrypt:" in result1, "First run should produce encrypted tokens"
assert "{encrypt:" in result2, "Second run should produce encrypted tokens"
# Verify consistent PII detection
metrics1 = parse_metrics_markdown(metrics_md1)
metrics2 = parse_metrics_markdown(metrics_md2)
assert (
metrics1["total_pii_detected"] == metrics2["total_pii_detected"]
), "PII detection should be consistent across runs"
# Verify token format is consistent (Fernet base64 pattern)
token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}"
tokens1 = re.findall(token_pattern, result1)
tokens2 = re.findall(token_pattern, result2)
assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"

View File

@@ -1,58 +0,0 @@
from template_code_location.field_level_pseudo_anonymisation.jobs import (
anonymize_pseudonymize_structured_job,
anonymize_pseudonymize_structured_job_s3,
depseudonymize_structured_job,
depseudonymize_structured_job_s3,
anonymize_pseudonymize_unstructured_job_s3,
anonymize_pseudonymize_unstructured_job,
depseudonymize_unstructured_job_s3,
depseudonymize_unstructured_job
)
def test_anonymize_pseudonymize_structured_job_is_callable():
"""Test anonymize_pseudonymize_structured_job is a valid Dagster job"""
assert callable(anonymize_pseudonymize_structured_job)
assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process')
def test_anonymize_pseudonymize_structured_job_s3_is_callable():
"""Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job"""
assert callable(anonymize_pseudonymize_structured_job_s3)
assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process')
def test_depseudonymize_structured_job_is_callable():
"""Test depseudonymize_structured_job is a valid Dagster job"""
assert callable(depseudonymize_structured_job)
assert hasattr(depseudonymize_structured_job, 'execute_in_process')
def test_depseudonymize_structured_job_s3_is_callable():
"""Test depseudonymize_structured_job_s3 is a valid Dagster job"""
assert callable(depseudonymize_structured_job_s3)
assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process')
def test_anonymize_pseudonymize_unstructured_job_is_callable():
"""Test anonymize_pseudonymize_unstructured_job is a valid Dagster job"""
assert callable(anonymize_pseudonymize_unstructured_job)
assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process')
def test_anonymize_pseudonymize_unstructured_job_s3_is_callable():
"""Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job"""
assert callable(anonymize_pseudonymize_unstructured_job_s3)
assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process')
def test_depseudonymize_unstructured_job_is_callable():
"""Test depseudonymize_unstructured_job is a valid Dagster job"""
assert callable(depseudonymize_unstructured_job)
assert hasattr(depseudonymize_unstructured_job, 'execute_in_process')
def test_depseudonymize_unstructured_job_s3_is_callable():
"""Test depseudonymize_unstructured_job_s3 is a valid Dagster job"""
assert callable(depseudonymize_unstructured_job_s3)
assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process')