change to import from modules
This commit is contained in:
@@ -41,6 +41,10 @@ dependencies = [
|
|||||||
"cryptography>=42.0.0",
|
"cryptography>=42.0.0",
|
||||||
# Util services — resolved via [tool.uv.sources] (git)
|
# Util services — resolved via [tool.uv.sources] (git)
|
||||||
"util-services",
|
"util-services",
|
||||||
|
# Code location packages — resolved via [tool.uv.sources] (git)
|
||||||
|
"data-processing",
|
||||||
|
"dataframe-level-anonymisation",
|
||||||
|
"field-level-pseudo-anonymisation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv]
|
[tool.uv]
|
||||||
@@ -49,6 +53,9 @@ exclude-dependencies = ["transformers", "spacy-transformers"]
|
|||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
torch = { index = "pytorch-cpu" }
|
torch = { index = "pytorch-cpu" }
|
||||||
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" }
|
util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" }
|
||||||
|
data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "feature/SIMPL-24642" }
|
||||||
|
dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "feature/SIMPL-24642" }
|
||||||
|
field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "feature/SIMPL-24642" }
|
||||||
|
|
||||||
[[tool.uv.index]]
|
[[tool.uv.index]]
|
||||||
name = "pytorch-cpu"
|
name = "pytorch-cpu"
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
"""Configuration models for data processing."""
|
|
||||||
|
|
||||||
from .columns_select_configuration import ColumnsSelectConfiguration
|
|
||||||
from .fill_missing_config import FillMissingConfiguration
|
|
||||||
from .spell_check_configuration import SpellCheckConfiguration
|
|
||||||
from .coordinates_normalization_configuration import CoordinatesNormalizationConfiguration
|
|
||||||
from .aggregation_configuration import AggregationConfiguration
|
|
||||||
from .filter_configuration import DatasetFilterConfiguration, FilterCondition
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"ColumnsSelectConfiguration",
|
|
||||||
"FillMissingConfiguration",
|
|
||||||
"SpellCheckConfiguration",
|
|
||||||
"CoordinatesNormalizationConfiguration",
|
|
||||||
"AggregationConfiguration",
|
|
||||||
"FilterCondition",
|
|
||||||
"DatasetFilterConfiguration"
|
|
||||||
]
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
from typing import List
|
|
||||||
|
|
||||||
from pydantic import Field, field_validator
|
|
||||||
|
|
||||||
from .columns_select_configuration import ColumnsSelectConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
class AggregationConfiguration(ColumnsSelectConfiguration):
|
|
||||||
|
|
||||||
operation: str = Field(
|
|
||||||
default="sum",
|
|
||||||
description="Aggregation operations: sum, mean, min, max, count"
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("operation")
|
|
||||||
@classmethod
|
|
||||||
def validate_operations(cls, value):
|
|
||||||
allowed = {"sum", "mean", "min", "max", "count"}
|
|
||||||
if value not in allowed:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid aggregation operation '{value}'. "
|
|
||||||
f"Allowed values: {allowed}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return value
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
from typing import List
|
|
||||||
from pydantic import Field,field_validator
|
|
||||||
from dagster import Config
|
|
||||||
|
|
||||||
|
|
||||||
class ColumnsSelectConfiguration(Config):
|
|
||||||
columns: List[str] = Field(
|
|
||||||
default=["Name"], description="List of columns to process."
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("columns")
|
|
||||||
@classmethod
|
|
||||||
def ensure_unique_columns(cls, v: List[str]) -> List[str]:
|
|
||||||
|
|
||||||
unique_values = list(dict.fromkeys(v))
|
|
||||||
|
|
||||||
return unique_values
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
from typing import Optional
|
|
||||||
|
|
||||||
from pydantic import Field, model_validator
|
|
||||||
from dagster import Config
|
|
||||||
|
|
||||||
|
|
||||||
class CoordinatesNormalizationConfiguration(Config):
|
|
||||||
latColumn: Optional[str] = Field(
|
|
||||||
default="lat", description="Latitude column name"
|
|
||||||
)
|
|
||||||
lonColumn: Optional[str] = Field(
|
|
||||||
default="lon", description="Longitude column name"
|
|
||||||
)
|
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def replace_nulls_with_defaults(cls, values):
|
|
||||||
if values.get("latColumn") is None:
|
|
||||||
values["latColumn"] = "lat"
|
|
||||||
if values.get("lonColumn") is None:
|
|
||||||
values["lonColumn"] = "lon"
|
|
||||||
return values
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from typing import Dict
|
|
||||||
from dagster import Config
|
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
|
|
||||||
class FillMissingConfiguration(Config):
|
|
||||||
fill_map: Dict[str, str] = Field(
|
|
||||||
default={"Age": "UNKNOWN_AGE"}, description="Missing values filling map."
|
|
||||||
)
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
from enum import Enum
|
|
||||||
import operator
|
|
||||||
from typing import List, Literal, Callable
|
|
||||||
from pydantic import Field, model_validator
|
|
||||||
from dagster import Config
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
class FilterOperator(str, Enum):
|
|
||||||
EQ = "=="
|
|
||||||
NE = "!="
|
|
||||||
LT = "<"
|
|
||||||
LE = "<="
|
|
||||||
GT = ">"
|
|
||||||
GE = ">="
|
|
||||||
|
|
||||||
@property
|
|
||||||
def function(self) -> Callable:
|
|
||||||
mapping = {
|
|
||||||
FilterOperator.EQ: operator.eq,
|
|
||||||
FilterOperator.NE: operator.ne,
|
|
||||||
FilterOperator.LT: operator.lt,
|
|
||||||
FilterOperator.LE: operator.le,
|
|
||||||
FilterOperator.GT: operator.gt,
|
|
||||||
FilterOperator.GE: operator.ge,
|
|
||||||
}
|
|
||||||
return mapping[self]
|
|
||||||
|
|
||||||
class FilterCondition(Config):
|
|
||||||
column: str = Field(..., description="Name of the column to filter")
|
|
||||||
type: Literal["string", "numeric"] = Field(..., description="Column type (string or numeric)")
|
|
||||||
value: str = Field(..., description="Value to compare against")
|
|
||||||
op: FilterOperator = Field(default=FilterOperator.EQ, description="Operator to apply (string supports only EQ and NE)")
|
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def check_operator_compatibility(self) -> "FilterCondition":
|
|
||||||
if self.type == "string" and self.op not in [FilterOperator.EQ, FilterOperator.NE]:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid operator '{self.op.name}' for type 'string'. "
|
|
||||||
"Only EQ (==) and NE (!=) are allowed."
|
|
||||||
)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def apply(self, df: pd.DataFrame) -> pd.Series:
|
|
||||||
val = float(self.value) if self.type == "numeric" else self.value
|
|
||||||
return self.op.function(df[self.column], val)
|
|
||||||
|
|
||||||
class DatasetFilterConfiguration(Config):
|
|
||||||
conditions: List[FilterCondition] = Field(
|
|
||||||
default=[],
|
|
||||||
description="List of filter conditions to apply on the dataset. "
|
|
||||||
"String columns support only 'EQ' and 'NE', numeric columns also support 'LT', 'LE', 'GT' and 'GE'."
|
|
||||||
)
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from typing import Literal
|
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
from .columns_select_configuration import ColumnsSelectConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
class SpellCheckConfiguration(ColumnsSelectConfiguration):
|
|
||||||
language: Literal["en", "es", "it", "fr", "pt", "de", "nl"] = Field(default="en", description="Language to use in the SpellChecker module.")
|
|
||||||
@@ -1,119 +0,0 @@
|
|||||||
from dagster import job
|
|
||||||
from util_services.util_ops import (
|
|
||||||
preview_dataframe,
|
|
||||||
read_structured_from_s3,
|
|
||||||
write_df_to_s3,
|
|
||||||
)
|
|
||||||
from .ops import (
|
|
||||||
remove_duplicates,
|
|
||||||
fill_missing_values,
|
|
||||||
standardize_categorical_values,
|
|
||||||
correct_typos,
|
|
||||||
normalize_numeric_min_max,
|
|
||||||
normalize_datetime,
|
|
||||||
normalize_coordinates,
|
|
||||||
add_global_aggregations,
|
|
||||||
filter_dataset
|
|
||||||
)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def remove_duplicates_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = remove_duplicates(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def fill_missing_values_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = fill_missing_values(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def standardize_categorical_values_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = standardize_categorical_values(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def correct_typos_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = correct_typos(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def normalize_numeric_min_max_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = normalize_numeric_min_max(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def normalize_datetime_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = normalize_datetime(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def normalize_coordinates_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = normalize_coordinates(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def add_global_aggregations_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = add_global_aggregations(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "PROCESSING",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def filter_dataset_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df = filter_dataset(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
@@ -1,256 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from dagster import Out, op
|
|
||||||
from spellchecker import SpellChecker
|
|
||||||
|
|
||||||
from template_code_location.data_processing.config_models import (
|
|
||||||
AggregationConfiguration,
|
|
||||||
ColumnsSelectConfiguration,
|
|
||||||
CoordinatesNormalizationConfiguration,
|
|
||||||
FillMissingConfiguration,
|
|
||||||
SpellCheckConfiguration,
|
|
||||||
DatasetFilterConfiguration
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_dms_to_decimal(value):
|
|
||||||
"""Parse a DMS (degrees-minutes-seconds) string to decimal degrees using PyGeodesy.
|
|
||||||
|
|
||||||
Supported formats include (but are not limited to):
|
|
||||||
- 40°26'46"N / 40°26′46″N
|
|
||||||
- 40 26 46 N
|
|
||||||
- 40:26:46N
|
|
||||||
- 40d26m46sN
|
|
||||||
- -40.446 (already decimal – returned as-is)
|
|
||||||
|
|
||||||
Returns None if parsing fails.
|
|
||||||
"""
|
|
||||||
from pygeodesy.dms import parseDMS
|
|
||||||
|
|
||||||
if pd.isna(value):
|
|
||||||
return None
|
|
||||||
|
|
||||||
text = str(value).strip()
|
|
||||||
if not text:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
return float(parseDMS(text))
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
try:
|
|
||||||
return float(text)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def remove_duplicates(context, df: pd.DataFrame):
|
|
||||||
"""Remove duplicate rows from the input DataFrame."""
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
before = df.shape[0]
|
|
||||||
|
|
||||||
df = df.drop_duplicates()
|
|
||||||
|
|
||||||
after = df.shape[0]
|
|
||||||
|
|
||||||
logger.info(f"Removed {before - after} duplicate rows")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def fill_missing_values(context, config: FillMissingConfiguration, df: pd.DataFrame):
|
|
||||||
"""Fill missing values in the DataFrame according to the configured column-to-value mapping."""
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
logger.info(f"Filling missing values: {config.fill_map}")
|
|
||||||
|
|
||||||
return df.fillna(config.fill_map)
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def standardize_categorical_values(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
|
|
||||||
"""Standardize categorical values in selected columns by trimming whitespace and converting text to lowercase."""
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
for col in config.columns:
|
|
||||||
if col not in df.columns:
|
|
||||||
logger.warning(f"Column '{col}' not found in DataFrame, skipping.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
original = df[col]
|
|
||||||
|
|
||||||
standardized = (
|
|
||||||
df[col]
|
|
||||||
.fillna("")
|
|
||||||
.astype(str)
|
|
||||||
.str.strip()
|
|
||||||
.str.lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
changed_count = (original != standardized).sum()
|
|
||||||
df[col] = standardized
|
|
||||||
|
|
||||||
logger.info(f"Standardized '{col}' column – {changed_count} values modified")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def correct_typos(context, config: SpellCheckConfiguration, df: pd.DataFrame):
|
|
||||||
"""Correct spelling mistakes in the specified text columns."""
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
for column in config.columns:
|
|
||||||
if column not in df.columns:
|
|
||||||
logger.warning(f"Column '{column}' not found in DataFrame, skipping.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
spell = SpellChecker(language=config.language)
|
|
||||||
|
|
||||||
original = df[column].astype(str)
|
|
||||||
corrected = original.apply(lambda x, spell_checker=spell: spell_checker.correction(x) if x else x)
|
|
||||||
|
|
||||||
changed_count = (original != corrected).sum()
|
|
||||||
logger.info(f"Corrected typos in '{column}' – {changed_count} values modified")
|
|
||||||
|
|
||||||
df[column] = corrected
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def normalize_datetime(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
for col in config.columns:
|
|
||||||
if col not in df.columns:
|
|
||||||
logger.warning(f"Column '{col}' not found, skipping normalization.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
normalized = pd.to_datetime(df[col], utc=True, format="mixed", dayfirst=True, errors="coerce")
|
|
||||||
|
|
||||||
if normalized.notna().sum() == 0:
|
|
||||||
logger.warning(
|
|
||||||
f"Column '{col}' has no normalizable datetime values, skipping."
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
iso_col = f"{col}_iso"
|
|
||||||
|
|
||||||
formatted = normalized.dt.strftime("%Y-%m-%dT%H:%M:%SZ").fillna("")
|
|
||||||
non_empty = formatted[formatted != ""]
|
|
||||||
if len(non_empty) > 0 and non_empty.str.startswith("1970-01-01").all():
|
|
||||||
logger.warning(
|
|
||||||
f"Column '{col}' all normalized values are '1970-01-01', likely bad input — skipping."
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
df[iso_col] = formatted
|
|
||||||
|
|
||||||
logger.info(f"Normalized datetime column '{col}' into '{iso_col}'")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def normalize_numeric_min_max(context, config: ColumnsSelectConfiguration, df: pd.DataFrame):
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
for col in config.columns:
|
|
||||||
if col not in df.columns:
|
|
||||||
logger.warning(f"Column '{col}' not found, skipping normalization.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
min_val = df[col].min()
|
|
||||||
max_val = df[col].max()
|
|
||||||
|
|
||||||
if min_val == max_val:
|
|
||||||
logger.warning(f"Column '{col}' has constant values, skipping normalization.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
df[col + "_norm"] = (df[col] - min_val) / (max_val - min_val)
|
|
||||||
logger.info(f"Normalized numeric column '{col}'")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def normalize_coordinates(context, config: CoordinatesNormalizationConfiguration, df: pd.DataFrame):
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
lat = config.latColumn
|
|
||||||
lon = config.lonColumn
|
|
||||||
|
|
||||||
for col in [lat, lon]:
|
|
||||||
if pd.api.types.is_numeric_dtype(df[col]):
|
|
||||||
logger.info(f"Column '{col}' is numeric — coercing directly")
|
|
||||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
||||||
else:
|
|
||||||
logger.info(f"Column '{col}' is non-numeric — parsing as DMS with PyGeodesy")
|
|
||||||
df[col] = df[col].apply(_parse_dms_to_decimal)
|
|
||||||
|
|
||||||
invalid_lat = df[lat].isnull().sum()
|
|
||||||
invalid_lon = df[lon].isnull().sum()
|
|
||||||
logger.info(f"Found {invalid_lat} invalid latitudes and {invalid_lon} invalid longitudes")
|
|
||||||
|
|
||||||
df[lat] = df[lat].round(4)
|
|
||||||
df[lon] = df[lon].round(4)
|
|
||||||
|
|
||||||
before_filter_rows = len(df)
|
|
||||||
df = df[(df[lat].between(-90, 90)) & (df[lon].between(-180, 180))]
|
|
||||||
after_filter_rows = len(df)
|
|
||||||
logger.info(f"Filtered coordinates out of range: removed {before_filter_rows - after_filter_rows} rows")
|
|
||||||
|
|
||||||
logger.info(f"Coordinate normalization completed: resulting dataframe has {after_filter_rows} rows")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def add_global_aggregations(context, config: AggregationConfiguration, df: pd.DataFrame):
|
|
||||||
logger = context.log
|
|
||||||
|
|
||||||
group_by_cols = []
|
|
||||||
|
|
||||||
for col in config.columns:
|
|
||||||
if col not in df.columns:
|
|
||||||
logger.warning(f"Column '{col}' not found, skipping aggregation.")
|
|
||||||
continue
|
|
||||||
group_by_cols.append(col)
|
|
||||||
|
|
||||||
if config.operation not in {"sum", "mean", "min", "max", "count"}:
|
|
||||||
logger.warning(f"Unsupported aggregation '{config.operation}'")
|
|
||||||
|
|
||||||
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
|
|
||||||
cols_to_keep = list(set(numeric_cols + group_by_cols))
|
|
||||||
df = df[[c for c in cols_to_keep if c in df.columns]]
|
|
||||||
df = df.groupby(group_by_cols).agg(config.operation).reset_index()
|
|
||||||
return df
|
|
||||||
|
|
||||||
@op(out={"data": Out()})
|
|
||||||
def filter_dataset(context, config: DatasetFilterConfiguration, df: pd.DataFrame):
|
|
||||||
logger = context.log
|
|
||||||
total_rows_before = len(df)
|
|
||||||
|
|
||||||
logger.info(f"Starting dataset filtering: initial dataframe has {total_rows_before} rows")
|
|
||||||
|
|
||||||
combined_mask = pd.Series([True] * total_rows_before, index=df.index)
|
|
||||||
|
|
||||||
for condition in config.conditions:
|
|
||||||
if condition.column not in df.columns:
|
|
||||||
logger.warning(f"Column '{condition.column}' not found, skipping filtering.")
|
|
||||||
continue
|
|
||||||
if df[condition.column].isna().all():
|
|
||||||
logger.warning(f"Column '{condition.column}' is empty (all NaN), skipping filtering.")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
current_mask = condition.apply(df)
|
|
||||||
combined_mask &= current_mask
|
|
||||||
|
|
||||||
logger.info(f"Applied filter: {condition.column} {condition.op.value} '{condition.value}'")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error applying filter on column '{condition.column}': {e}")
|
|
||||||
|
|
||||||
filtered_df = df[combined_mask]
|
|
||||||
total_rows_after = len(filtered_df)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Filtering completed: {total_rows_after} rows remain "
|
|
||||||
f"(removed {total_rows_before - total_rows_after} rows in total)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return filtered_df
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
"""Configuration models for dataframe-level anonymization."""
|
|
||||||
|
|
||||||
from .k_anonymity_configuration import KAnonymityConfiguration
|
|
||||||
from .l_diversity_configuration import LDiversityConfiguration
|
|
||||||
from .t_closeness_configuration import TClosenessConfiguration
|
|
||||||
from .base_config import BaseConfiguration
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"BaseConfiguration",
|
|
||||||
"KAnonymityConfiguration",
|
|
||||||
"LDiversityConfiguration",
|
|
||||||
"TClosenessConfiguration",
|
|
||||||
]
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
from typing import Dict, List
|
|
||||||
from dagster import Config
|
|
||||||
from pydantic import Field, field_validator, model_validator
|
|
||||||
|
|
||||||
|
|
||||||
class BaseConfiguration(Config):
|
|
||||||
ident: List[str] = Field(default=["Name"], description="List of identifier column names.")
|
|
||||||
quasi_identifiers: List[str] = Field(default=["Age"], description="List of quasi-identifier column names.")
|
|
||||||
supp_level: float = Field(default=50.0, ge=0.0, le=100.0, description="Max suppression allowed (0–100).")
|
|
||||||
generalisation_hierarchies: Dict[str, str] = Field(
|
|
||||||
default={"Age": "simpl_age"}, description="Hierarchies used to generalize quasi-identifiers."
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("quasi_identifiers")
|
|
||||||
def validate_quasi_identifiers(cls, value):
|
|
||||||
if not value:
|
|
||||||
raise ValueError("At least one quasi-identifier must be provided.")
|
|
||||||
return value
|
|
||||||
|
|
||||||
@field_validator("ident")
|
|
||||||
def validate_ident(cls, value):
|
|
||||||
if not value:
|
|
||||||
raise ValueError("At least one identifier must be provided.")
|
|
||||||
return value
|
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def check_no_overlap(self):
|
|
||||||
ident = set(self.ident)
|
|
||||||
quasi = set(self.quasi_identifiers)
|
|
||||||
overlap = ident & quasi
|
|
||||||
if overlap:
|
|
||||||
raise ValueError(f"Fields cannot be both identifiers and quasi-identifiers: {overlap}")
|
|
||||||
return self
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
from anjana.anonymity.utils import utils
|
|
||||||
|
|
||||||
simpl_age = {
|
|
||||||
0: [age for age in range(0, 100)],
|
|
||||||
1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5),
|
|
||||||
2: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 10),
|
|
||||||
3: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 20),
|
|
||||||
4: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 100),
|
|
||||||
}
|
|
||||||
simpl_age2 = {
|
|
||||||
0: [age for age in range(0, 100)],
|
|
||||||
1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5),
|
|
||||||
}
|
|
||||||
simpl_gender = {0: ["M", "F", "O"], 1: ["*", "*", "*"]}
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_hierarchies():
|
|
||||||
return {name: obj for name, obj in globals().items() if isinstance(obj, dict)}
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from typing import List
|
|
||||||
from pydantic import Field
|
|
||||||
|
|
||||||
from .base_config import BaseConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
class KAnonymityConfiguration(BaseConfiguration):
|
|
||||||
k: int = Field(default=3, ge=2, description="Desired level of k-anonymity (must be >= 2).")
|
|
||||||
sensitive_attributes: List[str] = Field(
|
|
||||||
default=["Disease"], description="List of sensitive attribute column names."
|
|
||||||
)
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from pydantic import Field
|
|
||||||
from .base_config import BaseConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
class LDiversityConfiguration(BaseConfiguration):
|
|
||||||
k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).")
|
|
||||||
l: int = Field(default=3, ge=1, description="L-diversity level (must be >= 1)")
|
|
||||||
sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.")
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from pydantic import Field
|
|
||||||
from .base_config import BaseConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
class TClosenessConfiguration(BaseConfiguration):
|
|
||||||
k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).")
|
|
||||||
t: float = Field(default=0.5, ge=0.0, le=1.0, description="Maximum t-distance threshold.")
|
|
||||||
sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.")
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
from dagster import job
|
|
||||||
from util_services.util_ops import (
|
|
||||||
preview_dataframe,
|
|
||||||
read_structured_to_df,
|
|
||||||
write_df_to_local,
|
|
||||||
read_structured_from_s3,
|
|
||||||
write_df_to_s3,
|
|
||||||
write_semistructured_to_s3,
|
|
||||||
read_semistructured_from_s3
|
|
||||||
)
|
|
||||||
|
|
||||||
from .ops import apply_k_anonymity, apply_l_diversity, apply_t_closeness
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION"
|
|
||||||
})
|
|
||||||
def k_anonymity_job():
|
|
||||||
org_df = read_structured_to_df()
|
|
||||||
anon_df, _ = apply_k_anonymity(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_local(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION"
|
|
||||||
})
|
|
||||||
def l_diversity_job():
|
|
||||||
org_df = read_structured_to_df()
|
|
||||||
anon_df, _ = apply_l_diversity(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_local(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION"
|
|
||||||
})
|
|
||||||
def t_closeness_job():
|
|
||||||
org_df = read_structured_to_df()
|
|
||||||
anon_df, _ = apply_t_closeness(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_local(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def k_anonymity_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df, _ = apply_k_anonymity(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def l_diversity_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df, _ = apply_l_diversity(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def t_closeness_job_s3():
|
|
||||||
org_df = read_structured_from_s3()
|
|
||||||
anon_df, _ = apply_t_closeness(org_df)
|
|
||||||
preview_dataframe(org_df)
|
|
||||||
write_df_to_s3(anon_df)
|
|
||||||
preview_dataframe(anon_df)
|
|
||||||
|
|
||||||
@job()
|
|
||||||
def read_write_semistructured_job_s3():
|
|
||||||
semistruct_data = read_semistructured_from_s3()
|
|
||||||
write_semistructured_to_s3(semistruct_data)
|
|
||||||
@@ -1,187 +0,0 @@
|
|||||||
import json
|
|
||||||
from textwrap import dedent
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from anjana.anonymity import k_anonymity, l_diversity, t_closeness
|
|
||||||
from dagster import (
|
|
||||||
DagsterInvalidInvocationError,
|
|
||||||
MarkdownMetadataValue,
|
|
||||||
Out,
|
|
||||||
Output,
|
|
||||||
get_dagster_logger,
|
|
||||||
op,
|
|
||||||
)
|
|
||||||
from pycanon import anonymity
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models import (
|
|
||||||
KAnonymityConfiguration,
|
|
||||||
LDiversityConfiguration,
|
|
||||||
TClosenessConfiguration,
|
|
||||||
)
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import get_all_hierarchies
|
|
||||||
|
|
||||||
|
|
||||||
def _calc_dataframe_metrics(df_anon, df_org, quasi_identifiers, sensitive_atttributes):
|
|
||||||
# --- Metrics ---
|
|
||||||
# Anonymization metrics
|
|
||||||
k_anon = anonymity.k_anonymity(df_anon, quasi_identifiers)
|
|
||||||
l_div = anonymity.l_diversity(df_anon, quasi_identifiers, sensitive_atttributes, True)
|
|
||||||
t_clos = anonymity.t_closeness(df_anon, quasi_identifiers, sensitive_atttributes, True)
|
|
||||||
|
|
||||||
# Data Utilization metrics
|
|
||||||
supression_rate = 1 - len(df_anon) / len(df_org)
|
|
||||||
grouped = df_anon.groupby(quasi_identifiers)
|
|
||||||
mean_equivalence_class_size = len(df_anon) / len(grouped) if len(grouped) else 0
|
|
||||||
|
|
||||||
# flake8: noqa
|
|
||||||
anon_report = dedent(
|
|
||||||
f"""
|
|
||||||
### Anonymization & Data Utilization Metrics
|
|
||||||
|
|
||||||
| Metric | Value | Description |
|
|
||||||
|--------|-------|-------------|
|
|
||||||
| **k-anonymity** | `k = {k_anon}` | Minimum number of records sharing the same quasi-identifier values. |
|
|
||||||
| **l-diversity** | `l = {l_div}` | Diversity of sensitive attributes within each equivalence class. |
|
|
||||||
| **t-closeness** | `t = {round(t_clos, 2)}` | Distance between sensitive attribute distribution in a group and the overall dataset. |
|
|
||||||
| **Suppression rate** | `{round(supression_rate, 2)}` | Fraction of records or attributes suppressed to meet privacy requirements. |
|
|
||||||
| **Mean equivalence class size** | `{round(mean_equivalence_class_size, 2)}` | Average size of equivalence classes for quasi-identifiers, indicates data grouping. |
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
# flake8: enable
|
|
||||||
metrics = {
|
|
||||||
"k_anon": k_anon,
|
|
||||||
"l_div": l_div,
|
|
||||||
"t_clos": t_clos,
|
|
||||||
"supp_rate": supression_rate,
|
|
||||||
"mean_equivalence_class": mean_equivalence_class_size,
|
|
||||||
}
|
|
||||||
return anon_report, metrics
|
|
||||||
|
|
||||||
|
|
||||||
def _validate_and_get_hierarchies(config, df: pd.DataFrame):
|
|
||||||
hierarchies = get_all_hierarchies()
|
|
||||||
|
|
||||||
# Dataset smaller than k
|
|
||||||
if len(df) < config.k:
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
f"Cannot apply k-anonymity: dataset has {len(df)} records, but k={config.k}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Missing or incomplete generalisation hierarchies
|
|
||||||
for qi in config.quasi_identifiers:
|
|
||||||
if qi not in config.generalisation_hierarchies or not config.generalisation_hierarchies[qi]:
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
f"Generalisation hierarchy for quasi-identifier '{qi}' is missing or incomplete"
|
|
||||||
)
|
|
||||||
if config.generalisation_hierarchies[qi] not in hierarchies:
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
f"Generalisation hierarchy '{config.generalisation_hierarchies[qi]}' is missing in the code basis"
|
|
||||||
)
|
|
||||||
|
|
||||||
hier = {
|
|
||||||
qi: hierarchies[config.generalisation_hierarchies[qi]] for qi in config.quasi_identifiers
|
|
||||||
}
|
|
||||||
return hier
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def apply_k_anonymity(context, config: KAnonymityConfiguration, df: pd.DataFrame):
|
|
||||||
|
|
||||||
hier = _validate_and_get_hierarchies(config, df)
|
|
||||||
|
|
||||||
data_anon = k_anonymity(
|
|
||||||
df, config.ident, config.quasi_identifiers, config.k, config.supp_level, hier
|
|
||||||
)
|
|
||||||
if "index" in data_anon.columns and "index" not in df.columns:
|
|
||||||
data_anon.drop(columns="index", inplace=True)
|
|
||||||
anon_report, metrics = _calc_dataframe_metrics(
|
|
||||||
data_anon, df, config.quasi_identifiers, config.sensitive_attributes
|
|
||||||
)
|
|
||||||
yield Output(
|
|
||||||
value=data_anon,
|
|
||||||
metadata={
|
|
||||||
"metric_report": MarkdownMetadataValue(anon_report),
|
|
||||||
"metric_json": json.dumps(metrics),
|
|
||||||
},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value=metrics, output_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def apply_l_diversity(context, config: LDiversityConfiguration, df: pd.DataFrame):
|
|
||||||
|
|
||||||
hier = _validate_and_get_hierarchies(config, df)
|
|
||||||
|
|
||||||
data_anon = l_diversity(
|
|
||||||
df,
|
|
||||||
config.ident,
|
|
||||||
config.quasi_identifiers,
|
|
||||||
config.sensitive_attribute,
|
|
||||||
config.k,
|
|
||||||
config.l,
|
|
||||||
config.supp_level,
|
|
||||||
hier,
|
|
||||||
)
|
|
||||||
if data_anon.empty:
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
"Could not tranform the data to l-diversity, empty dataset returned!"
|
|
||||||
)
|
|
||||||
anon_report, metrics = _calc_dataframe_metrics(
|
|
||||||
data_anon, df, config.quasi_identifiers, [config.sensitive_attribute]
|
|
||||||
)
|
|
||||||
yield Output(
|
|
||||||
value=data_anon,
|
|
||||||
metadata={
|
|
||||||
"metric_report": MarkdownMetadataValue(anon_report),
|
|
||||||
"metric_json": json.dumps(metrics),
|
|
||||||
},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value=metrics, output_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def apply_t_closeness(context, config: TClosenessConfiguration, df: pd.DataFrame):
|
|
||||||
|
|
||||||
hier = _validate_and_get_hierarchies(config, df)
|
|
||||||
|
|
||||||
try:
|
|
||||||
data_anon = t_closeness(
|
|
||||||
df,
|
|
||||||
config.ident,
|
|
||||||
config.quasi_identifiers,
|
|
||||||
config.sensitive_attribute,
|
|
||||||
config.k,
|
|
||||||
config.t,
|
|
||||||
config.supp_level,
|
|
||||||
hier,
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
|
||||||
if "Cannot be quasi-identifiers" in str(e):
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
f"T-closeness failed: k-anonymity parameter = {config.k} is too small "
|
|
||||||
f"for existing hierarchies of {config.quasi_identifiers} in inner k-anonymity call."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Re-raise other ValueError types with context
|
|
||||||
raise DagsterInvalidInvocationError(f"T-closeness failed with error: {str(e)}")
|
|
||||||
|
|
||||||
if data_anon.empty:
|
|
||||||
raise DagsterInvalidInvocationError(
|
|
||||||
f"Could not transform the data to t-closeness, empty dataset returned! "
|
|
||||||
f"This may indicate that the t-closeness constraint (t={config.t}) is too strict for the given data."
|
|
||||||
)
|
|
||||||
|
|
||||||
anon_report, metrics = _calc_dataframe_metrics(
|
|
||||||
data_anon, df, config.quasi_identifiers, [config.sensitive_attribute]
|
|
||||||
)
|
|
||||||
yield Output(
|
|
||||||
value=data_anon,
|
|
||||||
metadata={
|
|
||||||
"metric_report": MarkdownMetadataValue(anon_report),
|
|
||||||
"metric_json": json.dumps(metrics),
|
|
||||||
},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value=metrics, output_name="metrics")
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def parse_value_list(values):
|
|
||||||
return [int(v) if isinstance(v, str) and v.isdigit() else v for v in values]
|
|
||||||
|
|
||||||
|
|
||||||
# Hierarchy normalization for Anjana
|
|
||||||
def normalize_hierarchy_levels(hierarchy_dict):
|
|
||||||
normalized = {}
|
|
||||||
for column, levels in hierarchy_dict.items():
|
|
||||||
normalized[column] = {}
|
|
||||||
for level_str, mapping_list in levels.items():
|
|
||||||
level = int(level_str)
|
|
||||||
if level == 0:
|
|
||||||
normalized[column][level] = np.array(parse_value_list(mapping_list))
|
|
||||||
else:
|
|
||||||
normalized[column][level] = mapping_list
|
|
||||||
return normalized
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
from .structured_config import ( # noqa: F401
|
|
||||||
HashConfig,
|
|
||||||
EncryptConfig,
|
|
||||||
RedactConfig,
|
|
||||||
ReplaceConfig,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
AnonymisePseudonymizeStructuredConfig,
|
|
||||||
DecryptConfig,
|
|
||||||
DepseudoTechniqueConfig,
|
|
||||||
DepseudonymizeStructuredConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .unstructured_config import ( # noqa: F401, F811
|
|
||||||
HashConfig,
|
|
||||||
EncryptConfig,
|
|
||||||
RedactConfig,
|
|
||||||
ReplaceConfig,
|
|
||||||
RetainConfig,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig,
|
|
||||||
DecryptConfig,
|
|
||||||
DepseudoTechniqueConfig,
|
|
||||||
DepseudonymizeUnstructuredConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .languages import SupportedLanguages, LanguageEnum # noqa: F401
|
|
||||||
|
|
||||||
from .pii_entities import PIIEntityEnum, PII_MAPPING # noqa: F401
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
from enum import Enum
|
|
||||||
from typing import ClassVar
|
|
||||||
|
|
||||||
|
|
||||||
class SupportedLanguages:
|
|
||||||
LANGUAGES: ClassVar[dict[str, str]] = {
|
|
||||||
"hr": "hr_HR", # Croatian
|
|
||||||
"da": "da_DK", # Danish
|
|
||||||
"nl": "nl_NL", # Dutch
|
|
||||||
"en": "en_US", # English
|
|
||||||
"fi": "fi_FI", # Finnish
|
|
||||||
"fr": "fr_FR", # French
|
|
||||||
"de": "de_DE", # German
|
|
||||||
"el": "el_GR", # Greek
|
|
||||||
"it": "it_IT", # Italian
|
|
||||||
"lt": "lt_LT", # Lithuanian
|
|
||||||
"pl": "pl_PL", # Polish
|
|
||||||
"pt": "pt_PT", # Portuguese
|
|
||||||
"ro": "ro_RO", # Romanian
|
|
||||||
"sl": "sl_SI", # Slovenian
|
|
||||||
"es": "es_ES", # Spanish
|
|
||||||
"sv": "sv_SE", # Swedish
|
|
||||||
}
|
|
||||||
LANGUAGE_MODELS = {
|
|
||||||
"en": "en_core_web_sm",
|
|
||||||
"it": "it_core_news_sm",
|
|
||||||
"de": "de_core_news_sm",
|
|
||||||
"fr": "fr_core_news_sm",
|
|
||||||
"es": "es_core_news_sm",
|
|
||||||
"nl": "nl_core_news_sm",
|
|
||||||
"da": "da_core_news_sm",
|
|
||||||
"sv": "sv_core_news_sm",
|
|
||||||
"fi": "fi_core_news_sm",
|
|
||||||
"pl": "pl_core_news_sm",
|
|
||||||
"el": "el_core_news_sm",
|
|
||||||
"hr": "hr_core_news_sm",
|
|
||||||
"lt": "lt_core_news_sm",
|
|
||||||
"pt": "pt_core_news_sm",
|
|
||||||
"ro": "ro_core_news_sm",
|
|
||||||
"sl": "sl_core_news_sm",
|
|
||||||
}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def codes(cls) -> list[str]:
|
|
||||||
return list(cls.LANGUAGES.keys())
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_locale(cls, code: str) -> str:
|
|
||||||
return cls.LANGUAGES[code]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_language_model(cls, code: str) -> str:
|
|
||||||
return cls.LANGUAGE_MODELS[code]
|
|
||||||
|
|
||||||
|
|
||||||
class LanguageEnum(str, Enum):
|
|
||||||
hr = "hr"
|
|
||||||
da = "da"
|
|
||||||
nl = "nl"
|
|
||||||
en = "en"
|
|
||||||
fi = "fi"
|
|
||||||
fr = "fr"
|
|
||||||
de = "de"
|
|
||||||
el = "el"
|
|
||||||
it = "it"
|
|
||||||
lt = "lt"
|
|
||||||
pl = "pl"
|
|
||||||
pt = "pt"
|
|
||||||
ro = "ro"
|
|
||||||
sl = "sl"
|
|
||||||
es = "es"
|
|
||||||
sv = "sv"
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
from enum import Enum
|
|
||||||
|
|
||||||
|
|
||||||
class PIIEntityEnum(str, Enum):
|
|
||||||
PERSON = "Person"
|
|
||||||
EMAIL = "Email"
|
|
||||||
CREDIT_CARD = "Credit card"
|
|
||||||
DATE_OF_BIRTH = "Date of birth"
|
|
||||||
URL = "URLs"
|
|
||||||
PHONE_NUMBERS = "Phone numbers"
|
|
||||||
CREDENTIALS = "Credentials"
|
|
||||||
X_SOCIAL = "X (formally known as Twitter) username"
|
|
||||||
|
|
||||||
|
|
||||||
PII_MAPPING: dict[PIIEntityEnum, str] = {
|
|
||||||
PIIEntityEnum.PERSON: "NameFilth",
|
|
||||||
PIIEntityEnum.EMAIL: "EmailFilth",
|
|
||||||
PIIEntityEnum.CREDIT_CARD: "CreditCardFilth",
|
|
||||||
PIIEntityEnum.DATE_OF_BIRTH: "DateOfBirthFilth",
|
|
||||||
PIIEntityEnum.URL: "UrlFilth",
|
|
||||||
PIIEntityEnum.PHONE_NUMBERS: "PhoneFilth",
|
|
||||||
PIIEntityEnum.CREDENTIALS: "CredentialFilth",
|
|
||||||
PIIEntityEnum.X_SOCIAL: "TwitterFilth",
|
|
||||||
}
|
|
||||||
@@ -1,110 +0,0 @@
|
|||||||
from typing import List, Literal, Optional, Union
|
|
||||||
|
|
||||||
from dagster import Config
|
|
||||||
from pydantic import Field as PydanticField, model_validator, field_validator
|
|
||||||
|
|
||||||
|
|
||||||
class HashConfig(Config):
|
|
||||||
type: Literal["hash"] = "hash"
|
|
||||||
columns: List[str] = PydanticField(default=["example_column"], description="Columns to hash")
|
|
||||||
algorithm: str = PydanticField(default="sha256", description="Hashing algorithm")
|
|
||||||
|
|
||||||
class EncryptConfig(Config):
|
|
||||||
type: Literal["encrypt"] = "encrypt"
|
|
||||||
columns: List[str] = PydanticField(default=["example_column"], description="Columns to encrypt")
|
|
||||||
key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption")
|
|
||||||
|
|
||||||
class RedactConfig(Config):
|
|
||||||
type: Literal["redact"] = "redact"
|
|
||||||
columns: List[str] = PydanticField(default=["example_column"], description="Columns to redact")
|
|
||||||
|
|
||||||
class ReplaceConfig(Config):
|
|
||||||
type: Literal["replace"] = "replace"
|
|
||||||
columns: List[str] = PydanticField(default=["example_column"], description="Columns to replace")
|
|
||||||
new_value: str = PydanticField(default="REPLACED", description="Replacement value")
|
|
||||||
|
|
||||||
class PseudoTechniqueConfig(Config):
|
|
||||||
technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig] = PydanticField(
|
|
||||||
default={"hash": HashConfig().model_dump(exclude={"type"})},
|
|
||||||
discriminator="type"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AnonymisePseudonymizeStructuredConfig(Config):
|
|
||||||
used_function: List[PseudoTechniqueConfig] = PydanticField(
|
|
||||||
default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}],
|
|
||||||
description=("List of functions to be used on column"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def ensure_unique_columns(self):
|
|
||||||
column_to_techniques = self._collect_column_to_techniques()
|
|
||||||
duplicates = {
|
|
||||||
col: techs for col, techs in column_to_techniques.items() if len(techs) > 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if duplicates:
|
|
||||||
formatted = "; ".join(
|
|
||||||
f"{col} -> {', '.join(techs)}" for col, techs in duplicates.items()
|
|
||||||
)
|
|
||||||
raise ValueError(f"Duplicate column(s) across techniques not allowed:\n{formatted}")
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _collect_column_to_techniques(self):
|
|
||||||
"""Extract column-to-techniques mapping from used_function list."""
|
|
||||||
column_to_techniques = {}
|
|
||||||
for f in self.used_function:
|
|
||||||
technique_type, cols = self._extract_technique_and_columns(f)
|
|
||||||
for col in cols:
|
|
||||||
column_to_techniques.setdefault(col, []).append(technique_type)
|
|
||||||
return column_to_techniques
|
|
||||||
|
|
||||||
def _extract_technique_and_columns(self, item):
|
|
||||||
"""Extract technique type and columns list from a PseudoTechniqueConfig item (dict or model instance)."""
|
|
||||||
if isinstance(item, dict):
|
|
||||||
tech = item.get("technique") or {}
|
|
||||||
if isinstance(tech, dict):
|
|
||||||
if "type" in tech:
|
|
||||||
return tech.get("type"), tech.get("columns") or []
|
|
||||||
elif len(tech) == 1:
|
|
||||||
# variant-key mapping: {'hash': {...}}
|
|
||||||
technique_type, inner = next(iter(tech.items()))
|
|
||||||
return technique_type, inner.get("columns") or []
|
|
||||||
return None, []
|
|
||||||
else:
|
|
||||||
# item is a PseudoTechniqueConfig instance
|
|
||||||
technique_type = item.technique.type
|
|
||||||
cols = getattr(item.technique, "columns", [])
|
|
||||||
return technique_type, cols
|
|
||||||
|
|
||||||
class DecryptConfig(Config):
|
|
||||||
type: Literal["decrypt"] = "decrypt"
|
|
||||||
columns: List[str] = PydanticField(default=["example_column"], description="Columns to decrypt")
|
|
||||||
key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption")
|
|
||||||
|
|
||||||
class DepseudoTechniqueConfig(Config):
|
|
||||||
technique: DecryptConfig = PydanticField(default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})})
|
|
||||||
|
|
||||||
|
|
||||||
class DepseudonymizeStructuredConfig(Config):
|
|
||||||
used_function: List[DepseudoTechniqueConfig] = PydanticField(
|
|
||||||
default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}],
|
|
||||||
description=("Decryption functions to be used on column"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("used_function", mode="before")
|
|
||||||
def _normalize_depseudo_used_function(cls, v):
|
|
||||||
normalized = []
|
|
||||||
for item in v:
|
|
||||||
if isinstance(item, dict):
|
|
||||||
normalized.append(DepseudoTechniqueConfig.model_validate(item))
|
|
||||||
else:
|
|
||||||
normalized.append(item)
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def ensure_unique_columns(self):
|
|
||||||
# For depseudonymize, we don't have per-column uniqueness constraints,
|
|
||||||
# but keep a no-op validator to preserve API parity.
|
|
||||||
return self
|
|
||||||
@@ -1,115 +0,0 @@
|
|||||||
from typing import List, Literal, Optional, Union
|
|
||||||
|
|
||||||
from dagster import Config
|
|
||||||
from pydantic import Field as PydanticField, model_validator, field_validator
|
|
||||||
from .languages import LanguageEnum
|
|
||||||
from .pii_entities import PIIEntityEnum
|
|
||||||
|
|
||||||
|
|
||||||
class HashConfig(Config):
|
|
||||||
type: Literal["hash"] = "hash"
|
|
||||||
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to hash")
|
|
||||||
algorithm: str = PydanticField(default="sha256", description="Hashing algorithm")
|
|
||||||
|
|
||||||
class EncryptConfig(Config):
|
|
||||||
type: Literal["encrypt"] = "encrypt"
|
|
||||||
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to encrypt")
|
|
||||||
key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption")
|
|
||||||
|
|
||||||
|
|
||||||
class RedactConfig(Config):
|
|
||||||
type: Literal["redact"] = "redact"
|
|
||||||
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to redact")
|
|
||||||
|
|
||||||
class ReplaceConfig(Config):
|
|
||||||
type: Literal["replace"] = "replace"
|
|
||||||
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to replace")
|
|
||||||
new_value: str = PydanticField(default="REPLACED", description="Replacement value")
|
|
||||||
|
|
||||||
class RetainConfig(Config):
|
|
||||||
type: Literal["retain"] = "retain"
|
|
||||||
pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to retain")
|
|
||||||
|
|
||||||
class PseudoTechniqueConfig(Config):
|
|
||||||
technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig, RetainConfig] = PydanticField(
|
|
||||||
default={"hash": HashConfig().model_dump(exclude={"type"})},
|
|
||||||
discriminator="type"
|
|
||||||
)
|
|
||||||
|
|
||||||
class AnonymisePseudonymizeUnstructuredConfig(Config):
|
|
||||||
language: LanguageEnum = PydanticField(
|
|
||||||
default=LanguageEnum.en,
|
|
||||||
description="Language code (must be one of: hr, da, nl, en, fi, fr, de, el, it, lt, pl, pt, ro, sl, es, sv)"
|
|
||||||
|
|
||||||
)
|
|
||||||
used_function: List[PseudoTechniqueConfig] = PydanticField(
|
|
||||||
default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}],
|
|
||||||
description=("List of functions to be used on PIIs"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@field_validator("used_function", mode="before")
|
|
||||||
def _normalize_used_function(cls, v):
|
|
||||||
normalized = []
|
|
||||||
for item in v:
|
|
||||||
if isinstance(item, dict):
|
|
||||||
normalized.append(PseudoTechniqueConfig.model_validate(item))
|
|
||||||
else:
|
|
||||||
normalized.append(item)
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
@model_validator(mode="after")
|
|
||||||
def ensure_unique_pii(self):
|
|
||||||
pii_to_techniques = self._collect_pii_to_techniques()
|
|
||||||
duplicates = {
|
|
||||||
pii: techs for pii, techs in pii_to_techniques.items() if len(techs) > 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if duplicates:
|
|
||||||
formatted = "; ".join(
|
|
||||||
f"{pii} -> {', '.join(techs)}" for pii, techs in duplicates.items()
|
|
||||||
)
|
|
||||||
raise ValueError(f"Duplicate PII(s) across techniques not allowed:\n{formatted}")
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _collect_pii_to_techniques(self):
|
|
||||||
"""Extract PII-to-techniques mapping from used_function list."""
|
|
||||||
pii_to_techniques = {}
|
|
||||||
for f in self.used_function:
|
|
||||||
technique_type, piis = self._extract_technique_and_pii(f)
|
|
||||||
for pii in piis:
|
|
||||||
pii_to_techniques.setdefault(pii, []).append(technique_type)
|
|
||||||
return pii_to_techniques
|
|
||||||
|
|
||||||
def _extract_technique_and_pii(self, item):
|
|
||||||
"""Extract technique type and PII list from a PseudoTechniqueConfig item (dict or model instance)."""
|
|
||||||
if isinstance(item, dict):
|
|
||||||
tech = item.get("technique") or {}
|
|
||||||
if isinstance(tech, dict):
|
|
||||||
if "type" in tech:
|
|
||||||
return tech.get("type"), tech.get("pii") or tech.get("columns") or []
|
|
||||||
elif len(tech) == 1:
|
|
||||||
# variant-key mapping: {'hash': {...}}
|
|
||||||
technique_type, inner = next(iter(tech.items()))
|
|
||||||
return technique_type, inner.get("pii") or inner.get("columns") or []
|
|
||||||
return None, []
|
|
||||||
else:
|
|
||||||
# item is a PseudoTechniqueConfig instance
|
|
||||||
technique_type = item.technique.type
|
|
||||||
piis = getattr(item.technique, "pii", []) or getattr(item.technique, "columns", [])
|
|
||||||
return technique_type, piis
|
|
||||||
|
|
||||||
class DecryptConfig(Config):
|
|
||||||
type: Literal["decrypt"] = "decrypt"
|
|
||||||
key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption")
|
|
||||||
|
|
||||||
class DepseudoTechniqueConfig(Config):
|
|
||||||
technique: DecryptConfig = PydanticField(
|
|
||||||
default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})},
|
|
||||||
)
|
|
||||||
|
|
||||||
class DepseudonymizeUnstructuredConfig(Config):
|
|
||||||
used_function: List[DepseudoTechniqueConfig] = PydanticField(
|
|
||||||
default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}],
|
|
||||||
description=("Decryption function"),
|
|
||||||
)
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
from dagster import job
|
|
||||||
from util_services.util_ops import (
|
|
||||||
preview_dataframe,
|
|
||||||
read_structured_to_df,
|
|
||||||
write_df_to_local,
|
|
||||||
write_string_to_unstructured,
|
|
||||||
read_unstructured_to_string,
|
|
||||||
preview_unstructured,
|
|
||||||
read_structured_from_s3,
|
|
||||||
write_df_to_s3,
|
|
||||||
read_unstructured_from_s3,
|
|
||||||
write_unstructured_to_s3,
|
|
||||||
)
|
|
||||||
from .ops import (
|
|
||||||
anonymize_pseudonymize_structured,
|
|
||||||
depseudonymize_structured,
|
|
||||||
)
|
|
||||||
from .unstructured_ops import (
|
|
||||||
anonymize_pseudonymize_unstructured,
|
|
||||||
depseudonymize_unstructured,
|
|
||||||
)
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
|
|
||||||
})
|
|
||||||
def anonymise_pseudonymise_structured_job():
|
|
||||||
df = read_structured_to_df()
|
|
||||||
preview_dataframe(df)
|
|
||||||
df_anon, metrics = anonymize_pseudonymize_structured(df)
|
|
||||||
preview_dataframe(df_anon)
|
|
||||||
write_df_to_local(df_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION_PSEUDONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def anonymise_pseudonymise_structured_job_s3():
|
|
||||||
df = read_structured_from_s3()
|
|
||||||
preview_dataframe(df)
|
|
||||||
df_anon, metrics = anonymize_pseudonymize_structured(df)
|
|
||||||
preview_dataframe(df_anon)
|
|
||||||
write_df_to_s3(df_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "DEPSEUDONYMISATION"
|
|
||||||
})
|
|
||||||
def depseudonymise_structured_job():
|
|
||||||
df = read_structured_to_df()
|
|
||||||
preview_dataframe(df)
|
|
||||||
df_anon, metrics = depseudonymize_structured(df)
|
|
||||||
preview_dataframe(df_anon)
|
|
||||||
write_df_to_local(df_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "DEPSEUDONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def depseudonymise_structured_job_s3():
|
|
||||||
df = read_structured_from_s3()
|
|
||||||
preview_dataframe(df)
|
|
||||||
df_anon, metrics = depseudonymize_structured(df)
|
|
||||||
preview_dataframe(df_anon)
|
|
||||||
write_df_to_s3(df_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
|
|
||||||
})
|
|
||||||
def anonymise_pseudonymise_depseudonymise_structured_job():
|
|
||||||
df = read_structured_to_df()
|
|
||||||
preview_dataframe(df)
|
|
||||||
df_pseduo, metrics = anonymize_pseudonymize_structured(df)
|
|
||||||
preview_dataframe(df_pseduo)
|
|
||||||
df_depseduo, metrics = depseudonymize_structured(df_pseduo)
|
|
||||||
preview_dataframe(df_depseduo)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION_PSEUDONYMISATION"
|
|
||||||
})
|
|
||||||
def anonymise_pseudonymise_unstructured_job():
|
|
||||||
text = read_unstructured_to_string()
|
|
||||||
preview_unstructured(text)
|
|
||||||
text_anon, metrics = anonymize_pseudonymize_unstructured(text)
|
|
||||||
preview_unstructured(text_anon)
|
|
||||||
preview_unstructured(metrics)
|
|
||||||
write_string_to_unstructured(text_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "ANONYMISATION_PSEUDONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def anonymise_pseudonymise_unstructured_job_s3():
|
|
||||||
text = read_unstructured_from_s3()
|
|
||||||
preview_unstructured(text)
|
|
||||||
text_anon, metrics = anonymize_pseudonymize_unstructured(text)
|
|
||||||
preview_unstructured(text_anon)
|
|
||||||
preview_unstructured(metrics)
|
|
||||||
write_unstructured_to_s3(text_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "DEPSEUDONYMISATION"
|
|
||||||
})
|
|
||||||
def depseudonymise_unstructured_job():
|
|
||||||
text = read_unstructured_to_string()
|
|
||||||
preview_unstructured(text)
|
|
||||||
text_anon, metrics = depseudonymize_unstructured(text)
|
|
||||||
preview_unstructured(text_anon)
|
|
||||||
write_string_to_unstructured(text_anon)
|
|
||||||
|
|
||||||
|
|
||||||
@job(tags={
|
|
||||||
"business_operation": "DEPSEUDONYMISATION",
|
|
||||||
"resource_type": "RD_DATA"
|
|
||||||
})
|
|
||||||
def depseudonymise_unstructured_job_s3():
|
|
||||||
text = read_unstructured_from_s3()
|
|
||||||
preview_unstructured(text)
|
|
||||||
text_anon, metrics = depseudonymize_unstructured(text)
|
|
||||||
preview_unstructured(text_anon)
|
|
||||||
write_unstructured_to_s3(text_anon)
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
from dagster import Out, Output, op
|
|
||||||
from cryptography.fernet import InvalidToken
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models import (
|
|
||||||
AnonymisePseudonymizeStructuredConfig,
|
|
||||||
DepseudonymizeStructuredConfig,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.techniques import (
|
|
||||||
anonymisation_pseudonymisation_techniques as anon_pseudo_funcs,
|
|
||||||
)
|
|
||||||
import template_code_location.field_level_pseudo_anonymisation.techniques.depseudonymisation_techniques as depseudo_funcs
|
|
||||||
from .utils import create_get_encryption_key
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_column_wise_function(config, df, funcs):
|
|
||||||
for used_function in config.used_function:
|
|
||||||
func_name = used_function.technique.type
|
|
||||||
columns = used_function.technique.columns
|
|
||||||
func = getattr(funcs, func_name)
|
|
||||||
params = used_function.technique.model_dump()
|
|
||||||
del params["type"]
|
|
||||||
del params["columns"]
|
|
||||||
|
|
||||||
if func_name in ["encrypt", "decrypt"]:
|
|
||||||
key_name = used_function.technique.key_name
|
|
||||||
del params["key_name"]
|
|
||||||
params["key"] = create_get_encryption_key(func_name, key_name)
|
|
||||||
|
|
||||||
missing = [col for col in columns if col not in df.columns]
|
|
||||||
if missing:
|
|
||||||
raise ValueError(
|
|
||||||
f"The following columns required by technique '{func_name}' "
|
|
||||||
f"are not present in the DataFrame: {', '.join(missing)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Skip processing if DataFrame is empty
|
|
||||||
if len(df) == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for column in columns:
|
|
||||||
try:
|
|
||||||
vectorized_func = np.vectorize(lambda x: func(x, **params))
|
|
||||||
df[column] = vectorized_func(df[column].to_numpy())
|
|
||||||
except InvalidToken:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid Fernet token while decrypting column '{column}' "
|
|
||||||
f"using key '{key_name}'. The data may not be encrypted "
|
|
||||||
f"or the key may be incorrect. "
|
|
||||||
)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def anonymize_pseudonymize_structured(
|
|
||||||
context, config: AnonymisePseudonymizeStructuredConfig, df: pd.DataFrame
|
|
||||||
):
|
|
||||||
|
|
||||||
df = _apply_column_wise_function(config, df, anon_pseudo_funcs)
|
|
||||||
yield Output(
|
|
||||||
value=df,
|
|
||||||
metadata={},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value={}, output_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def depseudonymize_structured(context, config: DepseudonymizeStructuredConfig, df: pd.DataFrame):
|
|
||||||
|
|
||||||
df = _apply_column_wise_function(config, df, depseudo_funcs)
|
|
||||||
yield Output(
|
|
||||||
value=df,
|
|
||||||
metadata={},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value={}, output_name="metrics")
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
from .anonymisation_pseudonymisation_techniques import hash, redact, replace, encrypt # noqa: F401
|
|
||||||
|
|
||||||
from .depseudonymisation_techniques import decrypt # noqa: F401
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
import hashlib
|
|
||||||
from cryptography.fernet import Fernet
|
|
||||||
|
|
||||||
|
|
||||||
def hash(value: str, algorithm: str = "sha256") -> str:
|
|
||||||
"""
|
|
||||||
Hash the value using the specified algorithm (default: SHA-256).
|
|
||||||
"""
|
|
||||||
value = str(value)
|
|
||||||
hash_func = hashlib.new(algorithm)
|
|
||||||
hash_func.update(value.encode("utf-8"))
|
|
||||||
return hash_func.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def redact(value: str) -> str:
|
|
||||||
"""
|
|
||||||
Redact the column and return an empty string
|
|
||||||
"""
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def replace(value: str, new_value) -> str:
|
|
||||||
"""
|
|
||||||
Replace the value column with the provided value
|
|
||||||
"""
|
|
||||||
return new_value
|
|
||||||
|
|
||||||
|
|
||||||
def encrypt(value: str, key: bytes) -> str:
|
|
||||||
"""
|
|
||||||
Encrypt the value using the provided Fernet key.
|
|
||||||
"""
|
|
||||||
value = str(value)
|
|
||||||
f = Fernet(key)
|
|
||||||
return f.encrypt(value.encode()).decode()
|
|
||||||
|
|
||||||
|
|
||||||
def retain(value: str) -> str:
|
|
||||||
"""
|
|
||||||
Retain the original value without any changes.
|
|
||||||
"""
|
|
||||||
return value
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from cryptography.fernet import Fernet
|
|
||||||
|
|
||||||
|
|
||||||
def decrypt(value: str, key: bytes) -> str:
|
|
||||||
"""
|
|
||||||
Decrypt a string using the provided Fernet key.
|
|
||||||
"""
|
|
||||||
f = Fernet(key)
|
|
||||||
return f.decrypt(value.encode()).decode()
|
|
||||||
@@ -1,428 +0,0 @@
|
|||||||
import importlib
|
|
||||||
import importlib.abc
|
|
||||||
import importlib.machinery
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import types
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Stub out the `transformers` and `spacy_transformers` packages before any
|
|
||||||
# other import triggers spaCy's entry-point scan or scrubadub_spacy's runtime
|
|
||||||
# import of spacy_transformers.pipeline_component.
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
_STUB_PACKAGES = ("transformers", "spacy_transformers")
|
|
||||||
|
|
||||||
|
|
||||||
class _StubModule(types.ModuleType):
|
|
||||||
"""Module that returns a dummy class for any attribute access."""
|
|
||||||
|
|
||||||
def __getattr__(self, name: str):
|
|
||||||
return type(name, (), {})
|
|
||||||
|
|
||||||
|
|
||||||
class _StubFinder(importlib.abc.MetaPathFinder):
|
|
||||||
"""Intercept any import under the stubbed packages and return a stub module."""
|
|
||||||
|
|
||||||
def find_spec(self, fullname, path=None, target=None): # noqa: ANN001
|
|
||||||
for pkg in _STUB_PACKAGES:
|
|
||||||
if fullname == pkg or fullname.startswith(pkg + "."):
|
|
||||||
return importlib.machinery.ModuleSpec(fullname, _StubLoader())
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class _StubLoader(importlib.abc.Loader):
|
|
||||||
def create_module(self, spec): # noqa: ANN001
|
|
||||||
mod = _StubModule(spec.name)
|
|
||||||
mod.__path__ = [] # mark as package
|
|
||||||
mod.__spec__ = spec
|
|
||||||
return mod
|
|
||||||
|
|
||||||
def exec_module(self, module): # noqa: ANN001
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# Install the finder once, before scrubadub / spacy are imported.
|
|
||||||
if not any(isinstance(f, _StubFinder) for f in sys.meta_path):
|
|
||||||
sys.meta_path.insert(0, _StubFinder())
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
import scrubadub # noqa: E402
|
|
||||||
import scrubadub_spacy # noqa: E402
|
|
||||||
from cryptography.fernet import InvalidToken # noqa: E402
|
|
||||||
from dagster import Out, Output, get_dagster_logger, op # noqa: E402
|
|
||||||
from scrubadub.detectors import RegexDetector # noqa: E402
|
|
||||||
from scrubadub.filth import CredentialFilth, NameFilth # noqa: E402
|
|
||||||
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.techniques import (
|
|
||||||
anonymisation_pseudonymisation_techniques as anon_pseudo_funcs,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.techniques import (
|
|
||||||
depseudonymisation_techniques as depseudo_funcs,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .config_models import (
|
|
||||||
PII_MAPPING,
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig,
|
|
||||||
DepseudonymizeUnstructuredConfig,
|
|
||||||
PIIEntityEnum,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
SupportedLanguages,
|
|
||||||
)
|
|
||||||
from .utils import create_get_encryption_key
|
|
||||||
|
|
||||||
|
|
||||||
def _initialize_scrubber(language: str) -> scrubadub.Scrubber:
|
|
||||||
class SIMPLCredentialDetector(RegexDetector):
|
|
||||||
"""
|
|
||||||
Remove username/password combinations from dirty ``text``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
filth_cls = CredentialFilth
|
|
||||||
name = "credential"
|
|
||||||
autoload = True
|
|
||||||
|
|
||||||
regex = re.compile(
|
|
||||||
r"""
|
|
||||||
(?:username|login|u:)\s*(?::\s*)?
|
|
||||||
(?P<username>[\w.\-@+]+)
|
|
||||||
[\s\S]{0,500}?
|
|
||||||
(?:password|pw|p:)\s*(?::\s*)?
|
|
||||||
(?P<password>[^\s]+)
|
|
||||||
""",
|
|
||||||
re.MULTILINE | re.VERBOSE | re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
locale = SupportedLanguages.get_locale(language)
|
|
||||||
scrubber = scrubadub.Scrubber(locale=locale)
|
|
||||||
|
|
||||||
model_name = SupportedLanguages.get_language_model(language)
|
|
||||||
spacy_detector = scrubadub_spacy.detectors.SpacyEntityDetector(model=model_name)
|
|
||||||
spacy_detector.named_entities = {
|
|
||||||
"PERSON",
|
|
||||||
"PER",
|
|
||||||
"ORG",
|
|
||||||
"persName",
|
|
||||||
"PRS",
|
|
||||||
} # Need to set it after the constructor because scrubadub_spacy uses upper on all entries
|
|
||||||
spacy_detector.filth_cls_map["persName"] = NameFilth # Required because PL uses persName
|
|
||||||
spacy_detector.filth_cls_map["PRS"] = NameFilth # Required for swedish that uses PRS
|
|
||||||
scrubber.add_detector(spacy_detector)
|
|
||||||
if language in ["en", "de"]:
|
|
||||||
scrubber.add_detector(
|
|
||||||
scrubadub.detectors.DateOfBirthDetector
|
|
||||||
) # add optional data of birth detector
|
|
||||||
scrubber.remove_detector(
|
|
||||||
scrubadub.detectors.CredentialDetector
|
|
||||||
) # remove the not so great credentials detector and replace with custom SIMPL one
|
|
||||||
scrubber.add_detector(SIMPLCredentialDetector())
|
|
||||||
return scrubber
|
|
||||||
|
|
||||||
|
|
||||||
def _map_filth_to_pii_enum(filth) -> PIIEntityEnum | None:
|
|
||||||
cls_name = filth.__class__.__name__
|
|
||||||
for pii_enum, filth_name in PII_MAPPING.items():
|
|
||||||
if filth_name == cls_name:
|
|
||||||
return pii_enum
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_metrics(metrics_dict: dict, language: str) -> str:
|
|
||||||
# Format metrics as Markdown table
|
|
||||||
metrics_report = f"""
|
|
||||||
## PII Anonymization Report
|
|
||||||
|
|
||||||
### Summary
|
|
||||||
- **Total PII Detected**: {metrics_dict['total_pii_detected']}
|
|
||||||
- **Original Length**: {metrics_dict['text_length_original']} chars
|
|
||||||
- **Anonymized Length**: {metrics_dict['text_length_anonymised']} chars
|
|
||||||
- **Language**: {language}
|
|
||||||
|
|
||||||
### PII by Type
|
|
||||||
| Entity Type | Count |
|
|
||||||
|-------------|-------|
|
|
||||||
"""
|
|
||||||
for pii_type, count in metrics_dict["pii_by_type"].items():
|
|
||||||
metrics_report += f"| {pii_type} | {count} |\n"
|
|
||||||
|
|
||||||
metrics_report += "\n### Techniques Applied\n"
|
|
||||||
for pii, technique in metrics_dict["techniques_applied"].items():
|
|
||||||
metrics_report += f"- **{pii}**: {technique}\n"
|
|
||||||
|
|
||||||
return metrics_report
|
|
||||||
|
|
||||||
|
|
||||||
def _build_metrics_dict(
|
|
||||||
pii_counts: dict[str, int],
|
|
||||||
text: str,
|
|
||||||
anon_text: str,
|
|
||||||
technique_map: dict[PIIEntityEnum, PseudoTechniqueConfig],
|
|
||||||
) -> dict:
|
|
||||||
metrics_dict = {
|
|
||||||
"total_pii_detected": sum(pii_counts.values()),
|
|
||||||
"pii_by_type": pii_counts,
|
|
||||||
"text_length_original": len(text),
|
|
||||||
"text_length_anonymised": len(anon_text),
|
|
||||||
"techniques_applied": {
|
|
||||||
pii.name: technique_map[pii].technique.type for pii in technique_map.keys()
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
return metrics_dict
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def anonymize_pseudonymize_unstructured(
|
|
||||||
context, config: AnonymisePseudonymizeUnstructuredConfig, text: str
|
|
||||||
):
|
|
||||||
logger = get_dagster_logger()
|
|
||||||
|
|
||||||
if text is None or not text.strip():
|
|
||||||
raise ValueError("Input text cannot be None or empty")
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
f"Starting unstructured PII anonymization | lang={config.language.value} "
|
|
||||||
f"| input_chars={len(text)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- Filth detection ---
|
|
||||||
try:
|
|
||||||
scrubber = _initialize_scrubber(config.language.value)
|
|
||||||
filths = list(scrubber.iter_filth(text))
|
|
||||||
logger.info(f"Detected {len(filths)} potential PII entities before filtering.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Scrubber initialization/detection failed | lang={config.language.value}")
|
|
||||||
raise RuntimeError(f"PII detection failed for language '{config.language.value}'") from e
|
|
||||||
|
|
||||||
# --- Build technique routing map ---
|
|
||||||
technique_map = _build_technique_map(config)
|
|
||||||
logger.debug(
|
|
||||||
"Technique map constructed: "
|
|
||||||
+ ", ".join(f"{pii.name}->{cfg.technique.type}" for pii, cfg in technique_map.items())
|
|
||||||
)
|
|
||||||
|
|
||||||
replacements = []
|
|
||||||
key_cache = {}
|
|
||||||
pii_counts = {}
|
|
||||||
|
|
||||||
# --- Process filths ---
|
|
||||||
for idx, filth in enumerate(filths, start=1):
|
|
||||||
pii_enum = _map_filth_to_pii_enum(filth)
|
|
||||||
|
|
||||||
if pii_enum is None:
|
|
||||||
logger.debug(f"[{idx}] Skipping unknown filth class={filth.__class__.__name__}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
start_idx, end_idx = _extract_span(filth, logger, idx)
|
|
||||||
if start_idx is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
original_value = text[start_idx:end_idx]
|
|
||||||
technique_cfg = technique_map.get(pii_enum)
|
|
||||||
|
|
||||||
# No technique configured
|
|
||||||
if technique_cfg is None:
|
|
||||||
_handle_missing_technique(
|
|
||||||
pii_enum,
|
|
||||||
start_idx,
|
|
||||||
end_idx,
|
|
||||||
text,
|
|
||||||
pii_counts,
|
|
||||||
replacements,
|
|
||||||
logger,
|
|
||||||
idx,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Apply configured technique
|
|
||||||
t = technique_cfg.technique
|
|
||||||
params = _prepare_params(t, key_cache, idx, logger)
|
|
||||||
replacement = _apply_technique(original_value, t.type, params, pii_enum, idx, logger)
|
|
||||||
|
|
||||||
replacements.append((start_idx, end_idx, replacement))
|
|
||||||
pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1
|
|
||||||
|
|
||||||
# --- Apply replacements ---
|
|
||||||
anon_text = _apply_replacements(text, replacements, logger)
|
|
||||||
|
|
||||||
logger.info(f"Anonymisation completed, total PII counts: {pii_counts}")
|
|
||||||
|
|
||||||
metrics_report = _get_metrics(
|
|
||||||
_build_metrics_dict(pii_counts, text, anon_text, technique_map),
|
|
||||||
config.language.value,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield Output(value=anon_text, output_name="data")
|
|
||||||
yield Output(value=metrics_report, output_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
@op(out={"data": Out(), "metrics": Out()})
|
|
||||||
def depseudonymize_unstructured(context, config: DepseudonymizeUnstructuredConfig, input_text: str):
|
|
||||||
|
|
||||||
input_restored, metrics = _apply_depseudonimisation_function(config, input_text, depseudo_funcs)
|
|
||||||
yield Output(
|
|
||||||
value=input_restored,
|
|
||||||
metadata={},
|
|
||||||
output_name="data",
|
|
||||||
)
|
|
||||||
yield Output(value=metrics, output_name="metrics")
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_depseudonimisation_function(config, input_text: str, funcs_module):
|
|
||||||
"""
|
|
||||||
Searches and depseudonymizes text segments formatted as:
|
|
||||||
{technique:pseudonymized_value}
|
|
||||||
"""
|
|
||||||
|
|
||||||
total_depseudo_count = 0
|
|
||||||
depseudonimized_text = input_text # Initialize with input text
|
|
||||||
|
|
||||||
# Loop through each depseudonymisation technique defined in the config
|
|
||||||
for used_function in config.used_function:
|
|
||||||
func_name = used_function.technique.type
|
|
||||||
func = getattr(funcs_module, func_name)
|
|
||||||
pseudo_anon_func = ""
|
|
||||||
|
|
||||||
# Prepare parameters
|
|
||||||
params = used_function.technique.model_dump()
|
|
||||||
del params["type"]
|
|
||||||
|
|
||||||
if func_name == "decrypt":
|
|
||||||
key_name = used_function.technique.key_name
|
|
||||||
del params["key_name"]
|
|
||||||
pseudo_anon_func = "encrypt"
|
|
||||||
params["key"] = create_get_encryption_key(func_name, key_name)
|
|
||||||
|
|
||||||
# Regex pattern for this technique, e.g. {encrypt:...}
|
|
||||||
pattern = rf"\{{{pseudo_anon_func}:([^}}]+)\}}"
|
|
||||||
|
|
||||||
def replace_match(match):
|
|
||||||
nonlocal total_depseudo_count
|
|
||||||
pseudovalue = match.group(1)
|
|
||||||
total_depseudo_count += 1
|
|
||||||
try:
|
|
||||||
return func(pseudovalue, **params)
|
|
||||||
except InvalidToken:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid Fernet token while decrypting value using key '{key_name}'. "
|
|
||||||
f"The data may not be encrypted or the key may be incorrect."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Error during depseudonymisation with '{func_name}': {e}")
|
|
||||||
|
|
||||||
# Apply replacements for this technique
|
|
||||||
depseudonimized_text = re.sub(pattern, replace_match, depseudonimized_text)
|
|
||||||
|
|
||||||
yield depseudonimized_text
|
|
||||||
yield {"total_depseudo_count": total_depseudo_count}
|
|
||||||
|
|
||||||
|
|
||||||
def _build_technique_map(config):
|
|
||||||
technique_map = {}
|
|
||||||
for func_cfg in config.used_function:
|
|
||||||
for pii in func_cfg.technique.pii:
|
|
||||||
technique_map[pii] = func_cfg
|
|
||||||
return technique_map
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_span(filth, logger, idx):
|
|
||||||
start_idx = getattr(filth, "beg", getattr(filth, "start", None))
|
|
||||||
end_idx = getattr(filth, "end", None)
|
|
||||||
if start_idx is None or end_idx is None:
|
|
||||||
logger.debug(f"[{idx}] Filth missing span attributes; skipping.")
|
|
||||||
return None, None
|
|
||||||
return start_idx, end_idx
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_missing_technique(
|
|
||||||
pii_enum, start_idx, end_idx, text, pii_counts, replacements, logger, idx
|
|
||||||
):
|
|
||||||
original_value = text[start_idx:end_idx]
|
|
||||||
logger.debug(
|
|
||||||
f"[{idx}] PII={pii_enum.name} span=({start_idx},{end_idx}) value={original_value} "
|
|
||||||
f"- No technique configured, using placeholder"
|
|
||||||
)
|
|
||||||
placeholder = f"{{{{{pii_enum.name}}}}}"
|
|
||||||
replacements.append((start_idx, end_idx, placeholder))
|
|
||||||
pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1
|
|
||||||
|
|
||||||
|
|
||||||
def _prepare_params(t, key_cache, idx, logger):
|
|
||||||
params = t.model_dump()
|
|
||||||
del params["type"]
|
|
||||||
del params["pii"]
|
|
||||||
|
|
||||||
if t.type == "encrypt":
|
|
||||||
try:
|
|
||||||
if t.key_name not in key_cache:
|
|
||||||
logger.debug(
|
|
||||||
f"[{idx}] Retrieving/generating Vault key name={t.key_name} for encryption"
|
|
||||||
)
|
|
||||||
key_cache[t.key_name] = create_get_encryption_key("encrypt", t.key_name)
|
|
||||||
params["key"] = key_cache[t.key_name]
|
|
||||||
del params["key_name"]
|
|
||||||
logger.debug(f"[{idx}] Encryption key prepared")
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Encryption key retrieval failed for key '{t.key_name}': {type(e).__name__}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_technique(original_value, t_type, params, pii_enum, idx, logger):
|
|
||||||
try:
|
|
||||||
func = getattr(anon_pseudo_funcs, t_type)
|
|
||||||
replacement = func(original_value, **params)
|
|
||||||
|
|
||||||
if t_type == "encrypt":
|
|
||||||
replacement = f"{{encrypt:{replacement}}}"
|
|
||||||
|
|
||||||
logger.debug(f"[{idx}] {t_type.capitalize()} complete")
|
|
||||||
return replacement
|
|
||||||
|
|
||||||
except AttributeError:
|
|
||||||
logger.warning(f"[{idx}] Technique '{t_type}' not recognized; inserting placeholder.")
|
|
||||||
return f"{{UNIMPL_{t_type}_{pii_enum.name}}}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Technique '{t_type}' failed for PII type '{pii_enum.name}': {type(e).__name__}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_replacements(text, replacements, logger):
|
|
||||||
if not replacements:
|
|
||||||
logger.info("No PII detected; returning original text.")
|
|
||||||
return text
|
|
||||||
|
|
||||||
logger.debug(f"Applying {len(replacements)} replacements to text body.")
|
|
||||||
replacements.sort(key=lambda r: r[0])
|
|
||||||
|
|
||||||
# Detect overlaps
|
|
||||||
for i in range(len(replacements) - 1):
|
|
||||||
if replacements[i][1] > replacements[i + 1][0]:
|
|
||||||
logger.warning(
|
|
||||||
f"Overlapping PII detected at positions "
|
|
||||||
f"({replacements[i][0]},{replacements[i][1]}) "
|
|
||||||
f"and ({replacements[i+1][0]},{replacements[i+1][1]}). "
|
|
||||||
f"Using first match."
|
|
||||||
)
|
|
||||||
replacements[i + 1] = (
|
|
||||||
replacements[i][1],
|
|
||||||
replacements[i + 1][1],
|
|
||||||
replacements[i + 1][2],
|
|
||||||
)
|
|
||||||
|
|
||||||
result_parts = []
|
|
||||||
last = 0
|
|
||||||
for start, end, repl in replacements:
|
|
||||||
if start < last:
|
|
||||||
continue
|
|
||||||
result_parts.append(text[last:start])
|
|
||||||
result_parts.append(repl)
|
|
||||||
last = end
|
|
||||||
|
|
||||||
result_parts.append(text[last:])
|
|
||||||
return "".join(result_parts)
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
import os
|
|
||||||
import hvac
|
|
||||||
from hvac.exceptions import InvalidPath
|
|
||||||
from cryptography.fernet import Fernet
|
|
||||||
|
|
||||||
|
|
||||||
def create_get_encryption_key(func_name: str, key_name: str) -> bytes:
|
|
||||||
client = hvac.Client(url=os.getenv("OPENBAO_URL"), token=os.getenv("OPENBAO_TOKEN"))
|
|
||||||
|
|
||||||
secret_folder = os.getenv("ENCRYPTION_KEYS_PATH")
|
|
||||||
secret_path = f"{secret_folder}/{key_name}" if secret_folder else key_name
|
|
||||||
mount_point = os.getenv("ENCRYPTION_KEYS_MOUNT_POINT")
|
|
||||||
|
|
||||||
try:
|
|
||||||
secret_response = client.secrets.kv.v2.read_secret_version(
|
|
||||||
path=secret_path, mount_point=mount_point
|
|
||||||
)
|
|
||||||
key_value = secret_response["data"]["data"]["value"]
|
|
||||||
|
|
||||||
except InvalidPath:
|
|
||||||
if func_name == "encrypt":
|
|
||||||
new_key = Fernet.generate_key().decode()
|
|
||||||
client.secrets.kv.v2.create_or_update_secret(
|
|
||||||
path=secret_path, mount_point=mount_point, secret={"value": new_key}
|
|
||||||
)
|
|
||||||
key_value = new_key
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Fernet key '{key_name}' not found in Vault for decrypt.")
|
|
||||||
except Exception as e:
|
|
||||||
raise ValueError(f"Error while reading Fernet key '{key_name}': {e}")
|
|
||||||
|
|
||||||
return key_value.encode()
|
|
||||||
@@ -8,7 +8,7 @@ from util_services.sensors import (
|
|||||||
from util_services.custom_json_logger import simpl_json_logger
|
from util_services.custom_json_logger import simpl_json_logger
|
||||||
|
|
||||||
# Data processing jobs
|
# Data processing jobs
|
||||||
from template_code_location.data_processing.jobs import (
|
from data_processing.jobs import (
|
||||||
remove_duplicates_job_s3,
|
remove_duplicates_job_s3,
|
||||||
fill_missing_values_job_s3,
|
fill_missing_values_job_s3,
|
||||||
standardize_categorical_values_job_s3,
|
standardize_categorical_values_job_s3,
|
||||||
@@ -21,7 +21,7 @@ from template_code_location.data_processing.jobs import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Dataframe-level anonymisation jobs
|
# Dataframe-level anonymisation jobs
|
||||||
from template_code_location.dataframe_level_anonymisation.jobs import (
|
from dataframe_level_anonymisation.jobs import (
|
||||||
k_anonymity_job_s3,
|
k_anonymity_job_s3,
|
||||||
l_diversity_job_s3,
|
l_diversity_job_s3,
|
||||||
t_closeness_job_s3,
|
t_closeness_job_s3,
|
||||||
@@ -29,7 +29,7 @@ from template_code_location.dataframe_level_anonymisation.jobs import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Field-level pseudo-anonymisation jobs
|
# Field-level pseudo-anonymisation jobs
|
||||||
from template_code_location.field_level_pseudo_anonymisation.jobs import (
|
from field_level_pseudo_anonymisation.jobs import (
|
||||||
anonymise_pseudonymise_structured_job_s3,
|
anonymise_pseudonymise_structured_job_s3,
|
||||||
depseudonymise_structured_job_s3,
|
depseudonymise_structured_job_s3,
|
||||||
anonymise_pseudonymise_unstructured_job_s3,
|
anonymise_pseudonymise_unstructured_job_s3,
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
"""Pytest configuration and shared fixtures."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pandas as pd
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
import sys
|
|
||||||
from dagster import build_op_context
|
|
||||||
|
|
||||||
# Mock external dependencies that might not be available in test environment
|
|
||||||
sys.modules['spellchecker'] = MagicMock()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_context():
|
|
||||||
"""Create a mock Dagster context for testing operations."""
|
|
||||||
context = build_op_context()
|
|
||||||
return context
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_dataframe():
|
|
||||||
"""Create a sample DataFrame for testing."""
|
|
||||||
return pd.DataFrame({
|
|
||||||
'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'],
|
|
||||||
'Age': [25, 30, 25, None, 25],
|
|
||||||
'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'],
|
|
||||||
'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active']
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_dataframe_with_typos():
|
|
||||||
"""Create a sample DataFrame with typos for spell checking."""
|
|
||||||
return pd.DataFrame({
|
|
||||||
'Name': ['jon doe', 'jane smith', 'bob jonson'],
|
|
||||||
'Description': ['developer', 'analst', 'enginer']
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def empty_dataframe():
|
|
||||||
"""Create an empty DataFrame."""
|
|
||||||
return pd.DataFrame()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dataframe_with_missing_values():
|
|
||||||
"""Create a DataFrame with various missing values."""
|
|
||||||
return pd.DataFrame({
|
|
||||||
'Column1': [1, None, 3, None, 5],
|
|
||||||
'Column2': ['a', 'b', None, 'd', None],
|
|
||||||
'Column3': [None, None, None, None, None]
|
|
||||||
})
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
"""Configuration utilities for testing."""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# Add src directory to path for imports
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
|
||||||
@@ -1,202 +0,0 @@
|
|||||||
"""Unit tests for configuration models."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
from template_code_location.data_processing.config_models import (
|
|
||||||
FillMissingConfiguration,
|
|
||||||
ColumnsSelectConfiguration,
|
|
||||||
SpellCheckConfiguration,
|
|
||||||
AggregationConfiguration
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestColumnsSelectConfiguration:
|
|
||||||
"""Tests for ColumnsSelectConfiguration."""
|
|
||||||
|
|
||||||
def test_default_columns(self):
|
|
||||||
"""Test default columns configuration."""
|
|
||||||
config = ColumnsSelectConfiguration()
|
|
||||||
assert config.columns == ['Name']
|
|
||||||
|
|
||||||
def test_custom_columns(self):
|
|
||||||
"""Test custom columns configuration."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3'])
|
|
||||||
assert config.columns == ['Col1', 'Col2', 'Col3']
|
|
||||||
|
|
||||||
def test_empty_columns_list(self):
|
|
||||||
"""Test with empty columns list."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=[])
|
|
||||||
assert config.columns == []
|
|
||||||
|
|
||||||
def test_single_column(self):
|
|
||||||
"""Test with a single column."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['SingleCol'])
|
|
||||||
assert config.columns == ['SingleCol']
|
|
||||||
|
|
||||||
def test_columns_with_special_characters(self):
|
|
||||||
"""Test columns with special characters."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3'])
|
|
||||||
assert config.columns == ['Col-1', 'Col_2', 'Col.3']
|
|
||||||
|
|
||||||
def test_duplicate_columns_are_removed(self):
|
|
||||||
"""Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys)."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B'])
|
|
||||||
|
|
||||||
assert config.columns == ['A', 'B', 'C']
|
|
||||||
|
|
||||||
def test_duplicate_default_behavior(self):
|
|
||||||
"""Verifica che anche input estremi vengano gestiti correttamente."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name'])
|
|
||||||
assert config.columns == ['Name']
|
|
||||||
|
|
||||||
|
|
||||||
class TestFillMissingConfiguration:
|
|
||||||
"""Tests for FillMissingConfiguration."""
|
|
||||||
|
|
||||||
def test_default_fill_map(self):
|
|
||||||
"""Test default fill map configuration."""
|
|
||||||
config = FillMissingConfiguration()
|
|
||||||
|
|
||||||
assert config.fill_map == {'Age': 'UNKNOWN_AGE'}
|
|
||||||
|
|
||||||
def test_custom_fill_map(self):
|
|
||||||
"""Test custom fill map configuration."""
|
|
||||||
fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'}
|
|
||||||
config = FillMissingConfiguration(fill_map=fill_map)
|
|
||||||
|
|
||||||
assert config.fill_map == fill_map
|
|
||||||
|
|
||||||
def test_empty_fill_map(self):
|
|
||||||
"""Test with empty fill map."""
|
|
||||||
config = FillMissingConfiguration(fill_map={})
|
|
||||||
|
|
||||||
assert config.fill_map == {}
|
|
||||||
|
|
||||||
def test_fill_map_with_numeric_values(self):
|
|
||||||
"""Test fill map with numeric string values."""
|
|
||||||
fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'}
|
|
||||||
config = FillMissingConfiguration(fill_map=fill_map)
|
|
||||||
|
|
||||||
assert config.fill_map == fill_map
|
|
||||||
|
|
||||||
def test_fill_map_with_string_values(self):
|
|
||||||
"""Test fill map with string values."""
|
|
||||||
fill_map = {'Name': 'Unknown', 'Email': 'no-email'}
|
|
||||||
config = FillMissingConfiguration(fill_map=fill_map)
|
|
||||||
|
|
||||||
assert config.fill_map == fill_map
|
|
||||||
|
|
||||||
def test_fill_map_mixed_types(self):
|
|
||||||
"""Test fill map with mixed value types (all strings)."""
|
|
||||||
fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'}
|
|
||||||
config = FillMissingConfiguration(fill_map=fill_map)
|
|
||||||
|
|
||||||
assert config.fill_map == fill_map
|
|
||||||
|
|
||||||
|
|
||||||
class TestSpellCheckConfiguration:
|
|
||||||
"""Tests for SpellCheckConfiguration."""
|
|
||||||
|
|
||||||
def test_default_spell_check_config(self):
|
|
||||||
"""Test default spell check configuration."""
|
|
||||||
config = SpellCheckConfiguration()
|
|
||||||
|
|
||||||
assert config.columns == ['Name']
|
|
||||||
assert config.language == 'en'
|
|
||||||
|
|
||||||
def test_custom_spell_check_config(self):
|
|
||||||
"""Test custom spell check configuration."""
|
|
||||||
config = SpellCheckConfiguration(
|
|
||||||
columns=['Description', 'Notes'],
|
|
||||||
language='es'
|
|
||||||
)
|
|
||||||
|
|
||||||
assert config.columns == ['Description', 'Notes']
|
|
||||||
assert config.language == 'es'
|
|
||||||
|
|
||||||
def test_spell_check_all_languages(self):
|
|
||||||
"""Test spell check with all supported languages."""
|
|
||||||
supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']
|
|
||||||
|
|
||||||
for lang in supported_languages:
|
|
||||||
config = SpellCheckConfiguration(language=lang)
|
|
||||||
assert config.language == lang
|
|
||||||
|
|
||||||
def test_spell_check_invalid_language(self):
|
|
||||||
"""Test spell check with invalid language."""
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
SpellCheckConfiguration(language='invalid')
|
|
||||||
|
|
||||||
def test_spell_check_multiple_columns(self):
|
|
||||||
"""Test spell check with multiple columns."""
|
|
||||||
columns = ['Col1', 'Col2', 'Col3', 'Col4']
|
|
||||||
config = SpellCheckConfiguration(columns=columns)
|
|
||||||
|
|
||||||
assert config.columns == columns
|
|
||||||
|
|
||||||
def test_spell_check_empty_columns(self):
|
|
||||||
"""Test spell check with empty columns list."""
|
|
||||||
config = SpellCheckConfiguration(columns=[])
|
|
||||||
|
|
||||||
assert config.columns == []
|
|
||||||
assert config.language == 'en'
|
|
||||||
|
|
||||||
def test_spell_check_inheritance(self):
|
|
||||||
"""Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration."""
|
|
||||||
config = SpellCheckConfiguration()
|
|
||||||
|
|
||||||
assert isinstance(config, ColumnsSelectConfiguration)
|
|
||||||
assert hasattr(config, 'columns')
|
|
||||||
assert hasattr(config, 'language')
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'])
|
|
||||||
def test_spell_check_languages_parametrized(self, language):
|
|
||||||
"""Test spell check with parametrized languages."""
|
|
||||||
config = SpellCheckConfiguration(language=language)
|
|
||||||
assert config.language == language
|
|
||||||
|
|
||||||
class TestAggregationConfiguration:
|
|
||||||
"""Tests for AggregationConfiguration."""
|
|
||||||
|
|
||||||
def test_aggregation_default_config(self):
|
|
||||||
"""Test default aggregation configuration."""
|
|
||||||
config = AggregationConfiguration()
|
|
||||||
|
|
||||||
assert config.columns == ['Name']
|
|
||||||
assert config.operation == 'sum'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"])
|
|
||||||
def test_aggregation_valid_operations(self, op):
|
|
||||||
"""Test all allowed aggregation operations."""
|
|
||||||
config = AggregationConfiguration(operation=op)
|
|
||||||
assert config.operation == op
|
|
||||||
|
|
||||||
def test_aggregation_invalid_operation(self):
|
|
||||||
"""Test that an invalid operation raises a ValidationError."""
|
|
||||||
with pytest.raises(ValidationError) as excinfo:
|
|
||||||
AggregationConfiguration(operation="invalid_op")
|
|
||||||
|
|
||||||
assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value)
|
|
||||||
|
|
||||||
def test_aggregation_custom_columns(self):
|
|
||||||
"""Test aggregation with custom columns."""
|
|
||||||
config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean')
|
|
||||||
|
|
||||||
assert config.columns == ['Price', 'Quantity']
|
|
||||||
assert config.operation == 'mean'
|
|
||||||
|
|
||||||
def test_aggregation_inheritance(self):
|
|
||||||
"""Test that AggregationConfiguration inherits from ColumnsSelectConfiguration."""
|
|
||||||
config = AggregationConfiguration()
|
|
||||||
|
|
||||||
assert isinstance(config, ColumnsSelectConfiguration)
|
|
||||||
assert hasattr(config, 'columns')
|
|
||||||
assert hasattr(config, 'operation')
|
|
||||||
|
|
||||||
def test_aggregation_model_dump(self):
|
|
||||||
"""Test that model_dump contains all expected fields (useful for the Dagster op)."""
|
|
||||||
config = AggregationConfiguration(columns=['Value'], operation='max')
|
|
||||||
dump = config.model_dump()
|
|
||||||
|
|
||||||
assert dump['columns'] == ['Value']
|
|
||||||
assert dump['operation'] == 'max'
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
"""Integration tests for data processing jobs."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pandas as pd
|
|
||||||
from unittest.mock import patch, MagicMock
|
|
||||||
from template_code_location.data_processing.ops import (
|
|
||||||
remove_duplicates,
|
|
||||||
fill_missing_values,
|
|
||||||
standardize_categorical_values,
|
|
||||||
correct_typos
|
|
||||||
)
|
|
||||||
from template_code_location.data_processing.config_models import (
|
|
||||||
FillMissingConfiguration,
|
|
||||||
ColumnsSelectConfiguration,
|
|
||||||
SpellCheckConfiguration
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestPipelineIntegration:
|
|
||||||
"""Integration tests for data processing pipeline."""
|
|
||||||
|
|
||||||
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
|
|
||||||
"""Test pipeline: remove duplicates then standardize."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
|
|
||||||
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
|
|
||||||
})
|
|
||||||
|
|
||||||
# Step 1: Remove duplicates
|
|
||||||
df_no_dupes = remove_duplicates(mock_context, df)
|
|
||||||
assert df_no_dupes.shape[0] == 3
|
|
||||||
|
|
||||||
# Step 2: Standardize
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
|
|
||||||
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
|
|
||||||
|
|
||||||
assert df_standardized['Name'].iloc[0] == 'john doe'
|
|
||||||
assert df_standardized['City'].iloc[0] == 'new york'
|
|
||||||
|
|
||||||
def test_pipeline_fill_missing_then_standardize(self, mock_context):
|
|
||||||
"""Test pipeline: fill missing values then standardize."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Category': [' ACTIVE ', None, ' PENDING '],
|
|
||||||
'Value': ['1', '2', None]
|
|
||||||
})
|
|
||||||
|
|
||||||
# Step 1: Fill missing values
|
|
||||||
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
|
||||||
df_filled = fill_missing_values(mock_context, fill_config, df)
|
|
||||||
|
|
||||||
# Step 2: Standardize
|
|
||||||
std_config = ColumnsSelectConfiguration(columns=['Category'])
|
|
||||||
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
|
|
||||||
|
|
||||||
assert df_standardized['Category'].iloc[0] == 'active'
|
|
||||||
assert df_filled['Value'].iloc[2] == '0'
|
|
||||||
|
|
||||||
def test_pipeline_all_operations(self, mock_context):
|
|
||||||
"""Test complete pipeline with all operations."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
|
|
||||||
'Value': ['1', None, '1', '2']
|
|
||||||
})
|
|
||||||
|
|
||||||
# Step 1: Remove duplicates
|
|
||||||
df = remove_duplicates(mock_context, df)
|
|
||||||
assert df.shape[0] == 3
|
|
||||||
|
|
||||||
# Step 2: Fill missing
|
|
||||||
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
|
||||||
df = fill_missing_values(mock_context, fill_config, df)
|
|
||||||
assert df['Value'].isna().sum() == 0
|
|
||||||
|
|
||||||
# Step 3: Standardize
|
|
||||||
std_config = ColumnsSelectConfiguration(columns=['Name'])
|
|
||||||
df = standardize_categorical_values(mock_context, std_config, df)
|
|
||||||
|
|
||||||
assert df['Name'].iloc[0] == 'john doe'
|
|
||||||
|
|
||||||
def test_pipeline_with_large_dataset(self, mock_context):
|
|
||||||
"""Test pipeline performance with larger dataset."""
|
|
||||||
# Create larger dataset
|
|
||||||
size = 1000
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'ID': list(range(size)),
|
|
||||||
'Name': ['User_' + str(i % 50) for i in range(size)],
|
|
||||||
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
|
|
||||||
'Score': [i % 100 for i in range(size)]
|
|
||||||
})
|
|
||||||
|
|
||||||
# Add some duplicates
|
|
||||||
df = pd.concat([df, df.head(100)], ignore_index=True)
|
|
||||||
|
|
||||||
# Process
|
|
||||||
df_cleaned = remove_duplicates(mock_context, df)
|
|
||||||
|
|
||||||
assert df_cleaned.shape[0] == 1000
|
|
||||||
assert df_cleaned.shape[1] == 4
|
|
||||||
|
|
||||||
|
|
||||||
class TestErrorHandling:
|
|
||||||
"""Tests for error handling and edge cases."""
|
|
||||||
|
|
||||||
def test_operation_with_corrupted_data(self, mock_context):
|
|
||||||
"""Test operations with corrupted/unusual data."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
|
|
||||||
})
|
|
||||||
|
|
||||||
# Should handle special float values
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
assert result.shape[0] > 0
|
|
||||||
|
|
||||||
def test_operation_preserves_index(self, mock_context):
|
|
||||||
"""Test that index is handled correctly."""
|
|
||||||
df = pd.DataFrame(
|
|
||||||
{'Col': [1, 2, 1, 3]},
|
|
||||||
index=['a', 'b', 'c', 'd']
|
|
||||||
)
|
|
||||||
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
# Index may be reset, so just check shape
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
|
|
||||||
def test_standardize_with_unicode_characters(self, mock_context):
|
|
||||||
"""Test standardization with unicode characters."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Name': ['José', 'François', 'Müller']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Name'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, df)
|
|
||||||
|
|
||||||
# Should handle unicode correctly
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
|
|
||||||
def test_fill_with_same_key_multiple_times(self, mock_context):
|
|
||||||
"""Test filling when fill_map has multiple entries."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': ['1', None, '3'],
|
|
||||||
'B': [None, None, 'c'],
|
|
||||||
'C': [None, '2', None]
|
|
||||||
})
|
|
||||||
|
|
||||||
config = FillMissingConfiguration(fill_map={
|
|
||||||
'A': '-1',
|
|
||||||
'B': 'EMPTY',
|
|
||||||
'C': '0'
|
|
||||||
})
|
|
||||||
|
|
||||||
result = fill_missing_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result.loc[1, 'A'] == '-1'
|
|
||||||
assert result.loc[0, 'B'] == 'EMPTY'
|
|
||||||
assert result.loc[0, 'C'] == '0'
|
|
||||||
|
|
||||||
|
|
||||||
class TestDataTypePreservation:
|
|
||||||
"""Tests to ensure data types are preserved appropriately."""
|
|
||||||
|
|
||||||
def test_remove_duplicates_preserves_dtypes(self, mock_context):
|
|
||||||
"""Test that remove_duplicates preserves column data types."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'int32': pd.array([1, 2, 1], dtype='int32'),
|
|
||||||
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
|
|
||||||
'str': ['a', 'b', 'a']
|
|
||||||
})
|
|
||||||
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
|
|
||||||
assert result['int32'].dtype == df['int32'].dtype
|
|
||||||
assert result['float64'].dtype == df['float64'].dtype
|
|
||||||
|
|
||||||
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
|
|
||||||
"""Test that fill_missing handles type preservation."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': pd.array(['1', None, '3'], dtype='string'),
|
|
||||||
'B': ['x', 'y', 'z']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = FillMissingConfiguration(fill_map={'A': '0'})
|
|
||||||
result = fill_missing_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result['A'].loc[1] == '0'
|
|
||||||
assert result['B'].dtype == df['B'].dtype
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
from template_code_location.data_processing.jobs import (
|
|
||||||
remove_duplicates_job_s3,
|
|
||||||
fill_missing_values_job_s3,
|
|
||||||
standardize_categorical_values_job_s3,
|
|
||||||
correct_typos_job_s3,
|
|
||||||
normalize_numeric_min_max_job_s3,
|
|
||||||
normalize_datetime_job_s3,
|
|
||||||
normalize_coordinates_job_s3,
|
|
||||||
add_global_aggregations_job_s3
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_remove_duplicates_job_s3_is_callable():
|
|
||||||
"""Test remove_duplicates_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(remove_duplicates_job_s3)
|
|
||||||
assert hasattr(remove_duplicates_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_fill_missing_values_job_s3_is_callable():
|
|
||||||
"""Test fill_missing_values_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(fill_missing_values_job_s3)
|
|
||||||
assert hasattr(fill_missing_values_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_standardize_categorical_values_job_s3_is_callable():
|
|
||||||
"""Test standardize_categorical_values_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(standardize_categorical_values_job_s3)
|
|
||||||
assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_correct_typos_job_s3_is_callable():
|
|
||||||
"""Test correct_typos_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(correct_typos_job_s3)
|
|
||||||
assert hasattr(correct_typos_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_numeric_min_max_job_s3_is_callable():
|
|
||||||
"""Test normalize_numeric_min_max_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(normalize_numeric_min_max_job_s3)
|
|
||||||
assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_datetime_job_s3_is_callable():
|
|
||||||
"""Test normalize_datetime_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(normalize_datetime_job_s3)
|
|
||||||
assert hasattr(normalize_datetime_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
def test_normalize_coordinates_job_s3_is_callable():
|
|
||||||
"""Test normalize_coordinates_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(normalize_coordinates_job_s3)
|
|
||||||
assert hasattr(normalize_coordinates_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
def test_add_global_aggregations_job_s3_is_callable():
|
|
||||||
"""Test add_global_aggregations_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(add_global_aggregations_job_s3)
|
|
||||||
assert hasattr(add_global_aggregations_job_s3, 'execute_in_process')
|
|
||||||
@@ -1,700 +0,0 @@
|
|||||||
"""Unit tests for data processing operations."""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import pandas as pd
|
|
||||||
from template_code_location.data_processing.ops import (
|
|
||||||
remove_duplicates,
|
|
||||||
fill_missing_values,
|
|
||||||
standardize_categorical_values,
|
|
||||||
correct_typos,
|
|
||||||
normalize_datetime,
|
|
||||||
normalize_numeric_min_max,
|
|
||||||
normalize_coordinates,
|
|
||||||
add_global_aggregations
|
|
||||||
)
|
|
||||||
from template_code_location.data_processing.config_models import (
|
|
||||||
FillMissingConfiguration,
|
|
||||||
ColumnsSelectConfiguration,
|
|
||||||
SpellCheckConfiguration,
|
|
||||||
AggregationConfiguration,
|
|
||||||
CoordinatesNormalizationConfiguration
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestRemoveDuplicates:
|
|
||||||
"""Tests for the remove_duplicates operation."""
|
|
||||||
|
|
||||||
def test_remove_duplicates_basic(self, mock_context, sample_dataframe):
|
|
||||||
"""Test basic duplicate removal."""
|
|
||||||
result = remove_duplicates(mock_context, sample_dataframe)
|
|
||||||
|
|
||||||
# Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x)
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
assert len(result) < len(sample_dataframe)
|
|
||||||
|
|
||||||
def test_remove_duplicates_no_duplicates(self, mock_context):
|
|
||||||
"""Test remove_duplicates when there are no duplicates."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': [1, 2, 3],
|
|
||||||
'B': ['x', 'y', 'z']
|
|
||||||
})
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
pd.testing.assert_frame_equal(result, df)
|
|
||||||
|
|
||||||
def test_remove_duplicates_all_duplicates(self, mock_context):
|
|
||||||
"""Test remove_duplicates when all rows are identical."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': [1, 1, 1],
|
|
||||||
'B': ['x', 'x', 'x']
|
|
||||||
})
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
|
|
||||||
assert result.shape[0] == 1
|
|
||||||
|
|
||||||
def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe):
|
|
||||||
"""Test remove_duplicates with empty DataFrame."""
|
|
||||||
result = remove_duplicates(mock_context, empty_dataframe)
|
|
||||||
|
|
||||||
assert result.shape[0] == 0
|
|
||||||
assert result.shape[1] == 0
|
|
||||||
|
|
||||||
def test_remove_duplicates_preserves_data_types(self, mock_context):
|
|
||||||
"""Test that remove_duplicates preserves data types."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'int_col': [1, 2, 1],
|
|
||||||
'str_col': ['a', 'b', 'a'],
|
|
||||||
'float_col': [1.5, 2.5, 1.5]
|
|
||||||
})
|
|
||||||
result = remove_duplicates(mock_context, df)
|
|
||||||
|
|
||||||
assert result['int_col'].dtype == df['int_col'].dtype
|
|
||||||
assert result['str_col'].dtype == df['str_col'].dtype
|
|
||||||
assert result['float_col'].dtype == df['float_col'].dtype
|
|
||||||
|
|
||||||
|
|
||||||
class TestFillMissingValues:
|
|
||||||
"""Tests for the fill_missing_values operation."""
|
|
||||||
|
|
||||||
def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values):
|
|
||||||
"""Test basic missing value filling."""
|
|
||||||
config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'})
|
|
||||||
result = fill_missing_values(mock_context, config, dataframe_with_missing_values)
|
|
||||||
|
|
||||||
# Check that no NaN values remain
|
|
||||||
assert result['Column1'].isna().sum() == 0
|
|
||||||
assert result['Column2'].isna().sum() == 0
|
|
||||||
|
|
||||||
def test_fill_missing_values_with_different_values(self, mock_context):
|
|
||||||
"""Test filling with different replacement values."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': [1, None, 3],
|
|
||||||
'B': [None, 'b', 'c']
|
|
||||||
})
|
|
||||||
config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'})
|
|
||||||
result = fill_missing_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result.loc[1, 'A'] == '-1'
|
|
||||||
assert result.loc[0, 'B'] == 'UNKNOWN'
|
|
||||||
|
|
||||||
def test_fill_missing_values_partial_columns(self, mock_context):
|
|
||||||
"""Test filling only specified columns."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': [1, None, 3],
|
|
||||||
'B': [None, 'b', 'c']
|
|
||||||
})
|
|
||||||
config = FillMissingConfiguration(fill_map={'A': '999'})
|
|
||||||
result = fill_missing_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result.loc[1, 'A'] == '999'
|
|
||||||
assert pd.isna(result.loc[0, 'B']) # B should still have NaN
|
|
||||||
|
|
||||||
def test_fill_missing_values_no_missing(self, mock_context):
|
|
||||||
"""Test when there are no missing values."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': ['1', '2', '3'],
|
|
||||||
'B': ['a', 'b', 'c']
|
|
||||||
})
|
|
||||||
config = FillMissingConfiguration(fill_map={'A': '0'})
|
|
||||||
result = fill_missing_values(mock_context, config, df)
|
|
||||||
|
|
||||||
pd.testing.assert_frame_equal(result, df)
|
|
||||||
|
|
||||||
def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe):
|
|
||||||
"""Test with empty DataFrame."""
|
|
||||||
config = FillMissingConfiguration(fill_map={})
|
|
||||||
result = fill_missing_values(mock_context, config, empty_dataframe)
|
|
||||||
|
|
||||||
assert result.shape[0] == 0
|
|
||||||
|
|
||||||
|
|
||||||
class TestStandardizeCategoricalValues:
|
|
||||||
"""Tests for the standardize_categorical_values operation."""
|
|
||||||
|
|
||||||
def test_standardize_categorical_basic(self, mock_context, sample_dataframe):
|
|
||||||
"""Test basic categorical standardization."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, sample_dataframe)
|
|
||||||
|
|
||||||
# Check that values are lowercase and stripped
|
|
||||||
assert result['Name'].iloc[0] == 'john doe'
|
|
||||||
assert result['City'].iloc[1] == 'los angeles'
|
|
||||||
assert result['Status'].iloc[1] == 'inactive'
|
|
||||||
|
|
||||||
def test_standardize_categorical_single_column(self, mock_context):
|
|
||||||
"""Test standardization on a single column."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'City': [' NEW YORK ', 'LOS ANGELES', ' chicago ']
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['City'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result['City'].iloc[0] == 'new york'
|
|
||||||
assert result['City'].iloc[1] == 'los angeles'
|
|
||||||
assert result['City'].iloc[2] == 'chicago'
|
|
||||||
|
|
||||||
def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe):
|
|
||||||
"""Test with non-existent column (should skip)."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, sample_dataframe)
|
|
||||||
|
|
||||||
# Should process 'Name' column without error
|
|
||||||
assert result['Name'].iloc[0] == 'john doe'
|
|
||||||
|
|
||||||
def test_standardize_categorical_with_missing_values(self, mock_context):
|
|
||||||
"""Test standardization with missing values."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Category': [' ACTIVE ', None, ' pending ']
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['Category'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result['Category'].iloc[0] == 'active'
|
|
||||||
assert result['Category'].iloc[1] == ''
|
|
||||||
assert result['Category'].iloc[2] == 'pending'
|
|
||||||
|
|
||||||
def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe):
|
|
||||||
"""Test with empty DataFrame."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['A', 'B'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, empty_dataframe)
|
|
||||||
|
|
||||||
assert result.shape[0] == 0
|
|
||||||
|
|
||||||
def test_standardize_categorical_numeric_columns(self, mock_context):
|
|
||||||
"""Test that numeric columns are converted to strings."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'NumCol': [1, 2, 3]
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['NumCol'])
|
|
||||||
result = standardize_categorical_values(mock_context, config, df)
|
|
||||||
|
|
||||||
assert result['NumCol'].iloc[0] == '1'
|
|
||||||
assert isinstance(result['NumCol'].iloc[0], str)
|
|
||||||
|
|
||||||
|
|
||||||
class TestCorrectTypos:
|
|
||||||
"""Tests for the correct_typos operation."""
|
|
||||||
|
|
||||||
def test_correct_typos_basic(self, mock_context):
|
|
||||||
"""Test basic typo correction."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Name': ['jon', 'jayne', 'bob']
|
|
||||||
})
|
|
||||||
config = SpellCheckConfiguration(columns=['Name'], language='en')
|
|
||||||
result = correct_typos(mock_context, config, df)
|
|
||||||
|
|
||||||
# Result should have corrections applied
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
|
|
||||||
def test_correct_typos_missing_column(self, mock_context):
|
|
||||||
"""Test with non-existent column (should skip)."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Name': ['jon', 'jayne']
|
|
||||||
})
|
|
||||||
config = SpellCheckConfiguration(columns=['NonExistent'], language='en')
|
|
||||||
result = correct_typos(mock_context, config, df)
|
|
||||||
|
|
||||||
# Should not raise error, just skip
|
|
||||||
pd.testing.assert_frame_equal(result, df)
|
|
||||||
|
|
||||||
def test_correct_typos_with_missing_values(self, mock_context):
|
|
||||||
"""Test typo correction with missing values."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Text': ['helo', '', 'wrld']
|
|
||||||
})
|
|
||||||
config = SpellCheckConfiguration(columns=['Text'], language='en')
|
|
||||||
result = correct_typos(mock_context, config, df)
|
|
||||||
|
|
||||||
# Empty strings should be preserved
|
|
||||||
assert result.loc[1, 'Text'] == ''
|
|
||||||
|
|
||||||
def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe):
|
|
||||||
"""Test with empty DataFrame."""
|
|
||||||
config = SpellCheckConfiguration(columns=['A'], language='en')
|
|
||||||
result = correct_typos(mock_context, config, empty_dataframe)
|
|
||||||
|
|
||||||
assert result.shape[0] == 0
|
|
||||||
|
|
||||||
def test_correct_typos_different_languages(self, mock_context):
|
|
||||||
"""Test typo correction with different languages."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Text': ['ciao', 'mondo']
|
|
||||||
})
|
|
||||||
|
|
||||||
for lang in ['en', 'es', 'it']:
|
|
||||||
config = SpellCheckConfiguration(columns=['Text'], language=lang)
|
|
||||||
result = correct_typos(mock_context, config, df)
|
|
||||||
|
|
||||||
# Should process without error
|
|
||||||
assert result.shape[0] == 2
|
|
||||||
|
|
||||||
def test_correct_typos_numeric_values(self, mock_context):
|
|
||||||
"""Test typo correction on numeric values converted to strings."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'Values': [123, 456, 789]
|
|
||||||
})
|
|
||||||
config = SpellCheckConfiguration(columns=['Values'], language='en')
|
|
||||||
result = correct_typos(mock_context, config, df)
|
|
||||||
|
|
||||||
# Numeric values should be converted to string and processed
|
|
||||||
assert result.shape[0] == 3
|
|
||||||
|
|
||||||
class TestNormalizeDatetime:
|
|
||||||
"""Tests for the normalize_datetime operation."""
|
|
||||||
|
|
||||||
def test_normalize_datetime_basic(self, mock_context):
|
|
||||||
"""Test basic datetime normalization to ISO format."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = ColumnsSelectConfiguration(columns=['date_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'date_col_iso' in result.columns
|
|
||||||
assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z'
|
|
||||||
assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z'
|
|
||||||
|
|
||||||
def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe):
|
|
||||||
"""Test behavior when a configured column is missing in the DataFrame."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['non_existent_column'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, sample_dataframe.copy())
|
|
||||||
|
|
||||||
pd.testing.assert_frame_equal(result, sample_dataframe)
|
|
||||||
|
|
||||||
def test_normalize_datetime_unparseable_values(self, mock_context):
|
|
||||||
"""Test column with values that cannot be parsed as dates."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'invalid_col': ['not-a-date', 'completely-random-text']
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['invalid_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'invalid_col_iso' not in result.columns
|
|
||||||
|
|
||||||
def test_normalize_datetime_mixed_and_nulls(self, mock_context):
|
|
||||||
"""Test column with mixed valid dates, invalid dates, and NaNs."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'mixed_col': ['2023-05-01', None, 'invalid-date']
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['mixed_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'mixed_col_iso' in result.columns
|
|
||||||
assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z'
|
|
||||||
|
|
||||||
assert result['mixed_col_iso'].iloc[1] == ""
|
|
||||||
assert result['mixed_col_iso'].iloc[2] == ""
|
|
||||||
|
|
||||||
def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe):
|
|
||||||
"""Test with an empty DataFrame."""
|
|
||||||
config = ColumnsSelectConfiguration(columns=['some_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, empty_dataframe)
|
|
||||||
|
|
||||||
assert result.empty
|
|
||||||
|
|
||||||
def test_normalize_datetime_epoch_only(self, mock_context, capsys):
|
|
||||||
"""If parsing a column yields only the Unix epoch date, it should be skipped."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'weird_col': ['0', 0, '0000', '']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = ColumnsSelectConfiguration(columns=['weird_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'weird_col_iso' not in result.columns
|
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert "all normalized values are '1970-01-01'" in captured.err
|
|
||||||
|
|
||||||
def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys):
|
|
||||||
"""If all formatted values are '1970-01-01', the column should be skipped with a warning."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = ColumnsSelectConfiguration(columns=['ts_col'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'ts_col_iso' not in result.columns
|
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert "all normalized values are '1970-01-01'" in captured.err
|
|
||||||
|
|
||||||
def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys):
|
|
||||||
"""If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'age': [66, 45, 40, 43, 20, 26, 69, 21, 46]
|
|
||||||
})
|
|
||||||
|
|
||||||
config = ColumnsSelectConfiguration(columns=['age'])
|
|
||||||
|
|
||||||
result = normalize_datetime(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'age_iso' not in result.columns
|
|
||||||
|
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert "all normalized values are '1970-01-01'" in captured.err
|
|
||||||
|
|
||||||
class TestNormalizeNumericMinMax:
|
|
||||||
"""Tests for the normalize_numeric_min_max operation."""
|
|
||||||
|
|
||||||
def test_normalize_numeric_basic(self, mock_context):
|
|
||||||
"""Test standard min-max normalization between 0 and 1."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'score': [10, 20, 30, 40, 50]
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['score'])
|
|
||||||
|
|
||||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'score_norm' in result.columns
|
|
||||||
assert result['score_norm'].min() == 0.0
|
|
||||||
assert result['score_norm'].max() == 1.0
|
|
||||||
|
|
||||||
assert result['score_norm'].iloc[2] == 0.5
|
|
||||||
|
|
||||||
def test_normalize_numeric_missing_column(self, mock_context):
|
|
||||||
"""Test skipping of non-existent columns."""
|
|
||||||
df = pd.DataFrame({'existing': [1, 2, 3]})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['missing_col'])
|
|
||||||
|
|
||||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'missing_col_norm' not in result.columns
|
|
||||||
|
|
||||||
def test_normalize_numeric_constant_values(self, mock_context):
|
|
||||||
"""Test skipping when min == max to avoid division by zero."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'constant': [10, 10, 10]
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['constant'])
|
|
||||||
|
|
||||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'constant_norm' not in result.columns
|
|
||||||
|
|
||||||
def test_normalize_numeric_with_nans(self, mock_context):
|
|
||||||
"""Test normalization with NaN values (pandas min/max ignore NaNs by default)."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'with_nans': [10, None, 50]
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['with_nans'])
|
|
||||||
|
|
||||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'with_nans_norm' in result.columns
|
|
||||||
assert result['with_nans_norm'].iloc[0] == 0.0
|
|
||||||
assert result['with_nans_norm'].iloc[2] == 1.0
|
|
||||||
assert pd.isna(result['with_nans_norm'].iloc[1])
|
|
||||||
|
|
||||||
def test_normalize_numeric_multiple_columns(self, mock_context):
|
|
||||||
"""Test processing multiple columns in one call."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'A': [1, 2],
|
|
||||||
'B': [10, 20]
|
|
||||||
})
|
|
||||||
config = ColumnsSelectConfiguration(columns=['A', 'B'])
|
|
||||||
|
|
||||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'A_norm' in result.columns
|
|
||||||
assert 'B_norm' in result.columns
|
|
||||||
|
|
||||||
class TestNormalizeCoordinates:
|
|
||||||
"""Tests for the normalize_coordinates operation."""
|
|
||||||
|
|
||||||
def test_normalize_coordinates_basic(self, mock_context):
|
|
||||||
"""Test rounding and basic coordinate normalization."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': [45.123456, 46.0],
|
|
||||||
'lon': [9.123456, 10.0]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert result['lat'].iloc[0] == 45.1235
|
|
||||||
assert result['lon'].iloc[0] == 9.1235
|
|
||||||
|
|
||||||
assert len(result) == 2
|
|
||||||
|
|
||||||
def test_normalize_coordinates_filtering(self, mock_context):
|
|
||||||
"""Test filtering of out-of-range coordinates."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range
|
|
||||||
'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 1
|
|
||||||
assert result['lat'].iloc[0] == 45.0
|
|
||||||
|
|
||||||
def test_normalize_coordinates_invalid_types(self, mock_context):
|
|
||||||
"""Test conversion of strings to numeric and handling of NaNs."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["45.5", "invalid", None],
|
|
||||||
'lon': ["9.5", "10.0", "11.0"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 1
|
|
||||||
assert isinstance(result['lat'].iloc[0], float)
|
|
||||||
|
|
||||||
def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe):
|
|
||||||
"""Test with an empty DataFrame."""
|
|
||||||
|
|
||||||
df = pd.DataFrame(columns=['lat', 'lon'])
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df)
|
|
||||||
|
|
||||||
assert len(result) == 0
|
|
||||||
assert result.empty
|
|
||||||
|
|
||||||
def test_normalize_coordinates_default_config(self, mock_context):
|
|
||||||
"""Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': [45.123456, 46.0],
|
|
||||||
'lon': [9.123456, 10.0]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration()
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert result['lat'].iloc[0] == 45.1235
|
|
||||||
assert result['lon'].iloc[0] == 9.1235
|
|
||||||
assert len(result) == 2
|
|
||||||
|
|
||||||
def test_normalize_coordinates_null_config_values(self, mock_context):
|
|
||||||
"""Test that null lat/lon column names fall back to defaults ('lat'/'lon')."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': [45.123456, 46.0],
|
|
||||||
'lon': [9.123456, 10.0]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None)
|
|
||||||
|
|
||||||
assert config.latColumn == "lat"
|
|
||||||
assert config.lonColumn == "lon"
|
|
||||||
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert result['lat'].iloc[0] == 45.1235
|
|
||||||
assert result['lon'].iloc[0] == 9.1235
|
|
||||||
assert len(result) == 2
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_degree_symbol(self, mock_context):
|
|
||||||
"""Test DMS parsing with degree/minute/second symbols like 40°26'46\"N."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["40°26'46\"N", "51°30'26\"N"],
|
|
||||||
'lon': ["79°58'56\"W", "0°7'39\"W"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 2
|
|
||||||
# 40°26'46"N ≈ 40.4461
|
|
||||||
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
|
|
||||||
# 79°58'56"W ≈ -79.9822
|
|
||||||
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_spaced_format(self, mock_context):
|
|
||||||
"""Test DMS parsing with space-separated format like '40 26 46 N'."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["40 26 46 N"],
|
|
||||||
'lon': ["79 58 56 W"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 1
|
|
||||||
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
|
|
||||||
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_already_decimal(self, mock_context):
|
|
||||||
"""Test that string columns with decimal values are auto-parsed correctly."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["45.5", "46.0"],
|
|
||||||
'lon': ["9.5", "10.0"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 2
|
|
||||||
assert result['lat'].iloc[0] == 45.5
|
|
||||||
assert result['lon'].iloc[0] == 9.5
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context):
|
|
||||||
"""Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["40°26'46\"N", "not_a_coord", "51.5"],
|
|
||||||
'lon': ["79°58'56\"W", "10.0", "0.1"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
# Row with "not_a_coord" for lat should be dropped (NaN lat)
|
|
||||||
assert len(result) == 2
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_out_of_range(self, mock_context):
|
|
||||||
"""Test that DMS-parsed coordinates outside valid range are filtered out."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["91°0'0\"N", "45°0'0\"N"],
|
|
||||||
'lon': ["0°0'0\"E", "9°0'0\"E"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
# First row has lat=91° which is out of [-90, 90]
|
|
||||||
assert len(result) == 1
|
|
||||||
assert abs(result['lat'].iloc[0] - 45.0) < 0.001
|
|
||||||
|
|
||||||
def test_normalize_coordinates_dms_south_and_east(self, mock_context):
|
|
||||||
"""Test DMS parsing with south latitude and east longitude."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'lat': ["33°51'54\"S"],
|
|
||||||
'lon': ["151°12'36\"E"]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(
|
|
||||||
latColumn='lat', lonColumn='lon'
|
|
||||||
)
|
|
||||||
result = normalize_coordinates(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 1
|
|
||||||
# 33°51'54"S ≈ -33.865
|
|
||||||
assert result['lat'].iloc[0] < 0
|
|
||||||
assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001
|
|
||||||
# 151°12'36"E ≈ 151.21
|
|
||||||
assert result['lon'].iloc[0] > 0
|
|
||||||
assert abs(result['lon'].iloc[0] - 151.21) < 0.01
|
|
||||||
|
|
||||||
def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context):
|
|
||||||
"""Test that numeric columns are coerced directly while string columns are parsed as DMS."""
|
|
||||||
# Numeric columns — should go through pd.to_numeric path
|
|
||||||
df_numeric = pd.DataFrame({
|
|
||||||
'lat': [45.123456, 46.0],
|
|
||||||
'lon': [9.123456, 10.0]
|
|
||||||
})
|
|
||||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
|
||||||
result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy())
|
|
||||||
|
|
||||||
assert result_numeric['lat'].iloc[0] == 45.1235
|
|
||||||
assert len(result_numeric) == 2
|
|
||||||
|
|
||||||
# String DMS columns — should go through _parse_dms_to_decimal path
|
|
||||||
df_dms = pd.DataFrame({
|
|
||||||
'lat': ["40°26'46\"N"],
|
|
||||||
'lon': ["79°58'56\"W"]
|
|
||||||
})
|
|
||||||
result_dms = normalize_coordinates(mock_context, config, df_dms.copy())
|
|
||||||
|
|
||||||
assert len(result_dms) == 1
|
|
||||||
assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001
|
|
||||||
|
|
||||||
class TestAddGlobalAggregations:
|
|
||||||
"""Tests for the add_global_aggregations operation."""
|
|
||||||
|
|
||||||
def test_add_global_aggregations_success(self, mock_context):
|
|
||||||
"""Test a successful group by and aggregation."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'category': ['A', 'A', 'B'],
|
|
||||||
'value': [10, 20, 100],
|
|
||||||
'ignored_str': ['x', 'y', 'z']
|
|
||||||
})
|
|
||||||
|
|
||||||
config = AggregationConfiguration(
|
|
||||||
columns=['category'],
|
|
||||||
operation='sum'
|
|
||||||
)
|
|
||||||
|
|
||||||
result = add_global_aggregations(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert len(result) == 2
|
|
||||||
assert result.loc[result['category'] == 'A', 'value'].values[0] == 30
|
|
||||||
assert result.loc[result['category'] == 'B', 'value'].values[0] == 100
|
|
||||||
assert 'ignored_str' not in result.columns
|
|
||||||
mock_context.log.info.assert_called()
|
|
||||||
|
|
||||||
def test_add_global_aggregations_missing_column(self, mock_context):
|
|
||||||
"""Test skipping a column that does not exist in the dataframe."""
|
|
||||||
df = pd.DataFrame({'value': [1, 2, 3]})
|
|
||||||
config = AggregationConfiguration(
|
|
||||||
columns=['missing_col'],
|
|
||||||
operation='count'
|
|
||||||
)
|
|
||||||
|
|
||||||
result = add_global_aggregations(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.")
|
|
||||||
assert len(result) == 1
|
|
||||||
|
|
||||||
def test_add_global_aggregations_unsupported_op(self, mock_context):
|
|
||||||
"""Test the warning when an unsupported operation is provided."""
|
|
||||||
df = pd.DataFrame({'category': ['A'], 'value': [1]})
|
|
||||||
|
|
||||||
config = AggregationConfiguration(
|
|
||||||
columns=['category'],
|
|
||||||
operation='unsupported'
|
|
||||||
)
|
|
||||||
|
|
||||||
with pytest.raises(Exception):
|
|
||||||
add_global_aggregations(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'")
|
|
||||||
|
|
||||||
def test_add_global_aggregations_only_numeric_kept(self, mock_context):
|
|
||||||
"""Verify that non-numeric and non-grouping columns are dropped."""
|
|
||||||
df = pd.DataFrame({
|
|
||||||
'group': ['A', 'A'],
|
|
||||||
'num': [1, 2],
|
|
||||||
'text': ['hello', 'world']
|
|
||||||
})
|
|
||||||
config = AggregationConfiguration(columns=['group'], operation='mean')
|
|
||||||
|
|
||||||
result = add_global_aggregations(mock_context, config, df.copy())
|
|
||||||
|
|
||||||
assert 'text' not in result.columns
|
|
||||||
assert 'num' in result.columns
|
|
||||||
assert 'group' in result.columns
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration
|
|
||||||
|
|
||||||
|
|
||||||
def test_valid_configuration_with_overrides():
|
|
||||||
cfg = BaseConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
supp_level=10.0,
|
|
||||||
generalisation_hierarchies={"age": "age_hierarchy"},
|
|
||||||
)
|
|
||||||
assert cfg.ident == ["id"]
|
|
||||||
assert cfg.quasi_identifiers == ["age"]
|
|
||||||
assert cfg.supp_level == 10.0
|
|
||||||
assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_values_are_loaded():
|
|
||||||
cfg = BaseConfiguration()
|
|
||||||
assert cfg.ident == ["Name"]
|
|
||||||
assert cfg.quasi_identifiers == ["Age"]
|
|
||||||
assert cfg.supp_level == 50.0
|
|
||||||
assert cfg.generalisation_hierarchies == {"Age": "simpl_age"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_missing_ident_raises_error():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
BaseConfiguration(
|
|
||||||
ident=[]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_missing_quasi_ident_raises_error():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
BaseConfiguration(
|
|
||||||
quasi_identifiers=[]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_overlap_between_ident_and_quasi_identifiers():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
BaseConfiguration(
|
|
||||||
ident=["age"],
|
|
||||||
quasi_identifiers=["age"]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_supp_level_bounds():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
BaseConfiguration(
|
|
||||||
supp_level=150.0 # fuori range
|
|
||||||
)
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import (
|
|
||||||
simpl_age,
|
|
||||||
simpl_age2,
|
|
||||||
simpl_gender,
|
|
||||||
get_all_hierarchies,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_simpl_age_structure():
|
|
||||||
assert isinstance(simpl_age, dict)
|
|
||||||
assert 0 in simpl_age
|
|
||||||
assert isinstance(simpl_age[0], list)
|
|
||||||
# verify first level contains 100 ages
|
|
||||||
assert len(simpl_age[0]) == 100
|
|
||||||
assert simpl_age[0][0] == 0
|
|
||||||
assert simpl_age[0][-1] == 99
|
|
||||||
|
|
||||||
|
|
||||||
def test_simpl_age2_structure():
|
|
||||||
assert isinstance(simpl_age2, dict)
|
|
||||||
assert 0 in simpl_age2
|
|
||||||
assert 1 in simpl_age2
|
|
||||||
assert isinstance(simpl_age2[0], list)
|
|
||||||
assert isinstance(simpl_age2[1], list)
|
|
||||||
|
|
||||||
|
|
||||||
def test_simpl_gender_structure():
|
|
||||||
assert isinstance(simpl_gender, dict)
|
|
||||||
assert 0 in simpl_gender
|
|
||||||
assert 1 in simpl_gender
|
|
||||||
assert simpl_gender[0] == ["M", "F", "O"]
|
|
||||||
assert simpl_gender[1] == ["*", "*", "*"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_all_hierarchies():
|
|
||||||
hier = get_all_hierarchies()
|
|
||||||
|
|
||||||
# the function should return dicts only
|
|
||||||
assert isinstance(hier, dict)
|
|
||||||
|
|
||||||
# ensure expected dicts are included
|
|
||||||
assert "simpl_age" in hier
|
|
||||||
assert "simpl_age2" in hier
|
|
||||||
assert "simpl_gender" in hier
|
|
||||||
|
|
||||||
# ensure the values returned are references to the actual dicts
|
|
||||||
assert hier["simpl_age"] is simpl_age
|
|
||||||
assert hier["simpl_gender"] is simpl_gender
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import (
|
|
||||||
KAnonymityConfiguration,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_valid_k_anonymity_config_with_overrides():
|
|
||||||
cfg = KAnonymityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
supp_level=5.0,
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
sensitive_attributes=["disease"],
|
|
||||||
)
|
|
||||||
assert cfg.k == 3
|
|
||||||
assert cfg.sensitive_attributes == ["disease"]
|
|
||||||
assert cfg.generalisation_hierarchies == {"age": "age_hier"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_values_are_loaded():
|
|
||||||
cfg = KAnonymityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"}
|
|
||||||
)
|
|
||||||
assert cfg.k == 3
|
|
||||||
assert cfg.sensitive_attributes == ["Disease"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_k_value_raises_error():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
KAnonymityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=1, # invalid, must be >= 2
|
|
||||||
sensitive_attributes=["disease"],
|
|
||||||
)
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import (
|
|
||||||
LDiversityConfiguration,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_valid_l_diversity_config_with_overrides():
|
|
||||||
cfg = LDiversityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
supp_level=5.0,
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
l=2,
|
|
||||||
sensitive_attribute="disease",
|
|
||||||
)
|
|
||||||
assert cfg.k == 3
|
|
||||||
assert cfg.l == 2
|
|
||||||
assert cfg.sensitive_attribute == "disease"
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_values_are_loaded():
|
|
||||||
cfg = LDiversityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"}
|
|
||||||
)
|
|
||||||
assert cfg.k == 2
|
|
||||||
assert cfg.l == 3
|
|
||||||
assert cfg.sensitive_attribute == "Disease"
|
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_l_value_raises_error():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
LDiversityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
l=0, # invalid, must be >= 1
|
|
||||||
sensitive_attribute="disease",
|
|
||||||
)
|
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import (
|
|
||||||
TClosenessConfiguration,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_valid_t_closeness_config_with_overrides():
|
|
||||||
cfg = TClosenessConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
supp_level=5.0,
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
t=0.4,
|
|
||||||
sensitive_attribute="disease",
|
|
||||||
)
|
|
||||||
assert cfg.k == 3
|
|
||||||
assert cfg.t == 0.4
|
|
||||||
assert cfg.sensitive_attribute == "disease"
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_values_are_loaded():
|
|
||||||
cfg = TClosenessConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"}
|
|
||||||
)
|
|
||||||
assert cfg.k == 2
|
|
||||||
assert cfg.t == 0.5
|
|
||||||
assert cfg.sensitive_attribute == "Disease"
|
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_t_value_low():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
TClosenessConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
t=-0.1, # invalid
|
|
||||||
sensitive_attribute="disease",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_t_value_high():
|
|
||||||
with pytest.raises(ValidationError):
|
|
||||||
TClosenessConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
generalisation_hierarchies={"age": "age_hier"},
|
|
||||||
k=3,
|
|
||||||
t=2.0, # invalid > 1
|
|
||||||
sensitive_attribute="disease",
|
|
||||||
)
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
from template_code_location.dataframe_level_anonymisation.jobs import (
|
|
||||||
k_anonymity_job,
|
|
||||||
l_diversity_job,
|
|
||||||
t_closeness_job,
|
|
||||||
k_anonymity_job_s3,
|
|
||||||
l_diversity_job_s3,
|
|
||||||
t_closeness_job_s3
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_k_anonymity_job_is_callable():
|
|
||||||
"""Test k_anonymity_job is a valid Dagster job"""
|
|
||||||
assert callable(k_anonymity_job)
|
|
||||||
assert hasattr(k_anonymity_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_l_diversity_job_is_callable():
|
|
||||||
"""Test l_diversity_job is a valid Dagster job"""
|
|
||||||
assert callable(l_diversity_job)
|
|
||||||
assert hasattr(l_diversity_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_t_closeness_job_is_callable():
|
|
||||||
"""Test t_closeness_job is a valid Dagster job"""
|
|
||||||
assert callable(t_closeness_job)
|
|
||||||
assert hasattr(t_closeness_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_k_anonymity_job_s3_is_callable():
|
|
||||||
"""Test k_anonymity_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(k_anonymity_job_s3)
|
|
||||||
assert hasattr(k_anonymity_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_l_diversity_job_s3_is_callable():
|
|
||||||
"""Test l_diversity_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(l_diversity_job_s3)
|
|
||||||
assert hasattr(l_diversity_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_t_closeness_job_s3_is_callable():
|
|
||||||
"""Test t_closeness_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(t_closeness_job_s3)
|
|
||||||
assert hasattr(t_closeness_job_s3, 'execute_in_process')
|
|
||||||
@@ -1,230 +0,0 @@
|
|||||||
import pytest
|
|
||||||
import pandas as pd
|
|
||||||
from unittest.mock import patch
|
|
||||||
from dagster import DagsterInvalidInvocationError, build_op_context
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.ops import (
|
|
||||||
apply_k_anonymity,
|
|
||||||
apply_l_diversity,
|
|
||||||
apply_t_closeness,
|
|
||||||
)
|
|
||||||
from template_code_location.dataframe_level_anonymisation.config_models import (
|
|
||||||
KAnonymityConfiguration,
|
|
||||||
LDiversityConfiguration,
|
|
||||||
TClosenessConfiguration,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Fixtures
|
|
||||||
# ---------------------------
|
|
||||||
@pytest.fixture
|
|
||||||
def fake_df():
|
|
||||||
return pd.DataFrame({"id": [1, 2], "age": [30, 40]})
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def k_config():
|
|
||||||
return KAnonymityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
sensitive_attributes=["age"],
|
|
||||||
k=2,
|
|
||||||
supp_level=0.0,
|
|
||||||
generalisation_hierarchies={"age": "simpl_age"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def l_config():
|
|
||||||
return LDiversityConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
sensitive_attribute="age",
|
|
||||||
k=2,
|
|
||||||
l=1,
|
|
||||||
supp_level=0.0,
|
|
||||||
generalisation_hierarchies={"age": "simpl_age"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def t_config():
|
|
||||||
return TClosenessConfiguration(
|
|
||||||
ident=["id"],
|
|
||||||
quasi_identifiers=["age"],
|
|
||||||
sensitive_attribute="age",
|
|
||||||
k=2,
|
|
||||||
t=0.5,
|
|
||||||
supp_level=0.0,
|
|
||||||
generalisation_hierarchies={"age": "simpl_age"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def op_context():
|
|
||||||
return build_op_context()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Helper for patching external functions
|
|
||||||
# ---------------------------
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def patch_external_ops():
|
|
||||||
with (
|
|
||||||
patch(
|
|
||||||
"dataframe_level_anonymisation.ops.get_all_hierarchies",
|
|
||||||
return_value={"simpl_age": {0: [30, 40]}},
|
|
||||||
),
|
|
||||||
patch(
|
|
||||||
"dataframe_level_anonymisation.ops.k_anonymity",
|
|
||||||
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
||||||
),
|
|
||||||
patch(
|
|
||||||
"dataframe_level_anonymisation.ops.l_diversity",
|
|
||||||
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
||||||
),
|
|
||||||
patch(
|
|
||||||
"dataframe_level_anonymisation.ops.t_closeness",
|
|
||||||
return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}),
|
|
||||||
),
|
|
||||||
):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Tests for apply_k_anonymity
|
|
||||||
# ---------------------------
|
|
||||||
def test_apply_k_anonymity_outputs(op_context, k_config, fake_df):
|
|
||||||
results = list(apply_k_anonymity(op_context, k_config, fake_df))
|
|
||||||
assert len(results) == 2
|
|
||||||
|
|
||||||
data_output = results[0].value
|
|
||||||
metrics_output = results[1].value
|
|
||||||
|
|
||||||
# Check types
|
|
||||||
assert isinstance(data_output, pd.DataFrame)
|
|
||||||
assert isinstance(metrics_output, dict)
|
|
||||||
assert "k_anon" in metrics_output
|
|
||||||
assert "l_div" in metrics_output
|
|
||||||
assert "t_clos" in metrics_output
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Tests for apply_l_diversity
|
|
||||||
# ---------------------------
|
|
||||||
def test_apply_l_diversity_outputs(op_context, l_config, fake_df):
|
|
||||||
results = list(apply_l_diversity(op_context, l_config, fake_df))
|
|
||||||
assert len(results) == 2
|
|
||||||
|
|
||||||
data_output = results[0].value
|
|
||||||
metrics_output = results[1].value
|
|
||||||
|
|
||||||
assert isinstance(data_output, pd.DataFrame)
|
|
||||||
assert isinstance(metrics_output, dict)
|
|
||||||
assert "k_anon" in metrics_output
|
|
||||||
assert "l_div" in metrics_output
|
|
||||||
assert "t_clos" in metrics_output
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_l_diversity_empty_raises(op_context, l_config):
|
|
||||||
with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()):
|
|
||||||
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]})))
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Tests for apply_t_closeness
|
|
||||||
# ---------------------------
|
|
||||||
def test_apply_t_closeness_outputs(op_context, t_config, fake_df):
|
|
||||||
results = list(apply_t_closeness(op_context, t_config, fake_df))
|
|
||||||
assert len(results) == 2
|
|
||||||
|
|
||||||
data_output = results[0].value
|
|
||||||
metrics_output = results[1].value
|
|
||||||
|
|
||||||
assert isinstance(data_output, pd.DataFrame)
|
|
||||||
assert isinstance(metrics_output, dict)
|
|
||||||
assert "k_anon" in metrics_output
|
|
||||||
assert "l_div" in metrics_output
|
|
||||||
assert "t_clos" in metrics_output
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_t_closeness_empty_raises(op_context, t_config):
|
|
||||||
with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()):
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]})))
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Additional tests for _validate_and_get_hierarchies
|
|
||||||
# ---------------------------
|
|
||||||
def test_validate_hierarchies_dataset_too_small(k_config):
|
|
||||||
small_df = pd.DataFrame({"id": [1], "age": [30]})
|
|
||||||
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
||||||
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
_validate_and_get_hierarchies(k_config, small_df)
|
|
||||||
|
|
||||||
|
|
||||||
def test_validate_hierarchies_missing_hierarchy(k_config, fake_df):
|
|
||||||
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
||||||
|
|
||||||
bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}})
|
|
||||||
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
_validate_and_get_hierarchies(bad_config, fake_df)
|
|
||||||
|
|
||||||
|
|
||||||
def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df):
|
|
||||||
from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies
|
|
||||||
|
|
||||||
with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}):
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
_validate_and_get_hierarchies(k_config, fake_df)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Additional tests for _calc_dataframe_metrics
|
|
||||||
# ---------------------------
|
|
||||||
def test_calc_dataframe_metrics_basic():
|
|
||||||
from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics
|
|
||||||
|
|
||||||
df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]})
|
|
||||||
df_anon = df_org.copy()
|
|
||||||
|
|
||||||
with (
|
|
||||||
patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2),
|
|
||||||
patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1),
|
|
||||||
patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1),
|
|
||||||
):
|
|
||||||
|
|
||||||
report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"])
|
|
||||||
|
|
||||||
assert "k-anonymity" in report
|
|
||||||
assert metrics["k_anon"] == 2
|
|
||||||
assert metrics["l_div"] == 1
|
|
||||||
assert metrics["t_clos"] == 0.1
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------
|
|
||||||
# Tests for apply_t_closeness exception branches
|
|
||||||
# ---------------------------
|
|
||||||
def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df):
|
|
||||||
"""Covers the branch where ValueError contains 'Cannot be quasi-identifiers'."""
|
|
||||||
with patch(
|
|
||||||
"dataframe_level_anonymisation.ops.t_closeness",
|
|
||||||
side_effect=ValueError("Cannot be quasi-identifiers invalid"),
|
|
||||||
):
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
list(apply_t_closeness(op_context, t_config, fake_df))
|
|
||||||
|
|
||||||
|
|
||||||
def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df):
|
|
||||||
"""Covers the branch where ValueError is raised but message does NOT contain that substring."""
|
|
||||||
with patch(
|
|
||||||
"dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error")
|
|
||||||
):
|
|
||||||
with pytest.raises(DagsterInvalidInvocationError):
|
|
||||||
list(apply_t_closeness(op_context, t_config, fake_df))
|
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
|
|
||||||
from template_code_location.dataframe_level_anonymisation.utils import (
|
|
||||||
parse_value_list,
|
|
||||||
normalize_hierarchy_levels,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------
|
|
||||||
# Tests for parse_value_list
|
|
||||||
# ------------------------------------
|
|
||||||
def test_parse_value_list_all_strings_digits():
|
|
||||||
values = ["1", "2", "3"]
|
|
||||||
assert parse_value_list(values) == [1, 2, 3]
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_value_list_mixed_values():
|
|
||||||
values = ["1", 2, "abc", "5"]
|
|
||||||
assert parse_value_list(values) == [1, 2, "abc", 5]
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_value_list_no_digits():
|
|
||||||
values = ["a", "b", "c"]
|
|
||||||
assert parse_value_list(values) == ["a", "b", "c"]
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------
|
|
||||||
# Tests for normalize_hierarchy_levels
|
|
||||||
# ------------------------------------
|
|
||||||
def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array():
|
|
||||||
hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}}
|
|
||||||
|
|
||||||
normalized = normalize_hierarchy_levels(hierarchy)
|
|
||||||
|
|
||||||
assert "age" in normalized
|
|
||||||
assert 0 in normalized["age"]
|
|
||||||
assert isinstance(normalized["age"][0], np.ndarray)
|
|
||||||
assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list
|
|
||||||
assert normalized["age"][1] == ["0-10", "11-20"] # untouched
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_hierarchy_levels_multiple_columns():
|
|
||||||
hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}}
|
|
||||||
|
|
||||||
normalized = normalize_hierarchy_levels(hierarchy)
|
|
||||||
|
|
||||||
# First column
|
|
||||||
assert isinstance(normalized["age"][0], np.ndarray)
|
|
||||||
assert normalized["age"][0].tolist() == [10, 20]
|
|
||||||
|
|
||||||
# Second column
|
|
||||||
assert isinstance(normalized["gender"][0], np.ndarray)
|
|
||||||
assert normalized["gender"][0].tolist() == ["M", "F"]
|
|
||||||
assert normalized["gender"][1] == ["*"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0():
|
|
||||||
hierarchy = {"test": {"0": ["1", "x", "3"]}}
|
|
||||||
|
|
||||||
normalized = normalize_hierarchy_levels(hierarchy)
|
|
||||||
|
|
||||||
assert isinstance(normalized["test"][0], np.ndarray)
|
|
||||||
assert normalized["test"][0].tolist() == ["1", "x", "3"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_hierarchy_levels_empty_mapping():
|
|
||||||
hierarchy = {"col": {}}
|
|
||||||
normalized = normalize_hierarchy_levels(hierarchy)
|
|
||||||
|
|
||||||
assert normalized == {"col": {}}
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
|
|
||||||
@@ -1,444 +0,0 @@
|
|||||||
"""
|
|
||||||
Shared pytest fixtures and helpers for field-level pseudonymisation tests.
|
|
||||||
|
|
||||||
This module provides:
|
|
||||||
- Mock Vault client for testing without real Vault connections
|
|
||||||
- Sample data fixtures
|
|
||||||
- Configuration fixtures for encryption/decryption operations
|
|
||||||
- Helper functions for running ops and managing test Vault storage
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import pytest
|
|
||||||
from dagster import build_op_context
|
|
||||||
from cryptography.fernet import Fernet
|
|
||||||
from hvac.exceptions import InvalidPath, Forbidden
|
|
||||||
from unittest.mock import patch, MagicMock
|
|
||||||
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
|
||||||
AnonymisePseudonymizeStructuredConfig,
|
|
||||||
DepseudonymizeStructuredConfig,
|
|
||||||
EncryptConfig,
|
|
||||||
DecryptConfig,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
DepseudoTechniqueConfig,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.ops import (
|
|
||||||
anonymize_pseudonymize_structured,
|
|
||||||
depseudonymize_structured,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Mock Vault Storage ----------------------------------------
|
|
||||||
|
|
||||||
# In-memory Vault simulation for tests
|
|
||||||
_test_vault_storage = {}
|
|
||||||
_test_vault_access_control = {} # For simulating access control
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def mock_vault_client():
|
|
||||||
"""
|
|
||||||
Auto-use fixture that mocks the hvac.Client to avoid real Vault connections.
|
|
||||||
Uses an in-memory dict to simulate Vault storage for tests.
|
|
||||||
Includes access control simulation for AC3.
|
|
||||||
"""
|
|
||||||
global _test_vault_storage, _test_vault_access_control
|
|
||||||
_test_vault_storage = {} # Reset storage before each test
|
|
||||||
_test_vault_access_control = {} # Reset access control
|
|
||||||
|
|
||||||
def mock_read_secret(path, mount_point):
|
|
||||||
"""Mock reading secret from Vault with access control"""
|
|
||||||
full_path = f"{mount_point}/{path}"
|
|
||||||
|
|
||||||
# Check access control first
|
|
||||||
if full_path in _test_vault_access_control:
|
|
||||||
if not _test_vault_access_control[full_path]:
|
|
||||||
raise Forbidden(f"Access denied to secret: {full_path}")
|
|
||||||
|
|
||||||
if full_path not in _test_vault_storage:
|
|
||||||
raise InvalidPath(f"Secret not found: {full_path}")
|
|
||||||
return {"data": {"data": {"value": _test_vault_storage[full_path]}}}
|
|
||||||
|
|
||||||
def mock_create_or_update_secret(path, mount_point, secret):
|
|
||||||
"""Mock creating/updating secret in Vault"""
|
|
||||||
full_path = f"{mount_point}/{path}"
|
|
||||||
_test_vault_storage[full_path] = secret["value"]
|
|
||||||
|
|
||||||
def mock_delete_metadata(path, mount_point):
|
|
||||||
"""Mock deleting secret from Vault"""
|
|
||||||
full_path = f"{mount_point}/{path}"
|
|
||||||
if full_path in _test_vault_storage:
|
|
||||||
del _test_vault_storage[full_path]
|
|
||||||
if full_path in _test_vault_access_control:
|
|
||||||
del _test_vault_access_control[full_path]
|
|
||||||
|
|
||||||
with patch("hvac.Client") as mock_client_class:
|
|
||||||
mock_instance = MagicMock()
|
|
||||||
mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret
|
|
||||||
mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = (
|
|
||||||
mock_create_or_update_secret
|
|
||||||
)
|
|
||||||
mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = (
|
|
||||||
mock_delete_metadata
|
|
||||||
)
|
|
||||||
mock_client_class.return_value = mock_instance
|
|
||||||
yield mock_instance
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Sample Data Fixtures ----------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_df():
|
|
||||||
"""
|
|
||||||
Fixture providing a sample structured dataset with PII data.
|
|
||||||
Represents typical data that requires pseudonymisation and restoration.
|
|
||||||
"""
|
|
||||||
return pd.DataFrame(
|
|
||||||
{
|
|
||||||
"id": [1, 2, 3, 4, 5],
|
|
||||||
"name": [
|
|
||||||
"Alice Smith",
|
|
||||||
"Bob Jones",
|
|
||||||
"Charlie Brown",
|
|
||||||
"David Wilson",
|
|
||||||
"Eva Garcia",
|
|
||||||
],
|
|
||||||
"email": [
|
|
||||||
"alice@example.com",
|
|
||||||
"bob@example.com",
|
|
||||||
"charlie@example.com",
|
|
||||||
"david@example.com",
|
|
||||||
"eva@example.com",
|
|
||||||
],
|
|
||||||
"ssn": [
|
|
||||||
"123-45-6789",
|
|
||||||
"234-56-7890",
|
|
||||||
"345-67-8901",
|
|
||||||
"456-78-9012",
|
|
||||||
"567-89-0123",
|
|
||||||
],
|
|
||||||
"age": [25, 30, 35, 40, 45],
|
|
||||||
"salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0],
|
|
||||||
"department": ["HR", "IT", "Finance", "IT", "HR"],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Configuration Fixtures ----------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def encrypt_config_single_field():
|
|
||||||
"""
|
|
||||||
Configuration for encrypting a single field (email).
|
|
||||||
Used to create pseudonymised data for restoration tests.
|
|
||||||
"""
|
|
||||||
return AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
columns=["email"],
|
|
||||||
key_name="test_restoration_key_single",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def decrypt_config_single_field():
|
|
||||||
"""
|
|
||||||
Configuration for decrypting a single field (email).
|
|
||||||
Used to restore original values.
|
|
||||||
"""
|
|
||||||
return DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
type="decrypt",
|
|
||||||
columns=["email"],
|
|
||||||
key_name="test_restoration_key_single",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def encrypt_config_multiple_fields():
|
|
||||||
"""
|
|
||||||
Configuration for encrypting multiple fields (name, email, ssn).
|
|
||||||
Tests restoration of multiple sensitive fields.
|
|
||||||
"""
|
|
||||||
return AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
columns=["name", "email", "ssn"],
|
|
||||||
key_name="test_restoration_key_multi",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def decrypt_config_multiple_fields():
|
|
||||||
"""
|
|
||||||
Configuration for decrypting multiple fields (name, email, ssn).
|
|
||||||
"""
|
|
||||||
return DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
type="decrypt",
|
|
||||||
columns=["name", "email", "ssn"],
|
|
||||||
key_name="test_restoration_key_multi",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def encrypt_config_partial_fields():
|
|
||||||
"""
|
|
||||||
Configuration for encrypting only some fields (email, ssn).
|
|
||||||
Tests partial restoration scenarios.
|
|
||||||
"""
|
|
||||||
return AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
columns=["email", "ssn"],
|
|
||||||
key_name="test_restoration_key_partial",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def decrypt_config_partial_fields():
|
|
||||||
"""
|
|
||||||
Configuration for decrypting only some fields (email, ssn).
|
|
||||||
"""
|
|
||||||
return DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
type="decrypt",
|
|
||||||
columns=["email", "ssn"],
|
|
||||||
key_name="test_restoration_key_partial",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def authorized_multi_key_scenario():
|
|
||||||
"""
|
|
||||||
Fixture for testing multi-key authorization scenarios.
|
|
||||||
Sets up two keys: one authorized, one denied.
|
|
||||||
"""
|
|
||||||
clear_vault_key("authorized_key")
|
|
||||||
clear_vault_key("unauthorized_key")
|
|
||||||
|
|
||||||
# Create authorized key by generating it
|
|
||||||
authorized_key = Fernet.generate_key().decode()
|
|
||||||
set_vault_key("authorized_key", authorized_key)
|
|
||||||
|
|
||||||
# Create unauthorized key and deny access
|
|
||||||
unauthorized_key = Fernet.generate_key().decode()
|
|
||||||
set_vault_key("unauthorized_key", unauthorized_key)
|
|
||||||
deny_vault_access("unauthorized_key")
|
|
||||||
|
|
||||||
yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"}
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
clear_vault_key("authorized_key")
|
|
||||||
clear_vault_key("unauthorized_key")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def large_dataset():
|
|
||||||
"""
|
|
||||||
Fixture providing a large dataset (10,000 rows) for performance testing.
|
|
||||||
Reusable across multiple performance tests.
|
|
||||||
"""
|
|
||||||
return pd.DataFrame(
|
|
||||||
{
|
|
||||||
"id": range(1, 10001),
|
|
||||||
"email": [f"user{i}@example.com" for i in range(1, 10001)],
|
|
||||||
"name": [f"User {i}" for i in range(1, 10001)],
|
|
||||||
"ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)],
|
|
||||||
"age": [20 + (i % 50) for i in range(1, 10001)],
|
|
||||||
"salary": [30000.0 + (i * 10) for i in range(1, 10001)],
|
|
||||||
"department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def vault_test_keys():
|
|
||||||
"""
|
|
||||||
Session-scoped fixture to pre-generate test keys for faster test execution.
|
|
||||||
Avoids repeated key generation in each test.
|
|
||||||
"""
|
|
||||||
keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)}
|
|
||||||
|
|
||||||
return keys
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def cleanup_test_keys(request):
|
|
||||||
"""
|
|
||||||
Fixture to automatically cleanup test keys after each test.
|
|
||||||
Use with: @pytest.mark.usefixtures("cleanup_test_keys")
|
|
||||||
"""
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Cleanup all test keys from mock Vault
|
|
||||||
test_keys = [k for k in _test_vault_storage.keys() if "test_" in k]
|
|
||||||
for key in test_keys:
|
|
||||||
_test_vault_storage.pop(key, None)
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Helper Functions ----------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def config_to_dagster_dict(config):
|
|
||||||
"""
|
|
||||||
Convert Pydantic config to Dagster-compatible dictionary.
|
|
||||||
|
|
||||||
For AnonymisePseudonymizeStructuredConfig (uses discriminated Union):
|
|
||||||
Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}}
|
|
||||||
Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}}
|
|
||||||
|
|
||||||
For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union):
|
|
||||||
Pydantic v2 outputs:
|
|
||||||
{'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}}
|
|
||||||
Dagster expects: Same flat structure with 'type' field
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Pydantic config instance
|
|
||||||
(AnonymisePseudonymizeStructuredConfig or
|
|
||||||
DepseudonymizeStructuredConfig)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Dagster-compatible configuration dictionary
|
|
||||||
"""
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
|
||||||
AnonymisePseudonymizeStructuredConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
config_dict = config.model_dump()
|
|
||||||
|
|
||||||
# Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig
|
|
||||||
# DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union)
|
|
||||||
if isinstance(config, AnonymisePseudonymizeStructuredConfig):
|
|
||||||
if "used_function" in config_dict:
|
|
||||||
for func_config in config_dict["used_function"]:
|
|
||||||
if "technique" in func_config:
|
|
||||||
technique = func_config["technique"]
|
|
||||||
# Pydantic outputs flat dict with 'type' field for discriminated unions
|
|
||||||
if isinstance(technique, dict) and "type" in technique:
|
|
||||||
# Extract the type discriminator
|
|
||||||
technique_type = technique["type"]
|
|
||||||
# Create nested structure without the 'type' field
|
|
||||||
technique_data = {k: v for k, v in technique.items() if k != "type"}
|
|
||||||
# Nest under the discriminator key for Dagster
|
|
||||||
func_config["technique"] = {technique_type: technique_data}
|
|
||||||
|
|
||||||
return config_dict
|
|
||||||
|
|
||||||
|
|
||||||
def run_encrypt_op(config, df):
|
|
||||||
"""
|
|
||||||
Helper function to execute the anonymize_pseudonymize_structured op.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: AnonymisePseudonymizeStructuredConfig instance
|
|
||||||
df: Input pandas DataFrame
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (result_df, metrics) - Output DataFrame and metrics dict
|
|
||||||
"""
|
|
||||||
context = build_op_context(op_config=config_to_dagster_dict(config))
|
|
||||||
result_df, metrics = anonymize_pseudonymize_structured(context, df=df)
|
|
||||||
return result_df.value, metrics.value
|
|
||||||
|
|
||||||
|
|
||||||
def run_decrypt_op(config, df):
|
|
||||||
"""
|
|
||||||
Helper function to execute the depseudonymize_structured op.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: DepseudonymizeStructuredConfig instance
|
|
||||||
df: Input pandas DataFrame
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (result_df, metrics) - Output DataFrame and metrics dict
|
|
||||||
"""
|
|
||||||
context = build_op_context(op_config=config_to_dagster_dict(config))
|
|
||||||
result_df, metrics = depseudonymize_structured(context, df=df)
|
|
||||||
return result_df.value, metrics.value
|
|
||||||
|
|
||||||
|
|
||||||
def clear_vault_key(key_name: str):
|
|
||||||
"""
|
|
||||||
Helper function to clear a key from the simulated Vault storage for test isolation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key_name: Name of the key to delete from Vault
|
|
||||||
"""
|
|
||||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
|
||||||
if full_path in _test_vault_storage:
|
|
||||||
del _test_vault_storage[full_path]
|
|
||||||
if full_path in _test_vault_access_control:
|
|
||||||
del _test_vault_access_control[full_path]
|
|
||||||
|
|
||||||
|
|
||||||
def set_vault_key(key_name: str, key_value: str):
|
|
||||||
"""
|
|
||||||
Helper function to set a key in the simulated Vault storage.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key_name: Name of the key
|
|
||||||
key_value: Value of the key (Fernet key as string)
|
|
||||||
"""
|
|
||||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
|
||||||
_test_vault_storage[full_path] = key_value
|
|
||||||
|
|
||||||
|
|
||||||
def deny_vault_access(key_name: str):
|
|
||||||
"""
|
|
||||||
Helper function to deny access to a key for authorization testing (AC3).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key_name: Name of the key to deny access to
|
|
||||||
"""
|
|
||||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
|
||||||
_test_vault_access_control[full_path] = False
|
|
||||||
|
|
||||||
|
|
||||||
def get_vault_key(key_name: str) -> bytes:
|
|
||||||
"""
|
|
||||||
Helper function to retrieve a key from the simulated Vault storage.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key_name: Name of the key to retrieve
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bytes: The encryption key
|
|
||||||
"""
|
|
||||||
full_path = f"secret/PseudonymKeys/{key_name}"
|
|
||||||
if full_path not in _test_vault_storage:
|
|
||||||
raise InvalidPath(f"Key not found: {key_name}")
|
|
||||||
return _test_vault_storage[full_path].encode()
|
|
||||||
@@ -1,633 +0,0 @@
|
|||||||
import pytest
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import (
|
|
||||||
AnonymisePseudonymizeStructuredConfig,
|
|
||||||
DepseudonymizeStructuredConfig,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
DepseudoTechniqueConfig,
|
|
||||||
HashConfig,
|
|
||||||
EncryptConfig,
|
|
||||||
RedactConfig,
|
|
||||||
ReplaceConfig,
|
|
||||||
DecryptConfig,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig,
|
|
||||||
DepseudonymizeUnstructuredConfig,
|
|
||||||
PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig,
|
|
||||||
DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig,
|
|
||||||
HashConfig as UnstructuredHashConfig,
|
|
||||||
EncryptConfig as UnstructuredEncryptConfig,
|
|
||||||
RedactConfig as UnstructuredRedactConfig,
|
|
||||||
ReplaceConfig as UnstructuredReplaceConfig,
|
|
||||||
RetainConfig,
|
|
||||||
DecryptConfig as UnstructuredDecryptConfig,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum
|
|
||||||
|
|
||||||
|
|
||||||
# ==================== Structured Config Tests ====================
|
|
||||||
|
|
||||||
class TestStructuredConfigValidators:
|
|
||||||
"""Tests for structured_config.py validators and validators."""
|
|
||||||
|
|
||||||
def test_ensure_unique_columns_valid_single_technique(self):
|
|
||||||
"""Test that single technique with single column passes validation."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert config is not None
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
|
|
||||||
def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self):
|
|
||||||
"""Test that multiple techniques with different columns passes validation."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=HashConfig(
|
|
||||||
columns=["ssn"],
|
|
||||||
algorithm="sha256"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert config is not None
|
|
||||||
assert len(config.used_function) == 2
|
|
||||||
|
|
||||||
def test_ensure_unique_columns_duplicate_columns_same_technique(self):
|
|
||||||
"""Test that duplicate columns in different techniques raises error."""
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
|
||||||
AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=HashConfig(
|
|
||||||
columns=["email"],
|
|
||||||
algorithm="sha256"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert "Duplicate column" in str(exc_info.value)
|
|
||||||
assert "email" in str(exc_info.value)
|
|
||||||
|
|
||||||
def test_ensure_unique_columns_multiple_duplicates(self):
|
|
||||||
"""Test error message with multiple duplicate columns."""
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
|
||||||
AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
columns=["email", "phone"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=HashConfig(
|
|
||||||
columns=["email", "phone"],
|
|
||||||
algorithm="sha256"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
error_msg = str(exc_info.value)
|
|
||||||
assert "Duplicate column" in error_msg
|
|
||||||
assert "email" in error_msg
|
|
||||||
assert "phone" in error_msg
|
|
||||||
|
|
||||||
def test_collect_column_to_techniques_single_technique(self):
|
|
||||||
"""Test _collect_column_to_techniques with single technique."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
columns=["email", "phone"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
mapping = config._collect_column_to_techniques()
|
|
||||||
assert mapping == {
|
|
||||||
"email": ["encrypt"],
|
|
||||||
"phone": ["encrypt"]
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_dict_with_type_field(self):
|
|
||||||
"""Test _extract_technique_and_columns with dict containing 'type' field."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "encrypt",
|
|
||||||
"columns": ["email", "ssn"],
|
|
||||||
"key_name": "test_key"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "encrypt"
|
|
||||||
assert columns == ["email", "ssn"]
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_dict_with_variant_mapping(self):
|
|
||||||
"""Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"encrypt": {
|
|
||||||
"columns": ["ssn"],
|
|
||||||
"key_name": "test_key"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "encrypt"
|
|
||||||
assert columns == ["ssn"]
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_model_instance(self):
|
|
||||||
"""Test _extract_technique_and_columns with PseudoTechniqueConfig model instance."""
|
|
||||||
pseudo_config = PseudoTechniqueConfig(
|
|
||||||
technique=RedactConfig(columns=["address"])
|
|
||||||
)
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
|
|
||||||
assert technique_type == "redact"
|
|
||||||
assert columns == ["address"]
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_empty_dict(self):
|
|
||||||
"""Test _extract_technique_and_columns with empty dict."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(
|
|
||||||
{"technique": {}}
|
|
||||||
)
|
|
||||||
assert technique_type is None
|
|
||||||
assert columns == []
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_none_technique(self):
|
|
||||||
"""Test _extract_technique_and_columns with None technique."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(
|
|
||||||
{"technique": None}
|
|
||||||
)
|
|
||||||
assert technique_type is None
|
|
||||||
assert columns == []
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_missing_columns_key(self):
|
|
||||||
"""Test _extract_technique_and_columns when 'columns' key is missing."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "encrypt",
|
|
||||||
"key_name": "test_key"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "encrypt"
|
|
||||||
assert columns == []
|
|
||||||
|
|
||||||
def test_extract_technique_and_columns_model_without_columns_attr(self):
|
|
||||||
"""Test _extract_technique_and_columns with model instance missing columns attribute."""
|
|
||||||
pseudo_config = PseudoTechniqueConfig(
|
|
||||||
technique=ReplaceConfig(columns=["old_value"], new_value="NEW")
|
|
||||||
)
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig()
|
|
||||||
technique_type, columns = config._extract_technique_and_columns(pseudo_config)
|
|
||||||
assert technique_type == "replace"
|
|
||||||
assert columns == ["old_value"]
|
|
||||||
|
|
||||||
|
|
||||||
class TestStructuredDepseudonymizeConfig:
|
|
||||||
"""Tests for DepseudonymizeStructuredConfig."""
|
|
||||||
|
|
||||||
def test_depseudonymize_config_normalize_used_function_with_dict(self):
|
|
||||||
"""Test _normalize_depseudo_used_function with dict input."""
|
|
||||||
config = DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "decrypt",
|
|
||||||
"columns": ["email"],
|
|
||||||
"key_name": "key1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
assert isinstance(config.used_function[0], DepseudoTechniqueConfig)
|
|
||||||
assert config.used_function[0].technique.type == "decrypt"
|
|
||||||
|
|
||||||
def test_depseudonymize_config_normalize_used_function_with_model(self):
|
|
||||||
"""Test _normalize_depseudo_used_function with model instance."""
|
|
||||||
depseudo_tech = DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
config = DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[depseudo_tech]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
assert config.used_function[0] is depseudo_tech
|
|
||||||
|
|
||||||
def test_depseudonymize_config_ensure_unique_columns_no_op(self):
|
|
||||||
"""Test that ensure_unique_columns is a no-op for depseudonymize."""
|
|
||||||
# For depseudonymize, there's no per-column uniqueness constraint
|
|
||||||
config = DepseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(
|
|
||||||
columns=["email"],
|
|
||||||
key_name="key2"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
# Should not raise - no-op validator
|
|
||||||
assert config is not None
|
|
||||||
|
|
||||||
|
|
||||||
# ==================== Unstructured Config Tests ====================
|
|
||||||
|
|
||||||
class TestUnstructuredConfigValidators:
|
|
||||||
"""Tests for unstructured_config.py validators."""
|
|
||||||
|
|
||||||
def test_normalize_used_function_with_dict(self):
|
|
||||||
"""Test _normalize_used_function with dict input."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"encrypt": {
|
|
||||||
"pii": [PIIEntityEnum.EMAIL.value],
|
|
||||||
"key_name": "key1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
|
|
||||||
def test_normalize_used_function_with_model(self):
|
|
||||||
"""Test _normalize_used_function with model instance."""
|
|
||||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredEncryptConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[pseudo_tech]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
|
|
||||||
def test_ensure_unique_pii_valid_different_pii_types(self):
|
|
||||||
"""Test that different PII types pass validation."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredEncryptConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredHashConfig(
|
|
||||||
pii=[PIIEntityEnum.PERSON.value],
|
|
||||||
algorithm="sha256"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert config is not None
|
|
||||||
assert len(config.used_function) == 2
|
|
||||||
|
|
||||||
def test_ensure_unique_pii_duplicate_pii_types(self):
|
|
||||||
"""Test that duplicate PII types raise error."""
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredEncryptConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredHashConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value],
|
|
||||||
algorithm="sha256"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert "Duplicate PII" in str(exc_info.value)
|
|
||||||
# Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value
|
|
||||||
assert "EMAIL" in str(exc_info.value)
|
|
||||||
|
|
||||||
def test_collect_pii_to_techniques_single_technique(self):
|
|
||||||
"""Test _collect_pii_to_techniques with single technique."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredEncryptConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value],
|
|
||||||
key_name="key1"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
mapping = config._collect_pii_to_techniques()
|
|
||||||
assert mapping == {
|
|
||||||
PIIEntityEnum.EMAIL.value: ["encrypt"],
|
|
||||||
PIIEntityEnum.PERSON.value: ["encrypt"]
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_dict_with_type_field(self):
|
|
||||||
"""Test _extract_technique_and_pii with dict containing 'type' field."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "encrypt",
|
|
||||||
"pii": [PIIEntityEnum.EMAIL.value],
|
|
||||||
"key_name": "test_key"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "encrypt"
|
|
||||||
assert piis == [PIIEntityEnum.EMAIL.value]
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_dict_with_variant_mapping(self):
|
|
||||||
"""Test _extract_technique_and_pii with variant-key mapping."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"hash": {
|
|
||||||
"pii": [PIIEntityEnum.PERSON.value],
|
|
||||||
"algorithm": "sha256"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "hash"
|
|
||||||
assert piis == [PIIEntityEnum.PERSON.value]
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_dict_fallback_to_columns(self):
|
|
||||||
"""Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "redact",
|
|
||||||
"columns": ["fallback_col"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "redact"
|
|
||||||
assert piis == ["fallback_col"]
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_model_instance(self):
|
|
||||||
"""Test _extract_technique_and_pii with model instance."""
|
|
||||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredRedactConfig(
|
|
||||||
pii=[PIIEntityEnum.EMAIL.value]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
|
|
||||||
assert technique_type == "redact"
|
|
||||||
assert piis == [PIIEntityEnum.EMAIL.value]
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_model_with_getattr_fallback(self):
|
|
||||||
"""Test _extract_technique_and_pii model with getattr fallback to columns."""
|
|
||||||
# Create a mock-like scenario where pii attribute doesn't exist
|
|
||||||
pseudo_tech = UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value])
|
|
||||||
)
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(pseudo_tech)
|
|
||||||
assert technique_type == "retain"
|
|
||||||
assert piis == [PIIEntityEnum.PERSON.value]
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_empty_dict(self):
|
|
||||||
"""Test _extract_technique_and_pii with empty dict."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(
|
|
||||||
{"technique": {}}
|
|
||||||
)
|
|
||||||
assert technique_type is None
|
|
||||||
assert piis == []
|
|
||||||
|
|
||||||
def test_extract_technique_and_pii_missing_pii_key(self):
|
|
||||||
"""Test _extract_technique_and_pii when 'pii' key is missing."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en)
|
|
||||||
technique_type, piis = config._extract_technique_and_pii(
|
|
||||||
{
|
|
||||||
"technique": {
|
|
||||||
"type": "encrypt",
|
|
||||||
"key_name": "test_key"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
assert technique_type == "encrypt"
|
|
||||||
assert piis == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestUnstructuredDepseudonymizeConfig:
|
|
||||||
"""Tests for DepseudonymizeUnstructuredConfig."""
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_config_default(self):
|
|
||||||
"""Test default DepseudonymizeUnstructuredConfig."""
|
|
||||||
config = DepseudonymizeUnstructuredConfig()
|
|
||||||
assert config is not None
|
|
||||||
assert len(config.used_function) >= 1
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_config_with_custom_function(self):
|
|
||||||
"""Test DepseudonymizeUnstructuredConfig with custom function."""
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
UnstructuredDepseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredDecryptConfig(
|
|
||||||
key_name="custom_key"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 1
|
|
||||||
assert config.used_function[0].technique.key_name == "custom_key"
|
|
||||||
|
|
||||||
|
|
||||||
class TestLanguageSupport:
|
|
||||||
"""Tests for language configuration support."""
|
|
||||||
|
|
||||||
def test_all_supported_languages(self):
|
|
||||||
"""Test that all supported languages can be set."""
|
|
||||||
supported_languages = [
|
|
||||||
LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en,
|
|
||||||
LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el,
|
|
||||||
LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt,
|
|
||||||
LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv
|
|
||||||
]
|
|
||||||
|
|
||||||
for lang in supported_languages:
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(language=lang)
|
|
||||||
assert config.language == lang
|
|
||||||
|
|
||||||
def test_default_language_is_english(self):
|
|
||||||
"""Test that default language is English."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig()
|
|
||||||
assert config.language == LanguageEnum.en
|
|
||||||
|
|
||||||
|
|
||||||
class TestTechniqueConfigDefaults:
|
|
||||||
"""Tests for technique config defaults."""
|
|
||||||
|
|
||||||
def test_hash_config_default_algorithm(self):
|
|
||||||
"""Test HashConfig default algorithm."""
|
|
||||||
config = HashConfig()
|
|
||||||
assert config.algorithm == "sha256"
|
|
||||||
assert config.type == "hash"
|
|
||||||
|
|
||||||
def test_encrypt_config_defaults(self):
|
|
||||||
"""Test EncryptConfig defaults."""
|
|
||||||
config = EncryptConfig()
|
|
||||||
assert config.type == "encrypt"
|
|
||||||
assert config.key_name == "my_key"
|
|
||||||
|
|
||||||
def test_redact_config_defaults(self):
|
|
||||||
"""Test RedactConfig defaults."""
|
|
||||||
config = RedactConfig()
|
|
||||||
assert config.type == "redact"
|
|
||||||
|
|
||||||
def test_replace_config_defaults(self):
|
|
||||||
"""Test ReplaceConfig defaults."""
|
|
||||||
config = ReplaceConfig()
|
|
||||||
assert config.type == "replace"
|
|
||||||
assert config.new_value == "REPLACED"
|
|
||||||
|
|
||||||
def test_decrypt_config_defaults(self):
|
|
||||||
"""Test DecryptConfig defaults."""
|
|
||||||
config = DecryptConfig()
|
|
||||||
assert config.type == "decrypt"
|
|
||||||
assert config.key_name == "my_key"
|
|
||||||
|
|
||||||
def test_unstructured_retain_config_defaults(self):
|
|
||||||
"""Test RetainConfig defaults."""
|
|
||||||
config = RetainConfig()
|
|
||||||
assert config.type == "retain"
|
|
||||||
|
|
||||||
|
|
||||||
class TestPseudoTechniqueConfigDefaults:
|
|
||||||
"""Tests for PseudoTechniqueConfig defaults."""
|
|
||||||
|
|
||||||
def test_pseudo_technique_default_to_hash(self):
|
|
||||||
"""Test PseudoTechniqueConfig defaults to hash technique."""
|
|
||||||
config = PseudoTechniqueConfig()
|
|
||||||
# For Dagster Config, technique may be a dict with the discriminator structure
|
|
||||||
if isinstance(config.technique, dict):
|
|
||||||
# Check if it has hash configuration
|
|
||||||
assert "hash" in config.technique or config.technique.get("type") == "hash"
|
|
||||||
else:
|
|
||||||
assert config.technique.type == "hash"
|
|
||||||
|
|
||||||
def test_unstructured_pseudo_technique_default_to_hash(self):
|
|
||||||
"""Test UnstructuredPseudoTechniqueConfig defaults to hash technique."""
|
|
||||||
config = UnstructuredPseudoTechniqueConfig()
|
|
||||||
# For Dagster Config, technique may be a dict with the discriminator structure
|
|
||||||
if isinstance(config.technique, dict):
|
|
||||||
# Check if it has hash configuration
|
|
||||||
assert "hash" in config.technique or config.technique.get("type") == "hash"
|
|
||||||
else:
|
|
||||||
assert config.technique.type == "hash"
|
|
||||||
|
|
||||||
|
|
||||||
class TestConfigModelIntegration:
|
|
||||||
"""Integration tests for config models."""
|
|
||||||
|
|
||||||
def test_structured_config_with_all_technique_types(self):
|
|
||||||
"""Test structured config with all technique types."""
|
|
||||||
config = AnonymisePseudonymizeStructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=HashConfig(columns=["col1"])
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(columns=["col2"], key_name="k1")
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=RedactConfig(columns=["col3"])
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=ReplaceConfig(columns=["col4"], new_value="X")
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 4
|
|
||||||
techniques = {f.technique.type for f in config.used_function}
|
|
||||||
assert techniques == {"hash", "encrypt", "redact", "replace"}
|
|
||||||
|
|
||||||
def test_unstructured_config_with_all_technique_types(self):
|
|
||||||
"""Test unstructured config with all technique types."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value])
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredEncryptConfig(
|
|
||||||
pii=[PIIEntityEnum.PERSON.value],
|
|
||||||
key_name="k1"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value])
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=UnstructuredReplaceConfig(
|
|
||||||
pii=[PIIEntityEnum.CREDIT_CARD.value],
|
|
||||||
new_value="X"
|
|
||||||
)
|
|
||||||
),
|
|
||||||
UnstructuredPseudoTechniqueConfig(
|
|
||||||
technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value])
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert len(config.used_function) == 5
|
|
||||||
techniques = {f.technique.type for f in config.used_function}
|
|
||||||
assert techniques == {"hash", "encrypt", "redact", "replace", "retain"}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,288 +0,0 @@
|
|||||||
"""
|
|
||||||
Test suite for data restoration (depseudonymisation) of unstructured text.
|
|
||||||
|
|
||||||
## Test Coverage Summary
|
|
||||||
|
|
||||||
### Acceptance Criteria Coverage:
|
|
||||||
- AC1 (Data Restoration with Valid Key): 2 tests
|
|
||||||
- AC2 (Restoration Denial - Missing Key): 1 test
|
|
||||||
- AC3 (Restoration Denial - Unauthorized Access): 1 test
|
|
||||||
- AC4 (Restoration Denial - Invalid Key): 1 test
|
|
||||||
- Additional Coverage: 2 tests (edge cases)
|
|
||||||
|
|
||||||
### Test Pattern:
|
|
||||||
- Each test uses build_op_context with .model_dump() for configuration
|
|
||||||
- Tests validate dual outputs (data, metrics)
|
|
||||||
- Tests verify complete restoration of original text
|
|
||||||
- Tests validate security controls and error handling
|
|
||||||
- Tests use descriptive names mapping to AC scenarios
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import patch
|
|
||||||
from cryptography.fernet import Fernet
|
|
||||||
from dagster import build_op_context
|
|
||||||
|
|
||||||
from src.field_level_pseudo_anonymisation.unstructured_ops import (
|
|
||||||
depseudonymize_unstructured,
|
|
||||||
)
|
|
||||||
from src.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
|
||||||
DepseudonymizeUnstructuredConfig,
|
|
||||||
DecryptConfig,
|
|
||||||
DepseudoTechniqueConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def fernet_key() -> bytes:
|
|
||||||
"""Generate a valid Fernet key for encryption in tests."""
|
|
||||||
return Fernet.generate_key()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def encrypted_text_data(fernet_key: bytes) -> dict:
|
|
||||||
"""
|
|
||||||
Create encrypted data for testing decryption.
|
|
||||||
|
|
||||||
Returns a dict with:
|
|
||||||
- original_text: The unencrypted text
|
|
||||||
- encrypted_text: Text with PII values encrypted in {encrypt:...} format
|
|
||||||
"""
|
|
||||||
original_text = "My name is John Doe and my email is john.doe@example.com."
|
|
||||||
fernet = Fernet(fernet_key)
|
|
||||||
encrypted_name = fernet.encrypt(b"John Doe").decode()
|
|
||||||
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
|
|
||||||
encrypted_text = (
|
|
||||||
f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}."
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"original_text": original_text,
|
|
||||||
"encrypted_text": encrypted_text,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------- AC1: Data Restoration with Valid Key --------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
|
||||||
def test_ac1_restore_encrypted_pii_entities_with_valid_key(
|
|
||||||
mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict
|
|
||||||
):
|
|
||||||
"""AC1: Restore encrypted PII entities with a valid key from secret management tool."""
|
|
||||||
# Arrange - Mock the Vault key retrieval to return the valid key
|
|
||||||
mock_create_get_key.return_value = fernet_key
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act - Request data restoration
|
|
||||||
result_gen = depseudonymize_unstructured(
|
|
||||||
context, input_text=encrypted_text_data["encrypted_text"]
|
|
||||||
)
|
|
||||||
data_output = next(result_gen)
|
|
||||||
metrics_output = next(result_gen)
|
|
||||||
|
|
||||||
# Assert - Verify successful restoration
|
|
||||||
# 1. All original values are restored exactly
|
|
||||||
assert (
|
|
||||||
data_output.value == encrypted_text_data["original_text"]
|
|
||||||
), "Original text should be fully restored"
|
|
||||||
|
|
||||||
# 2. Correct output structure
|
|
||||||
assert data_output.output_name == "data", "Output should be named 'data'"
|
|
||||||
|
|
||||||
# 3. Metrics show correct number of restored entities
|
|
||||||
assert (
|
|
||||||
metrics_output.value["total_depseudo_count"] == 2
|
|
||||||
), "Should restore 2 encrypted entities (name and email)"
|
|
||||||
|
|
||||||
# 4. System retrieved key from secret management tool
|
|
||||||
mock_create_get_key.assert_called_once_with("decrypt", "test_key")
|
|
||||||
|
|
||||||
|
|
||||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
|
||||||
def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes):
|
|
||||||
"""AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key."""
|
|
||||||
# Arrange - Create text with multiple PII types encrypted
|
|
||||||
original_text = "Contact John Doe at john.doe@example.com or call 555-1234."
|
|
||||||
fernet = Fernet(fernet_key)
|
|
||||||
encrypted_name = fernet.encrypt(b"John Doe").decode()
|
|
||||||
encrypted_email = fernet.encrypt(b"john.doe@example.com").decode()
|
|
||||||
encrypted_phone = fernet.encrypt(b"555-1234").decode()
|
|
||||||
encrypted_text = (
|
|
||||||
f"Contact {{encrypt:{encrypted_name}}} at "
|
|
||||||
f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}."
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_create_get_key.return_value = fernet_key
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(type="decrypt", key_name="multi_pii_key")
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result_gen = depseudonymize_unstructured(context, input_text=encrypted_text)
|
|
||||||
data_output = next(result_gen)
|
|
||||||
metrics_output = next(result_gen)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert data_output.value == original_text, "All PII types should be restored"
|
|
||||||
assert (
|
|
||||||
metrics_output.value["total_depseudo_count"] == 3
|
|
||||||
), "Should restore 3 encrypted entities (name, email, phone)"
|
|
||||||
mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key")
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------- AC2: Restoration Denial when Key is Missing ----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
|
||||||
def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict):
|
|
||||||
"""AC2: Deny restoration when decryption key is missing from secret management tool."""
|
|
||||||
# Arrange - Mock Vault to indicate key is missing
|
|
||||||
mock_create_get_key.side_effect = ValueError(
|
|
||||||
"Fernet key 'non_existent_key' not found in Vault for decrypt."
|
|
||||||
)
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(type="decrypt", key_name="non_existent_key")
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act & Assert - Verify system fails the restoration request
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match="Fernet key 'non_existent_key' not found in Vault for decrypt.",
|
|
||||||
) as exc_info:
|
|
||||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
|
||||||
|
|
||||||
# Verify error message is clear and actionable
|
|
||||||
assert "not found in Vault" in str(
|
|
||||||
exc_info.value
|
|
||||||
), "Error message should indicate key is missing from Vault"
|
|
||||||
|
|
||||||
# Verify system attempted to retrieve the key (logged attempt)
|
|
||||||
mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key")
|
|
||||||
|
|
||||||
|
|
||||||
# ------------- AC3: Restoration Denial when Access is Unauthorized --------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
|
||||||
def test_ac3_restoration_denial_when_unauthorized_access(
|
|
||||||
mock_create_get_key, encrypted_text_data: dict
|
|
||||||
):
|
|
||||||
"""AC3: Deny restoration when participant is not authorized to access the decryption key."""
|
|
||||||
# Arrange - Mock Vault to deny access
|
|
||||||
mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key")
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(
|
|
||||||
technique=DecryptConfig(type="decrypt", key_name="unauthorized_key")
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act & Assert - Verify system denies access
|
|
||||||
with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info:
|
|
||||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
|
||||||
|
|
||||||
# Verify error message clearly indicates access denial
|
|
||||||
assert "Access denied" in str(
|
|
||||||
exc_info.value
|
|
||||||
), "Error message should clearly indicate access was denied"
|
|
||||||
|
|
||||||
# Verify the unauthorized access attempt was logged (function was called)
|
|
||||||
mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key")
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------- AC4: Restoration Denial when Key is Invalid ----------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key")
|
|
||||||
def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict):
|
|
||||||
"""AC4: Deny restoration when decryption key does not correspond to the encrypted fields."""
|
|
||||||
# Arrange - Mock Vault to return a different (wrong) key
|
|
||||||
invalid_key = Fernet.generate_key() # A different, incorrect key
|
|
||||||
mock_create_get_key.return_value = invalid_key
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key"))
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act & Assert - Verify system fails the restoration
|
|
||||||
with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info:
|
|
||||||
list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"]))
|
|
||||||
|
|
||||||
# Verify error message indicates decryption failure
|
|
||||||
assert "Invalid Fernet token" in str(
|
|
||||||
exc_info.value
|
|
||||||
), "Error message should indicate the key is invalid for this data"
|
|
||||||
|
|
||||||
# Verify key was retrieved (system attempted decryption)
|
|
||||||
mock_create_get_key.assert_called_once_with("decrypt", "wrong_key")
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Additional Edge Cases ----------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_no_decrypt_config():
|
|
||||||
"""Edge case: Text is returned unchanged when no decryption techniques are configured."""
|
|
||||||
# Arrange
|
|
||||||
original_text = "This text has no {encrypt:values} to decrypt."
|
|
||||||
config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
result_gen = depseudonymize_unstructured(context, input_text=original_text)
|
|
||||||
result_output = next(result_gen)
|
|
||||||
metrics_output = next(result_gen)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert (
|
|
||||||
result_output.value == original_text
|
|
||||||
), "Text should remain unchanged when no decryption is configured"
|
|
||||||
assert (
|
|
||||||
metrics_output.value["total_depseudo_count"] == 0
|
|
||||||
), "Should report zero decryptions performed"
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_empty_text():
|
|
||||||
"""Edge case: Empty input text is returned unchanged with zero decryptions performed."""
|
|
||||||
# Arrange
|
|
||||||
empty_text = ""
|
|
||||||
config = DepseudonymizeUnstructuredConfig(
|
|
||||||
used_function=[
|
|
||||||
DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key"))
|
|
||||||
]
|
|
||||||
)
|
|
||||||
context = build_op_context(op_config=config.model_dump())
|
|
||||||
|
|
||||||
# Act
|
|
||||||
with patch(
|
|
||||||
"src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
|
|
||||||
) as mock_key:
|
|
||||||
mock_key.return_value = Fernet.generate_key()
|
|
||||||
result_gen = depseudonymize_unstructured(context, input_text=empty_text)
|
|
||||||
result_output = next(result_gen)
|
|
||||||
metrics_output = next(result_gen)
|
|
||||||
|
|
||||||
# Assert
|
|
||||||
assert result_output.value == "", "Empty text should remain empty"
|
|
||||||
assert (
|
|
||||||
metrics_output.value["total_depseudo_count"] == 0
|
|
||||||
), "Should report zero decryptions for empty text"
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,853 +0,0 @@
|
|||||||
"""
|
|
||||||
Test suite for field-level pseudonymisation operations on unstructured data.
|
|
||||||
|
|
||||||
This test suite validates the pseudonymisation of unstructured text with PII detection,
|
|
||||||
covering the following Acceptance Criteria:
|
|
||||||
|
|
||||||
## Test Coverage Summary
|
|
||||||
|
|
||||||
### Acceptance Criteria Coverage:
|
|
||||||
- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests
|
|
||||||
- AC2 (Invalid Execution Handling): 5 tests
|
|
||||||
- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests
|
|
||||||
- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests
|
|
||||||
- Additional Coverage: 3 tests
|
|
||||||
|
|
||||||
### Test Pattern:
|
|
||||||
- Each test uses build_op_context with config_to_dagster_dict for configuration
|
|
||||||
- Tests validate dual outputs (data, metrics)
|
|
||||||
- Vault access is mocked for isolation
|
|
||||||
- Tests validate Scrubadub automatic PII detection
|
|
||||||
- Tests ensure placeholder replacement for unconfigured PII
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import re
|
|
||||||
from dagster import build_op_context
|
|
||||||
from unittest.mock import patch, MagicMock
|
|
||||||
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import (
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig,
|
|
||||||
EncryptConfig,
|
|
||||||
RetainConfig,
|
|
||||||
PseudoTechniqueConfig,
|
|
||||||
)
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum
|
|
||||||
from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import (
|
|
||||||
anonymize_pseudonymize_unstructured,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .conftest import clear_vault_key
|
|
||||||
|
|
||||||
|
|
||||||
def config_to_dagster_dict_unstructured(config):
|
|
||||||
"""Convert unstructured config to Dagster format."""
|
|
||||||
config_dict = {"language": config.language.value, "used_function": []}
|
|
||||||
|
|
||||||
for func_config in config.used_function:
|
|
||||||
technique = func_config.technique
|
|
||||||
technique_type = technique.type
|
|
||||||
technique_dict = technique.model_dump()
|
|
||||||
|
|
||||||
if "pii" in technique_dict:
|
|
||||||
technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii]
|
|
||||||
|
|
||||||
technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"}
|
|
||||||
|
|
||||||
config_dict["used_function"].append(
|
|
||||||
{"technique": {technique_type: technique_dict_without_type}}
|
|
||||||
)
|
|
||||||
|
|
||||||
return config_dict
|
|
||||||
|
|
||||||
|
|
||||||
def run_unstructured_op(config, text):
|
|
||||||
"""
|
|
||||||
Helper to run unstructured pseudonymisation op.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (result_text: str, metrics_markdown: str)
|
|
||||||
"""
|
|
||||||
context = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
|
|
||||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text)
|
|
||||||
|
|
||||||
# Extract actual values from Output objects
|
|
||||||
return result_text.value, metrics.value
|
|
||||||
|
|
||||||
|
|
||||||
def parse_metrics_markdown(metrics_md: str) -> dict:
|
|
||||||
"""
|
|
||||||
Parse markdown metrics into structured dict for easier testing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metrics_md: Markdown metrics string from op output
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict with keys: total_pii_detected, pii_by_type, techniques_applied, language
|
|
||||||
"""
|
|
||||||
result = {
|
|
||||||
"total_pii_detected": 0,
|
|
||||||
"pii_by_type": {},
|
|
||||||
"techniques_applied": {},
|
|
||||||
"language": "",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Extract total PII detected
|
|
||||||
total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md)
|
|
||||||
if total_match:
|
|
||||||
result["total_pii_detected"] = int(total_match.group(1))
|
|
||||||
|
|
||||||
# Extract language
|
|
||||||
lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md)
|
|
||||||
if lang_match:
|
|
||||||
result["language"] = lang_match.group(1)
|
|
||||||
|
|
||||||
# Extract PII by type from table
|
|
||||||
pii_table_section = re.search(
|
|
||||||
r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)",
|
|
||||||
metrics_md,
|
|
||||||
)
|
|
||||||
if pii_table_section:
|
|
||||||
for line in pii_table_section.group(1).strip().split("\n"):
|
|
||||||
parts = [p.strip() for p in line.split("|") if p.strip()]
|
|
||||||
if len(parts) == 2:
|
|
||||||
entity_type, count = parts
|
|
||||||
result["pii_by_type"][entity_type] = int(count)
|
|
||||||
|
|
||||||
# Extract techniques applied
|
|
||||||
techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md)
|
|
||||||
if techniques_section:
|
|
||||||
for line in techniques_section.group(1).strip().split("\n"):
|
|
||||||
tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line)
|
|
||||||
if tech_match:
|
|
||||||
pii_type, technique = tech_match.groups()
|
|
||||||
result["techniques_applied"][pii_type] = technique
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------- Fixtures ----------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_text_en():
|
|
||||||
"""English text with various PII types."""
|
|
||||||
return """
|
|
||||||
John Smith works at Acme Corporation. His email is john.smith@example.com
|
|
||||||
and his phone number is +1-555-123-4567. He lives in New York City at
|
|
||||||
123 Main Street, Apartment 4B. His SSN is 123-45-6789.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_text_multi_person():
|
|
||||||
"""Text with multiple person names."""
|
|
||||||
return """
|
|
||||||
The meeting included Alice Johnson, Bob Williams, and Charlie Brown.
|
|
||||||
They discussed the project with Maria Garcia and David Wilson.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_text_mixed_pii():
|
|
||||||
"""Text with multiple PII types for AC1 comprehensive testing."""
|
|
||||||
return """
|
|
||||||
Contact Information:
|
|
||||||
Name: Dr. Emily Watson
|
|
||||||
Email: emily.watson@hospital.com
|
|
||||||
Phone: +44-20-7946-0958
|
|
||||||
Website: https://patient-portal.hospital.com/records
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def encrypt_person_config():
|
|
||||||
"""Configuration to encrypt PERSON entities."""
|
|
||||||
return AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_person_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def retain_person_config():
|
|
||||||
"""Configuration to retain PERSON entities unchanged."""
|
|
||||||
return AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mixed_technique_config():
|
|
||||||
"""Configuration with encryption and retention for AC1 testing."""
|
|
||||||
return AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
|
||||||
key_name="test_mixed_key",
|
|
||||||
)
|
|
||||||
),
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS])
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
|
||||||
# AC1: Pseudonymisation and Retention Are Applied Correctly
|
|
||||||
# ================================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config):
|
|
||||||
"""AC1: Test that configured PII types are encrypted correctly."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify person name is encrypted (not in plaintext)
|
|
||||||
assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted"
|
|
||||||
|
|
||||||
# Verify encryption token is present
|
|
||||||
assert "{encrypt:" in result_text, "Encrypted token should be present in result"
|
|
||||||
|
|
||||||
# Verify PII was detected and processed
|
|
||||||
assert metrics["total_pii_detected"] > 0, "System should detect PII entities"
|
|
||||||
assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII"
|
|
||||||
|
|
||||||
# Verify text structure is preserved (surrounding text intact)
|
|
||||||
assert "Contact Information:" in result_text, "Non-PII text structure should be preserved"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person):
|
|
||||||
"""AC1: Test that PII types marked for retention remain unchanged."""
|
|
||||||
retain_config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON]))
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify retained PII types remain in plaintext
|
|
||||||
assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged"
|
|
||||||
assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged"
|
|
||||||
|
|
||||||
# Verify technique applied is 'retain'
|
|
||||||
assert (
|
|
||||||
"retain" in metrics["techniques_applied"].get("PERSON", "").lower()
|
|
||||||
), "Retain technique should be recorded for PERSON type"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii):
|
|
||||||
"""AC1: Test that unconfigured PII types are replaced with placeholders."""
|
|
||||||
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_person_only_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_person_only_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
|
|
||||||
|
|
||||||
# Verify person is encrypted (configured)
|
|
||||||
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
|
|
||||||
|
|
||||||
# Verify unconfigured PII types have placeholders
|
|
||||||
assert (
|
|
||||||
"{{" in result_text and "}}" in result_text
|
|
||||||
), "Unconfigured PII should be replaced with placeholders"
|
|
||||||
|
|
||||||
# Verify original unconfigured PII values are not in result
|
|
||||||
assert (
|
|
||||||
"emily.watson@hospital.com" not in result_text
|
|
||||||
), "Unconfigured EMAIL should be replaced with placeholder"
|
|
||||||
|
|
||||||
# Verify placeholder format
|
|
||||||
assert (
|
|
||||||
"{{EMAIL}}" in result_text or "{{URL}}" in result_text
|
|
||||||
), "Placeholders should indicate entity type"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config):
|
|
||||||
"""AC1: Test that multiple techniques (encrypt, retain) are applied correctly."""
|
|
||||||
clear_vault_key("test_mixed_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify encrypted PII types (PERSON, EMAIL)
|
|
||||||
assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted"
|
|
||||||
assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted"
|
|
||||||
|
|
||||||
# Verify retained PII type (PHONE_NUMBERS)
|
|
||||||
assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained"
|
|
||||||
|
|
||||||
# Verify metrics reflect different techniques
|
|
||||||
assert (
|
|
||||||
"encrypt" in metrics["techniques_applied"].get("PERSON", "").lower()
|
|
||||||
), "Encrypt technique should be applied to PERSON"
|
|
||||||
assert (
|
|
||||||
"encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower()
|
|
||||||
), "Encrypt technique should be applied to EMAIL"
|
|
||||||
assert (
|
|
||||||
"retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower()
|
|
||||||
), "Retain technique should be applied to PHONE_NUMBERS"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config):
|
|
||||||
"""AC1: Test that all instances of a configured PII type are processed."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify all person names are encrypted
|
|
||||||
person_names = [
|
|
||||||
"Alice Johnson",
|
|
||||||
"Bob Williams",
|
|
||||||
"Charlie Brown",
|
|
||||||
"Maria Garcia",
|
|
||||||
"David Wilson",
|
|
||||||
]
|
|
||||||
for name in person_names:
|
|
||||||
assert name not in result_text, f"All PERSON instances should be encrypted: {name}"
|
|
||||||
|
|
||||||
# Verify metrics count multiple instances
|
|
||||||
assert metrics["pii_by_type"].get("PERSON", 0) >= len(
|
|
||||||
person_names
|
|
||||||
), f"Should detect at least {len(person_names)} PERSON entities"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_empty_text_returns_empty(encrypt_person_config):
|
|
||||||
"""AC1: Test that empty or null text input raises a ValueError."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
|
||||||
run_unstructured_op(encrypt_person_config, "")
|
|
||||||
|
|
||||||
assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_text_without_pii_remains_unchanged():
|
|
||||||
"""AC1: Test that text without any PII remains unchanged after processing."""
|
|
||||||
no_pii_text = """
|
|
||||||
The weather today is sunny with a high of 25 degrees Celsius.
|
|
||||||
The conference starts at 9:00 AM in Room 301.
|
|
||||||
"""
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_no_pii_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_no_pii_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(config, no_pii_text)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged"
|
|
||||||
assert metrics["total_pii_detected"] == 0, "No PII should be detected"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii):
|
|
||||||
"""AC1: Test that placeholders for unconfigured PII indicate the entity type."""
|
|
||||||
encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_placeholder_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_placeholder_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify placeholder format (scrubadub uses {{TYPE}} format)
|
|
||||||
placeholder_pattern = r"\{\{[A-Z_]+\}\}"
|
|
||||||
placeholders = re.findall(placeholder_pattern, result_text)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
len(placeholders) > 0
|
|
||||||
), "Result should contain entity-type placeholders for unconfigured PII"
|
|
||||||
|
|
||||||
# Verify metrics track which PII types were detected
|
|
||||||
assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types"
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
|
||||||
# AC2: Invalid Execution Handling
|
|
||||||
# ================================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac2_graceful_abort_on_scrubadub_failure():
|
|
||||||
"""AC2: Test graceful abort when the PII detection engine (Scrubadub) fails."""
|
|
||||||
text = "Test user John Smith with email john@example.com"
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_abort_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_abort_key")
|
|
||||||
|
|
||||||
# Mock Scrubadub to fail at the right import path
|
|
||||||
with patch(
|
|
||||||
"field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber"
|
|
||||||
) as mock_scrubber_class:
|
|
||||||
mock_scrubber = MagicMock()
|
|
||||||
mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error")
|
|
||||||
mock_scrubber_class.return_value = mock_scrubber
|
|
||||||
|
|
||||||
with pytest.raises(RuntimeError) as exc_info:
|
|
||||||
run_unstructured_op(config, text)
|
|
||||||
|
|
||||||
error_msg = str(exc_info.value).lower()
|
|
||||||
assert (
|
|
||||||
"pii" in error_msg
|
|
||||||
or "detection" in error_msg
|
|
||||||
or "scrubadub" in error_msg
|
|
||||||
or "failed" in error_msg
|
|
||||||
), "Error message should indicate PII detection failure"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac2_graceful_abort_on_encryption_failure(sample_text_en):
|
|
||||||
"""AC2: Test graceful abort when an encryption technique fails during execution."""
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_encrypt_fail_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_encrypt_fail_key")
|
|
||||||
|
|
||||||
# Mock encrypt function at correct path - it's imported from techniques module
|
|
||||||
encrypt_path = (
|
|
||||||
"field_level_pseudo_anonymisation"
|
|
||||||
".techniques.anonymisation_pseudonymisation_techniques.encrypt"
|
|
||||||
)
|
|
||||||
with patch(encrypt_path) as mock_encrypt:
|
|
||||||
mock_encrypt.side_effect = Exception("Encryption algorithm failure")
|
|
||||||
|
|
||||||
with pytest.raises(RuntimeError) as exc_info:
|
|
||||||
run_unstructured_op(config, sample_text_en)
|
|
||||||
|
|
||||||
error_msg = str(exc_info.value).lower()
|
|
||||||
assert (
|
|
||||||
"encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg
|
|
||||||
), "Error message should indicate encryption failure"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac2_null_text_input_raises_error(encrypt_person_config):
|
|
||||||
"""AC2: Test that a null (None) text input is rejected with an error."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
# Dagster will raise DagsterTypeCheckDidNotPass before op executes
|
|
||||||
from dagster import DagsterTypeCheckDidNotPass
|
|
||||||
|
|
||||||
with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)):
|
|
||||||
run_unstructured_op(encrypt_person_config, None)
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac2_invalid_language_configuration():
|
|
||||||
"""AC2: Test that an unsupported language in the config raises a validation error."""
|
|
||||||
# This should fail at config creation due to Pydantic validation
|
|
||||||
with pytest.raises((ValueError, TypeError)):
|
|
||||||
AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language="invalid_lang", # Should fail Pydantic validation
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac2_very_large_text_processing():
|
|
||||||
"""AC2: Test that very large text inputs are processed successfully without memory errors."""
|
|
||||||
# Create large text with repeated PII patterns
|
|
||||||
large_text = (
|
|
||||||
"""
|
|
||||||
John Smith works at company. Email: john.smith@example.com.
|
|
||||||
"""
|
|
||||||
* 1000
|
|
||||||
) # ~60KB of text with repeated PII
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
|
||||||
key_name="test_large_text_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_large_text_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(config, large_text)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify processing completed
|
|
||||||
assert result_text is not None, "Large text should be processed successfully"
|
|
||||||
assert len(result_text) > 0, "Result should not be empty"
|
|
||||||
assert metrics["total_pii_detected"] > 0, "PII should be detected in large text"
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
|
||||||
# AC3: Execution Audit & Logging - Positive Scenario
|
|
||||||
# ================================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config):
|
|
||||||
"""AC3: Test that successful execution context contains a run ID for logging."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
|
|
||||||
context = build_op_context(op_config=op_config_dict)
|
|
||||||
|
|
||||||
# Capture run context
|
|
||||||
run_id = context.run_id
|
|
||||||
|
|
||||||
# Execute operation
|
|
||||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
|
|
||||||
|
|
||||||
# Verify run identifier is available for logging
|
|
||||||
assert run_id is not None, "Run ID must be available for audit logging"
|
|
||||||
|
|
||||||
# Verify outputs are returned (for Dagster to log)
|
|
||||||
assert result_text is not None, "Result text should be available for logging"
|
|
||||||
assert metrics is not None, "Metrics should be available for logging"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac3_successful_execution_logs_configuration_parameters(
|
|
||||||
sample_text_en, mixed_technique_config
|
|
||||||
):
|
|
||||||
"""AC3: Test that the used configuration is accessible for logging on success."""
|
|
||||||
clear_vault_key("test_mixed_key")
|
|
||||||
|
|
||||||
op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config)
|
|
||||||
context = build_op_context(op_config=op_config_dict)
|
|
||||||
|
|
||||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en)
|
|
||||||
|
|
||||||
# Verify configuration is captured and accessible
|
|
||||||
assert "used_function" in op_config_dict, "Configuration must be accessible for logging"
|
|
||||||
assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured"
|
|
||||||
|
|
||||||
# Verify techniques are logged
|
|
||||||
techniques = [func["technique"] for func in op_config_dict["used_function"]]
|
|
||||||
assert any(
|
|
||||||
"encrypt" in str(tech) for tech in techniques
|
|
||||||
), "Encrypt technique should be in configuration"
|
|
||||||
assert any(
|
|
||||||
"retain" in str(tech) for tech in techniques
|
|
||||||
), "Retain technique should be in configuration"
|
|
||||||
|
|
||||||
# Verify metrics contain technique information (in markdown string)
|
|
||||||
metrics_str = metrics.value
|
|
||||||
assert (
|
|
||||||
"Techniques Applied" in metrics_str
|
|
||||||
), "Applied techniques should be in metrics for logging"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config):
|
|
||||||
"""AC3: Test that logs and metrics from a successful run do not contain raw PII."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config)
|
|
||||||
context = build_op_context(op_config=op_config_dict)
|
|
||||||
|
|
||||||
result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii)
|
|
||||||
|
|
||||||
# Verify raw PII values are not in metrics
|
|
||||||
metrics_str = metrics.value
|
|
||||||
|
|
||||||
sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"]
|
|
||||||
|
|
||||||
for pii_value in sensitive_values:
|
|
||||||
assert (
|
|
||||||
pii_value not in metrics_str
|
|
||||||
), f"Raw PII value should not appear in metrics: {pii_value}"
|
|
||||||
|
|
||||||
# Verify configuration logs do not contain raw PII
|
|
||||||
config_str = str(op_config_dict)
|
|
||||||
for pii_value in sensitive_values:
|
|
||||||
assert (
|
|
||||||
pii_value not in config_str
|
|
||||||
), f"Raw PII value should not appear in configuration logs: {pii_value}"
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
|
||||||
# AC4: Execution Audit & Logging - Negative Scenario
|
|
||||||
# ================================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac4_failed_execution_logs_error_details():
|
|
||||||
"""AC4: Negative execution should surface clear error details (encryption key failure)."""
|
|
||||||
text = "Test user John Smith"
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_fail_log_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
clear_vault_key("test_fail_log_key")
|
|
||||||
ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config))
|
|
||||||
|
|
||||||
# Patch the key retrieval used inside unstructured_ops to force failure
|
|
||||||
with patch(
|
|
||||||
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key",
|
|
||||||
side_effect=RuntimeError("Encryption key retrieval failed"),
|
|
||||||
):
|
|
||||||
with pytest.raises(RuntimeError) as exc_info:
|
|
||||||
# Consume the generator to trigger execution and raise the exception
|
|
||||||
list(anonymize_pseudonymize_unstructured(ctx, text=text))
|
|
||||||
|
|
||||||
msg = str(exc_info.value).lower()
|
|
||||||
assert "key" in msg and "failed" in msg, "Error message should mention key failure"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac4_failed_execution_logs_configuration_used():
|
|
||||||
"""AC4: Test that the attempted configuration is available for logging on failure."""
|
|
||||||
text = "Test data with person John Doe"
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_config_fail_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_config_fail_key")
|
|
||||||
|
|
||||||
op_config_dict = config_to_dagster_dict_unstructured(config)
|
|
||||||
context = build_op_context(op_config=op_config_dict)
|
|
||||||
|
|
||||||
# Mock _initialize_scrubber to fail
|
|
||||||
with patch(
|
|
||||||
"field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber"
|
|
||||||
) as mock_init_scrubber:
|
|
||||||
mock_init_scrubber.side_effect = Exception("Scrubber module not available")
|
|
||||||
|
|
||||||
with pytest.raises((RuntimeError, Exception)) as exc_info:
|
|
||||||
list(anonymize_pseudonymize_unstructured(context, text=text))
|
|
||||||
|
|
||||||
# Verify configuration is still accessible despite failure
|
|
||||||
assert op_config_dict is not None, "Configuration must be accessible for failure audit"
|
|
||||||
assert (
|
|
||||||
"used_function" in op_config_dict
|
|
||||||
), "Technique configuration should be available for diagnosis"
|
|
||||||
|
|
||||||
# Verify error was raised with proper message
|
|
||||||
error_msg = str(exc_info.value).lower()
|
|
||||||
assert (
|
|
||||||
"pii" in error_msg
|
|
||||||
or "detection" in error_msg
|
|
||||||
or "failed" in error_msg
|
|
||||||
or "scrubber" in error_msg
|
|
||||||
or "module" in error_msg
|
|
||||||
), "Error should indicate detection/processing failed"
|
|
||||||
|
|
||||||
|
|
||||||
def test_ac4_failed_execution_logs_failure_reason():
|
|
||||||
"""AC4: Test that the reason for a failure is clearly indicated in the error message."""
|
|
||||||
text = "User: Alice Smith, Email: alice@example.com"
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.en,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
|
||||||
key_name="test_failure_reason_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_failure_reason_key")
|
|
||||||
|
|
||||||
# Mock key retrieval function to fail
|
|
||||||
with patch(
|
|
||||||
"field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key"
|
|
||||||
) as mock_get_key:
|
|
||||||
mock_get_key.side_effect = RuntimeError("Vault connection timeout")
|
|
||||||
|
|
||||||
with pytest.raises(RuntimeError) as exc_info:
|
|
||||||
run_unstructured_op(config, text)
|
|
||||||
|
|
||||||
# Verify failure reason is in error message
|
|
||||||
error_msg = str(exc_info.value).lower()
|
|
||||||
assert (
|
|
||||||
"encrypt" in error_msg
|
|
||||||
or "key" in error_msg
|
|
||||||
or "timeout" in error_msg
|
|
||||||
or "failed" in error_msg
|
|
||||||
), "Error should indicate key retrieval/encryption failure"
|
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
|
||||||
# Additional Tests - Edge Cases and Integration
|
|
||||||
# ================================================================================================
|
|
||||||
|
|
||||||
|
|
||||||
def test_multi_language_support_italian():
|
|
||||||
"""Additional test: Verify that Italian text is processed correctly."""
|
|
||||||
italian_text = """
|
|
||||||
Il dottor Marco Rossi lavora presso l'ospedale.
|
|
||||||
Email: marco.rossi@ospedale.it
|
|
||||||
Telefono: +39-06-12345678
|
|
||||||
"""
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.it,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON],
|
|
||||||
key_name="test_italian_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_italian_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(config, italian_text)
|
|
||||||
metrics = parse_metrics_markdown(metrics_md)
|
|
||||||
|
|
||||||
# Verify processing occurred
|
|
||||||
assert result_text != italian_text, "Italian text should be processed"
|
|
||||||
assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text"
|
|
||||||
|
|
||||||
|
|
||||||
def test_special_characters_in_text():
|
|
||||||
"""Additional test: Verify handling of text with special Unicode characters."""
|
|
||||||
special_text = """
|
|
||||||
User: João da Silva 🇧🇷
|
|
||||||
Email: joão@empresa.com.br
|
|
||||||
Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ®
|
|
||||||
"""
|
|
||||||
|
|
||||||
config = AnonymisePseudonymizeUnstructuredConfig(
|
|
||||||
language=LanguageEnum.pt,
|
|
||||||
used_function=[
|
|
||||||
PseudoTechniqueConfig(
|
|
||||||
technique=EncryptConfig(
|
|
||||||
type="encrypt",
|
|
||||||
pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL],
|
|
||||||
key_name="test_special_chars_key",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
clear_vault_key("test_special_chars_key")
|
|
||||||
|
|
||||||
result_text, metrics_md = run_unstructured_op(config, special_text)
|
|
||||||
|
|
||||||
# Verify processing completed without encoding errors
|
|
||||||
assert result_text is not None, "Special characters should not cause processing failure"
|
|
||||||
assert len(result_text) > 0, "Result should not be empty"
|
|
||||||
|
|
||||||
|
|
||||||
def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config):
|
|
||||||
"""Additional test: Verify encryption format consistency across runs."""
|
|
||||||
clear_vault_key("test_person_key")
|
|
||||||
|
|
||||||
result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en)
|
|
||||||
result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en)
|
|
||||||
|
|
||||||
# Both should have encryption tokens
|
|
||||||
assert "{encrypt:" in result1, "First run should produce encrypted tokens"
|
|
||||||
assert "{encrypt:" in result2, "Second run should produce encrypted tokens"
|
|
||||||
|
|
||||||
# Verify consistent PII detection
|
|
||||||
metrics1 = parse_metrics_markdown(metrics_md1)
|
|
||||||
metrics2 = parse_metrics_markdown(metrics_md2)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
metrics1["total_pii_detected"] == metrics2["total_pii_detected"]
|
|
||||||
), "PII detection should be consistent across runs"
|
|
||||||
|
|
||||||
# Verify token format is consistent (Fernet base64 pattern)
|
|
||||||
token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}"
|
|
||||||
tokens1 = re.findall(token_pattern, result1)
|
|
||||||
tokens2 = re.findall(token_pattern, result2)
|
|
||||||
|
|
||||||
assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated"
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
from template_code_location.field_level_pseudo_anonymisation.jobs import (
|
|
||||||
anonymize_pseudonymize_structured_job,
|
|
||||||
anonymize_pseudonymize_structured_job_s3,
|
|
||||||
depseudonymize_structured_job,
|
|
||||||
depseudonymize_structured_job_s3,
|
|
||||||
anonymize_pseudonymize_unstructured_job_s3,
|
|
||||||
anonymize_pseudonymize_unstructured_job,
|
|
||||||
depseudonymize_unstructured_job_s3,
|
|
||||||
depseudonymize_unstructured_job
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_anonymize_pseudonymize_structured_job_is_callable():
|
|
||||||
"""Test anonymize_pseudonymize_structured_job is a valid Dagster job"""
|
|
||||||
assert callable(anonymize_pseudonymize_structured_job)
|
|
||||||
assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_anonymize_pseudonymize_structured_job_s3_is_callable():
|
|
||||||
"""Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(anonymize_pseudonymize_structured_job_s3)
|
|
||||||
assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_structured_job_is_callable():
|
|
||||||
"""Test depseudonymize_structured_job is a valid Dagster job"""
|
|
||||||
assert callable(depseudonymize_structured_job)
|
|
||||||
assert hasattr(depseudonymize_structured_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_structured_job_s3_is_callable():
|
|
||||||
"""Test depseudonymize_structured_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(depseudonymize_structured_job_s3)
|
|
||||||
assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_anonymize_pseudonymize_unstructured_job_is_callable():
|
|
||||||
"""Test anonymize_pseudonymize_unstructured_job is a valid Dagster job"""
|
|
||||||
assert callable(anonymize_pseudonymize_unstructured_job)
|
|
||||||
assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_anonymize_pseudonymize_unstructured_job_s3_is_callable():
|
|
||||||
"""Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(anonymize_pseudonymize_unstructured_job_s3)
|
|
||||||
assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_job_is_callable():
|
|
||||||
"""Test depseudonymize_unstructured_job is a valid Dagster job"""
|
|
||||||
assert callable(depseudonymize_unstructured_job)
|
|
||||||
assert hasattr(depseudonymize_unstructured_job, 'execute_in_process')
|
|
||||||
|
|
||||||
|
|
||||||
def test_depseudonymize_unstructured_job_s3_is_callable():
|
|
||||||
"""Test depseudonymize_unstructured_job_s3 is a valid Dagster job"""
|
|
||||||
assert callable(depseudonymize_unstructured_job_s3)
|
|
||||||
assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process')
|
|
||||||
Reference in New Issue
Block a user