From 4e0b216410fc0d0879b583047fc7ba1f0390c612 Mon Sep 17 00:00:00 2001 From: ILay Date: Fri, 24 Apr 2026 18:38:12 +0200 Subject: [PATCH 01/15] feat(SIMPL-24642): consolidate all code locations into template-code-location - Rename src/template-code-location to src/template_code_location - Copy data-processing jobs/ops/config_models - Copy dataframe-level-anonymisation jobs/ops/utils/config_models - Copy field-level-pseudo-anonymisation jobs/ops/techniques/config_models - Update all imports to template_code_location namespace - Merge all jobs into unified repository.py with sensors/resources/loggers - Update pyproject.toml with all dependencies - Update Dockerfile for consolidated image --- Dockerfile | 11 +- pyproject.toml | 28 +- src/template-code-location/repository.py | 6 - .../__init__.py | 0 .../data_processing}/__init__.py | 0 .../data_processing/config_models/__init__.py | 18 + .../aggregation_configuration.py | 25 + .../columns_select_configuration.py | 17 + ...coordinates_normalization_configuration.py | 22 + .../config_models/fill_missing_config.py | 9 + .../config_models/filter_configuration.py | 52 +++ .../spell_check_configuration.py | 8 + .../data_processing/jobs.py | 119 +++++ .../data_processing/ops.py | 256 +++++++++++ .../__init__.py | 0 .../config_models/__init__.py | 13 + .../config_models/base_config.py | 33 ++ .../config_models/hierarchies.py | 18 + .../k_anonymity_configuration.py | 11 + .../l_diversity_configuration.py | 8 + .../t_closeness_configuration.py | 8 + .../dataframe_level_anonymisation/jobs.py | 86 ++++ .../dataframe_level_anonymisation/ops.py | 187 ++++++++ .../dataframe_level_anonymisation/utils.py | 19 + .../__init__.py | 0 .../config_models/__init__.py | 28 ++ .../config_models/languages.py | 72 +++ .../config_models/pii_entities.py | 24 + .../config_models/structured_config.py | 110 +++++ .../config_models/unstructured_config.py | 115 +++++ .../field_level_pseudo_anonymisation/jobs.py | 126 ++++++ .../field_level_pseudo_anonymisation/ops.py | 77 ++++ .../techniques/__init__.py | 3 + ...onymisation_pseudonymisation_techniques.py | 42 ++ .../depseudonymisation_techniques.py | 9 + .../unstructured_ops.py | 428 ++++++++++++++++++ .../field_level_pseudo_anonymisation/utils.py | 32 ++ src/template_code_location/jobs/__init__.py | 0 .../jobs/jobs.py | 0 src/template_code_location/ops/__init__.py | 0 .../ops/ops.py | 0 src/template_code_location/repository.py | 65 +++ 42 files changed, 2071 insertions(+), 14 deletions(-) delete mode 100644 src/template-code-location/repository.py rename src/{template-code-location => template_code_location}/__init__.py (100%) rename src/{template-code-location/jobs => template_code_location/data_processing}/__init__.py (100%) create mode 100644 src/template_code_location/data_processing/config_models/__init__.py create mode 100644 src/template_code_location/data_processing/config_models/aggregation_configuration.py create mode 100644 src/template_code_location/data_processing/config_models/columns_select_configuration.py create mode 100644 src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py create mode 100644 src/template_code_location/data_processing/config_models/fill_missing_config.py create mode 100644 src/template_code_location/data_processing/config_models/filter_configuration.py create mode 100644 src/template_code_location/data_processing/config_models/spell_check_configuration.py create mode 100644 src/template_code_location/data_processing/jobs.py create mode 100644 src/template_code_location/data_processing/ops.py rename src/{template-code-location/ops => template_code_location/dataframe_level_anonymisation}/__init__.py (100%) create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/jobs.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/ops.py create mode 100644 src/template_code_location/dataframe_level_anonymisation/utils.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/__init__.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/jobs.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/ops.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py create mode 100644 src/template_code_location/field_level_pseudo_anonymisation/utils.py create mode 100644 src/template_code_location/jobs/__init__.py rename src/{template-code-location => template_code_location}/jobs/jobs.py (100%) create mode 100644 src/template_code_location/ops/__init__.py rename src/{template-code-location => template_code_location}/ops/ops.py (100%) create mode 100644 src/template_code_location/repository.py diff --git a/Dockerfile b/Dockerfile index b61745b..fd4e780 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,16 @@ FROM python:3.12-slim-bookworm +# Install git for git-based dependencies +RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/* + WORKDIR /app COPY pyproject.toml . -RUN pip install --no-cache-dir dagster dagster-webserver - COPY src/ src/ + +# Install the package and all dependencies RUN pip install --no-cache-dir . -EXPOSE 3000 +EXPOSE 4000 -CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "3000", "-m", "template-code-location.repository"] +CMD ["dagster", "code-server", "start", "-h", "0.0.0.0", "-p", "4000", "-f", "src/template_code_location/repository.py"] diff --git a/pyproject.toml b/pyproject.toml index ca2cdc0..3b2741f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,23 +4,43 @@ build-backend = "setuptools.build_meta" [project] name = "template-code-location" -version = "0.0.1" -description = "Template code location for data processings services" +version = "0.1.0" +description = "Consolidated code location for all data services workflows" requires-python = ">=3.12" dependencies = [ + # Dagster core "dagster>=1.8.13", "dagster-webserver>=1.8.13", "dagster-postgres>=0.24.13", - "pandas>=3.0", + # Data processing + "pandas>=2.1.4", "pyarrow>=23.0", + "numpy>=2.4", "lxml>=6.0", "xmltodict>=1.0", "rdflib>=7.6", - "numpy>=2.4", + "openpyxl", + "xlrd>=2.0.1", + "tabulate==0.8.10", + "pyspellchecker>=0.8.4", + "PyGeodesy>=24.6.11", + # Validation "great_expectations>=1.16", "pandera>=0.31", + "pydantic>=2.6.0,<3.0.0", + # Scraping "scrapy>=2.15", "BeautifulSoup4>=4.14", + # Anonymisation libraries + "pycanon==1.0.1.post2", + "anjana>=1.0.0", + # Field-level pseudo-anonymisation + "scrubadub", + "scrubadub_spacy", + "hvac", + "cryptography", + # Util services (git dependency) + "util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.4.1", ] [project.optional-dependencies] diff --git a/src/template-code-location/repository.py b/src/template-code-location/repository.py deleted file mode 100644 index 10c73e6..0000000 --- a/src/template-code-location/repository.py +++ /dev/null @@ -1,6 +0,0 @@ -from dagster import Definitions -from .jobs.jobs import data_processing_job - -defs = Definitions( - jobs=[data_processing_job], -) diff --git a/src/template-code-location/__init__.py b/src/template_code_location/__init__.py similarity index 100% rename from src/template-code-location/__init__.py rename to src/template_code_location/__init__.py diff --git a/src/template-code-location/jobs/__init__.py b/src/template_code_location/data_processing/__init__.py similarity index 100% rename from src/template-code-location/jobs/__init__.py rename to src/template_code_location/data_processing/__init__.py diff --git a/src/template_code_location/data_processing/config_models/__init__.py b/src/template_code_location/data_processing/config_models/__init__.py new file mode 100644 index 0000000..5833cab --- /dev/null +++ b/src/template_code_location/data_processing/config_models/__init__.py @@ -0,0 +1,18 @@ +"""Configuration models for data processing.""" + +from .columns_select_configuration import ColumnsSelectConfiguration +from .fill_missing_config import FillMissingConfiguration +from .spell_check_configuration import SpellCheckConfiguration +from .coordinates_normalization_configuration import CoordinatesNormalizationConfiguration +from .aggregation_configuration import AggregationConfiguration +from .filter_configuration import DatasetFilterConfiguration, FilterCondition + +__all__ = [ + "ColumnsSelectConfiguration", + "FillMissingConfiguration", + "SpellCheckConfiguration", + "CoordinatesNormalizationConfiguration", + "AggregationConfiguration", + "FilterCondition", + "DatasetFilterConfiguration" +] diff --git a/src/template_code_location/data_processing/config_models/aggregation_configuration.py b/src/template_code_location/data_processing/config_models/aggregation_configuration.py new file mode 100644 index 0000000..553740f --- /dev/null +++ b/src/template_code_location/data_processing/config_models/aggregation_configuration.py @@ -0,0 +1,25 @@ +from typing import List + +from pydantic import Field, field_validator + +from .columns_select_configuration import ColumnsSelectConfiguration + + +class AggregationConfiguration(ColumnsSelectConfiguration): + + operation: str = Field( + default="sum", + description="Aggregation operations: sum, mean, min, max, count" + ) + + @field_validator("operation") + @classmethod + def validate_operations(cls, value): + allowed = {"sum", "mean", "min", "max", "count"} + if value not in allowed: + raise ValueError( + f"Invalid aggregation operation '{value}'. " + f"Allowed values: {allowed}" + ) + + return value diff --git a/src/template_code_location/data_processing/config_models/columns_select_configuration.py b/src/template_code_location/data_processing/config_models/columns_select_configuration.py new file mode 100644 index 0000000..658450d --- /dev/null +++ b/src/template_code_location/data_processing/config_models/columns_select_configuration.py @@ -0,0 +1,17 @@ +from typing import List +from pydantic import Field,field_validator +from dagster import Config + + +class ColumnsSelectConfiguration(Config): + columns: List[str] = Field( + default=["Name"], description="List of columns to process." + ) + + @field_validator("columns") + @classmethod + def ensure_unique_columns(cls, v: List[str]) -> List[str]: + + unique_values = list(dict.fromkeys(v)) + + return unique_values diff --git a/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py b/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py new file mode 100644 index 0000000..64342e4 --- /dev/null +++ b/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py @@ -0,0 +1,22 @@ +from typing import Optional + +from pydantic import Field, model_validator +from dagster import Config + + +class CoordinatesNormalizationConfiguration(Config): + latColumn: Optional[str] = Field( + default="lat", description="Latitude column name" + ) + lonColumn: Optional[str] = Field( + default="lon", description="Longitude column name" + ) + + @model_validator(mode="before") + @classmethod + def replace_nulls_with_defaults(cls, values): + if values.get("latColumn") is None: + values["latColumn"] = "lat" + if values.get("lonColumn") is None: + values["lonColumn"] = "lon" + return values diff --git a/src/template_code_location/data_processing/config_models/fill_missing_config.py b/src/template_code_location/data_processing/config_models/fill_missing_config.py new file mode 100644 index 0000000..4c9e5b2 --- /dev/null +++ b/src/template_code_location/data_processing/config_models/fill_missing_config.py @@ -0,0 +1,9 @@ +from typing import Dict +from dagster import Config +from pydantic import Field + + +class FillMissingConfiguration(Config): + fill_map: Dict[str, str] = Field( + default={"Age": "UNKNOWN_AGE"}, description="Missing values filling map." + ) diff --git a/src/template_code_location/data_processing/config_models/filter_configuration.py b/src/template_code_location/data_processing/config_models/filter_configuration.py new file mode 100644 index 0000000..86bde37 --- /dev/null +++ b/src/template_code_location/data_processing/config_models/filter_configuration.py @@ -0,0 +1,52 @@ +from enum import Enum +import operator +from typing import List, Literal, Callable +from pydantic import Field, model_validator +from dagster import Config +import pandas as pd + +class FilterOperator(str, Enum): + EQ = "==" + NE = "!=" + LT = "<" + LE = "<=" + GT = ">" + GE = ">=" + + @property + def function(self) -> Callable: + mapping = { + FilterOperator.EQ: operator.eq, + FilterOperator.NE: operator.ne, + FilterOperator.LT: operator.lt, + FilterOperator.LE: operator.le, + FilterOperator.GT: operator.gt, + FilterOperator.GE: operator.ge, + } + return mapping[self] + +class FilterCondition(Config): + column: str = Field(..., description="Name of the column to filter") + type: Literal["string", "numeric"] = Field(..., description="Column type (string or numeric)") + value: str = Field(..., description="Value to compare against") + op: FilterOperator = Field(default=FilterOperator.EQ, description="Operator to apply (string supports only EQ and NE)") + + @model_validator(mode="after") + def check_operator_compatibility(self) -> "FilterCondition": + if self.type == "string" and self.op not in [FilterOperator.EQ, FilterOperator.NE]: + raise ValueError( + f"Invalid operator '{self.op.name}' for type 'string'. " + "Only EQ (==) and NE (!=) are allowed." + ) + return self + + def apply(self, df: pd.DataFrame) -> pd.Series: + val = float(self.value) if self.type == "numeric" else self.value + return self.op.function(df[self.column], val) + +class DatasetFilterConfiguration(Config): + conditions: List[FilterCondition] = Field( + default=[], + description="List of filter conditions to apply on the dataset. " + "String columns support only 'EQ' and 'NE', numeric columns also support 'LT', 'LE', 'GT' and 'GE'." + ) diff --git a/src/template_code_location/data_processing/config_models/spell_check_configuration.py b/src/template_code_location/data_processing/config_models/spell_check_configuration.py new file mode 100644 index 0000000..7a12f87 --- /dev/null +++ b/src/template_code_location/data_processing/config_models/spell_check_configuration.py @@ -0,0 +1,8 @@ +from typing import Literal +from pydantic import Field + +from .columns_select_configuration import ColumnsSelectConfiguration + + +class SpellCheckConfiguration(ColumnsSelectConfiguration): + language: Literal["en", "es", "it", "fr", "pt", "de", "nl"] = Field(default="en", description="Language to use in the SpellChecker module.") diff --git a/src/template_code_location/data_processing/jobs.py b/src/template_code_location/data_processing/jobs.py new file mode 100644 index 0000000..54fb939 --- /dev/null +++ b/src/template_code_location/data_processing/jobs.py @@ -0,0 +1,119 @@ +from dagster import job +from util_services.util_ops import ( + preview_dataframe, + read_csv_from_s3, + write_csv_to_s3, +) +from .ops import ( + remove_duplicates, + fill_missing_values, + standardize_categorical_values, + correct_typos, + normalize_numeric_min_max, + normalize_datetime, + normalize_coordinates, + add_global_aggregations, + filter_dataset +) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def remove_duplicates_job_s3(): + org_df = read_csv_from_s3() + anon_df = remove_duplicates(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def fill_missing_values_job_s3(): + org_df = read_csv_from_s3() + anon_df = fill_missing_values(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def standardize_categorical_values_job_s3(): + org_df = read_csv_from_s3() + anon_df = standardize_categorical_values(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def correct_typos_job_s3(): + org_df = read_csv_from_s3() + anon_df = correct_typos(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def normalize_numeric_min_max_job_s3(): + org_df = read_csv_from_s3() + anon_df = normalize_numeric_min_max(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def normalize_datetime_job_s3(): + org_df = read_csv_from_s3() + anon_df = normalize_datetime(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def normalize_coordinates_job_s3(): + org_df = read_csv_from_s3() + anon_df = normalize_coordinates(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def add_global_aggregations_job_s3(): + org_df = read_csv_from_s3() + anon_df = add_global_aggregations(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) + +@job(tags={ + "business_operation": "PROCESSING", + "resource_type": "RD_DATA" +}) +def filter_dataset_job_s3(): + org_df = read_csv_from_s3() + anon_df = filter_dataset(org_df) + preview_dataframe(org_df) + write_csv_to_s3(anon_df) + preview_dataframe(anon_df) diff --git a/src/template_code_location/data_processing/ops.py b/src/template_code_location/data_processing/ops.py new file mode 100644 index 0000000..e380cb8 --- /dev/null +++ b/src/template_code_location/data_processing/ops.py @@ -0,0 +1,256 @@ +import pandas as pd +from dagster import Out, op +from spellchecker import SpellChecker + +from template_code_location.data_processing.config_models import ( + AggregationConfiguration, + ColumnsSelectConfiguration, + CoordinatesNormalizationConfiguration, + FillMissingConfiguration, + SpellCheckConfiguration, + DatasetFilterConfiguration +) + + +def _parse_dms_to_decimal(value): + """Parse a DMS (degrees-minutes-seconds) string to decimal degrees using PyGeodesy. + + Supported formats include (but are not limited to): + - 40°26'46"N / 40°26′46″N + - 40 26 46 N + - 40:26:46N + - 40d26m46sN + - -40.446 (already decimal – returned as-is) + + Returns None if parsing fails. + """ + from pygeodesy.dms import parseDMS + + if pd.isna(value): + return None + + text = str(value).strip() + if not text: + return None + + try: + return float(parseDMS(text)) + except (ValueError, TypeError): + try: + return float(text) + except (ValueError, TypeError): + return None + + +@op(out={"data": Out()}) +def remove_duplicates(context, df: pd.DataFrame): + """Remove duplicate rows from the input DataFrame.""" + logger = context.log + + before = df.shape[0] + + df = df.drop_duplicates() + + after = df.shape[0] + + logger.info(f"Removed {before - after} duplicate rows") + + return df + +@op(out={"data": Out()}) +def fill_missing_values(context, config: FillMissingConfiguration, df: pd.DataFrame): + """Fill missing values in the DataFrame according to the configured column-to-value mapping.""" + logger = context.log + + logger.info(f"Filling missing values: {config.fill_map}") + + return df.fillna(config.fill_map) + +@op(out={"data": Out()}) +def standardize_categorical_values(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): + """Standardize categorical values in selected columns by trimming whitespace and converting text to lowercase.""" + logger = context.log + + for col in config.columns: + if col not in df.columns: + logger.warning(f"Column '{col}' not found in DataFrame, skipping.") + continue + + original = df[col] + + standardized = ( + df[col] + .fillna("") + .astype(str) + .str.strip() + .str.lower() + ) + + changed_count = (original != standardized).sum() + df[col] = standardized + + logger.info(f"Standardized '{col}' column – {changed_count} values modified") + + return df + +@op(out={"data": Out()}) +def correct_typos(context, config: SpellCheckConfiguration, df: pd.DataFrame): + """Correct spelling mistakes in the specified text columns.""" + logger = context.log + + for column in config.columns: + if column not in df.columns: + logger.warning(f"Column '{column}' not found in DataFrame, skipping.") + continue + + spell = SpellChecker(language=config.language) + + original = df[column].astype(str) + corrected = original.apply(lambda x, spell_checker=spell: spell_checker.correction(x) if x else x) + + changed_count = (original != corrected).sum() + logger.info(f"Corrected typos in '{column}' – {changed_count} values modified") + + df[column] = corrected + + return df + +@op(out={"data": Out()}) +def normalize_datetime(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): + logger = context.log + + for col in config.columns: + if col not in df.columns: + logger.warning(f"Column '{col}' not found, skipping normalization.") + continue + + normalized = pd.to_datetime(df[col], utc=True, format="mixed", dayfirst=True, errors="coerce") + + if normalized.notna().sum() == 0: + logger.warning( + f"Column '{col}' has no normalizable datetime values, skipping." + ) + continue + + iso_col = f"{col}_iso" + + formatted = normalized.dt.strftime("%Y-%m-%dT%H:%M:%SZ").fillna("") + non_empty = formatted[formatted != ""] + if len(non_empty) > 0 and non_empty.str.startswith("1970-01-01").all(): + logger.warning( + f"Column '{col}' all normalized values are '1970-01-01', likely bad input — skipping." + ) + continue + + df[iso_col] = formatted + + logger.info(f"Normalized datetime column '{col}' into '{iso_col}'") + + return df + +@op(out={"data": Out()}) +def normalize_numeric_min_max(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): + logger = context.log + + for col in config.columns: + if col not in df.columns: + logger.warning(f"Column '{col}' not found, skipping normalization.") + continue + + min_val = df[col].min() + max_val = df[col].max() + + if min_val == max_val: + logger.warning(f"Column '{col}' has constant values, skipping normalization.") + continue + + df[col + "_norm"] = (df[col] - min_val) / (max_val - min_val) + logger.info(f"Normalized numeric column '{col}'") + + return df + +@op(out={"data": Out()}) +def normalize_coordinates(context, config: CoordinatesNormalizationConfiguration, df: pd.DataFrame): + logger = context.log + + lat = config.latColumn + lon = config.lonColumn + + for col in [lat, lon]: + if pd.api.types.is_numeric_dtype(df[col]): + logger.info(f"Column '{col}' is numeric — coercing directly") + df[col] = pd.to_numeric(df[col], errors="coerce") + else: + logger.info(f"Column '{col}' is non-numeric — parsing as DMS with PyGeodesy") + df[col] = df[col].apply(_parse_dms_to_decimal) + + invalid_lat = df[lat].isnull().sum() + invalid_lon = df[lon].isnull().sum() + logger.info(f"Found {invalid_lat} invalid latitudes and {invalid_lon} invalid longitudes") + + df[lat] = df[lat].round(4) + df[lon] = df[lon].round(4) + + before_filter_rows = len(df) + df = df[(df[lat].between(-90, 90)) & (df[lon].between(-180, 180))] + after_filter_rows = len(df) + logger.info(f"Filtered coordinates out of range: removed {before_filter_rows - after_filter_rows} rows") + + logger.info(f"Coordinate normalization completed: resulting dataframe has {after_filter_rows} rows") + + return df + +@op(out={"data": Out()}) +def add_global_aggregations(context, config: AggregationConfiguration, df: pd.DataFrame): + logger = context.log + + group_by_cols = [] + + for col in config.columns: + if col not in df.columns: + logger.warning(f"Column '{col}' not found, skipping aggregation.") + continue + group_by_cols.append(col) + + if config.operation not in {"sum", "mean", "min", "max", "count"}: + logger.warning(f"Unsupported aggregation '{config.operation}'") + + numeric_cols = df.select_dtypes(include=['number']).columns.tolist() + cols_to_keep = list(set(numeric_cols + group_by_cols)) + df = df[[c for c in cols_to_keep if c in df.columns]] + df = df.groupby(group_by_cols).agg(config.operation).reset_index() + return df + +@op(out={"data": Out()}) +def filter_dataset(context, config: DatasetFilterConfiguration, df: pd.DataFrame): + logger = context.log + total_rows_before = len(df) + + logger.info(f"Starting dataset filtering: initial dataframe has {total_rows_before} rows") + + combined_mask = pd.Series([True] * total_rows_before, index=df.index) + + for condition in config.conditions: + if condition.column not in df.columns: + logger.warning(f"Column '{condition.column}' not found, skipping filtering.") + continue + if df[condition.column].isna().all(): + logger.warning(f"Column '{condition.column}' is empty (all NaN), skipping filtering.") + continue + try: + current_mask = condition.apply(df) + combined_mask &= current_mask + + logger.info(f"Applied filter: {condition.column} {condition.op.value} '{condition.value}'") + except Exception as e: + logger.error(f"Error applying filter on column '{condition.column}': {e}") + + filtered_df = df[combined_mask] + total_rows_after = len(filtered_df) + + logger.info( + f"Filtering completed: {total_rows_after} rows remain " + f"(removed {total_rows_before - total_rows_after} rows in total)" + ) + + return filtered_df diff --git a/src/template-code-location/ops/__init__.py b/src/template_code_location/dataframe_level_anonymisation/__init__.py similarity index 100% rename from src/template-code-location/ops/__init__.py rename to src/template_code_location/dataframe_level_anonymisation/__init__.py diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py b/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py new file mode 100644 index 0000000..0f490b5 --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py @@ -0,0 +1,13 @@ +"""Configuration models for dataframe-level anonymization.""" + +from .k_anonymity_configuration import KAnonymityConfiguration +from .l_diversity_configuration import LDiversityConfiguration +from .t_closeness_configuration import TClosenessConfiguration +from .base_config import BaseConfiguration + +__all__ = [ + "BaseConfiguration", + "KAnonymityConfiguration", + "LDiversityConfiguration", + "TClosenessConfiguration", +] diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py b/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py new file mode 100644 index 0000000..4abf451 --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py @@ -0,0 +1,33 @@ +from typing import Dict, List +from dagster import Config +from pydantic import Field, field_validator, model_validator + + +class BaseConfiguration(Config): + ident: List[str] = Field(default=["Name"], description="List of identifier column names.") + quasi_identifiers: List[str] = Field(default=["Age"], description="List of quasi-identifier column names.") + supp_level: float = Field(default=50.0, ge=0.0, le=100.0, description="Max suppression allowed (0–100).") + generalisation_hierarchies: Dict[str, str] = Field( + default={"Age": "simpl_age"}, description="Hierarchies used to generalize quasi-identifiers." + ) + + @field_validator("quasi_identifiers") + def validate_quasi_identifiers(cls, value): + if not value: + raise ValueError("At least one quasi-identifier must be provided.") + return value + + @field_validator("ident") + def validate_ident(cls, value): + if not value: + raise ValueError("At least one identifier must be provided.") + return value + + @model_validator(mode="after") + def check_no_overlap(self): + ident = set(self.ident) + quasi = set(self.quasi_identifiers) + overlap = ident & quasi + if overlap: + raise ValueError(f"Fields cannot be both identifiers and quasi-identifiers: {overlap}") + return self diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py b/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py new file mode 100644 index 0000000..65105a0 --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py @@ -0,0 +1,18 @@ +from anjana.anonymity.utils import utils + +simpl_age = { + 0: [age for age in range(0, 100)], + 1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5), + 2: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 10), + 3: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 20), + 4: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 100), +} +simpl_age2 = { + 0: [age for age in range(0, 100)], + 1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5), +} +simpl_gender = {0: ["M", "F", "O"], 1: ["*", "*", "*"]} + + +def get_all_hierarchies(): + return {name: obj for name, obj in globals().items() if isinstance(obj, dict)} diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py new file mode 100644 index 0000000..0ddd88f --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py @@ -0,0 +1,11 @@ +from typing import List +from pydantic import Field + +from .base_config import BaseConfiguration + + +class KAnonymityConfiguration(BaseConfiguration): + k: int = Field(default=3, ge=2, description="Desired level of k-anonymity (must be >= 2).") + sensitive_attributes: List[str] = Field( + default=["Disease"], description="List of sensitive attribute column names." + ) diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py new file mode 100644 index 0000000..c764f1d --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py @@ -0,0 +1,8 @@ +from pydantic import Field +from .base_config import BaseConfiguration + + +class LDiversityConfiguration(BaseConfiguration): + k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).") + l: int = Field(default=3, ge=1, description="L-diversity level (must be >= 1)") + sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.") diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py new file mode 100644 index 0000000..4461539 --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py @@ -0,0 +1,8 @@ +from pydantic import Field +from .base_config import BaseConfiguration + + +class TClosenessConfiguration(BaseConfiguration): + k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).") + t: float = Field(default=0.5, ge=0.0, le=1.0, description="Maximum t-distance threshold.") + sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.") diff --git a/src/template_code_location/dataframe_level_anonymisation/jobs.py b/src/template_code_location/dataframe_level_anonymisation/jobs.py new file mode 100644 index 0000000..35c76f7 --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/jobs.py @@ -0,0 +1,86 @@ +from dagster import job +from util_services.util_ops import ( + preview_dataframe, + read_structured_to_df, + write_df_to_local, + read_structured_from_s3, + write_df_to_s3, + write_semistructured_to_s3, + read_semistructured_from_s3 +) + +from .ops import apply_k_anonymity, apply_l_diversity, apply_t_closeness + + +@job(tags={ + "business_operation": "ANONYMISATION" +}) +def k_anonymity_job(): + org_df = read_structured_to_df() + anon_df, _ = apply_k_anonymity(org_df) + preview_dataframe(org_df) + write_df_to_local(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "ANONYMISATION" +}) +def l_diversity_job(): + org_df = read_structured_to_df() + anon_df, _ = apply_l_diversity(org_df) + preview_dataframe(org_df) + write_df_to_local(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "ANONYMISATION" +}) +def t_closeness_job(): + org_df = read_structured_to_df() + anon_df, _ = apply_t_closeness(org_df) + preview_dataframe(org_df) + write_df_to_local(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "ANONYMISATION", + "resource_type": "RD_DATA" +}) +def k_anonymity_job_s3(): + org_df = read_structured_from_s3() + anon_df, _ = apply_k_anonymity(org_df) + preview_dataframe(org_df) + write_df_to_s3(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "ANONYMISATION", + "resource_type": "RD_DATA" +}) +def l_diversity_job_s3(): + org_df = read_structured_from_s3() + anon_df, _ = apply_l_diversity(org_df) + preview_dataframe(org_df) + write_df_to_s3(anon_df) + preview_dataframe(anon_df) + + +@job(tags={ + "business_operation": "ANONYMISATION", + "resource_type": "RD_DATA" +}) +def t_closeness_job_s3(): + org_df = read_structured_from_s3() + anon_df, _ = apply_t_closeness(org_df) + preview_dataframe(org_df) + write_df_to_s3(anon_df) + preview_dataframe(anon_df) + +@job() +def read_write_semistructured_job_s3(): + semistruct_data = read_semistructured_from_s3() + write_semistructured_to_s3(semistruct_data) diff --git a/src/template_code_location/dataframe_level_anonymisation/ops.py b/src/template_code_location/dataframe_level_anonymisation/ops.py new file mode 100644 index 0000000..93682bf --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/ops.py @@ -0,0 +1,187 @@ +import json +from textwrap import dedent + +import pandas as pd +from anjana.anonymity import k_anonymity, l_diversity, t_closeness +from dagster import ( + DagsterInvalidInvocationError, + MarkdownMetadataValue, + Out, + Output, + get_dagster_logger, + op, +) +from pycanon import anonymity + +from template_code_location.dataframe_level_anonymisation.config_models import ( + KAnonymityConfiguration, + LDiversityConfiguration, + TClosenessConfiguration, +) +from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import get_all_hierarchies + + +def _calc_dataframe_metrics(df_anon, df_org, quasi_identifiers, sensitive_atttributes): + # --- Metrics --- + # Anonymization metrics + k_anon = anonymity.k_anonymity(df_anon, quasi_identifiers) + l_div = anonymity.l_diversity(df_anon, quasi_identifiers, sensitive_atttributes, True) + t_clos = anonymity.t_closeness(df_anon, quasi_identifiers, sensitive_atttributes, True) + + # Data Utilization metrics + supression_rate = 1 - len(df_anon) / len(df_org) + grouped = df_anon.groupby(quasi_identifiers) + mean_equivalence_class_size = len(df_anon) / len(grouped) if len(grouped) else 0 + + # flake8: noqa + anon_report = dedent( + f""" + ### Anonymization & Data Utilization Metrics + + | Metric | Value | Description | + |--------|-------|-------------| + | **k-anonymity** | `k = {k_anon}` | Minimum number of records sharing the same quasi-identifier values. | + | **l-diversity** | `l = {l_div}` | Diversity of sensitive attributes within each equivalence class. | + | **t-closeness** | `t = {round(t_clos, 2)}` | Distance between sensitive attribute distribution in a group and the overall dataset. | + | **Suppression rate** | `{round(supression_rate, 2)}` | Fraction of records or attributes suppressed to meet privacy requirements. | + | **Mean equivalence class size** | `{round(mean_equivalence_class_size, 2)}` | Average size of equivalence classes for quasi-identifiers, indicates data grouping. | + """ + ) + # flake8: enable + metrics = { + "k_anon": k_anon, + "l_div": l_div, + "t_clos": t_clos, + "supp_rate": supression_rate, + "mean_equivalence_class": mean_equivalence_class_size, + } + return anon_report, metrics + + +def _validate_and_get_hierarchies(config, df: pd.DataFrame): + hierarchies = get_all_hierarchies() + + # Dataset smaller than k + if len(df) < config.k: + raise DagsterInvalidInvocationError( + f"Cannot apply k-anonymity: dataset has {len(df)} records, but k={config.k}" + ) + + # Missing or incomplete generalisation hierarchies + for qi in config.quasi_identifiers: + if qi not in config.generalisation_hierarchies or not config.generalisation_hierarchies[qi]: + raise DagsterInvalidInvocationError( + f"Generalisation hierarchy for quasi-identifier '{qi}' is missing or incomplete" + ) + if config.generalisation_hierarchies[qi] not in hierarchies: + raise DagsterInvalidInvocationError( + f"Generalisation hierarchy '{config.generalisation_hierarchies[qi]}' is missing in the code basis" + ) + + hier = { + qi: hierarchies[config.generalisation_hierarchies[qi]] for qi in config.quasi_identifiers + } + return hier + + +@op(out={"data": Out(), "metrics": Out()}) +def apply_k_anonymity(context, config: KAnonymityConfiguration, df: pd.DataFrame): + + hier = _validate_and_get_hierarchies(config, df) + + data_anon = k_anonymity( + df, config.ident, config.quasi_identifiers, config.k, config.supp_level, hier + ) + if "index" in data_anon.columns and "index" not in df.columns: + data_anon.drop(columns="index", inplace=True) + anon_report, metrics = _calc_dataframe_metrics( + data_anon, df, config.quasi_identifiers, config.sensitive_attributes + ) + yield Output( + value=data_anon, + metadata={ + "metric_report": MarkdownMetadataValue(anon_report), + "metric_json": json.dumps(metrics), + }, + output_name="data", + ) + yield Output(value=metrics, output_name="metrics") + + +@op(out={"data": Out(), "metrics": Out()}) +def apply_l_diversity(context, config: LDiversityConfiguration, df: pd.DataFrame): + + hier = _validate_and_get_hierarchies(config, df) + + data_anon = l_diversity( + df, + config.ident, + config.quasi_identifiers, + config.sensitive_attribute, + config.k, + config.l, + config.supp_level, + hier, + ) + if data_anon.empty: + raise DagsterInvalidInvocationError( + "Could not tranform the data to l-diversity, empty dataset returned!" + ) + anon_report, metrics = _calc_dataframe_metrics( + data_anon, df, config.quasi_identifiers, [config.sensitive_attribute] + ) + yield Output( + value=data_anon, + metadata={ + "metric_report": MarkdownMetadataValue(anon_report), + "metric_json": json.dumps(metrics), + }, + output_name="data", + ) + yield Output(value=metrics, output_name="metrics") + + +@op(out={"data": Out(), "metrics": Out()}) +def apply_t_closeness(context, config: TClosenessConfiguration, df: pd.DataFrame): + + hier = _validate_and_get_hierarchies(config, df) + + try: + data_anon = t_closeness( + df, + config.ident, + config.quasi_identifiers, + config.sensitive_attribute, + config.k, + config.t, + config.supp_level, + hier, + ) + except ValueError as e: + if "Cannot be quasi-identifiers" in str(e): + raise DagsterInvalidInvocationError( + f"T-closeness failed: k-anonymity parameter = {config.k} is too small " + f"for existing hierarchies of {config.quasi_identifiers} in inner k-anonymity call." + ) + else: + # Re-raise other ValueError types with context + raise DagsterInvalidInvocationError(f"T-closeness failed with error: {str(e)}") + + if data_anon.empty: + raise DagsterInvalidInvocationError( + f"Could not transform the data to t-closeness, empty dataset returned! " + f"This may indicate that the t-closeness constraint (t={config.t}) is too strict for the given data." + ) + + anon_report, metrics = _calc_dataframe_metrics( + data_anon, df, config.quasi_identifiers, [config.sensitive_attribute] + ) + yield Output( + value=data_anon, + metadata={ + "metric_report": MarkdownMetadataValue(anon_report), + "metric_json": json.dumps(metrics), + }, + output_name="data", + ) + yield Output(value=metrics, output_name="metrics") diff --git a/src/template_code_location/dataframe_level_anonymisation/utils.py b/src/template_code_location/dataframe_level_anonymisation/utils.py new file mode 100644 index 0000000..c233c4e --- /dev/null +++ b/src/template_code_location/dataframe_level_anonymisation/utils.py @@ -0,0 +1,19 @@ +import numpy as np + + +def parse_value_list(values): + return [int(v) if isinstance(v, str) and v.isdigit() else v for v in values] + + +# Hierarchy normalization for Anjana +def normalize_hierarchy_levels(hierarchy_dict): + normalized = {} + for column, levels in hierarchy_dict.items(): + normalized[column] = {} + for level_str, mapping_list in levels.items(): + level = int(level_str) + if level == 0: + normalized[column][level] = np.array(parse_value_list(mapping_list)) + else: + normalized[column][level] = mapping_list + return normalized diff --git a/src/template_code_location/field_level_pseudo_anonymisation/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py new file mode 100644 index 0000000..60944be --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py @@ -0,0 +1,28 @@ +from .structured_config import ( # noqa: F401 + HashConfig, + EncryptConfig, + RedactConfig, + ReplaceConfig, + PseudoTechniqueConfig, + AnonymisePseudonymizeStructuredConfig, + DecryptConfig, + DepseudoTechniqueConfig, + DepseudonymizeStructuredConfig, +) + +from .unstructured_config import ( # noqa: F401, F811 + HashConfig, + EncryptConfig, + RedactConfig, + ReplaceConfig, + RetainConfig, + PseudoTechniqueConfig, + AnonymisePseudonymizeUnstructuredConfig, + DecryptConfig, + DepseudoTechniqueConfig, + DepseudonymizeUnstructuredConfig, +) + +from .languages import SupportedLanguages, LanguageEnum # noqa: F401 + +from .pii_entities import PIIEntityEnum, PII_MAPPING # noqa: F401 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py new file mode 100644 index 0000000..e3ba89e --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py @@ -0,0 +1,72 @@ +from enum import Enum +from typing import ClassVar + + +class SupportedLanguages: + LANGUAGES: ClassVar[dict[str, str]] = { + "hr": "hr_HR", # Croatian + "da": "da_DK", # Danish + "nl": "nl_NL", # Dutch + "en": "en_US", # English + "fi": "fi_FI", # Finnish + "fr": "fr_FR", # French + "de": "de_DE", # German + "el": "el_GR", # Greek + "it": "it_IT", # Italian + "lt": "lt_LT", # Lithuanian + "pl": "pl_PL", # Polish + "pt": "pt_PT", # Portuguese + "ro": "ro_RO", # Romanian + "sl": "sl_SI", # Slovenian + "es": "es_ES", # Spanish + "sv": "sv_SE", # Swedish + } + LANGUAGE_MODELS = { + "en": "en_core_web_sm", + "it": "it_core_news_sm", + "de": "de_core_news_sm", + "fr": "fr_core_news_sm", + "es": "es_core_news_sm", + "nl": "nl_core_news_sm", + "da": "da_core_news_sm", + "sv": "sv_core_news_sm", + "fi": "fi_core_news_sm", + "pl": "pl_core_news_sm", + "el": "el_core_news_sm", + "hr": "hr_core_news_sm", + "lt": "lt_core_news_sm", + "pt": "pt_core_news_sm", + "ro": "ro_core_news_sm", + "sl": "sl_core_news_sm", + } + + @classmethod + def codes(cls) -> list[str]: + return list(cls.LANGUAGES.keys()) + + @classmethod + def get_locale(cls, code: str) -> str: + return cls.LANGUAGES[code] + + @classmethod + def get_language_model(cls, code: str) -> str: + return cls.LANGUAGE_MODELS[code] + + +class LanguageEnum(str, Enum): + hr = "hr" + da = "da" + nl = "nl" + en = "en" + fi = "fi" + fr = "fr" + de = "de" + el = "el" + it = "it" + lt = "lt" + pl = "pl" + pt = "pt" + ro = "ro" + sl = "sl" + es = "es" + sv = "sv" diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py new file mode 100644 index 0000000..e730b6d --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py @@ -0,0 +1,24 @@ +from enum import Enum + + +class PIIEntityEnum(str, Enum): + PERSON = "Person" + EMAIL = "Email" + CREDIT_CARD = "Credit card" + DATE_OF_BIRTH = "Date of birth" + URL = "URLs" + PHONE_NUMBERS = "Phone numbers" + CREDENTIALS = "Credentials" + X_SOCIAL = "X (formally known as Twitter) username" + + +PII_MAPPING: dict[PIIEntityEnum, str] = { + PIIEntityEnum.PERSON: "NameFilth", + PIIEntityEnum.EMAIL: "EmailFilth", + PIIEntityEnum.CREDIT_CARD: "CreditCardFilth", + PIIEntityEnum.DATE_OF_BIRTH: "DateOfBirthFilth", + PIIEntityEnum.URL: "UrlFilth", + PIIEntityEnum.PHONE_NUMBERS: "PhoneFilth", + PIIEntityEnum.CREDENTIALS: "CredentialFilth", + PIIEntityEnum.X_SOCIAL: "TwitterFilth", +} diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py new file mode 100644 index 0000000..af8abf6 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py @@ -0,0 +1,110 @@ +from typing import List, Literal, Optional, Union + +from dagster import Config +from pydantic import Field as PydanticField, model_validator, field_validator + + +class HashConfig(Config): + type: Literal["hash"] = "hash" + columns: List[str] = PydanticField(default=["example_column"], description="Columns to hash") + algorithm: str = PydanticField(default="sha256", description="Hashing algorithm") + +class EncryptConfig(Config): + type: Literal["encrypt"] = "encrypt" + columns: List[str] = PydanticField(default=["example_column"], description="Columns to encrypt") + key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption") + +class RedactConfig(Config): + type: Literal["redact"] = "redact" + columns: List[str] = PydanticField(default=["example_column"], description="Columns to redact") + +class ReplaceConfig(Config): + type: Literal["replace"] = "replace" + columns: List[str] = PydanticField(default=["example_column"], description="Columns to replace") + new_value: str = PydanticField(default="REPLACED", description="Replacement value") + +class PseudoTechniqueConfig(Config): + technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig] = PydanticField( + default={"hash": HashConfig().model_dump(exclude={"type"})}, + discriminator="type" + ) + + +class AnonymisePseudonymizeStructuredConfig(Config): + used_function: List[PseudoTechniqueConfig] = PydanticField( + default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}], + description=("List of functions to be used on column"), + ) + + @model_validator(mode="after") + def ensure_unique_columns(self): + column_to_techniques = self._collect_column_to_techniques() + duplicates = { + col: techs for col, techs in column_to_techniques.items() if len(techs) > 1 + } + + if duplicates: + formatted = "; ".join( + f"{col} -> {', '.join(techs)}" for col, techs in duplicates.items() + ) + raise ValueError(f"Duplicate column(s) across techniques not allowed:\n{formatted}") + + return self + + def _collect_column_to_techniques(self): + """Extract column-to-techniques mapping from used_function list.""" + column_to_techniques = {} + for f in self.used_function: + technique_type, cols = self._extract_technique_and_columns(f) + for col in cols: + column_to_techniques.setdefault(col, []).append(technique_type) + return column_to_techniques + + def _extract_technique_and_columns(self, item): + """Extract technique type and columns list from a PseudoTechniqueConfig item (dict or model instance).""" + if isinstance(item, dict): + tech = item.get("technique") or {} + if isinstance(tech, dict): + if "type" in tech: + return tech.get("type"), tech.get("columns") or [] + elif len(tech) == 1: + # variant-key mapping: {'hash': {...}} + technique_type, inner = next(iter(tech.items())) + return technique_type, inner.get("columns") or [] + return None, [] + else: + # item is a PseudoTechniqueConfig instance + technique_type = item.technique.type + cols = getattr(item.technique, "columns", []) + return technique_type, cols + +class DecryptConfig(Config): + type: Literal["decrypt"] = "decrypt" + columns: List[str] = PydanticField(default=["example_column"], description="Columns to decrypt") + key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption") + +class DepseudoTechniqueConfig(Config): + technique: DecryptConfig = PydanticField(default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}) + + +class DepseudonymizeStructuredConfig(Config): + used_function: List[DepseudoTechniqueConfig] = PydanticField( + default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}], + description=("Decryption functions to be used on column"), + ) + + @field_validator("used_function", mode="before") + def _normalize_depseudo_used_function(cls, v): + normalized = [] + for item in v: + if isinstance(item, dict): + normalized.append(DepseudoTechniqueConfig.model_validate(item)) + else: + normalized.append(item) + return normalized + + @model_validator(mode="after") + def ensure_unique_columns(self): + # For depseudonymize, we don't have per-column uniqueness constraints, + # but keep a no-op validator to preserve API parity. + return self diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py new file mode 100644 index 0000000..abea0b0 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py @@ -0,0 +1,115 @@ +from typing import List, Literal, Optional, Union + +from dagster import Config +from pydantic import Field as PydanticField, model_validator, field_validator +from .languages import LanguageEnum +from .pii_entities import PIIEntityEnum + + +class HashConfig(Config): + type: Literal["hash"] = "hash" + pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to hash") + algorithm: str = PydanticField(default="sha256", description="Hashing algorithm") + +class EncryptConfig(Config): + type: Literal["encrypt"] = "encrypt" + pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to encrypt") + key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption") + + +class RedactConfig(Config): + type: Literal["redact"] = "redact" + pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to redact") + +class ReplaceConfig(Config): + type: Literal["replace"] = "replace" + pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to replace") + new_value: str = PydanticField(default="REPLACED", description="Replacement value") + +class RetainConfig(Config): + type: Literal["retain"] = "retain" + pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to retain") + +class PseudoTechniqueConfig(Config): + technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig, RetainConfig] = PydanticField( + default={"hash": HashConfig().model_dump(exclude={"type"})}, + discriminator="type" + ) + +class AnonymisePseudonymizeUnstructuredConfig(Config): + language: LanguageEnum = PydanticField( + default=LanguageEnum.en, + description="Language code (must be one of: hr, da, nl, en, fi, fr, de, el, it, lt, pl, pt, ro, sl, es, sv)" + + ) + used_function: List[PseudoTechniqueConfig] = PydanticField( + default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}], + description=("List of functions to be used on PIIs"), + ) + + @field_validator("used_function", mode="before") + def _normalize_used_function(cls, v): + normalized = [] + for item in v: + if isinstance(item, dict): + normalized.append(PseudoTechniqueConfig.model_validate(item)) + else: + normalized.append(item) + return normalized + + @model_validator(mode="after") + def ensure_unique_pii(self): + pii_to_techniques = self._collect_pii_to_techniques() + duplicates = { + pii: techs for pii, techs in pii_to_techniques.items() if len(techs) > 1 + } + + if duplicates: + formatted = "; ".join( + f"{pii} -> {', '.join(techs)}" for pii, techs in duplicates.items() + ) + raise ValueError(f"Duplicate PII(s) across techniques not allowed:\n{formatted}") + + return self + + def _collect_pii_to_techniques(self): + """Extract PII-to-techniques mapping from used_function list.""" + pii_to_techniques = {} + for f in self.used_function: + technique_type, piis = self._extract_technique_and_pii(f) + for pii in piis: + pii_to_techniques.setdefault(pii, []).append(technique_type) + return pii_to_techniques + + def _extract_technique_and_pii(self, item): + """Extract technique type and PII list from a PseudoTechniqueConfig item (dict or model instance).""" + if isinstance(item, dict): + tech = item.get("technique") or {} + if isinstance(tech, dict): + if "type" in tech: + return tech.get("type"), tech.get("pii") or tech.get("columns") or [] + elif len(tech) == 1: + # variant-key mapping: {'hash': {...}} + technique_type, inner = next(iter(tech.items())) + return technique_type, inner.get("pii") or inner.get("columns") or [] + return None, [] + else: + # item is a PseudoTechniqueConfig instance + technique_type = item.technique.type + piis = getattr(item.technique, "pii", []) or getattr(item.technique, "columns", []) + return technique_type, piis + +class DecryptConfig(Config): + type: Literal["decrypt"] = "decrypt" + key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption") + +class DepseudoTechniqueConfig(Config): + technique: DecryptConfig = PydanticField( + default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}, + ) + +class DepseudonymizeUnstructuredConfig(Config): + used_function: List[DepseudoTechniqueConfig] = PydanticField( + default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}], + description=("Decryption function"), + ) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/jobs.py b/src/template_code_location/field_level_pseudo_anonymisation/jobs.py new file mode 100644 index 0000000..56baf11 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/jobs.py @@ -0,0 +1,126 @@ +from dagster import job +from util_services.util_ops import ( + preview_dataframe, + read_structured_to_df, + write_df_to_local, + write_string_to_txt, + read_txt_to_string, + preview_txt, + read_structured_from_s3, + write_df_to_s3, + read_txt_from_s3, + write_text_to_s3, +) +from .ops import ( + anonymize_pseudonymize_structured, + depseudonymize_structured, +) +from .unstructured_ops import ( + anonymize_pseudonymize_unstructured, + depseudonymize_unstructured, +) + +@job(tags={ + "business_operation": "ANONYMISATION_PSEUDONYMISATION" +}) +def anonymize_pseudonymize_structured_job(): + df = read_structured_to_df() + preview_dataframe(df) + df_anon, metrics = anonymize_pseudonymize_structured(df) + preview_dataframe(df_anon) + write_df_to_local(df_anon) + + +@job(tags={ + "business_operation": "ANONYMISATION_PSEUDONYMISATION", + "resource_type": "RD_DATA" +}) +def anonymize_pseudonymize_structured_job_s3(): + df = read_structured_from_s3() + preview_dataframe(df) + df_anon, metrics = anonymize_pseudonymize_structured(df) + preview_dataframe(df_anon) + write_df_to_s3(df_anon) + + +@job(tags={ + "business_operation": "DEPSEUDONYMISATION" +}) +def depseudonymize_structured_job(): + df = read_structured_to_df() + preview_dataframe(df) + df_anon, metrics = depseudonymize_structured(df) + preview_dataframe(df_anon) + write_df_to_local(df_anon) + + +@job(tags={ + "business_operation": "DEPSEUDONYMISATION", + "resource_type": "RD_DATA" +}) +def depseudonymize_structured_job_s3(): + df = read_structured_from_s3() + preview_dataframe(df) + df_anon, metrics = depseudonymize_structured(df) + preview_dataframe(df_anon) + write_df_to_s3(df_anon) + + +@job(tags={ + "business_operation": "ANONYMISATION_PSEUDONYMISATION" +}) +def anonymize_pseudonymize_depseudonymize_structured_job(): + df = read_structured_to_df() + preview_dataframe(df) + df_pseduo, metrics = anonymize_pseudonymize_structured(df) + preview_dataframe(df_pseduo) + df_depseduo, metrics = depseudonymize_structured(df_pseduo) + preview_dataframe(df_depseduo) + + +@job(tags={ + "business_operation": "ANONYMISATION_PSEUDONYMISATION" +}) +def anonymize_pseudonymize_unstructured_job(): + text = read_txt_to_string() + preview_txt(text) + text_anon, metrics = anonymize_pseudonymize_unstructured(text) + preview_txt(text_anon) + preview_txt(metrics) + write_string_to_txt(text_anon) + + +@job(tags={ + "business_operation": "ANONYMISATION_PSEUDONYMISATION", + "resource_type": "RD_DATA" +}) +def anonymize_pseudonymize_unstructured_job_s3(): + text = read_txt_from_s3() + preview_txt(text) + text_anon, metrics = anonymize_pseudonymize_unstructured(text) + preview_txt(text_anon) + preview_txt(metrics) + write_text_to_s3(text_anon) + + +@job(tags={ + "business_operation": "DEPSEUDONYMISATION" +}) +def depseudonymize_unstructured_job(): + text = read_txt_to_string() + preview_txt(text) + text_anon, metrics = depseudonymize_unstructured(text) + preview_txt(text_anon) + write_string_to_txt(text_anon) + + +@job(tags={ + "business_operation": "DEPSEUDONYMISATION", + "resource_type": "RD_DATA" +}) +def depseudonymize_unstructured_job_s3(): + text = read_txt_from_s3() + preview_txt(text) + text_anon, metrics = depseudonymize_unstructured(text) + preview_txt(text_anon) + write_text_to_s3(text_anon) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/ops.py b/src/template_code_location/field_level_pseudo_anonymisation/ops.py new file mode 100644 index 0000000..a485ff9 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/ops.py @@ -0,0 +1,77 @@ +import pandas as pd +import numpy as np +from dagster import Out, Output, op +from cryptography.fernet import InvalidToken +from template_code_location.field_level_pseudo_anonymisation.config_models import ( + AnonymisePseudonymizeStructuredConfig, + DepseudonymizeStructuredConfig, +) +from template_code_location.field_level_pseudo_anonymisation.techniques import ( + anonymisation_pseudonymisation_techniques as anon_pseudo_funcs, +) +import template_code_location.field_level_pseudo_anonymisation.techniques.depseudonymisation_techniques as depseudo_funcs +from .utils import create_get_encryption_key + + +def _apply_column_wise_function(config, df, funcs): + for used_function in config.used_function: + func_name = used_function.technique.type + columns = used_function.technique.columns + func = getattr(funcs, func_name) + params = used_function.technique.model_dump() + del params["type"] + del params["columns"] + + if func_name in ["encrypt", "decrypt"]: + key_name = used_function.technique.key_name + del params["key_name"] + params["key"] = create_get_encryption_key(func_name, key_name) + + missing = [col for col in columns if col not in df.columns] + if missing: + raise ValueError( + f"The following columns required by technique '{func_name}' " + f"are not present in the DataFrame: {', '.join(missing)}" + ) + + # Skip processing if DataFrame is empty + if len(df) == 0: + continue + + for column in columns: + try: + vectorized_func = np.vectorize(lambda x: func(x, **params)) + df[column] = vectorized_func(df[column].to_numpy()) + except InvalidToken: + raise ValueError( + f"Invalid Fernet token while decrypting column '{column}' " + f"using key '{key_name}'. The data may not be encrypted " + f"or the key may be incorrect. " + ) + return df + + +@op(out={"data": Out(), "metrics": Out()}) +def anonymize_pseudonymize_structured( + context, config: AnonymisePseudonymizeStructuredConfig, df: pd.DataFrame +): + + df = _apply_column_wise_function(config, df, anon_pseudo_funcs) + yield Output( + value=df, + metadata={}, + output_name="data", + ) + yield Output(value={}, output_name="metrics") + + +@op(out={"data": Out(), "metrics": Out()}) +def depseudonymize_structured(context, config: DepseudonymizeStructuredConfig, df: pd.DataFrame): + + df = _apply_column_wise_function(config, df, depseudo_funcs) + yield Output( + value=df, + metadata={}, + output_name="data", + ) + yield Output(value={}, output_name="metrics") diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py new file mode 100644 index 0000000..128c371 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py @@ -0,0 +1,3 @@ +from .anonymisation_pseudonymisation_techniques import hash, redact, replace, encrypt # noqa: F401 + +from .depseudonymisation_techniques import decrypt # noqa: F401 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py new file mode 100644 index 0000000..ce15613 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py @@ -0,0 +1,42 @@ +import hashlib +from cryptography.fernet import Fernet + + +def hash(value: str, algorithm: str = "sha256") -> str: + """ + Hash the value using the specified algorithm (default: SHA-256). + """ + value = str(value) + hash_func = hashlib.new(algorithm) + hash_func.update(value.encode("utf-8")) + return hash_func.hexdigest() + + +def redact(value: str) -> str: + """ + Redact the column and return an empty string + """ + return "" + + +def replace(value: str, new_value) -> str: + """ + Replace the value column with the provided value + """ + return new_value + + +def encrypt(value: str, key: bytes) -> str: + """ + Encrypt the value using the provided Fernet key. + """ + value = str(value) + f = Fernet(key) + return f.encrypt(value.encode()).decode() + + +def retain(value: str) -> str: + """ + Retain the original value without any changes. + """ + return value diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py new file mode 100644 index 0000000..4e0937c --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py @@ -0,0 +1,9 @@ +from cryptography.fernet import Fernet + + +def decrypt(value: str, key: bytes) -> str: + """ + Decrypt a string using the provided Fernet key. + """ + f = Fernet(key) + return f.decrypt(value.encode()).decode() diff --git a/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py b/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py new file mode 100644 index 0000000..f8f0ffe --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py @@ -0,0 +1,428 @@ +import importlib +import importlib.abc +import importlib.machinery +import re +import sys +import types + + +# --------------------------------------------------------------------------- +# Stub out the `transformers` and `spacy_transformers` packages before any +# other import triggers spaCy's entry-point scan or scrubadub_spacy's runtime +# import of spacy_transformers.pipeline_component. +# --------------------------------------------------------------------------- +_STUB_PACKAGES = ("transformers", "spacy_transformers") + + +class _StubModule(types.ModuleType): + """Module that returns a dummy class for any attribute access.""" + + def __getattr__(self, name: str): + return type(name, (), {}) + + +class _StubFinder(importlib.abc.MetaPathFinder): + """Intercept any import under the stubbed packages and return a stub module.""" + + def find_spec(self, fullname, path=None, target=None): # noqa: ANN001 + for pkg in _STUB_PACKAGES: + if fullname == pkg or fullname.startswith(pkg + "."): + return importlib.machinery.ModuleSpec(fullname, _StubLoader()) + return None + + +class _StubLoader(importlib.abc.Loader): + def create_module(self, spec): # noqa: ANN001 + mod = _StubModule(spec.name) + mod.__path__ = [] # mark as package + mod.__spec__ = spec + return mod + + def exec_module(self, module): # noqa: ANN001 + pass + + +# Install the finder once, before scrubadub / spacy are imported. +if not any(isinstance(f, _StubFinder) for f in sys.meta_path): + sys.meta_path.insert(0, _StubFinder()) +# --------------------------------------------------------------------------- + + +import scrubadub # noqa: E402 +import scrubadub_spacy # noqa: E402 +from cryptography.fernet import InvalidToken # noqa: E402 +from dagster import Out, Output, get_dagster_logger, op # noqa: E402 +from scrubadub.detectors import RegexDetector # noqa: E402 +from scrubadub.filth import CredentialFilth, NameFilth # noqa: E402 + +from template_code_location.field_level_pseudo_anonymisation.techniques import ( + anonymisation_pseudonymisation_techniques as anon_pseudo_funcs, +) +from template_code_location.field_level_pseudo_anonymisation.techniques import ( + depseudonymisation_techniques as depseudo_funcs, +) + +from .config_models import ( + PII_MAPPING, + AnonymisePseudonymizeUnstructuredConfig, + DepseudonymizeUnstructuredConfig, + PIIEntityEnum, + PseudoTechniqueConfig, + SupportedLanguages, +) +from .utils import create_get_encryption_key + + +def _initialize_scrubber(language: str) -> scrubadub.Scrubber: + class SIMPLCredentialDetector(RegexDetector): + """ + Remove username/password combinations from dirty ``text``. + """ + + filth_cls = CredentialFilth + name = "credential" + autoload = True + + regex = re.compile( + r""" + (?:username|login|u:)\s*(?::\s*)? + (?P[\w.\-@+]+) + [\s\S]{0,500}? + (?:password|pw|p:)\s*(?::\s*)? + (?P[^\s]+) + """, + re.MULTILINE | re.VERBOSE | re.IGNORECASE, + ) + + locale = SupportedLanguages.get_locale(language) + scrubber = scrubadub.Scrubber(locale=locale) + + model_name = SupportedLanguages.get_language_model(language) + spacy_detector = scrubadub_spacy.detectors.SpacyEntityDetector(model=model_name) + spacy_detector.named_entities = { + "PERSON", + "PER", + "ORG", + "persName", + "PRS", + } # Need to set it after the constructor because scrubadub_spacy uses upper on all entries + spacy_detector.filth_cls_map["persName"] = NameFilth # Required because PL uses persName + spacy_detector.filth_cls_map["PRS"] = NameFilth # Required for swedish that uses PRS + scrubber.add_detector(spacy_detector) + if language in ["en", "de"]: + scrubber.add_detector( + scrubadub.detectors.DateOfBirthDetector + ) # add optional data of birth detector + scrubber.remove_detector( + scrubadub.detectors.CredentialDetector + ) # remove the not so great credentials detector and replace with custom SIMPL one + scrubber.add_detector(SIMPLCredentialDetector()) + return scrubber + + +def _map_filth_to_pii_enum(filth) -> PIIEntityEnum | None: + cls_name = filth.__class__.__name__ + for pii_enum, filth_name in PII_MAPPING.items(): + if filth_name == cls_name: + return pii_enum + return None + + +def _get_metrics(metrics_dict: dict, language: str) -> str: + # Format metrics as Markdown table + metrics_report = f""" +## PII Anonymization Report + +### Summary +- **Total PII Detected**: {metrics_dict['total_pii_detected']} +- **Original Length**: {metrics_dict['text_length_original']} chars +- **Anonymized Length**: {metrics_dict['text_length_anonymised']} chars +- **Language**: {language} + +### PII by Type +| Entity Type | Count | +|-------------|-------| +""" + for pii_type, count in metrics_dict["pii_by_type"].items(): + metrics_report += f"| {pii_type} | {count} |\n" + + metrics_report += "\n### Techniques Applied\n" + for pii, technique in metrics_dict["techniques_applied"].items(): + metrics_report += f"- **{pii}**: {technique}\n" + + return metrics_report + + +def _build_metrics_dict( + pii_counts: dict[str, int], + text: str, + anon_text: str, + technique_map: dict[PIIEntityEnum, PseudoTechniqueConfig], +) -> dict: + metrics_dict = { + "total_pii_detected": sum(pii_counts.values()), + "pii_by_type": pii_counts, + "text_length_original": len(text), + "text_length_anonymised": len(anon_text), + "techniques_applied": { + pii.name: technique_map[pii].technique.type for pii in technique_map.keys() + }, + } + + return metrics_dict + + +@op(out={"data": Out(), "metrics": Out()}) +def anonymize_pseudonymize_unstructured( + context, config: AnonymisePseudonymizeUnstructuredConfig, text: str +): + logger = get_dagster_logger() + + if text is None or not text.strip(): + raise ValueError("Input text cannot be None or empty") + + logger.debug( + f"Starting unstructured PII anonymization | lang={config.language.value} " + f"| input_chars={len(text)}" + ) + + # --- Filth detection --- + try: + scrubber = _initialize_scrubber(config.language.value) + filths = list(scrubber.iter_filth(text)) + logger.info(f"Detected {len(filths)} potential PII entities before filtering.") + except Exception as e: + logger.error(f"Scrubber initialization/detection failed | lang={config.language.value}") + raise RuntimeError(f"PII detection failed for language '{config.language.value}'") from e + + # --- Build technique routing map --- + technique_map = _build_technique_map(config) + logger.debug( + "Technique map constructed: " + + ", ".join(f"{pii.name}->{cfg.technique.type}" for pii, cfg in technique_map.items()) + ) + + replacements = [] + key_cache = {} + pii_counts = {} + + # --- Process filths --- + for idx, filth in enumerate(filths, start=1): + pii_enum = _map_filth_to_pii_enum(filth) + + if pii_enum is None: + logger.debug(f"[{idx}] Skipping unknown filth class={filth.__class__.__name__}") + continue + + start_idx, end_idx = _extract_span(filth, logger, idx) + if start_idx is None: + continue + + original_value = text[start_idx:end_idx] + technique_cfg = technique_map.get(pii_enum) + + # No technique configured + if technique_cfg is None: + _handle_missing_technique( + pii_enum, + start_idx, + end_idx, + text, + pii_counts, + replacements, + logger, + idx, + ) + continue + + # Apply configured technique + t = technique_cfg.technique + params = _prepare_params(t, key_cache, idx, logger) + replacement = _apply_technique(original_value, t.type, params, pii_enum, idx, logger) + + replacements.append((start_idx, end_idx, replacement)) + pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1 + + # --- Apply replacements --- + anon_text = _apply_replacements(text, replacements, logger) + + logger.info(f"Anonymisation completed, total PII counts: {pii_counts}") + + metrics_report = _get_metrics( + _build_metrics_dict(pii_counts, text, anon_text, technique_map), + config.language.value, + ) + + yield Output(value=anon_text, output_name="data") + yield Output(value=metrics_report, output_name="metrics") + + +@op(out={"data": Out(), "metrics": Out()}) +def depseudonymize_unstructured(context, config: DepseudonymizeUnstructuredConfig, input_text: str): + + input_restored, metrics = _apply_depseudonimisation_function(config, input_text, depseudo_funcs) + yield Output( + value=input_restored, + metadata={}, + output_name="data", + ) + yield Output(value=metrics, output_name="metrics") + + +def _apply_depseudonimisation_function(config, input_text: str, funcs_module): + """ + Searches and depseudonymizes text segments formatted as: + {technique:pseudonymized_value} + """ + + total_depseudo_count = 0 + depseudonimized_text = input_text # Initialize with input text + + # Loop through each depseudonymisation technique defined in the config + for used_function in config.used_function: + func_name = used_function.technique.type + func = getattr(funcs_module, func_name) + pseudo_anon_func = "" + + # Prepare parameters + params = used_function.technique.model_dump() + del params["type"] + + if func_name == "decrypt": + key_name = used_function.technique.key_name + del params["key_name"] + pseudo_anon_func = "encrypt" + params["key"] = create_get_encryption_key(func_name, key_name) + + # Regex pattern for this technique, e.g. {encrypt:...} + pattern = rf"\{{{pseudo_anon_func}:([^}}]+)\}}" + + def replace_match(match): + nonlocal total_depseudo_count + pseudovalue = match.group(1) + total_depseudo_count += 1 + try: + return func(pseudovalue, **params) + except InvalidToken: + raise ValueError( + f"Invalid Fernet token while decrypting value using key '{key_name}'. " + f"The data may not be encrypted or the key may be incorrect." + ) + except Exception as e: + raise RuntimeError(f"Error during depseudonymisation with '{func_name}': {e}") + + # Apply replacements for this technique + depseudonimized_text = re.sub(pattern, replace_match, depseudonimized_text) + + yield depseudonimized_text + yield {"total_depseudo_count": total_depseudo_count} + + +def _build_technique_map(config): + technique_map = {} + for func_cfg in config.used_function: + for pii in func_cfg.technique.pii: + technique_map[pii] = func_cfg + return technique_map + + +def _extract_span(filth, logger, idx): + start_idx = getattr(filth, "beg", getattr(filth, "start", None)) + end_idx = getattr(filth, "end", None) + if start_idx is None or end_idx is None: + logger.debug(f"[{idx}] Filth missing span attributes; skipping.") + return None, None + return start_idx, end_idx + + +def _handle_missing_technique( + pii_enum, start_idx, end_idx, text, pii_counts, replacements, logger, idx +): + original_value = text[start_idx:end_idx] + logger.debug( + f"[{idx}] PII={pii_enum.name} span=({start_idx},{end_idx}) value={original_value} " + f"- No technique configured, using placeholder" + ) + placeholder = f"{{{{{pii_enum.name}}}}}" + replacements.append((start_idx, end_idx, placeholder)) + pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1 + + +def _prepare_params(t, key_cache, idx, logger): + params = t.model_dump() + del params["type"] + del params["pii"] + + if t.type == "encrypt": + try: + if t.key_name not in key_cache: + logger.debug( + f"[{idx}] Retrieving/generating Vault key name={t.key_name} for encryption" + ) + key_cache[t.key_name] = create_get_encryption_key("encrypt", t.key_name) + params["key"] = key_cache[t.key_name] + del params["key_name"] + logger.debug(f"[{idx}] Encryption key prepared") + except Exception as e: + raise RuntimeError( + f"Encryption key retrieval failed for key '{t.key_name}': {type(e).__name__}" + ) from e + + return params + + +def _apply_technique(original_value, t_type, params, pii_enum, idx, logger): + try: + func = getattr(anon_pseudo_funcs, t_type) + replacement = func(original_value, **params) + + if t_type == "encrypt": + replacement = f"{{encrypt:{replacement}}}" + + logger.debug(f"[{idx}] {t_type.capitalize()} complete") + return replacement + + except AttributeError: + logger.warning(f"[{idx}] Technique '{t_type}' not recognized; inserting placeholder.") + return f"{{UNIMPL_{t_type}_{pii_enum.name}}}" + + except Exception as e: + raise RuntimeError( + f"Technique '{t_type}' failed for PII type '{pii_enum.name}': {type(e).__name__}" + ) from e + + +def _apply_replacements(text, replacements, logger): + if not replacements: + logger.info("No PII detected; returning original text.") + return text + + logger.debug(f"Applying {len(replacements)} replacements to text body.") + replacements.sort(key=lambda r: r[0]) + + # Detect overlaps + for i in range(len(replacements) - 1): + if replacements[i][1] > replacements[i + 1][0]: + logger.warning( + f"Overlapping PII detected at positions " + f"({replacements[i][0]},{replacements[i][1]}) " + f"and ({replacements[i+1][0]},{replacements[i+1][1]}). " + f"Using first match." + ) + replacements[i + 1] = ( + replacements[i][1], + replacements[i + 1][1], + replacements[i + 1][2], + ) + + result_parts = [] + last = 0 + for start, end, repl in replacements: + if start < last: + continue + result_parts.append(text[last:start]) + result_parts.append(repl) + last = end + + result_parts.append(text[last:]) + return "".join(result_parts) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/utils.py b/src/template_code_location/field_level_pseudo_anonymisation/utils.py new file mode 100644 index 0000000..25ebd75 --- /dev/null +++ b/src/template_code_location/field_level_pseudo_anonymisation/utils.py @@ -0,0 +1,32 @@ +import os +import hvac +from hvac.exceptions import InvalidPath +from cryptography.fernet import Fernet + + +def create_get_encryption_key(func_name: str, key_name: str) -> bytes: + client = hvac.Client(url=os.getenv("OPENBAO_URL"), token=os.getenv("OPENBAO_TOKEN")) + + secret_folder = os.getenv("ENCRYPTION_KEYS_PATH") + secret_path = f"{secret_folder}/{key_name}" if secret_folder else key_name + mount_point = os.getenv("ENCRYPTION_KEYS_MOUNT_POINT") + + try: + secret_response = client.secrets.kv.v2.read_secret_version( + path=secret_path, mount_point=mount_point + ) + key_value = secret_response["data"]["data"]["value"] + + except InvalidPath: + if func_name == "encrypt": + new_key = Fernet.generate_key().decode() + client.secrets.kv.v2.create_or_update_secret( + path=secret_path, mount_point=mount_point, secret={"value": new_key} + ) + key_value = new_key + else: + raise ValueError(f"Fernet key '{key_name}' not found in Vault for decrypt.") + except Exception as e: + raise ValueError(f"Error while reading Fernet key '{key_name}': {e}") + + return key_value.encode() diff --git a/src/template_code_location/jobs/__init__.py b/src/template_code_location/jobs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/template-code-location/jobs/jobs.py b/src/template_code_location/jobs/jobs.py similarity index 100% rename from src/template-code-location/jobs/jobs.py rename to src/template_code_location/jobs/jobs.py diff --git a/src/template_code_location/ops/__init__.py b/src/template_code_location/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/template-code-location/ops/ops.py b/src/template_code_location/ops/ops.py similarity index 100% rename from src/template-code-location/ops/ops.py rename to src/template_code_location/ops/ops.py diff --git a/src/template_code_location/repository.py b/src/template_code_location/repository.py new file mode 100644 index 0000000..cf97606 --- /dev/null +++ b/src/template_code_location/repository.py @@ -0,0 +1,65 @@ +from dagster import Definitions +from util_services.resources import s3_resource +from util_services.sensors import ( + notify_success, + notify_failure, + notify_canceled +) +from util_services.custom_json_logger import simpl_json_logger + +# Data processing jobs +from template_code_location.data_processing.jobs import ( + remove_duplicates_job_s3, + fill_missing_values_job_s3, + standardize_categorical_values_job_s3, + correct_typos_job_s3, + normalize_numeric_min_max_job_s3, + normalize_datetime_job_s3, + normalize_coordinates_job_s3, + add_global_aggregations_job_s3, + filter_dataset_job_s3, +) + +# Dataframe-level anonymisation jobs +from template_code_location.dataframe_level_anonymisation.jobs import ( + k_anonymity_job_s3, + l_diversity_job_s3, + t_closeness_job_s3, + read_write_semistructured_job_s3, +) + +# Field-level pseudo-anonymisation jobs +from template_code_location.field_level_pseudo_anonymisation.jobs import ( + anonymize_pseudonymize_structured_job_s3, + depseudonymize_structured_job_s3, + anonymize_pseudonymize_unstructured_job_s3, + depseudonymize_unstructured_job_s3, +) + +defs = Definitions( + jobs=[ + # Data processing + remove_duplicates_job_s3, + fill_missing_values_job_s3, + standardize_categorical_values_job_s3, + correct_typos_job_s3, + normalize_numeric_min_max_job_s3, + normalize_datetime_job_s3, + normalize_coordinates_job_s3, + add_global_aggregations_job_s3, + filter_dataset_job_s3, + # Dataframe-level anonymisation + k_anonymity_job_s3, + l_diversity_job_s3, + t_closeness_job_s3, + read_write_semistructured_job_s3, + # Field-level pseudo-anonymisation + anonymize_pseudonymize_structured_job_s3, + depseudonymize_structured_job_s3, + anonymize_pseudonymize_unstructured_job_s3, + depseudonymize_unstructured_job_s3, + ], + sensors=[notify_success, notify_failure, notify_canceled], + resources={"s3": s3_resource.configured({"resource_name": "selfS3"})}, + loggers={"simpl": simpl_json_logger}, +) From d14b2dfac46fd193705231eb38f618e5933b7add Mon Sep 17 00:00:00 2001 From: ILay Date: Fri, 24 Apr 2026 18:42:07 +0200 Subject: [PATCH 02/15] feat(SIMPL-24642): migrate tests from 3 source repos with updated imports --- tests/__init__.py | 1 + tests/data_processing/__init__.py | 1 + tests/data_processing/conftest.py | 53 + tests/data_processing/conftest_utils.py | 7 + tests/data_processing/test_config_models.py | 202 +++ tests/data_processing/test_integration.py | 185 +++ tests/data_processing/test_jobs.py | 56 + tests/data_processing/test_ops.py | 700 +++++++++++ .../dataframe_level_anonymisation/__init__.py | 1 + .../config_models/__init__.py | 1 + .../config_models/test_base_config.py | 54 + .../config_models/test_hierarchies.py | 48 + .../config_models/test_k_anonymity_config.py | 41 + .../config_models/test_l_diversity_config.py | 44 + .../config_models/test_t_closeness_config.py | 56 + .../test_jobs.py | 44 + .../dataframe_level_anonymisation/test_ops.py | 230 ++++ .../test_utils.py | 70 ++ .../__init__.py | 1 + .../conftest.py | 444 +++++++ .../test_config_models_coverage.py | 633 ++++++++++ .../test_decrypt_structured.py | 1090 ++++++++++++++++ .../test_decrypt_unstructured.py | 288 +++++ .../test_encrypt_structured.py | 1119 +++++++++++++++++ .../test_encrypt_unstructured.py | 853 +++++++++++++ .../test_jobs.py | 58 + 26 files changed, 6280 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/data_processing/__init__.py create mode 100644 tests/data_processing/conftest.py create mode 100644 tests/data_processing/conftest_utils.py create mode 100644 tests/data_processing/test_config_models.py create mode 100644 tests/data_processing/test_integration.py create mode 100644 tests/data_processing/test_jobs.py create mode 100644 tests/data_processing/test_ops.py create mode 100644 tests/dataframe_level_anonymisation/__init__.py create mode 100644 tests/dataframe_level_anonymisation/config_models/__init__.py create mode 100644 tests/dataframe_level_anonymisation/config_models/test_base_config.py create mode 100644 tests/dataframe_level_anonymisation/config_models/test_hierarchies.py create mode 100644 tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py create mode 100644 tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py create mode 100644 tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py create mode 100644 tests/dataframe_level_anonymisation/test_jobs.py create mode 100644 tests/dataframe_level_anonymisation/test_ops.py create mode 100644 tests/dataframe_level_anonymisation/test_utils.py create mode 100644 tests/field_level_pseudo_anonymisation/__init__.py create mode 100644 tests/field_level_pseudo_anonymisation/conftest.py create mode 100644 tests/field_level_pseudo_anonymisation/test_config_models_coverage.py create mode 100644 tests/field_level_pseudo_anonymisation/test_decrypt_structured.py create mode 100644 tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py create mode 100644 tests/field_level_pseudo_anonymisation/test_encrypt_structured.py create mode 100644 tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py create mode 100644 tests/field_level_pseudo_anonymisation/test_jobs.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/data_processing/__init__.py b/tests/data_processing/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/data_processing/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/data_processing/conftest.py b/tests/data_processing/conftest.py new file mode 100644 index 0000000..9eda2af --- /dev/null +++ b/tests/data_processing/conftest.py @@ -0,0 +1,53 @@ +"""Pytest configuration and shared fixtures.""" + +import pytest +import pandas as pd +from unittest.mock import MagicMock, patch +import sys +from dagster import build_op_context + +# Mock external dependencies that might not be available in test environment +sys.modules['spellchecker'] = MagicMock() + + +@pytest.fixture +def mock_context(): + """Create a mock Dagster context for testing operations.""" + context = build_op_context() + return context + + +@pytest.fixture +def sample_dataframe(): + """Create a sample DataFrame for testing.""" + return pd.DataFrame({ + 'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'], + 'Age': [25, 30, 25, None, 25], + 'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'], + 'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active'] + }) + + +@pytest.fixture +def sample_dataframe_with_typos(): + """Create a sample DataFrame with typos for spell checking.""" + return pd.DataFrame({ + 'Name': ['jon doe', 'jane smith', 'bob jonson'], + 'Description': ['developer', 'analst', 'enginer'] + }) + + +@pytest.fixture +def empty_dataframe(): + """Create an empty DataFrame.""" + return pd.DataFrame() + + +@pytest.fixture +def dataframe_with_missing_values(): + """Create a DataFrame with various missing values.""" + return pd.DataFrame({ + 'Column1': [1, None, 3, None, 5], + 'Column2': ['a', 'b', None, 'd', None], + 'Column3': [None, None, None, None, None] + }) diff --git a/tests/data_processing/conftest_utils.py b/tests/data_processing/conftest_utils.py new file mode 100644 index 0000000..19d2f59 --- /dev/null +++ b/tests/data_processing/conftest_utils.py @@ -0,0 +1,7 @@ +"""Configuration utilities for testing.""" + +import os +import sys + +# Add src directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) diff --git a/tests/data_processing/test_config_models.py b/tests/data_processing/test_config_models.py new file mode 100644 index 0000000..989054f --- /dev/null +++ b/tests/data_processing/test_config_models.py @@ -0,0 +1,202 @@ +"""Unit tests for configuration models.""" + +import pytest +from pydantic import ValidationError +from template_code_location.data_processing.config_models import ( + FillMissingConfiguration, + ColumnsSelectConfiguration, + SpellCheckConfiguration, + AggregationConfiguration +) + + +class TestColumnsSelectConfiguration: + """Tests for ColumnsSelectConfiguration.""" + + def test_default_columns(self): + """Test default columns configuration.""" + config = ColumnsSelectConfiguration() + assert config.columns == ['Name'] + + def test_custom_columns(self): + """Test custom columns configuration.""" + config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3']) + assert config.columns == ['Col1', 'Col2', 'Col3'] + + def test_empty_columns_list(self): + """Test with empty columns list.""" + config = ColumnsSelectConfiguration(columns=[]) + assert config.columns == [] + + def test_single_column(self): + """Test with a single column.""" + config = ColumnsSelectConfiguration(columns=['SingleCol']) + assert config.columns == ['SingleCol'] + + def test_columns_with_special_characters(self): + """Test columns with special characters.""" + config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3']) + assert config.columns == ['Col-1', 'Col_2', 'Col.3'] + + def test_duplicate_columns_are_removed(self): + """Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys).""" + config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B']) + + assert config.columns == ['A', 'B', 'C'] + + def test_duplicate_default_behavior(self): + """Verifica che anche input estremi vengano gestiti correttamente.""" + config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name']) + assert config.columns == ['Name'] + + +class TestFillMissingConfiguration: + """Tests for FillMissingConfiguration.""" + + def test_default_fill_map(self): + """Test default fill map configuration.""" + config = FillMissingConfiguration() + + assert config.fill_map == {'Age': 'UNKNOWN_AGE'} + + def test_custom_fill_map(self): + """Test custom fill map configuration.""" + fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'} + config = FillMissingConfiguration(fill_map=fill_map) + + assert config.fill_map == fill_map + + def test_empty_fill_map(self): + """Test with empty fill map.""" + config = FillMissingConfiguration(fill_map={}) + + assert config.fill_map == {} + + def test_fill_map_with_numeric_values(self): + """Test fill map with numeric string values.""" + fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'} + config = FillMissingConfiguration(fill_map=fill_map) + + assert config.fill_map == fill_map + + def test_fill_map_with_string_values(self): + """Test fill map with string values.""" + fill_map = {'Name': 'Unknown', 'Email': 'no-email'} + config = FillMissingConfiguration(fill_map=fill_map) + + assert config.fill_map == fill_map + + def test_fill_map_mixed_types(self): + """Test fill map with mixed value types (all strings).""" + fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'} + config = FillMissingConfiguration(fill_map=fill_map) + + assert config.fill_map == fill_map + + +class TestSpellCheckConfiguration: + """Tests for SpellCheckConfiguration.""" + + def test_default_spell_check_config(self): + """Test default spell check configuration.""" + config = SpellCheckConfiguration() + + assert config.columns == ['Name'] + assert config.language == 'en' + + def test_custom_spell_check_config(self): + """Test custom spell check configuration.""" + config = SpellCheckConfiguration( + columns=['Description', 'Notes'], + language='es' + ) + + assert config.columns == ['Description', 'Notes'] + assert config.language == 'es' + + def test_spell_check_all_languages(self): + """Test spell check with all supported languages.""" + supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'] + + for lang in supported_languages: + config = SpellCheckConfiguration(language=lang) + assert config.language == lang + + def test_spell_check_invalid_language(self): + """Test spell check with invalid language.""" + with pytest.raises(ValidationError): + SpellCheckConfiguration(language='invalid') + + def test_spell_check_multiple_columns(self): + """Test spell check with multiple columns.""" + columns = ['Col1', 'Col2', 'Col3', 'Col4'] + config = SpellCheckConfiguration(columns=columns) + + assert config.columns == columns + + def test_spell_check_empty_columns(self): + """Test spell check with empty columns list.""" + config = SpellCheckConfiguration(columns=[]) + + assert config.columns == [] + assert config.language == 'en' + + def test_spell_check_inheritance(self): + """Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration.""" + config = SpellCheckConfiguration() + + assert isinstance(config, ColumnsSelectConfiguration) + assert hasattr(config, 'columns') + assert hasattr(config, 'language') + + @pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']) + def test_spell_check_languages_parametrized(self, language): + """Test spell check with parametrized languages.""" + config = SpellCheckConfiguration(language=language) + assert config.language == language + +class TestAggregationConfiguration: + """Tests for AggregationConfiguration.""" + + def test_aggregation_default_config(self): + """Test default aggregation configuration.""" + config = AggregationConfiguration() + + assert config.columns == ['Name'] + assert config.operation == 'sum' + + @pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"]) + def test_aggregation_valid_operations(self, op): + """Test all allowed aggregation operations.""" + config = AggregationConfiguration(operation=op) + assert config.operation == op + + def test_aggregation_invalid_operation(self): + """Test that an invalid operation raises a ValidationError.""" + with pytest.raises(ValidationError) as excinfo: + AggregationConfiguration(operation="invalid_op") + + assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value) + + def test_aggregation_custom_columns(self): + """Test aggregation with custom columns.""" + config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean') + + assert config.columns == ['Price', 'Quantity'] + assert config.operation == 'mean' + + def test_aggregation_inheritance(self): + """Test that AggregationConfiguration inherits from ColumnsSelectConfiguration.""" + config = AggregationConfiguration() + + assert isinstance(config, ColumnsSelectConfiguration) + assert hasattr(config, 'columns') + assert hasattr(config, 'operation') + + def test_aggregation_model_dump(self): + """Test that model_dump contains all expected fields (useful for the Dagster op).""" + config = AggregationConfiguration(columns=['Value'], operation='max') + dump = config.model_dump() + + assert dump['columns'] == ['Value'] + assert dump['operation'] == 'max' diff --git a/tests/data_processing/test_integration.py b/tests/data_processing/test_integration.py new file mode 100644 index 0000000..c9d01eb --- /dev/null +++ b/tests/data_processing/test_integration.py @@ -0,0 +1,185 @@ +"""Integration tests for data processing jobs.""" + +import pytest +import pandas as pd +from unittest.mock import patch, MagicMock +from template_code_location.data_processing.ops import ( + remove_duplicates, + fill_missing_values, + standardize_categorical_values, + correct_typos +) +from template_code_location.data_processing.config_models import ( + FillMissingConfiguration, + ColumnsSelectConfiguration, + SpellCheckConfiguration +) + + +class TestPipelineIntegration: + """Integration tests for data processing pipeline.""" + + def test_pipeline_remove_duplicates_then_standardize(self, mock_context): + """Test pipeline: remove duplicates then standardize.""" + df = pd.DataFrame({ + 'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'], + 'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago'] + }) + + # Step 1: Remove duplicates + df_no_dupes = remove_duplicates(mock_context, df) + assert df_no_dupes.shape[0] == 3 + + # Step 2: Standardize + config = ColumnsSelectConfiguration(columns=['Name', 'City']) + df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes) + + assert df_standardized['Name'].iloc[0] == 'john doe' + assert df_standardized['City'].iloc[0] == 'new york' + + def test_pipeline_fill_missing_then_standardize(self, mock_context): + """Test pipeline: fill missing values then standardize.""" + df = pd.DataFrame({ + 'Category': [' ACTIVE ', None, ' PENDING '], + 'Value': ['1', '2', None] + }) + + # Step 1: Fill missing values + fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) + df_filled = fill_missing_values(mock_context, fill_config, df) + + # Step 2: Standardize + std_config = ColumnsSelectConfiguration(columns=['Category']) + df_standardized = standardize_categorical_values(mock_context, std_config, df_filled) + + assert df_standardized['Category'].iloc[0] == 'active' + assert df_filled['Value'].iloc[2] == '0' + + def test_pipeline_all_operations(self, mock_context): + """Test complete pipeline with all operations.""" + df = pd.DataFrame({ + 'Name': [' john doe ', 'JANE SMITH', ' john doe ', None], + 'Value': ['1', None, '1', '2'] + }) + + # Step 1: Remove duplicates + df = remove_duplicates(mock_context, df) + assert df.shape[0] == 3 + + # Step 2: Fill missing + fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) + df = fill_missing_values(mock_context, fill_config, df) + assert df['Value'].isna().sum() == 0 + + # Step 3: Standardize + std_config = ColumnsSelectConfiguration(columns=['Name']) + df = standardize_categorical_values(mock_context, std_config, df) + + assert df['Name'].iloc[0] == 'john doe' + + def test_pipeline_with_large_dataset(self, mock_context): + """Test pipeline performance with larger dataset.""" + # Create larger dataset + size = 1000 + df = pd.DataFrame({ + 'ID': list(range(size)), + 'Name': ['User_' + str(i % 50) for i in range(size)], + 'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3), + 'Score': [i % 100 for i in range(size)] + }) + + # Add some duplicates + df = pd.concat([df, df.head(100)], ignore_index=True) + + # Process + df_cleaned = remove_duplicates(mock_context, df) + + assert df_cleaned.shape[0] == 1000 + assert df_cleaned.shape[1] == 4 + + +class TestErrorHandling: + """Tests for error handling and edge cases.""" + + def test_operation_with_corrupted_data(self, mock_context): + """Test operations with corrupted/unusual data.""" + df = pd.DataFrame({ + 'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2] + }) + + # Should handle special float values + result = remove_duplicates(mock_context, df) + assert result.shape[0] > 0 + + def test_operation_preserves_index(self, mock_context): + """Test that index is handled correctly.""" + df = pd.DataFrame( + {'Col': [1, 2, 1, 3]}, + index=['a', 'b', 'c', 'd'] + ) + + result = remove_duplicates(mock_context, df) + # Index may be reset, so just check shape + assert result.shape[0] == 3 + + def test_standardize_with_unicode_characters(self, mock_context): + """Test standardization with unicode characters.""" + df = pd.DataFrame({ + 'Name': ['José', 'François', 'Müller'] + }) + + config = ColumnsSelectConfiguration(columns=['Name']) + result = standardize_categorical_values(mock_context, config, df) + + # Should handle unicode correctly + assert result.shape[0] == 3 + + def test_fill_with_same_key_multiple_times(self, mock_context): + """Test filling when fill_map has multiple entries.""" + df = pd.DataFrame({ + 'A': ['1', None, '3'], + 'B': [None, None, 'c'], + 'C': [None, '2', None] + }) + + config = FillMissingConfiguration(fill_map={ + 'A': '-1', + 'B': 'EMPTY', + 'C': '0' + }) + + result = fill_missing_values(mock_context, config, df) + + assert result.loc[1, 'A'] == '-1' + assert result.loc[0, 'B'] == 'EMPTY' + assert result.loc[0, 'C'] == '0' + + +class TestDataTypePreservation: + """Tests to ensure data types are preserved appropriately.""" + + def test_remove_duplicates_preserves_dtypes(self, mock_context): + """Test that remove_duplicates preserves column data types.""" + df = pd.DataFrame({ + 'int32': pd.array([1, 2, 1], dtype='int32'), + 'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'), + 'str': ['a', 'b', 'a'] + }) + + result = remove_duplicates(mock_context, df) + + assert result['int32'].dtype == df['int32'].dtype + assert result['float64'].dtype == df['float64'].dtype + + def test_fill_missing_preserves_column_types_where_possible(self, mock_context): + """Test that fill_missing handles type preservation.""" + df = pd.DataFrame({ + 'A': pd.array(['1', None, '3'], dtype='string'), + 'B': ['x', 'y', 'z'] + }) + + config = FillMissingConfiguration(fill_map={'A': '0'}) + result = fill_missing_values(mock_context, config, df) + + assert result['A'].loc[1] == '0' + assert result['B'].dtype == df['B'].dtype diff --git a/tests/data_processing/test_jobs.py b/tests/data_processing/test_jobs.py new file mode 100644 index 0000000..5373f7c --- /dev/null +++ b/tests/data_processing/test_jobs.py @@ -0,0 +1,56 @@ +from template_code_location.data_processing.jobs import ( + remove_duplicates_job_s3, + fill_missing_values_job_s3, + standardize_categorical_values_job_s3, + correct_typos_job_s3, + normalize_numeric_min_max_job_s3, + normalize_datetime_job_s3, + normalize_coordinates_job_s3, + add_global_aggregations_job_s3 +) + + +def test_remove_duplicates_job_s3_is_callable(): + """Test remove_duplicates_job_s3 is a valid Dagster job""" + assert callable(remove_duplicates_job_s3) + assert hasattr(remove_duplicates_job_s3, 'execute_in_process') + + +def test_fill_missing_values_job_s3_is_callable(): + """Test fill_missing_values_job_s3 is a valid Dagster job""" + assert callable(fill_missing_values_job_s3) + assert hasattr(fill_missing_values_job_s3, 'execute_in_process') + + +def test_standardize_categorical_values_job_s3_is_callable(): + """Test standardize_categorical_values_job_s3 is a valid Dagster job""" + assert callable(standardize_categorical_values_job_s3) + assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process') + + +def test_correct_typos_job_s3_is_callable(): + """Test correct_typos_job_s3 is a valid Dagster job""" + assert callable(correct_typos_job_s3) + assert hasattr(correct_typos_job_s3, 'execute_in_process') + + +def test_normalize_numeric_min_max_job_s3_is_callable(): + """Test normalize_numeric_min_max_job_s3 is a valid Dagster job""" + assert callable(normalize_numeric_min_max_job_s3) + assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process') + + +def test_normalize_datetime_job_s3_is_callable(): + """Test normalize_datetime_job_s3 is a valid Dagster job""" + assert callable(normalize_datetime_job_s3) + assert hasattr(normalize_datetime_job_s3, 'execute_in_process') + +def test_normalize_coordinates_job_s3_is_callable(): + """Test normalize_coordinates_job_s3 is a valid Dagster job""" + assert callable(normalize_coordinates_job_s3) + assert hasattr(normalize_coordinates_job_s3, 'execute_in_process') + +def test_add_global_aggregations_job_s3_is_callable(): + """Test add_global_aggregations_job_s3 is a valid Dagster job""" + assert callable(add_global_aggregations_job_s3) + assert hasattr(add_global_aggregations_job_s3, 'execute_in_process') diff --git a/tests/data_processing/test_ops.py b/tests/data_processing/test_ops.py new file mode 100644 index 0000000..def913b --- /dev/null +++ b/tests/data_processing/test_ops.py @@ -0,0 +1,700 @@ +"""Unit tests for data processing operations.""" + +import pytest +import pandas as pd +from template_code_location.data_processing.ops import ( + remove_duplicates, + fill_missing_values, + standardize_categorical_values, + correct_typos, + normalize_datetime, + normalize_numeric_min_max, + normalize_coordinates, + add_global_aggregations +) +from template_code_location.data_processing.config_models import ( + FillMissingConfiguration, + ColumnsSelectConfiguration, + SpellCheckConfiguration, + AggregationConfiguration, + CoordinatesNormalizationConfiguration +) + + +class TestRemoveDuplicates: + """Tests for the remove_duplicates operation.""" + + def test_remove_duplicates_basic(self, mock_context, sample_dataframe): + """Test basic duplicate removal.""" + result = remove_duplicates(mock_context, sample_dataframe) + + # Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x) + assert result.shape[0] == 3 + assert len(result) < len(sample_dataframe) + + def test_remove_duplicates_no_duplicates(self, mock_context): + """Test remove_duplicates when there are no duplicates.""" + df = pd.DataFrame({ + 'A': [1, 2, 3], + 'B': ['x', 'y', 'z'] + }) + result = remove_duplicates(mock_context, df) + + assert result.shape[0] == 3 + pd.testing.assert_frame_equal(result, df) + + def test_remove_duplicates_all_duplicates(self, mock_context): + """Test remove_duplicates when all rows are identical.""" + df = pd.DataFrame({ + 'A': [1, 1, 1], + 'B': ['x', 'x', 'x'] + }) + result = remove_duplicates(mock_context, df) + + assert result.shape[0] == 1 + + def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe): + """Test remove_duplicates with empty DataFrame.""" + result = remove_duplicates(mock_context, empty_dataframe) + + assert result.shape[0] == 0 + assert result.shape[1] == 0 + + def test_remove_duplicates_preserves_data_types(self, mock_context): + """Test that remove_duplicates preserves data types.""" + df = pd.DataFrame({ + 'int_col': [1, 2, 1], + 'str_col': ['a', 'b', 'a'], + 'float_col': [1.5, 2.5, 1.5] + }) + result = remove_duplicates(mock_context, df) + + assert result['int_col'].dtype == df['int_col'].dtype + assert result['str_col'].dtype == df['str_col'].dtype + assert result['float_col'].dtype == df['float_col'].dtype + + +class TestFillMissingValues: + """Tests for the fill_missing_values operation.""" + + def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values): + """Test basic missing value filling.""" + config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'}) + result = fill_missing_values(mock_context, config, dataframe_with_missing_values) + + # Check that no NaN values remain + assert result['Column1'].isna().sum() == 0 + assert result['Column2'].isna().sum() == 0 + + def test_fill_missing_values_with_different_values(self, mock_context): + """Test filling with different replacement values.""" + df = pd.DataFrame({ + 'A': [1, None, 3], + 'B': [None, 'b', 'c'] + }) + config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'}) + result = fill_missing_values(mock_context, config, df) + + assert result.loc[1, 'A'] == '-1' + assert result.loc[0, 'B'] == 'UNKNOWN' + + def test_fill_missing_values_partial_columns(self, mock_context): + """Test filling only specified columns.""" + df = pd.DataFrame({ + 'A': [1, None, 3], + 'B': [None, 'b', 'c'] + }) + config = FillMissingConfiguration(fill_map={'A': '999'}) + result = fill_missing_values(mock_context, config, df) + + assert result.loc[1, 'A'] == '999' + assert pd.isna(result.loc[0, 'B']) # B should still have NaN + + def test_fill_missing_values_no_missing(self, mock_context): + """Test when there are no missing values.""" + df = pd.DataFrame({ + 'A': ['1', '2', '3'], + 'B': ['a', 'b', 'c'] + }) + config = FillMissingConfiguration(fill_map={'A': '0'}) + result = fill_missing_values(mock_context, config, df) + + pd.testing.assert_frame_equal(result, df) + + def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe): + """Test with empty DataFrame.""" + config = FillMissingConfiguration(fill_map={}) + result = fill_missing_values(mock_context, config, empty_dataframe) + + assert result.shape[0] == 0 + + +class TestStandardizeCategoricalValues: + """Tests for the standardize_categorical_values operation.""" + + def test_standardize_categorical_basic(self, mock_context, sample_dataframe): + """Test basic categorical standardization.""" + config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status']) + result = standardize_categorical_values(mock_context, config, sample_dataframe) + + # Check that values are lowercase and stripped + assert result['Name'].iloc[0] == 'john doe' + assert result['City'].iloc[1] == 'los angeles' + assert result['Status'].iloc[1] == 'inactive' + + def test_standardize_categorical_single_column(self, mock_context): + """Test standardization on a single column.""" + df = pd.DataFrame({ + 'City': [' NEW YORK ', 'LOS ANGELES', ' chicago '] + }) + config = ColumnsSelectConfiguration(columns=['City']) + result = standardize_categorical_values(mock_context, config, df) + + assert result['City'].iloc[0] == 'new york' + assert result['City'].iloc[1] == 'los angeles' + assert result['City'].iloc[2] == 'chicago' + + def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe): + """Test with non-existent column (should skip).""" + config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name']) + result = standardize_categorical_values(mock_context, config, sample_dataframe) + + # Should process 'Name' column without error + assert result['Name'].iloc[0] == 'john doe' + + def test_standardize_categorical_with_missing_values(self, mock_context): + """Test standardization with missing values.""" + df = pd.DataFrame({ + 'Category': [' ACTIVE ', None, ' pending '] + }) + config = ColumnsSelectConfiguration(columns=['Category']) + result = standardize_categorical_values(mock_context, config, df) + + assert result['Category'].iloc[0] == 'active' + assert result['Category'].iloc[1] == '' + assert result['Category'].iloc[2] == 'pending' + + def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe): + """Test with empty DataFrame.""" + config = ColumnsSelectConfiguration(columns=['A', 'B']) + result = standardize_categorical_values(mock_context, config, empty_dataframe) + + assert result.shape[0] == 0 + + def test_standardize_categorical_numeric_columns(self, mock_context): + """Test that numeric columns are converted to strings.""" + df = pd.DataFrame({ + 'NumCol': [1, 2, 3] + }) + config = ColumnsSelectConfiguration(columns=['NumCol']) + result = standardize_categorical_values(mock_context, config, df) + + assert result['NumCol'].iloc[0] == '1' + assert isinstance(result['NumCol'].iloc[0], str) + + +class TestCorrectTypos: + """Tests for the correct_typos operation.""" + + def test_correct_typos_basic(self, mock_context): + """Test basic typo correction.""" + df = pd.DataFrame({ + 'Name': ['jon', 'jayne', 'bob'] + }) + config = SpellCheckConfiguration(columns=['Name'], language='en') + result = correct_typos(mock_context, config, df) + + # Result should have corrections applied + assert result.shape[0] == 3 + + def test_correct_typos_missing_column(self, mock_context): + """Test with non-existent column (should skip).""" + df = pd.DataFrame({ + 'Name': ['jon', 'jayne'] + }) + config = SpellCheckConfiguration(columns=['NonExistent'], language='en') + result = correct_typos(mock_context, config, df) + + # Should not raise error, just skip + pd.testing.assert_frame_equal(result, df) + + def test_correct_typos_with_missing_values(self, mock_context): + """Test typo correction with missing values.""" + df = pd.DataFrame({ + 'Text': ['helo', '', 'wrld'] + }) + config = SpellCheckConfiguration(columns=['Text'], language='en') + result = correct_typos(mock_context, config, df) + + # Empty strings should be preserved + assert result.loc[1, 'Text'] == '' + + def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe): + """Test with empty DataFrame.""" + config = SpellCheckConfiguration(columns=['A'], language='en') + result = correct_typos(mock_context, config, empty_dataframe) + + assert result.shape[0] == 0 + + def test_correct_typos_different_languages(self, mock_context): + """Test typo correction with different languages.""" + df = pd.DataFrame({ + 'Text': ['ciao', 'mondo'] + }) + + for lang in ['en', 'es', 'it']: + config = SpellCheckConfiguration(columns=['Text'], language=lang) + result = correct_typos(mock_context, config, df) + + # Should process without error + assert result.shape[0] == 2 + + def test_correct_typos_numeric_values(self, mock_context): + """Test typo correction on numeric values converted to strings.""" + df = pd.DataFrame({ + 'Values': [123, 456, 789] + }) + config = SpellCheckConfiguration(columns=['Values'], language='en') + result = correct_typos(mock_context, config, df) + + # Numeric values should be converted to string and processed + assert result.shape[0] == 3 + +class TestNormalizeDatetime: + """Tests for the normalize_datetime operation.""" + + def test_normalize_datetime_basic(self, mock_context): + """Test basic datetime normalization to ISO format.""" + df = pd.DataFrame({ + 'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59'] + }) + + config = ColumnsSelectConfiguration(columns=['date_col']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'date_col_iso' in result.columns + assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z' + assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z' + + def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe): + """Test behavior when a configured column is missing in the DataFrame.""" + config = ColumnsSelectConfiguration(columns=['non_existent_column']) + + result = normalize_datetime(mock_context, config, sample_dataframe.copy()) + + pd.testing.assert_frame_equal(result, sample_dataframe) + + def test_normalize_datetime_unparseable_values(self, mock_context): + """Test column with values that cannot be parsed as dates.""" + df = pd.DataFrame({ + 'invalid_col': ['not-a-date', 'completely-random-text'] + }) + config = ColumnsSelectConfiguration(columns=['invalid_col']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'invalid_col_iso' not in result.columns + + def test_normalize_datetime_mixed_and_nulls(self, mock_context): + """Test column with mixed valid dates, invalid dates, and NaNs.""" + df = pd.DataFrame({ + 'mixed_col': ['2023-05-01', None, 'invalid-date'] + }) + config = ColumnsSelectConfiguration(columns=['mixed_col']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'mixed_col_iso' in result.columns + assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z' + + assert result['mixed_col_iso'].iloc[1] == "" + assert result['mixed_col_iso'].iloc[2] == "" + + def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe): + """Test with an empty DataFrame.""" + config = ColumnsSelectConfiguration(columns=['some_col']) + + result = normalize_datetime(mock_context, config, empty_dataframe) + + assert result.empty + + def test_normalize_datetime_epoch_only(self, mock_context, capsys): + """If parsing a column yields only the Unix epoch date, it should be skipped.""" + df = pd.DataFrame({ + 'weird_col': ['0', 0, '0000', ''] + }) + + config = ColumnsSelectConfiguration(columns=['weird_col']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'weird_col_iso' not in result.columns + + captured = capsys.readouterr() + assert "all normalized values are '1970-01-01'" in captured.err + + def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys): + """If all formatted values are '1970-01-01', the column should be skipped with a warning.""" + df = pd.DataFrame({ + 'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00'] + }) + + config = ColumnsSelectConfiguration(columns=['ts_col']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'ts_col_iso' not in result.columns + + captured = capsys.readouterr() + assert "all normalized values are '1970-01-01'" in captured.err + + def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys): + """If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped.""" + df = pd.DataFrame({ + 'age': [66, 45, 40, 43, 20, 26, 69, 21, 46] + }) + + config = ColumnsSelectConfiguration(columns=['age']) + + result = normalize_datetime(mock_context, config, df.copy()) + + assert 'age_iso' not in result.columns + + captured = capsys.readouterr() + assert "all normalized values are '1970-01-01'" in captured.err + +class TestNormalizeNumericMinMax: + """Tests for the normalize_numeric_min_max operation.""" + + def test_normalize_numeric_basic(self, mock_context): + """Test standard min-max normalization between 0 and 1.""" + df = pd.DataFrame({ + 'score': [10, 20, 30, 40, 50] + }) + config = ColumnsSelectConfiguration(columns=['score']) + + result = normalize_numeric_min_max(mock_context, config, df.copy()) + + assert 'score_norm' in result.columns + assert result['score_norm'].min() == 0.0 + assert result['score_norm'].max() == 1.0 + + assert result['score_norm'].iloc[2] == 0.5 + + def test_normalize_numeric_missing_column(self, mock_context): + """Test skipping of non-existent columns.""" + df = pd.DataFrame({'existing': [1, 2, 3]}) + config = ColumnsSelectConfiguration(columns=['missing_col']) + + result = normalize_numeric_min_max(mock_context, config, df.copy()) + + assert 'missing_col_norm' not in result.columns + + def test_normalize_numeric_constant_values(self, mock_context): + """Test skipping when min == max to avoid division by zero.""" + df = pd.DataFrame({ + 'constant': [10, 10, 10] + }) + config = ColumnsSelectConfiguration(columns=['constant']) + + result = normalize_numeric_min_max(mock_context, config, df.copy()) + + assert 'constant_norm' not in result.columns + + def test_normalize_numeric_with_nans(self, mock_context): + """Test normalization with NaN values (pandas min/max ignore NaNs by default).""" + df = pd.DataFrame({ + 'with_nans': [10, None, 50] + }) + config = ColumnsSelectConfiguration(columns=['with_nans']) + + result = normalize_numeric_min_max(mock_context, config, df.copy()) + + assert 'with_nans_norm' in result.columns + assert result['with_nans_norm'].iloc[0] == 0.0 + assert result['with_nans_norm'].iloc[2] == 1.0 + assert pd.isna(result['with_nans_norm'].iloc[1]) + + def test_normalize_numeric_multiple_columns(self, mock_context): + """Test processing multiple columns in one call.""" + df = pd.DataFrame({ + 'A': [1, 2], + 'B': [10, 20] + }) + config = ColumnsSelectConfiguration(columns=['A', 'B']) + + result = normalize_numeric_min_max(mock_context, config, df.copy()) + + assert 'A_norm' in result.columns + assert 'B_norm' in result.columns + +class TestNormalizeCoordinates: + """Tests for the normalize_coordinates operation.""" + + def test_normalize_coordinates_basic(self, mock_context): + """Test rounding and basic coordinate normalization.""" + df = pd.DataFrame({ + 'lat': [45.123456, 46.0], + 'lon': [9.123456, 10.0] + }) + config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') + + result = normalize_coordinates(mock_context, config, df.copy()) + + assert result['lat'].iloc[0] == 45.1235 + assert result['lon'].iloc[0] == 9.1235 + + assert len(result) == 2 + + def test_normalize_coordinates_filtering(self, mock_context): + """Test filtering of out-of-range coordinates.""" + df = pd.DataFrame({ + 'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range + 'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range + }) + config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') + + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 1 + assert result['lat'].iloc[0] == 45.0 + + def test_normalize_coordinates_invalid_types(self, mock_context): + """Test conversion of strings to numeric and handling of NaNs.""" + df = pd.DataFrame({ + 'lat': ["45.5", "invalid", None], + 'lon': ["9.5", "10.0", "11.0"] + }) + config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') + + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 1 + assert isinstance(result['lat'].iloc[0], float) + + def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe): + """Test with an empty DataFrame.""" + + df = pd.DataFrame(columns=['lat', 'lon']) + config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') + + result = normalize_coordinates(mock_context, config, df) + + assert len(result) == 0 + assert result.empty + + def test_normalize_coordinates_default_config(self, mock_context): + """Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided.""" + df = pd.DataFrame({ + 'lat': [45.123456, 46.0], + 'lon': [9.123456, 10.0] + }) + config = CoordinatesNormalizationConfiguration() + + result = normalize_coordinates(mock_context, config, df.copy()) + + assert result['lat'].iloc[0] == 45.1235 + assert result['lon'].iloc[0] == 9.1235 + assert len(result) == 2 + + def test_normalize_coordinates_null_config_values(self, mock_context): + """Test that null lat/lon column names fall back to defaults ('lat'/'lon').""" + df = pd.DataFrame({ + 'lat': [45.123456, 46.0], + 'lon': [9.123456, 10.0] + }) + config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None) + + assert config.latColumn == "lat" + assert config.lonColumn == "lon" + + result = normalize_coordinates(mock_context, config, df.copy()) + + assert result['lat'].iloc[0] == 45.1235 + assert result['lon'].iloc[0] == 9.1235 + assert len(result) == 2 + + def test_normalize_coordinates_dms_degree_symbol(self, mock_context): + """Test DMS parsing with degree/minute/second symbols like 40°26'46\"N.""" + df = pd.DataFrame({ + 'lat': ["40°26'46\"N", "51°30'26\"N"], + 'lon': ["79°58'56\"W", "0°7'39\"W"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 2 + # 40°26'46"N ≈ 40.4461 + assert abs(result['lat'].iloc[0] - 40.4461) < 0.001 + # 79°58'56"W ≈ -79.9822 + assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001 + + def test_normalize_coordinates_dms_spaced_format(self, mock_context): + """Test DMS parsing with space-separated format like '40 26 46 N'.""" + df = pd.DataFrame({ + 'lat': ["40 26 46 N"], + 'lon': ["79 58 56 W"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 1 + assert abs(result['lat'].iloc[0] - 40.4461) < 0.001 + assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001 + + def test_normalize_coordinates_dms_already_decimal(self, mock_context): + """Test that string columns with decimal values are auto-parsed correctly.""" + df = pd.DataFrame({ + 'lat': ["45.5", "46.0"], + 'lon': ["9.5", "10.0"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 2 + assert result['lat'].iloc[0] == 45.5 + assert result['lon'].iloc[0] == 9.5 + + def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context): + """Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values.""" + df = pd.DataFrame({ + 'lat': ["40°26'46\"N", "not_a_coord", "51.5"], + 'lon': ["79°58'56\"W", "10.0", "0.1"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + # Row with "not_a_coord" for lat should be dropped (NaN lat) + assert len(result) == 2 + + def test_normalize_coordinates_dms_out_of_range(self, mock_context): + """Test that DMS-parsed coordinates outside valid range are filtered out.""" + df = pd.DataFrame({ + 'lat': ["91°0'0\"N", "45°0'0\"N"], + 'lon': ["0°0'0\"E", "9°0'0\"E"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + # First row has lat=91° which is out of [-90, 90] + assert len(result) == 1 + assert abs(result['lat'].iloc[0] - 45.0) < 0.001 + + def test_normalize_coordinates_dms_south_and_east(self, mock_context): + """Test DMS parsing with south latitude and east longitude.""" + df = pd.DataFrame({ + 'lat': ["33°51'54\"S"], + 'lon': ["151°12'36\"E"] + }) + config = CoordinatesNormalizationConfiguration( + latColumn='lat', lonColumn='lon' + ) + result = normalize_coordinates(mock_context, config, df.copy()) + + assert len(result) == 1 + # 33°51'54"S ≈ -33.865 + assert result['lat'].iloc[0] < 0 + assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001 + # 151°12'36"E ≈ 151.21 + assert result['lon'].iloc[0] > 0 + assert abs(result['lon'].iloc[0] - 151.21) < 0.01 + + def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context): + """Test that numeric columns are coerced directly while string columns are parsed as DMS.""" + # Numeric columns — should go through pd.to_numeric path + df_numeric = pd.DataFrame({ + 'lat': [45.123456, 46.0], + 'lon': [9.123456, 10.0] + }) + config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') + result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy()) + + assert result_numeric['lat'].iloc[0] == 45.1235 + assert len(result_numeric) == 2 + + # String DMS columns — should go through _parse_dms_to_decimal path + df_dms = pd.DataFrame({ + 'lat': ["40°26'46\"N"], + 'lon': ["79°58'56\"W"] + }) + result_dms = normalize_coordinates(mock_context, config, df_dms.copy()) + + assert len(result_dms) == 1 + assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001 + +class TestAddGlobalAggregations: + """Tests for the add_global_aggregations operation.""" + + def test_add_global_aggregations_success(self, mock_context): + """Test a successful group by and aggregation.""" + df = pd.DataFrame({ + 'category': ['A', 'A', 'B'], + 'value': [10, 20, 100], + 'ignored_str': ['x', 'y', 'z'] + }) + + config = AggregationConfiguration( + columns=['category'], + operation='sum' + ) + + result = add_global_aggregations(mock_context, config, df.copy()) + + assert len(result) == 2 + assert result.loc[result['category'] == 'A', 'value'].values[0] == 30 + assert result.loc[result['category'] == 'B', 'value'].values[0] == 100 + assert 'ignored_str' not in result.columns + mock_context.log.info.assert_called() + + def test_add_global_aggregations_missing_column(self, mock_context): + """Test skipping a column that does not exist in the dataframe.""" + df = pd.DataFrame({'value': [1, 2, 3]}) + config = AggregationConfiguration( + columns=['missing_col'], + operation='count' + ) + + result = add_global_aggregations(mock_context, config, df.copy()) + + mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.") + assert len(result) == 1 + + def test_add_global_aggregations_unsupported_op(self, mock_context): + """Test the warning when an unsupported operation is provided.""" + df = pd.DataFrame({'category': ['A'], 'value': [1]}) + + config = AggregationConfiguration( + columns=['category'], + operation='unsupported' + ) + + with pytest.raises(Exception): + add_global_aggregations(mock_context, config, df.copy()) + + mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'") + + def test_add_global_aggregations_only_numeric_kept(self, mock_context): + """Verify that non-numeric and non-grouping columns are dropped.""" + df = pd.DataFrame({ + 'group': ['A', 'A'], + 'num': [1, 2], + 'text': ['hello', 'world'] + }) + config = AggregationConfiguration(columns=['group'], operation='mean') + + result = add_global_aggregations(mock_context, config, df.copy()) + + assert 'text' not in result.columns + assert 'num' in result.columns + assert 'group' in result.columns diff --git a/tests/dataframe_level_anonymisation/__init__.py b/tests/dataframe_level_anonymisation/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/dataframe_level_anonymisation/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/dataframe_level_anonymisation/config_models/__init__.py b/tests/dataframe_level_anonymisation/config_models/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/dataframe_level_anonymisation/config_models/test_base_config.py b/tests/dataframe_level_anonymisation/config_models/test_base_config.py new file mode 100644 index 0000000..92e599b --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/test_base_config.py @@ -0,0 +1,54 @@ +import pytest +from pydantic import ValidationError + +from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration + + +def test_valid_configuration_with_overrides(): + cfg = BaseConfiguration( + ident=["id"], + quasi_identifiers=["age"], + supp_level=10.0, + generalisation_hierarchies={"age": "age_hierarchy"}, + ) + assert cfg.ident == ["id"] + assert cfg.quasi_identifiers == ["age"] + assert cfg.supp_level == 10.0 + assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"} + + +def test_default_values_are_loaded(): + cfg = BaseConfiguration() + assert cfg.ident == ["Name"] + assert cfg.quasi_identifiers == ["Age"] + assert cfg.supp_level == 50.0 + assert cfg.generalisation_hierarchies == {"Age": "simpl_age"} + + +def test_missing_ident_raises_error(): + with pytest.raises(ValidationError): + BaseConfiguration( + ident=[] + ) + + +def test_missing_quasi_ident_raises_error(): + with pytest.raises(ValidationError): + BaseConfiguration( + quasi_identifiers=[] + ) + + +def test_overlap_between_ident_and_quasi_identifiers(): + with pytest.raises(ValidationError): + BaseConfiguration( + ident=["age"], + quasi_identifiers=["age"] + ) + + +def test_supp_level_bounds(): + with pytest.raises(ValidationError): + BaseConfiguration( + supp_level=150.0 # fuori range + ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py b/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py new file mode 100644 index 0000000..c6994a9 --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py @@ -0,0 +1,48 @@ +from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import ( + simpl_age, + simpl_age2, + simpl_gender, + get_all_hierarchies, +) + + +def test_simpl_age_structure(): + assert isinstance(simpl_age, dict) + assert 0 in simpl_age + assert isinstance(simpl_age[0], list) + # verify first level contains 100 ages + assert len(simpl_age[0]) == 100 + assert simpl_age[0][0] == 0 + assert simpl_age[0][-1] == 99 + + +def test_simpl_age2_structure(): + assert isinstance(simpl_age2, dict) + assert 0 in simpl_age2 + assert 1 in simpl_age2 + assert isinstance(simpl_age2[0], list) + assert isinstance(simpl_age2[1], list) + + +def test_simpl_gender_structure(): + assert isinstance(simpl_gender, dict) + assert 0 in simpl_gender + assert 1 in simpl_gender + assert simpl_gender[0] == ["M", "F", "O"] + assert simpl_gender[1] == ["*", "*", "*"] + + +def test_get_all_hierarchies(): + hier = get_all_hierarchies() + + # the function should return dicts only + assert isinstance(hier, dict) + + # ensure expected dicts are included + assert "simpl_age" in hier + assert "simpl_age2" in hier + assert "simpl_gender" in hier + + # ensure the values returned are references to the actual dicts + assert hier["simpl_age"] is simpl_age + assert hier["simpl_gender"] is simpl_gender diff --git a/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py b/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py new file mode 100644 index 0000000..ef6e2c8 --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py @@ -0,0 +1,41 @@ +import pytest +from pydantic import ValidationError + +from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import ( + KAnonymityConfiguration, +) + + +def test_valid_k_anonymity_config_with_overrides(): + cfg = KAnonymityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + supp_level=5.0, + generalisation_hierarchies={"age": "age_hier"}, + k=3, + sensitive_attributes=["disease"], + ) + assert cfg.k == 3 + assert cfg.sensitive_attributes == ["disease"] + assert cfg.generalisation_hierarchies == {"age": "age_hier"} + + +def test_default_values_are_loaded(): + cfg = KAnonymityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"} + ) + assert cfg.k == 3 + assert cfg.sensitive_attributes == ["Disease"] + + +def test_invalid_k_value_raises_error(): + with pytest.raises(ValidationError): + KAnonymityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"}, + k=1, # invalid, must be >= 2 + sensitive_attributes=["disease"], + ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py b/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py new file mode 100644 index 0000000..c94db3e --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py @@ -0,0 +1,44 @@ +import pytest +from pydantic import ValidationError + +from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import ( + LDiversityConfiguration, +) + + +def test_valid_l_diversity_config_with_overrides(): + cfg = LDiversityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + supp_level=5.0, + generalisation_hierarchies={"age": "age_hier"}, + k=3, + l=2, + sensitive_attribute="disease", + ) + assert cfg.k == 3 + assert cfg.l == 2 + assert cfg.sensitive_attribute == "disease" + + +def test_default_values_are_loaded(): + cfg = LDiversityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"} + ) + assert cfg.k == 2 + assert cfg.l == 3 + assert cfg.sensitive_attribute == "Disease" + + +def test_invalid_l_value_raises_error(): + with pytest.raises(ValidationError): + LDiversityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"}, + k=3, + l=0, # invalid, must be >= 1 + sensitive_attribute="disease", + ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py b/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py new file mode 100644 index 0000000..615bd27 --- /dev/null +++ b/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py @@ -0,0 +1,56 @@ +import pytest +from pydantic import ValidationError + +from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import ( + TClosenessConfiguration, +) + + +def test_valid_t_closeness_config_with_overrides(): + cfg = TClosenessConfiguration( + ident=["id"], + quasi_identifiers=["age"], + supp_level=5.0, + generalisation_hierarchies={"age": "age_hier"}, + k=3, + t=0.4, + sensitive_attribute="disease", + ) + assert cfg.k == 3 + assert cfg.t == 0.4 + assert cfg.sensitive_attribute == "disease" + + +def test_default_values_are_loaded(): + cfg = TClosenessConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"} + ) + assert cfg.k == 2 + assert cfg.t == 0.5 + assert cfg.sensitive_attribute == "Disease" + + +def test_invalid_t_value_low(): + with pytest.raises(ValidationError): + TClosenessConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"}, + k=3, + t=-0.1, # invalid + sensitive_attribute="disease", + ) + + +def test_invalid_t_value_high(): + with pytest.raises(ValidationError): + TClosenessConfiguration( + ident=["id"], + quasi_identifiers=["age"], + generalisation_hierarchies={"age": "age_hier"}, + k=3, + t=2.0, # invalid > 1 + sensitive_attribute="disease", + ) diff --git a/tests/dataframe_level_anonymisation/test_jobs.py b/tests/dataframe_level_anonymisation/test_jobs.py new file mode 100644 index 0000000..f890e2d --- /dev/null +++ b/tests/dataframe_level_anonymisation/test_jobs.py @@ -0,0 +1,44 @@ +from template_code_location.dataframe_level_anonymisation.jobs import ( + k_anonymity_job, + l_diversity_job, + t_closeness_job, + k_anonymity_job_s3, + l_diversity_job_s3, + t_closeness_job_s3 +) + + +def test_k_anonymity_job_is_callable(): + """Test k_anonymity_job is a valid Dagster job""" + assert callable(k_anonymity_job) + assert hasattr(k_anonymity_job, 'execute_in_process') + + +def test_l_diversity_job_is_callable(): + """Test l_diversity_job is a valid Dagster job""" + assert callable(l_diversity_job) + assert hasattr(l_diversity_job, 'execute_in_process') + + +def test_t_closeness_job_is_callable(): + """Test t_closeness_job is a valid Dagster job""" + assert callable(t_closeness_job) + assert hasattr(t_closeness_job, 'execute_in_process') + + +def test_k_anonymity_job_s3_is_callable(): + """Test k_anonymity_job_s3 is a valid Dagster job""" + assert callable(k_anonymity_job_s3) + assert hasattr(k_anonymity_job_s3, 'execute_in_process') + + +def test_l_diversity_job_s3_is_callable(): + """Test l_diversity_job_s3 is a valid Dagster job""" + assert callable(l_diversity_job_s3) + assert hasattr(l_diversity_job_s3, 'execute_in_process') + + +def test_t_closeness_job_s3_is_callable(): + """Test t_closeness_job_s3 is a valid Dagster job""" + assert callable(t_closeness_job_s3) + assert hasattr(t_closeness_job_s3, 'execute_in_process') diff --git a/tests/dataframe_level_anonymisation/test_ops.py b/tests/dataframe_level_anonymisation/test_ops.py new file mode 100644 index 0000000..90c01aa --- /dev/null +++ b/tests/dataframe_level_anonymisation/test_ops.py @@ -0,0 +1,230 @@ +import pytest +import pandas as pd +from unittest.mock import patch +from dagster import DagsterInvalidInvocationError, build_op_context + +from template_code_location.dataframe_level_anonymisation.ops import ( + apply_k_anonymity, + apply_l_diversity, + apply_t_closeness, +) +from template_code_location.dataframe_level_anonymisation.config_models import ( + KAnonymityConfiguration, + LDiversityConfiguration, + TClosenessConfiguration, +) + + +# --------------------------- +# Fixtures +# --------------------------- +@pytest.fixture +def fake_df(): + return pd.DataFrame({"id": [1, 2], "age": [30, 40]}) + + +@pytest.fixture +def k_config(): + return KAnonymityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + sensitive_attributes=["age"], + k=2, + supp_level=0.0, + generalisation_hierarchies={"age": "simpl_age"}, + ) + + +@pytest.fixture +def l_config(): + return LDiversityConfiguration( + ident=["id"], + quasi_identifiers=["age"], + sensitive_attribute="age", + k=2, + l=1, + supp_level=0.0, + generalisation_hierarchies={"age": "simpl_age"}, + ) + + +@pytest.fixture +def t_config(): + return TClosenessConfiguration( + ident=["id"], + quasi_identifiers=["age"], + sensitive_attribute="age", + k=2, + t=0.5, + supp_level=0.0, + generalisation_hierarchies={"age": "simpl_age"}, + ) + + +@pytest.fixture +def op_context(): + return build_op_context() + + +# --------------------------- +# Helper for patching external functions +# --------------------------- +@pytest.fixture(autouse=True) +def patch_external_ops(): + with ( + patch( + "dataframe_level_anonymisation.ops.get_all_hierarchies", + return_value={"simpl_age": {0: [30, 40]}}, + ), + patch( + "dataframe_level_anonymisation.ops.k_anonymity", + return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), + ), + patch( + "dataframe_level_anonymisation.ops.l_diversity", + return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), + ), + patch( + "dataframe_level_anonymisation.ops.t_closeness", + return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), + ), + ): + yield + + +# --------------------------- +# Tests for apply_k_anonymity +# --------------------------- +def test_apply_k_anonymity_outputs(op_context, k_config, fake_df): + results = list(apply_k_anonymity(op_context, k_config, fake_df)) + assert len(results) == 2 + + data_output = results[0].value + metrics_output = results[1].value + + # Check types + assert isinstance(data_output, pd.DataFrame) + assert isinstance(metrics_output, dict) + assert "k_anon" in metrics_output + assert "l_div" in metrics_output + assert "t_clos" in metrics_output + + +# --------------------------- +# Tests for apply_l_diversity +# --------------------------- +def test_apply_l_diversity_outputs(op_context, l_config, fake_df): + results = list(apply_l_diversity(op_context, l_config, fake_df)) + assert len(results) == 2 + + data_output = results[0].value + metrics_output = results[1].value + + assert isinstance(data_output, pd.DataFrame) + assert isinstance(metrics_output, dict) + assert "k_anon" in metrics_output + assert "l_div" in metrics_output + assert "t_clos" in metrics_output + + +def test_apply_l_diversity_empty_raises(op_context, l_config): + with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()): + + with pytest.raises(DagsterInvalidInvocationError): + list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]}))) + + +# --------------------------- +# Tests for apply_t_closeness +# --------------------------- +def test_apply_t_closeness_outputs(op_context, t_config, fake_df): + results = list(apply_t_closeness(op_context, t_config, fake_df)) + assert len(results) == 2 + + data_output = results[0].value + metrics_output = results[1].value + + assert isinstance(data_output, pd.DataFrame) + assert isinstance(metrics_output, dict) + assert "k_anon" in metrics_output + assert "l_div" in metrics_output + assert "t_clos" in metrics_output + + +def test_apply_t_closeness_empty_raises(op_context, t_config): + with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()): + with pytest.raises(DagsterInvalidInvocationError): + list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]}))) + + +# --------------------------- +# Additional tests for _validate_and_get_hierarchies +# --------------------------- +def test_validate_hierarchies_dataset_too_small(k_config): + small_df = pd.DataFrame({"id": [1], "age": [30]}) + from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies + + with pytest.raises(DagsterInvalidInvocationError): + _validate_and_get_hierarchies(k_config, small_df) + + +def test_validate_hierarchies_missing_hierarchy(k_config, fake_df): + from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies + + bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}}) + + with pytest.raises(DagsterInvalidInvocationError): + _validate_and_get_hierarchies(bad_config, fake_df) + + +def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df): + from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies + + with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}): + with pytest.raises(DagsterInvalidInvocationError): + _validate_and_get_hierarchies(k_config, fake_df) + + +# --------------------------- +# Additional tests for _calc_dataframe_metrics +# --------------------------- +def test_calc_dataframe_metrics_basic(): + from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics + + df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]}) + df_anon = df_org.copy() + + with ( + patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2), + patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1), + patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1), + ): + + report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"]) + + assert "k-anonymity" in report + assert metrics["k_anon"] == 2 + assert metrics["l_div"] == 1 + assert metrics["t_clos"] == 0.1 + + +# --------------------------- +# Tests for apply_t_closeness exception branches +# --------------------------- +def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df): + """Covers the branch where ValueError contains 'Cannot be quasi-identifiers'.""" + with patch( + "dataframe_level_anonymisation.ops.t_closeness", + side_effect=ValueError("Cannot be quasi-identifiers invalid"), + ): + with pytest.raises(DagsterInvalidInvocationError): + list(apply_t_closeness(op_context, t_config, fake_df)) + + +def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df): + """Covers the branch where ValueError is raised but message does NOT contain that substring.""" + with patch( + "dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error") + ): + with pytest.raises(DagsterInvalidInvocationError): + list(apply_t_closeness(op_context, t_config, fake_df)) diff --git a/tests/dataframe_level_anonymisation/test_utils.py b/tests/dataframe_level_anonymisation/test_utils.py new file mode 100644 index 0000000..3fa1841 --- /dev/null +++ b/tests/dataframe_level_anonymisation/test_utils.py @@ -0,0 +1,70 @@ +import numpy as np + +from template_code_location.dataframe_level_anonymisation.utils import ( + parse_value_list, + normalize_hierarchy_levels, +) + + +# ------------------------------------ +# Tests for parse_value_list +# ------------------------------------ +def test_parse_value_list_all_strings_digits(): + values = ["1", "2", "3"] + assert parse_value_list(values) == [1, 2, 3] + + +def test_parse_value_list_mixed_values(): + values = ["1", 2, "abc", "5"] + assert parse_value_list(values) == [1, 2, "abc", 5] + + +def test_parse_value_list_no_digits(): + values = ["a", "b", "c"] + assert parse_value_list(values) == ["a", "b", "c"] + + +# ------------------------------------ +# Tests for normalize_hierarchy_levels +# ------------------------------------ +def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array(): + hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}} + + normalized = normalize_hierarchy_levels(hierarchy) + + assert "age" in normalized + assert 0 in normalized["age"] + assert isinstance(normalized["age"][0], np.ndarray) + assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list + assert normalized["age"][1] == ["0-10", "11-20"] # untouched + + +def test_normalize_hierarchy_levels_multiple_columns(): + hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}} + + normalized = normalize_hierarchy_levels(hierarchy) + + # First column + assert isinstance(normalized["age"][0], np.ndarray) + assert normalized["age"][0].tolist() == [10, 20] + + # Second column + assert isinstance(normalized["gender"][0], np.ndarray) + assert normalized["gender"][0].tolist() == ["M", "F"] + assert normalized["gender"][1] == ["*"] + + +def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0(): + hierarchy = {"test": {"0": ["1", "x", "3"]}} + + normalized = normalize_hierarchy_levels(hierarchy) + + assert isinstance(normalized["test"][0], np.ndarray) + assert normalized["test"][0].tolist() == ["1", "x", "3"] + + +def test_normalize_hierarchy_levels_empty_mapping(): + hierarchy = {"col": {}} + normalized = normalize_hierarchy_levels(hierarchy) + + assert normalized == {"col": {}} diff --git a/tests/field_level_pseudo_anonymisation/__init__.py b/tests/field_level_pseudo_anonymisation/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/field_level_pseudo_anonymisation/conftest.py b/tests/field_level_pseudo_anonymisation/conftest.py new file mode 100644 index 0000000..ee54069 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/conftest.py @@ -0,0 +1,444 @@ +""" +Shared pytest fixtures and helpers for field-level pseudonymisation tests. + +This module provides: +- Mock Vault client for testing without real Vault connections +- Sample data fixtures +- Configuration fixtures for encryption/decryption operations +- Helper functions for running ops and managing test Vault storage +""" + +import pandas as pd +import pytest +from dagster import build_op_context +from cryptography.fernet import Fernet +from hvac.exceptions import InvalidPath, Forbidden +from unittest.mock import patch, MagicMock + +from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( + AnonymisePseudonymizeStructuredConfig, + DepseudonymizeStructuredConfig, + EncryptConfig, + DecryptConfig, + PseudoTechniqueConfig, + DepseudoTechniqueConfig, +) +from template_code_location.field_level_pseudo_anonymisation.ops import ( + anonymize_pseudonymize_structured, + depseudonymize_structured, +) + + +# -------------------------------- Mock Vault Storage ---------------------------------------- + +# In-memory Vault simulation for tests +_test_vault_storage = {} +_test_vault_access_control = {} # For simulating access control + + +@pytest.fixture(autouse=True) +def mock_vault_client(): + """ + Auto-use fixture that mocks the hvac.Client to avoid real Vault connections. + Uses an in-memory dict to simulate Vault storage for tests. + Includes access control simulation for AC3. + """ + global _test_vault_storage, _test_vault_access_control + _test_vault_storage = {} # Reset storage before each test + _test_vault_access_control = {} # Reset access control + + def mock_read_secret(path, mount_point): + """Mock reading secret from Vault with access control""" + full_path = f"{mount_point}/{path}" + + # Check access control first + if full_path in _test_vault_access_control: + if not _test_vault_access_control[full_path]: + raise Forbidden(f"Access denied to secret: {full_path}") + + if full_path not in _test_vault_storage: + raise InvalidPath(f"Secret not found: {full_path}") + return {"data": {"data": {"value": _test_vault_storage[full_path]}}} + + def mock_create_or_update_secret(path, mount_point, secret): + """Mock creating/updating secret in Vault""" + full_path = f"{mount_point}/{path}" + _test_vault_storage[full_path] = secret["value"] + + def mock_delete_metadata(path, mount_point): + """Mock deleting secret from Vault""" + full_path = f"{mount_point}/{path}" + if full_path in _test_vault_storage: + del _test_vault_storage[full_path] + if full_path in _test_vault_access_control: + del _test_vault_access_control[full_path] + + with patch("hvac.Client") as mock_client_class: + mock_instance = MagicMock() + mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret + mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = ( + mock_create_or_update_secret + ) + mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = ( + mock_delete_metadata + ) + mock_client_class.return_value = mock_instance + yield mock_instance + + +# -------------------------------- Sample Data Fixtures ---------------------------------------- + + +@pytest.fixture +def sample_df(): + """ + Fixture providing a sample structured dataset with PII data. + Represents typical data that requires pseudonymisation and restoration. + """ + return pd.DataFrame( + { + "id": [1, 2, 3, 4, 5], + "name": [ + "Alice Smith", + "Bob Jones", + "Charlie Brown", + "David Wilson", + "Eva Garcia", + ], + "email": [ + "alice@example.com", + "bob@example.com", + "charlie@example.com", + "david@example.com", + "eva@example.com", + ], + "ssn": [ + "123-45-6789", + "234-56-7890", + "345-67-8901", + "456-78-9012", + "567-89-0123", + ], + "age": [25, 30, 35, 40, 45], + "salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0], + "department": ["HR", "IT", "Finance", "IT", "HR"], + } + ) + + +# -------------------------------- Configuration Fixtures ---------------------------------------- + + +@pytest.fixture +def encrypt_config_single_field(): + """ + Configuration for encrypting a single field (email). + Used to create pseudonymised data for restoration tests. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + +@pytest.fixture +def decrypt_config_single_field(): + """ + Configuration for decrypting a single field (email). + Used to restore original values. + """ + return DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + +@pytest.fixture +def encrypt_config_multiple_fields(): + """ + Configuration for encrypting multiple fields (name, email, ssn). + Tests restoration of multiple sensitive fields. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + columns=["name", "email", "ssn"], + key_name="test_restoration_key_multi", + ) + ) + ] + ) + + +@pytest.fixture +def decrypt_config_multiple_fields(): + """ + Configuration for decrypting multiple fields (name, email, ssn). + """ + return DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["name", "email", "ssn"], + key_name="test_restoration_key_multi", + ) + ) + ] + ) + + +@pytest.fixture +def encrypt_config_partial_fields(): + """ + Configuration for encrypting only some fields (email, ssn). + Tests partial restoration scenarios. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + columns=["email", "ssn"], + key_name="test_restoration_key_partial", + ) + ) + ] + ) + + +@pytest.fixture +def decrypt_config_partial_fields(): + """ + Configuration for decrypting only some fields (email, ssn). + """ + return DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email", "ssn"], + key_name="test_restoration_key_partial", + ) + ) + ] + ) + + +@pytest.fixture +def authorized_multi_key_scenario(): + """ + Fixture for testing multi-key authorization scenarios. + Sets up two keys: one authorized, one denied. + """ + clear_vault_key("authorized_key") + clear_vault_key("unauthorized_key") + + # Create authorized key by generating it + authorized_key = Fernet.generate_key().decode() + set_vault_key("authorized_key", authorized_key) + + # Create unauthorized key and deny access + unauthorized_key = Fernet.generate_key().decode() + set_vault_key("unauthorized_key", unauthorized_key) + deny_vault_access("unauthorized_key") + + yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"} + + # Cleanup + clear_vault_key("authorized_key") + clear_vault_key("unauthorized_key") + + +@pytest.fixture +def large_dataset(): + """ + Fixture providing a large dataset (10,000 rows) for performance testing. + Reusable across multiple performance tests. + """ + return pd.DataFrame( + { + "id": range(1, 10001), + "email": [f"user{i}@example.com" for i in range(1, 10001)], + "name": [f"User {i}" for i in range(1, 10001)], + "ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)], + "age": [20 + (i % 50) for i in range(1, 10001)], + "salary": [30000.0 + (i * 10) for i in range(1, 10001)], + "department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)], + } + ) + + +@pytest.fixture(scope="session") +def vault_test_keys(): + """ + Session-scoped fixture to pre-generate test keys for faster test execution. + Avoids repeated key generation in each test. + """ + keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)} + + return keys + + +@pytest.fixture +def cleanup_test_keys(request): + """ + Fixture to automatically cleanup test keys after each test. + Use with: @pytest.mark.usefixtures("cleanup_test_keys") + """ + yield + + # Cleanup all test keys from mock Vault + test_keys = [k for k in _test_vault_storage.keys() if "test_" in k] + for key in test_keys: + _test_vault_storage.pop(key, None) + + +# -------------------------------- Helper Functions ---------------------------------------- + + +def config_to_dagster_dict(config): + """ + Convert Pydantic config to Dagster-compatible dictionary. + + For AnonymisePseudonymizeStructuredConfig (uses discriminated Union): + Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}} + Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}} + + For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union): + Pydantic v2 outputs: + {'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}} + Dagster expects: Same flat structure with 'type' field + + Args: + config: Pydantic config instance + (AnonymisePseudonymizeStructuredConfig or + DepseudonymizeStructuredConfig) + + Returns: + dict: Dagster-compatible configuration dictionary + """ + from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( + AnonymisePseudonymizeStructuredConfig, + ) + + config_dict = config.model_dump() + + # Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig + # DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union) + if isinstance(config, AnonymisePseudonymizeStructuredConfig): + if "used_function" in config_dict: + for func_config in config_dict["used_function"]: + if "technique" in func_config: + technique = func_config["technique"] + # Pydantic outputs flat dict with 'type' field for discriminated unions + if isinstance(technique, dict) and "type" in technique: + # Extract the type discriminator + technique_type = technique["type"] + # Create nested structure without the 'type' field + technique_data = {k: v for k, v in technique.items() if k != "type"} + # Nest under the discriminator key for Dagster + func_config["technique"] = {technique_type: technique_data} + + return config_dict + + +def run_encrypt_op(config, df): + """ + Helper function to execute the anonymize_pseudonymize_structured op. + + Args: + config: AnonymisePseudonymizeStructuredConfig instance + df: Input pandas DataFrame + + Returns: + tuple: (result_df, metrics) - Output DataFrame and metrics dict + """ + context = build_op_context(op_config=config_to_dagster_dict(config)) + result_df, metrics = anonymize_pseudonymize_structured(context, df=df) + return result_df.value, metrics.value + + +def run_decrypt_op(config, df): + """ + Helper function to execute the depseudonymize_structured op. + + Args: + config: DepseudonymizeStructuredConfig instance + df: Input pandas DataFrame + + Returns: + tuple: (result_df, metrics) - Output DataFrame and metrics dict + """ + context = build_op_context(op_config=config_to_dagster_dict(config)) + result_df, metrics = depseudonymize_structured(context, df=df) + return result_df.value, metrics.value + + +def clear_vault_key(key_name: str): + """ + Helper function to clear a key from the simulated Vault storage for test isolation. + + Args: + key_name: Name of the key to delete from Vault + """ + full_path = f"secret/PseudonymKeys/{key_name}" + if full_path in _test_vault_storage: + del _test_vault_storage[full_path] + if full_path in _test_vault_access_control: + del _test_vault_access_control[full_path] + + +def set_vault_key(key_name: str, key_value: str): + """ + Helper function to set a key in the simulated Vault storage. + + Args: + key_name: Name of the key + key_value: Value of the key (Fernet key as string) + """ + full_path = f"secret/PseudonymKeys/{key_name}" + _test_vault_storage[full_path] = key_value + + +def deny_vault_access(key_name: str): + """ + Helper function to deny access to a key for authorization testing (AC3). + + Args: + key_name: Name of the key to deny access to + """ + full_path = f"secret/PseudonymKeys/{key_name}" + _test_vault_access_control[full_path] = False + + +def get_vault_key(key_name: str) -> bytes: + """ + Helper function to retrieve a key from the simulated Vault storage. + + Args: + key_name: Name of the key to retrieve + + Returns: + bytes: The encryption key + """ + full_path = f"secret/PseudonymKeys/{key_name}" + if full_path not in _test_vault_storage: + raise InvalidPath(f"Key not found: {key_name}") + return _test_vault_storage[full_path].encode() diff --git a/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py b/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py new file mode 100644 index 0000000..010b9a6 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py @@ -0,0 +1,633 @@ +import pytest +from pydantic import ValidationError + +from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( + AnonymisePseudonymizeStructuredConfig, + DepseudonymizeStructuredConfig, + PseudoTechniqueConfig, + DepseudoTechniqueConfig, + HashConfig, + EncryptConfig, + RedactConfig, + ReplaceConfig, + DecryptConfig, +) +from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( + AnonymisePseudonymizeUnstructuredConfig, + DepseudonymizeUnstructuredConfig, + PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig, + DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig, + HashConfig as UnstructuredHashConfig, + EncryptConfig as UnstructuredEncryptConfig, + RedactConfig as UnstructuredRedactConfig, + ReplaceConfig as UnstructuredReplaceConfig, + RetainConfig, + DecryptConfig as UnstructuredDecryptConfig, +) +from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum +from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum + + +# ==================== Structured Config Tests ==================== + +class TestStructuredConfigValidators: + """Tests for structured_config.py validators and validators.""" + + def test_ensure_unique_columns_valid_single_technique(self): + """Test that single technique with single column passes validation.""" + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + columns=["email"], + key_name="key1" + ) + ) + ] + ) + assert config is not None + assert len(config.used_function) == 1 + + def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self): + """Test that multiple techniques with different columns passes validation.""" + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + columns=["email"], + key_name="key1" + ) + ), + PseudoTechniqueConfig( + technique=HashConfig( + columns=["ssn"], + algorithm="sha256" + ) + ) + ] + ) + assert config is not None + assert len(config.used_function) == 2 + + def test_ensure_unique_columns_duplicate_columns_same_technique(self): + """Test that duplicate columns in different techniques raises error.""" + with pytest.raises(ValueError) as exc_info: + AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + columns=["email"], + key_name="key1" + ) + ), + PseudoTechniqueConfig( + technique=HashConfig( + columns=["email"], + algorithm="sha256" + ) + ) + ] + ) + assert "Duplicate column" in str(exc_info.value) + assert "email" in str(exc_info.value) + + def test_ensure_unique_columns_multiple_duplicates(self): + """Test error message with multiple duplicate columns.""" + with pytest.raises(ValueError) as exc_info: + AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + columns=["email", "phone"], + key_name="key1" + ) + ), + PseudoTechniqueConfig( + technique=HashConfig( + columns=["email", "phone"], + algorithm="sha256" + ) + ) + ] + ) + error_msg = str(exc_info.value) + assert "Duplicate column" in error_msg + assert "email" in error_msg + assert "phone" in error_msg + + def test_collect_column_to_techniques_single_technique(self): + """Test _collect_column_to_techniques with single technique.""" + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + columns=["email", "phone"], + key_name="key1" + ) + ) + ] + ) + mapping = config._collect_column_to_techniques() + assert mapping == { + "email": ["encrypt"], + "phone": ["encrypt"] + } + + def test_extract_technique_and_columns_dict_with_type_field(self): + """Test _extract_technique_and_columns with dict containing 'type' field.""" + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns( + { + "technique": { + "type": "encrypt", + "columns": ["email", "ssn"], + "key_name": "test_key" + } + } + ) + assert technique_type == "encrypt" + assert columns == ["email", "ssn"] + + def test_extract_technique_and_columns_dict_with_variant_mapping(self): + """Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}.""" + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns( + { + "technique": { + "encrypt": { + "columns": ["ssn"], + "key_name": "test_key" + } + } + } + ) + assert technique_type == "encrypt" + assert columns == ["ssn"] + + def test_extract_technique_and_columns_model_instance(self): + """Test _extract_technique_and_columns with PseudoTechniqueConfig model instance.""" + pseudo_config = PseudoTechniqueConfig( + technique=RedactConfig(columns=["address"]) + ) + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns(pseudo_config) + assert technique_type == "redact" + assert columns == ["address"] + + def test_extract_technique_and_columns_empty_dict(self): + """Test _extract_technique_and_columns with empty dict.""" + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns( + {"technique": {}} + ) + assert technique_type is None + assert columns == [] + + def test_extract_technique_and_columns_none_technique(self): + """Test _extract_technique_and_columns with None technique.""" + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns( + {"technique": None} + ) + assert technique_type is None + assert columns == [] + + def test_extract_technique_and_columns_missing_columns_key(self): + """Test _extract_technique_and_columns when 'columns' key is missing.""" + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns( + { + "technique": { + "type": "encrypt", + "key_name": "test_key" + } + } + ) + assert technique_type == "encrypt" + assert columns == [] + + def test_extract_technique_and_columns_model_without_columns_attr(self): + """Test _extract_technique_and_columns with model instance missing columns attribute.""" + pseudo_config = PseudoTechniqueConfig( + technique=ReplaceConfig(columns=["old_value"], new_value="NEW") + ) + config = AnonymisePseudonymizeStructuredConfig() + technique_type, columns = config._extract_technique_and_columns(pseudo_config) + assert technique_type == "replace" + assert columns == ["old_value"] + + +class TestStructuredDepseudonymizeConfig: + """Tests for DepseudonymizeStructuredConfig.""" + + def test_depseudonymize_config_normalize_used_function_with_dict(self): + """Test _normalize_depseudo_used_function with dict input.""" + config = DepseudonymizeStructuredConfig( + used_function=[ + { + "technique": { + "type": "decrypt", + "columns": ["email"], + "key_name": "key1" + } + } + ] + ) + assert len(config.used_function) == 1 + assert isinstance(config.used_function[0], DepseudoTechniqueConfig) + assert config.used_function[0].technique.type == "decrypt" + + def test_depseudonymize_config_normalize_used_function_with_model(self): + """Test _normalize_depseudo_used_function with model instance.""" + depseudo_tech = DepseudoTechniqueConfig( + technique=DecryptConfig( + columns=["email"], + key_name="key1" + ) + ) + config = DepseudonymizeStructuredConfig( + used_function=[depseudo_tech] + ) + assert len(config.used_function) == 1 + assert config.used_function[0] is depseudo_tech + + def test_depseudonymize_config_ensure_unique_columns_no_op(self): + """Test that ensure_unique_columns is a no-op for depseudonymize.""" + # For depseudonymize, there's no per-column uniqueness constraint + config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + columns=["email"], + key_name="key1" + ) + ), + DepseudoTechniqueConfig( + technique=DecryptConfig( + columns=["email"], + key_name="key2" + ) + ) + ] + ) + # Should not raise - no-op validator + assert config is not None + + +# ==================== Unstructured Config Tests ==================== + +class TestUnstructuredConfigValidators: + """Tests for unstructured_config.py validators.""" + + def test_normalize_used_function_with_dict(self): + """Test _normalize_used_function with dict input.""" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + { + "technique": { + "encrypt": { + "pii": [PIIEntityEnum.EMAIL.value], + "key_name": "key1" + } + } + } + ] + ) + assert len(config.used_function) == 1 + + def test_normalize_used_function_with_model(self): + """Test _normalize_used_function with model instance.""" + pseudo_tech = UnstructuredPseudoTechniqueConfig( + technique=UnstructuredEncryptConfig( + pii=[PIIEntityEnum.EMAIL.value], + key_name="key1" + ) + ) + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[pseudo_tech] + ) + assert len(config.used_function) == 1 + + def test_ensure_unique_pii_valid_different_pii_types(self): + """Test that different PII types pass validation.""" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredEncryptConfig( + pii=[PIIEntityEnum.EMAIL.value], + key_name="key1" + ) + ), + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredHashConfig( + pii=[PIIEntityEnum.PERSON.value], + algorithm="sha256" + ) + ) + ] + ) + assert config is not None + assert len(config.used_function) == 2 + + def test_ensure_unique_pii_duplicate_pii_types(self): + """Test that duplicate PII types raise error.""" + with pytest.raises(ValueError) as exc_info: + AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredEncryptConfig( + pii=[PIIEntityEnum.EMAIL.value], + key_name="key1" + ) + ), + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredHashConfig( + pii=[PIIEntityEnum.EMAIL.value], + algorithm="sha256" + ) + ) + ] + ) + assert "Duplicate PII" in str(exc_info.value) + # Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value + assert "EMAIL" in str(exc_info.value) + + def test_collect_pii_to_techniques_single_technique(self): + """Test _collect_pii_to_techniques with single technique.""" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredEncryptConfig( + pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value], + key_name="key1" + ) + ) + ] + ) + mapping = config._collect_pii_to_techniques() + assert mapping == { + PIIEntityEnum.EMAIL.value: ["encrypt"], + PIIEntityEnum.PERSON.value: ["encrypt"] + } + + def test_extract_technique_and_pii_dict_with_type_field(self): + """Test _extract_technique_and_pii with dict containing 'type' field.""" + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii( + { + "technique": { + "type": "encrypt", + "pii": [PIIEntityEnum.EMAIL.value], + "key_name": "test_key" + } + } + ) + assert technique_type == "encrypt" + assert piis == [PIIEntityEnum.EMAIL.value] + + def test_extract_technique_and_pii_dict_with_variant_mapping(self): + """Test _extract_technique_and_pii with variant-key mapping.""" + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii( + { + "technique": { + "hash": { + "pii": [PIIEntityEnum.PERSON.value], + "algorithm": "sha256" + } + } + } + ) + assert technique_type == "hash" + assert piis == [PIIEntityEnum.PERSON.value] + + def test_extract_technique_and_pii_dict_fallback_to_columns(self): + """Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing.""" + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii( + { + "technique": { + "type": "redact", + "columns": ["fallback_col"] + } + } + ) + assert technique_type == "redact" + assert piis == ["fallback_col"] + + def test_extract_technique_and_pii_model_instance(self): + """Test _extract_technique_and_pii with model instance.""" + pseudo_tech = UnstructuredPseudoTechniqueConfig( + technique=UnstructuredRedactConfig( + pii=[PIIEntityEnum.EMAIL.value] + ) + ) + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii(pseudo_tech) + assert technique_type == "redact" + assert piis == [PIIEntityEnum.EMAIL.value] + + def test_extract_technique_and_pii_model_with_getattr_fallback(self): + """Test _extract_technique_and_pii model with getattr fallback to columns.""" + # Create a mock-like scenario where pii attribute doesn't exist + pseudo_tech = UnstructuredPseudoTechniqueConfig( + technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value]) + ) + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii(pseudo_tech) + assert technique_type == "retain" + assert piis == [PIIEntityEnum.PERSON.value] + + def test_extract_technique_and_pii_empty_dict(self): + """Test _extract_technique_and_pii with empty dict.""" + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii( + {"technique": {}} + ) + assert technique_type is None + assert piis == [] + + def test_extract_technique_and_pii_missing_pii_key(self): + """Test _extract_technique_and_pii when 'pii' key is missing.""" + config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) + technique_type, piis = config._extract_technique_and_pii( + { + "technique": { + "type": "encrypt", + "key_name": "test_key" + } + } + ) + assert technique_type == "encrypt" + assert piis == [] + + +class TestUnstructuredDepseudonymizeConfig: + """Tests for DepseudonymizeUnstructuredConfig.""" + + def test_depseudonymize_unstructured_config_default(self): + """Test default DepseudonymizeUnstructuredConfig.""" + config = DepseudonymizeUnstructuredConfig() + assert config is not None + assert len(config.used_function) >= 1 + + def test_depseudonymize_unstructured_config_with_custom_function(self): + """Test DepseudonymizeUnstructuredConfig with custom function.""" + config = DepseudonymizeUnstructuredConfig( + used_function=[ + UnstructuredDepseudoTechniqueConfig( + technique=UnstructuredDecryptConfig( + key_name="custom_key" + ) + ) + ] + ) + assert len(config.used_function) == 1 + assert config.used_function[0].technique.key_name == "custom_key" + + +class TestLanguageSupport: + """Tests for language configuration support.""" + + def test_all_supported_languages(self): + """Test that all supported languages can be set.""" + supported_languages = [ + LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en, + LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el, + LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt, + LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv + ] + + for lang in supported_languages: + config = AnonymisePseudonymizeUnstructuredConfig(language=lang) + assert config.language == lang + + def test_default_language_is_english(self): + """Test that default language is English.""" + config = AnonymisePseudonymizeUnstructuredConfig() + assert config.language == LanguageEnum.en + + +class TestTechniqueConfigDefaults: + """Tests for technique config defaults.""" + + def test_hash_config_default_algorithm(self): + """Test HashConfig default algorithm.""" + config = HashConfig() + assert config.algorithm == "sha256" + assert config.type == "hash" + + def test_encrypt_config_defaults(self): + """Test EncryptConfig defaults.""" + config = EncryptConfig() + assert config.type == "encrypt" + assert config.key_name == "my_key" + + def test_redact_config_defaults(self): + """Test RedactConfig defaults.""" + config = RedactConfig() + assert config.type == "redact" + + def test_replace_config_defaults(self): + """Test ReplaceConfig defaults.""" + config = ReplaceConfig() + assert config.type == "replace" + assert config.new_value == "REPLACED" + + def test_decrypt_config_defaults(self): + """Test DecryptConfig defaults.""" + config = DecryptConfig() + assert config.type == "decrypt" + assert config.key_name == "my_key" + + def test_unstructured_retain_config_defaults(self): + """Test RetainConfig defaults.""" + config = RetainConfig() + assert config.type == "retain" + + +class TestPseudoTechniqueConfigDefaults: + """Tests for PseudoTechniqueConfig defaults.""" + + def test_pseudo_technique_default_to_hash(self): + """Test PseudoTechniqueConfig defaults to hash technique.""" + config = PseudoTechniqueConfig() + # For Dagster Config, technique may be a dict with the discriminator structure + if isinstance(config.technique, dict): + # Check if it has hash configuration + assert "hash" in config.technique or config.technique.get("type") == "hash" + else: + assert config.technique.type == "hash" + + def test_unstructured_pseudo_technique_default_to_hash(self): + """Test UnstructuredPseudoTechniqueConfig defaults to hash technique.""" + config = UnstructuredPseudoTechniqueConfig() + # For Dagster Config, technique may be a dict with the discriminator structure + if isinstance(config.technique, dict): + # Check if it has hash configuration + assert "hash" in config.technique or config.technique.get("type") == "hash" + else: + assert config.technique.type == "hash" + + +class TestConfigModelIntegration: + """Integration tests for config models.""" + + def test_structured_config_with_all_technique_types(self): + """Test structured config with all technique types.""" + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=HashConfig(columns=["col1"]) + ), + PseudoTechniqueConfig( + technique=EncryptConfig(columns=["col2"], key_name="k1") + ), + PseudoTechniqueConfig( + technique=RedactConfig(columns=["col3"]) + ), + PseudoTechniqueConfig( + technique=ReplaceConfig(columns=["col4"], new_value="X") + ) + ] + ) + assert len(config.used_function) == 4 + techniques = {f.technique.type for f in config.used_function} + assert techniques == {"hash", "encrypt", "redact", "replace"} + + def test_unstructured_config_with_all_technique_types(self): + """Test unstructured config with all technique types.""" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value]) + ), + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredEncryptConfig( + pii=[PIIEntityEnum.PERSON.value], + key_name="k1" + ) + ), + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value]) + ), + UnstructuredPseudoTechniqueConfig( + technique=UnstructuredReplaceConfig( + pii=[PIIEntityEnum.CREDIT_CARD.value], + new_value="X" + ) + ), + UnstructuredPseudoTechniqueConfig( + technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value]) + ) + ] + ) + assert len(config.used_function) == 5 + techniques = {f.technique.type for f in config.used_function} + assert techniques == {"hash", "encrypt", "redact", "replace", "retain"} diff --git a/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py b/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py new file mode 100644 index 0000000..9ed013a --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py @@ -0,0 +1,1090 @@ +""" +Test suite for data restoration (depseudonymization) operations. + +This test suite validates the data restoration feature against the following Acceptance Criteria: + +## Test Coverage Summary + +### Acceptance Criteria Coverage: +- AC1 (Data Restoration with Valid Key): 7 tests +- AC2 (Restoration Denial - Missing Key): 3 tests +- AC3 (Restoration Denial - Unauthorized Access): 2 tests +- AC4 (Restoration Denial - Invalid Key): 3 tests +- Additional Coverage: 3 tests + +### Test Pattern: +- Each test uses build_op_context with .model_dump() for configuration +- Tests validate dual outputs (data, metrics) +- Tests verify complete restoration of original values +- Tests validate security controls and error handling + +""" + +import pandas as pd +import pytest +from cryptography.fernet import Fernet + +from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( + AnonymisePseudonymizeStructuredConfig, + DepseudonymizeStructuredConfig, + EncryptConfig, + DecryptConfig, + PseudoTechniqueConfig, + DepseudoTechniqueConfig, +) + +# Import helper functions (fixtures are auto-discovered by pytest) +from .conftest import ( + run_encrypt_op, + run_decrypt_op, + clear_vault_key, + set_vault_key, + deny_vault_access, + get_vault_key, +) + + +# -------------------------------- Test Markers Configuration -------------------------------- + +# Register custom markers +pytest.mark.slow = pytest.mark.slow +pytest.mark.security = pytest.mark.security +pytest.mark.edge_case = pytest.mark.edge_case +pytest.mark.integration = pytest.mark.integration + + +# ---------------------- AC1: Data Restoration with Valid Key -------------------------------- + + +def test_ac1_restore_single_encrypted_field_with_valid_key( + sample_df, encrypt_config_single_field, decrypt_config_single_field +): + """ + AC1: Data Restoration using Secret Management Tool-Stored Decryption Key + + Scenario: Restore encrypted field with a valid key + Given: A pseudonymised dataset with encrypted email field + And: A valid decryption key stored in secret management tool + And: The participant provided the field that needs to be restored (email) + And: The participant is authorized + When: The participant requests data restoration + And: Provides the correct key name + Then: The system retrieves the key from secret management tool + And: Decrypts the dataset accurately + And: All original values are restored + And: A success message is presented to the user (via successful return) + And: The result is presented to the user + """ + # Clear any existing test key + clear_vault_key("test_restoration_key_single") + + # Step 1: Encrypt the data (pseudonymisation phase) + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Verify encryption occurred + assert not encrypted_df["email"].equals(sample_df["email"]), "Email field should be encrypted" + + # Verify key was created in Vault + key = get_vault_key("test_restoration_key_single") + assert key is not None, "Encryption key should exist in Vault" + + # Step 2: Restore the data (depseudonymisation phase) + restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) + + # Verify restoration succeeded + assert restored_df is not None, "Restored DataFrame should not be None" + assert metrics is not None, "Metrics should not be None" + + # Verify all original values are restored exactly + assert restored_df["email"].equals( + sample_df["email"] + ), "Email field should be restored to original values" + + # Verify each individual value + for idx, (original, restored) in enumerate(zip(sample_df["email"], restored_df["email"])): + assert ( + original == restored + ), f"Row {idx}: Original '{original}' should match restored '{restored}'" + + # Verify row count preserved + assert len(restored_df) == len(sample_df), "Row count should be preserved during restoration" + + # Verify non-encrypted columns remain unchanged + assert restored_df["name"].equals( + sample_df["name"] + ), "Non-encrypted fields should remain unchanged" + assert restored_df["age"].equals( + sample_df["age"] + ), "Non-encrypted fields should remain unchanged" + assert restored_df["department"].equals( + sample_df["department"] + ), "Non-encrypted fields should remain unchanged" + + +def test_ac1_restore_multiple_encrypted_fields_with_valid_key( + sample_df, encrypt_config_multiple_fields, decrypt_config_multiple_fields +): + """ + AC1: Data Restoration of multiple encrypted fields with a valid key + + Scenario: Restore multiple encrypted fields (name, email, ssn) with a valid key + Given: A pseudonymised dataset with multiple encrypted fields + And: A valid decryption key stored in secret management tool + And: The participant provided the fields that need to be restored + When: The participant requests data restoration + Then: All specified fields are decrypted accurately + And: All original values are restored + """ + clear_vault_key("test_restoration_key_multi") + + # Encrypt multiple fields + encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) + + # Verify all specified fields were encrypted + assert not encrypted_df["name"].equals(sample_df["name"]), "Name should be encrypted" + assert not encrypted_df["email"].equals(sample_df["email"]), "Email should be encrypted" + assert not encrypted_df["ssn"].equals(sample_df["ssn"]), "SSN should be encrypted" + + # Restore all encrypted fields + restored_df, _ = run_decrypt_op(decrypt_config_multiple_fields, encrypted_df.copy()) + + # Verify all fields restored to original values + assert restored_df["name"].equals( + sample_df["name"] + ), "Name field should be restored to original values" + assert restored_df["email"].equals( + sample_df["email"] + ), "Email field should be restored to original values" + assert restored_df["ssn"].equals( + sample_df["ssn"] + ), "SSN field should be restored to original values" + + # Verify non-encrypted columns remain unchanged + assert restored_df["age"].equals( + sample_df["age"] + ), "Non-encrypted fields should remain unchanged" + assert restored_df["salary"].equals( + sample_df["salary"] + ), "Non-encrypted fields should remain unchanged" + + +def test_ac1_restore_partial_fields_leaves_others_encrypted( + sample_df, encrypt_config_multiple_fields +): + """ + AC1: Partial restoration - participant specifies only some fields to restore + + Scenario: Restore only selected fields while leaving others encrypted + Given: A pseudonymised dataset with multiple encrypted fields (name, email, ssn) + And: The participant specifies only some fields to restore (e.g., only email) + When: The participant requests partial restoration + Then: Only the specified fields are decrypted + And: Other encrypted fields remain encrypted + """ + clear_vault_key("test_restoration_key_multi") + + # Encrypt multiple fields + encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) + + # Create config to restore only email field + partial_decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], # Only restore email + key_name="test_restoration_key_multi", + ) + ) + ] + ) + + # Restore only email field + restored_df, _ = run_decrypt_op(partial_decrypt_config, encrypted_df.copy()) + + # Verify email is restored + assert restored_df["email"].equals( + sample_df["email"] + ), "Email field should be restored to original values" + + # Verify other fields remain encrypted (different from original) + assert not restored_df["name"].equals(sample_df["name"]), "Name field should remain encrypted" + assert not restored_df["ssn"].equals(sample_df["ssn"]), "SSN field should remain encrypted" + + +def test_ac1_restore_preserves_data_types(sample_df): + """ + AC1: Data restoration preserves original data types for all fields + + Scenario: Restore encrypted numeric and string fields + Given: A dataset with mixed data types (strings, integers, floats) + When: Fields are encrypted and then restored + Then: Original data types are preserved after restoration + """ + # Create config to encrypt mixed types + encrypt_config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + columns=["name", "age", "salary"], + key_name="test_restoration_types", + ) + ) + ] + ) + + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["name", "age", "salary"], + key_name="test_restoration_types", + ) + ) + ] + ) + + clear_vault_key("test_restoration_types") + + # Encrypt and restore + encrypted_df, _ = run_encrypt_op(encrypt_config, sample_df.copy()) + restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) + + # Verify values are restored (as strings due to encryption/decryption) + # Note: Fernet encryption/decryption converts everything to strings + # This is expected behavior - original types are preserved via string representation + assert ( + restored_df["name"].tolist() == sample_df["name"].tolist() + ), "String values should be restored" + assert ( + restored_df["age"].tolist() == sample_df["age"].astype(str).tolist() + ), "Integer values should be restored as strings" + assert ( + restored_df["salary"].tolist() == sample_df["salary"].astype(str).tolist() + ), "Float values should be restored as strings" + + +def test_ac1_restore_empty_dataframe(encrypt_config_single_field, decrypt_config_single_field): + """ + AC1: Edge case - restore an empty dataset + + Scenario: Attempt to restore an empty pseudonymised dataset + Given: An empty DataFrame with correct schema + When: Restoration is attempted + Then: Operation completes successfully without errors + And: Returns an empty DataFrame + """ + clear_vault_key("test_restoration_key_single") + + # Create empty DataFrame with same schema + empty_df = pd.DataFrame(columns=["id", "name", "email", "ssn", "age", "salary", "department"]) + + # Encrypt (should handle empty DataFrame) + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, empty_df.copy()) + + # Restore (should also handle empty DataFrame) + restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) + + # Verify empty DataFrame returned + assert len(restored_df) == 0, "Restored DataFrame should be empty" + assert list(restored_df.columns) == list(empty_df.columns), "Column schema should be preserved" + + +def test_ac1_restore_with_special_characters( + encrypt_config_single_field, decrypt_config_single_field +): + """ + AC1: Data restoration with special characters and edge case values + + Scenario: Restore data containing special characters, unicode, etc. + Given: A dataset with special characters in string fields + When: Data is encrypted and then restored + Then: All special characters are preserved accurately + """ + clear_vault_key("test_restoration_key_single") + + # Create DataFrame with special characters + special_df = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "name": ["José García", "François Müller", "李明", "O'Brien"], + "email": [ + "josé@example.com", + "françois@example.com", + "li@example.cn", + "o'brien@example.ie", + ], + "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", "456-78-9012"], + "age": [25, 30, 35, 40], + "salary": [50000.0, 60000.0, 70000.0, 80000.0], + "department": ["HR", "IT", "Finance", "IT"], + } + ) + + # Encrypt and restore + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, special_df.copy()) + restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) + + # Verify special characters preserved + assert restored_df["email"].equals( + special_df["email"] + ), "Special characters should be preserved during restoration" + + for idx, (original, restored) in enumerate(zip(special_df["email"], restored_df["email"])): + assert ( + original == restored + ), f"Row {idx}: Special characters in '{original}' should be preserved" + + +# ------------------- AC2: Restoration Denial when Key is Missing ---------------------------- + + +def test_ac2_restore_fails_when_key_missing(sample_df, encrypt_config_single_field): + """ + AC2: Restoration Denial when Decryption Key is missing + + Scenario: Attempt to restore encrypted fields when decryption key is missing + Given: A pseudonymised dataset + And: The decryption key is missing from Vault + And: The participant provides the correct key name + When: The participant attempts to restore the data + Then: The system fails the restoration request + And: Logs the failed key retrieval for auditing (via exception) + And: An error message is presented to the user + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data first + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Delete the key from Vault to simulate missing key + clear_vault_key("test_restoration_key_single") + + # Create decrypt config with missing key + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Attempt restoration - should fail with clear error + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + # Verify error message is informative + error_message = str(exc_info.value) + assert ( + "not found" in error_message.lower() or "decrypt" in error_message.lower() + ), "Error message should indicate key not found for decrypt operation" + assert ( + "test_restoration_key_single" in error_message + ), "Error message should include the key name for auditing" + + +def test_ac2_restore_fails_with_nonexistent_key_name(sample_df, encrypt_config_single_field): + """ + AC2: Restoration fails when using a key name that never existed + + Scenario: Attempt to restore with a key name that was never created + Given: A pseudonymised dataset + And: A key name that does not exist in Vault + When: The participant attempts to restore the data + Then: The system fails the restoration request with appropriate error + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data with one key + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Try to decrypt with a different, non-existent key + decrypt_config_wrong_key = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", columns=["email"], key_name="nonexistent_key_name" + ) + ) + ] + ) + + # Attempt restoration - should fail + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config_wrong_key, encrypted_df.copy()) + + error_message = str(exc_info.value) + assert "not found" in error_message.lower(), "Error message should indicate key not found" + + +def test_ac2_restore_fails_when_key_corrupted(sample_df, encrypt_config_single_field): + """ + AC2: Restoration Denial when Decryption Key is corrupted + + Scenario: Attempt to restore when key is corrupted in Vault + Given: A pseudonymised dataset + And: The decryption key is corrupted (invalid format) + When: The participant attempts to restore the data + Then: The system fails the restoration request + And: An appropriate error message is presented + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data first + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Corrupt the key by replacing it with invalid data + set_vault_key("test_restoration_key_single", "corrupted_invalid_key_data") + + # Create decrypt config + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Attempt restoration - should fail due to corrupted key + with pytest.raises(Exception) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + # Should raise either ValueError or Fernet-related exception + assert "Fernet" in str(type(exc_info.value)) or "ValueError" in str( + type(exc_info.value) + ), "Should raise Fernet or ValueError for corrupted key" + + +# ------------- AC3: Restoration Denial when Access is Unauthorized -------------------------- + + +def test_ac3_restore_fails_when_access_unauthorized(sample_df, encrypt_config_single_field): + """ + AC3: Restoration Denial when Decryption Key access is unauthorized + + Scenario: Attempt to restore encrypted fields without authorization + Given: A pseudonymised dataset + And: A decryption key in secret management tool + And: The participant is not authorized to access the key + When: The participant attempts to restore the data + Then: The system denies the participant access to the key + And: The system denies the initiation of the restoration process + And: The system logs the unauthorized access attempt (via exception) + And: An appropriate error message is presented to the user + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data first + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Set access control to deny access + deny_vault_access("test_restoration_key_single") + + # Create decrypt config + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Attempt restoration - should fail with ValueError (wrapping Forbidden) + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + # Verify error indicates access denial + error_message = str(exc_info.value) + assert ( + "access denied" in error_message.lower() or "error while reading" in error_message.lower() + ), "Error message should indicate access denial or error reading key" + assert ( + "test_restoration_key_single" in error_message + ), "Error message should include the key name for auditing" + + +def test_ac3_restore_multiple_keys_with_mixed_authorization(sample_df): + """ + AC3: Restoration with mixed authorization - some keys authorized, others not + + Scenario: Attempt to restore multiple fields where user has access to some keys but not others + Given: A pseudonymised dataset with multiple encrypted fields using different keys + And: The participant is authorized for some keys but not others + When: The participant attempts to restore all fields + Then: The system denies access when unauthorized key is encountered + """ + # Encrypt email with one key, ssn with another + encrypt_config_multi_keys = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="authorized_key" + ) + ) + ] + ) + + clear_vault_key("authorized_key") + clear_vault_key("unauthorized_key") + + # Encrypt data + encrypted_df, _ = run_encrypt_op(encrypt_config_multi_keys, sample_df.copy()) + + # Manually encrypt another field with different key (simulating separate encryption) + encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["ssn"], key_name="unauthorized_key" + ) + ) + ] + ) + encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) + + # Deny access to unauthorized_key + deny_vault_access("unauthorized_key") + + # Try to decrypt both fields + decrypt_config_both = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", columns=["email"], key_name="authorized_key" + ) + ), + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", columns=["ssn"], key_name="unauthorized_key" + ) + ), + ] + ) + + # Should fail when trying to access unauthorized_key with ValueError (wrapping Forbidden) + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config_both, encrypted_df.copy()) + + # Verify error indicates access issue with unauthorized key + error_message = str(exc_info.value) + assert ( + "access denied" in error_message.lower() or "error while reading" in error_message.lower() + ), "Error message should indicate access denial" + assert "unauthorized_key" in error_message, "Error message should mention the unauthorized key" + + +# ------------------- AC4: Restoration Denial when Key is Invalid ---------------------------- + + +def test_ac4_restore_fails_with_wrong_key(sample_df): + """ + AC4: Restoration Denial when Decryption Key is invalid + + Scenario: Attempt to restore encrypted fields with a key that doesn't match the encryption key + Given: A pseudonymised dataset encrypted with key A + And: A different valid decryption key B is stored in secret management tool + And: The participant provides key B (which is not the correct key) + And: Key B does not correspond to the fields to be restored + When: The participant attempts to restore the data + Then: The system fails the restoration request + And: Logs the failed decryption attempt for auditing (via exception) + And: An error message is presented to the user + """ + # Encrypt with one key + encrypt_config_key_a = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="encryption_key_a" + ) + ) + ] + ) + + clear_vault_key("encryption_key_a") + clear_vault_key("encryption_key_b") + + # Encrypt data with key A + encrypted_df, _ = run_encrypt_op(encrypt_config_key_a, sample_df.copy()) + + # Generate a different valid key B in Vault + different_key = Fernet.generate_key().decode() + set_vault_key("encryption_key_b", different_key) + + # Try to decrypt with key B (wrong key) + decrypt_config_key_b = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", columns=["email"], key_name="encryption_key_b" + ) + ) + ] + ) + + # Attempt restoration - should fail with InvalidToken or ValueError + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config_key_b, encrypted_df.copy()) + + # Verify error message indicates decryption failure + error_message = str(exc_info.value) + assert ( + "invalid" in error_message.lower() or "token" in error_message.lower() + ), "Error message should indicate invalid token or decryption failure" + assert ( + "encryption_key_b" in error_message + ), "Error message should include the key name for auditing" + + +def test_ac4_restore_fails_with_key_from_different_field(sample_df): + """ + AC4: Restoration fails when using a key intended for a different field + + Scenario: Attempt to restore field A using the key for field B + Given: A dataset with multiple fields encrypted with different keys + And: The participant provides the key for field B to decrypt field A + When: The participant attempts to restore field A + Then: The system fails the restoration request + """ + # Encrypt email and ssn with different keys + encrypt_config_email = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["email"], key_name="email_key") + ) + ] + ) + + encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["ssn"], key_name="ssn_key") + ) + ] + ) + + clear_vault_key("email_key") + clear_vault_key("ssn_key") + + # Encrypt both fields + encrypted_df, _ = run_encrypt_op(encrypt_config_email, sample_df.copy()) + encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) + + # Try to decrypt email field using ssn_key + decrypt_config_wrong_field = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], # Trying to decrypt email + key_name="ssn_key", # But using ssn's key + ) + ) + ] + ) + + # Should fail with InvalidToken + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config_wrong_field, encrypted_df.copy()) + + error_message = str(exc_info.value) + assert ( + "invalid" in error_message.lower() or "token" in error_message.lower() + ), "Error message should indicate invalid token" + + +def test_ac4_restore_fails_with_tampered_encrypted_data(sample_df, encrypt_config_single_field): + """ + AC4: Restoration fails when encrypted data has been tampered with + + Scenario: Attempt to restore encrypted data that has been modified + Given: A pseudonymised dataset + And: Some encrypted values have been tampered with + And: The correct decryption key is provided + When: The participant attempts to restore the data + Then: The system fails the restoration for tampered values + And: An appropriate error message is presented + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Tamper with encrypted data (modify one encrypted value) + encrypted_df.loc[0, "email"] = "tampered_invalid_encrypted_data" + + # Create decrypt config + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Attempt restoration - should fail on tampered data + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + error_message = str(exc_info.value) + assert ( + "invalid" in error_message.lower() or "token" in error_message.lower() + ), "Error message should indicate invalid token due to tampering" + + +# ---------------- Additional Edge Cases and Integration Tests ------------------------------- + + +def test_integration_full_cycle_encrypt_decrypt_multiple_operations(sample_df): + """ + Integration test: Full cycle of multiple encrypt/decrypt operations + + Scenario: Complex workflow with multiple encryption and restoration operations + Given: A dataset + When: Multiple fields are encrypted at different times + And: Fields are restored in different orders + Then: All operations complete successfully + And: Final restored data matches original + """ + # Phase 1: Encrypt email + encrypt_config_1 = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_1") + ) + ] + ) + clear_vault_key("key_1") + encrypted_df_1, _ = run_encrypt_op(encrypt_config_1, sample_df.copy()) + + # Phase 2: Encrypt name and ssn + encrypt_config_2 = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["name", "ssn"], key_name="key_2") + ) + ] + ) + clear_vault_key("key_2") + encrypted_df_2, _ = run_encrypt_op(encrypt_config_2, encrypted_df_1.copy()) + + # Phase 3: Restore email first + decrypt_config_1 = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_1") + ) + ] + ) + restored_df_1, _ = run_decrypt_op(decrypt_config_1, encrypted_df_2.copy()) + assert restored_df_1["email"].equals(sample_df["email"]), "Email should be restored" + + # Phase 4: Restore name and ssn + decrypt_config_2 = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", columns=["name", "ssn"], key_name="key_2") + ) + ] + ) + restored_df_2, _ = run_decrypt_op(decrypt_config_2, restored_df_1.copy()) + + # Verify all fields restored + assert restored_df_2["email"].equals(sample_df["email"]), "Email should remain restored" + assert restored_df_2["name"].equals(sample_df["name"]), "Name should be restored" + assert restored_df_2["ssn"].equals(sample_df["ssn"]), "SSN should be restored" + + +def test_restore_with_null_values(encrypt_config_single_field, decrypt_config_single_field): + """ + Edge case: Restoration of dataset with null/NaN values + + Scenario: Dataset contains null values in encrypted fields + Given: A dataset with null values in fields to be encrypted + When: Data is encrypted and then restored + Then: Null values are handled appropriately + """ + clear_vault_key("test_restoration_key_single") + + # Create DataFrame with null values + df_with_nulls = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "name": ["Alice", "Bob", None, "David"], + "email": [ + "alice@example.com", + None, + "charlie@example.com", + "david@example.com", + ], + "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", None], + "age": [25, 30, 35, 40], + "salary": [50000.0, 60000.0, 70000.0, 80000.0], + "department": ["HR", "IT", "Finance", "IT"], + } + ) + + # Note: Encryption of NaN/None values will convert them to string "nan" or "None" + # This is expected behavior - Fernet encryption requires string input + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, df_with_nulls.copy()) + restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) + + # Verify non-null values are restored correctly + assert restored_df.loc[0, "email"] == "alice@example.com" + assert restored_df.loc[2, "email"] == "charlie@example.com" + assert restored_df.loc[3, "email"] == "david@example.com" + + +def test_restore_large_dataset_performance(): + """ + Performance test: Restoration of large dataset + + Scenario: Restore a large dataset with many rows + Given: A large dataset with 10,000 rows + When: Data is encrypted and then restored + Then: Operation completes without errors or timeout + And: All values are restored correctly + """ + # Create large dataset + large_df = pd.DataFrame( + { + "id": range(1, 10001), + "email": [f"user{i}@example.com" for i in range(1, 10001)], + "name": [f"User {i}" for i in range(1, 10001)], + "ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)], + "age": [20 + (i % 50) for i in range(1, 10001)], + "salary": [30000 + (i * 10) for i in range(1, 10001)], + "department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)], + } + ) + + encrypt_config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="test_large_dataset" + ) + ) + ] + ) + + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", columns=["email"], key_name="test_large_dataset" + ) + ) + ] + ) + + clear_vault_key("test_large_dataset") + + # Encrypt and restore + encrypted_df, _ = run_encrypt_op(encrypt_config, large_df.copy()) + restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) + + # Verify sample of values + assert len(restored_df) == 10000, "Should restore all 10,000 rows" + assert restored_df["email"].equals(large_df["email"]), "All emails should be restored" + + # Spot check specific values + assert restored_df.loc[0, "email"] == "user1@example.com" + assert restored_df.loc[5000, "email"] == "user5001@example.com" + assert restored_df.loc[9999, "email"] == "user10000@example.com" + + +@pytest.mark.edge_case +@pytest.mark.security +def test_restore_after_key_rotation(sample_df, encrypt_config_single_field): + """ + AC4: Restoration fails after key rotation (key changed in Vault) + + Scenario: Key is rotated in Vault after encryption + Given: Data encrypted with key version 1 + And: Key is rotated to version 2 in Vault + When: Participant attempts to restore using new key version + Then: Restoration fails with clear error message + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt with original key + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Rotate key (replace with new key) + new_key = Fernet.generate_key().decode() + set_vault_key("test_restoration_key_single", new_key) + + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Should fail - key mismatch + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + assert ( + "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() + ), "Should indicate invalid token due to key rotation" + + +@pytest.mark.edge_case +def test_restore_partially_encrypted_column(sample_df, encrypt_config_single_field): + """ + Edge case: Attempt to restore column where only some rows are encrypted + + Scenario: Column has mixed encrypted/plaintext values (data corruption scenario) + """ + clear_vault_key("test_restoration_key_single") + + # Encrypt data + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Corrupt by replacing some encrypted values with plaintext + encrypted_df.loc[0, "email"] = "plaintext@example.com" + encrypted_df.loc[2, "email"] = "another_plaintext@example.com" + + decrypt_config = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig( + type="decrypt", + columns=["email"], + key_name="test_restoration_key_single", + ) + ) + ] + ) + + # Should fail on plaintext values + with pytest.raises(ValueError) as exc_info: + run_decrypt_op(decrypt_config, encrypted_df.copy()) + + assert ( + "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() + ), "Should indicate invalid token for plaintext values" + + +@pytest.mark.edge_case +def test_restore_with_missing_column_in_encrypted_data( + sample_df, encrypt_config_single_field, decrypt_config_single_field +): + """ + AC2: Restoration fails when specified column doesn't exist in encrypted dataset + """ + clear_vault_key("test_restoration_key_single") + + # First encrypt the sample data to create the key + encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) + + # Create encrypted DataFrame missing the 'email' column + incomplete_df = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + # Missing 'email' column that decrypt config expects + "age": [25, 30, 35], + "salary": [50000.0, 60000.0, 70000.0], + "department": ["HR", "IT", "Finance"], + } + ) + + with pytest.raises((ValueError, KeyError)) as exc_info: + run_decrypt_op(decrypt_config_single_field, incomplete_df) + + error_msg = str(exc_info.value) + assert ( + "email" in error_msg or "not present" in error_msg or "not found" in error_msg + ), f"Error should indicate missing column, got: {error_msg}" + + +@pytest.mark.integration +def test_restore_with_multiple_encryption_keys(sample_df): + """ + Integration test: Restore data encrypted with multiple different keys + + Scenario: Different fields encrypted with different keys + Given: name encrypted with key_a, email encrypted with key_b + When: Participant provides both keys for restoration + Then: Both fields are restored correctly + """ + clear_vault_key("key_a") + clear_vault_key("key_b") + + # Encrypt name with key_a + encrypt_config_name = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["name"], key_name="key_a") + ) + ] + ) + + # Encrypt email with key_b + encrypt_config_email = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_b") + ) + ] + ) + + # Encrypt both fields + df_encrypted = sample_df.copy() + df_encrypted, _ = run_encrypt_op(encrypt_config_name, df_encrypted) + df_encrypted, _ = run_encrypt_op(encrypt_config_email, df_encrypted) + + # Decrypt name with key_a + decrypt_config_name = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", columns=["name"], key_name="key_a") + ) + ] + ) + + # Decrypt email with key_b + decrypt_config_email = DepseudonymizeStructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_b") + ) + ] + ) + + # Restore both fields + df_restored = df_encrypted.copy() + df_restored, _ = run_decrypt_op(decrypt_config_name, df_restored) + df_restored, _ = run_decrypt_op(decrypt_config_email, df_restored) + + # Verify both fields restored + assert df_restored["name"].equals(sample_df["name"]), "Name field should be restored with key_a" + assert df_restored["email"].equals( + sample_df["email"] + ), "Email field should be restored with key_b" diff --git a/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py b/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py new file mode 100644 index 0000000..1ce8585 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py @@ -0,0 +1,288 @@ +""" +Test suite for data restoration (depseudonymisation) of unstructured text. + +## Test Coverage Summary + +### Acceptance Criteria Coverage: +- AC1 (Data Restoration with Valid Key): 2 tests +- AC2 (Restoration Denial - Missing Key): 1 test +- AC3 (Restoration Denial - Unauthorized Access): 1 test +- AC4 (Restoration Denial - Invalid Key): 1 test +- Additional Coverage: 2 tests (edge cases) + +### Test Pattern: +- Each test uses build_op_context with .model_dump() for configuration +- Tests validate dual outputs (data, metrics) +- Tests verify complete restoration of original text +- Tests validate security controls and error handling +- Tests use descriptive names mapping to AC scenarios + +""" + +import pytest +from unittest.mock import patch +from cryptography.fernet import Fernet +from dagster import build_op_context + +from src.field_level_pseudo_anonymisation.unstructured_ops import ( + depseudonymize_unstructured, +) +from src.field_level_pseudo_anonymisation.config_models.unstructured_config import ( + DepseudonymizeUnstructuredConfig, + DecryptConfig, + DepseudoTechniqueConfig, +) + + +@pytest.fixture +def fernet_key() -> bytes: + """Generate a valid Fernet key for encryption in tests.""" + return Fernet.generate_key() + + +@pytest.fixture +def encrypted_text_data(fernet_key: bytes) -> dict: + """ + Create encrypted data for testing decryption. + + Returns a dict with: + - original_text: The unencrypted text + - encrypted_text: Text with PII values encrypted in {encrypt:...} format + """ + original_text = "My name is John Doe and my email is john.doe@example.com." + fernet = Fernet(fernet_key) + encrypted_name = fernet.encrypt(b"John Doe").decode() + encrypted_email = fernet.encrypt(b"john.doe@example.com").decode() + encrypted_text = ( + f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}." + ) + return { + "original_text": original_text, + "encrypted_text": encrypted_text, + } + + +# ---------------------- AC1: Data Restoration with Valid Key -------------------------------- + + +@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") +def test_ac1_restore_encrypted_pii_entities_with_valid_key( + mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict +): + """AC1: Restore encrypted PII entities with a valid key from secret management tool.""" + # Arrange - Mock the Vault key retrieval to return the valid key + mock_create_get_key.return_value = fernet_key + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key")) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act - Request data restoration + result_gen = depseudonymize_unstructured( + context, input_text=encrypted_text_data["encrypted_text"] + ) + data_output = next(result_gen) + metrics_output = next(result_gen) + + # Assert - Verify successful restoration + # 1. All original values are restored exactly + assert ( + data_output.value == encrypted_text_data["original_text"] + ), "Original text should be fully restored" + + # 2. Correct output structure + assert data_output.output_name == "data", "Output should be named 'data'" + + # 3. Metrics show correct number of restored entities + assert ( + metrics_output.value["total_depseudo_count"] == 2 + ), "Should restore 2 encrypted entities (name and email)" + + # 4. System retrieved key from secret management tool + mock_create_get_key.assert_called_once_with("decrypt", "test_key") + + +@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") +def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes): + """AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key.""" + # Arrange - Create text with multiple PII types encrypted + original_text = "Contact John Doe at john.doe@example.com or call 555-1234." + fernet = Fernet(fernet_key) + encrypted_name = fernet.encrypt(b"John Doe").decode() + encrypted_email = fernet.encrypt(b"john.doe@example.com").decode() + encrypted_phone = fernet.encrypt(b"555-1234").decode() + encrypted_text = ( + f"Contact {{encrypt:{encrypted_name}}} at " + f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}." + ) + + mock_create_get_key.return_value = fernet_key + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", key_name="multi_pii_key") + ) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act + result_gen = depseudonymize_unstructured(context, input_text=encrypted_text) + data_output = next(result_gen) + metrics_output = next(result_gen) + + # Assert + assert data_output.value == original_text, "All PII types should be restored" + assert ( + metrics_output.value["total_depseudo_count"] == 3 + ), "Should restore 3 encrypted entities (name, email, phone)" + mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key") + + +# ------------------- AC2: Restoration Denial when Key is Missing ---------------------------- + + +@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") +def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict): + """AC2: Deny restoration when decryption key is missing from secret management tool.""" + # Arrange - Mock Vault to indicate key is missing + mock_create_get_key.side_effect = ValueError( + "Fernet key 'non_existent_key' not found in Vault for decrypt." + ) + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", key_name="non_existent_key") + ) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act & Assert - Verify system fails the restoration request + with pytest.raises( + ValueError, + match="Fernet key 'non_existent_key' not found in Vault for decrypt.", + ) as exc_info: + list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) + + # Verify error message is clear and actionable + assert "not found in Vault" in str( + exc_info.value + ), "Error message should indicate key is missing from Vault" + + # Verify system attempted to retrieve the key (logged attempt) + mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key") + + +# ------------- AC3: Restoration Denial when Access is Unauthorized -------------------------- + + +@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") +def test_ac3_restoration_denial_when_unauthorized_access( + mock_create_get_key, encrypted_text_data: dict +): + """AC3: Deny restoration when participant is not authorized to access the decryption key.""" + # Arrange - Mock Vault to deny access + mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key") + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig( + technique=DecryptConfig(type="decrypt", key_name="unauthorized_key") + ) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act & Assert - Verify system denies access + with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info: + list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) + + # Verify error message clearly indicates access denial + assert "Access denied" in str( + exc_info.value + ), "Error message should clearly indicate access was denied" + + # Verify the unauthorized access attempt was logged (function was called) + mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key") + + +# ------------------- AC4: Restoration Denial when Key is Invalid ---------------------------- + + +@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") +def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict): + """AC4: Deny restoration when decryption key does not correspond to the encrypted fields.""" + # Arrange - Mock Vault to return a different (wrong) key + invalid_key = Fernet.generate_key() # A different, incorrect key + mock_create_get_key.return_value = invalid_key + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key")) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act & Assert - Verify system fails the restoration + with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info: + list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) + + # Verify error message indicates decryption failure + assert "Invalid Fernet token" in str( + exc_info.value + ), "Error message should indicate the key is invalid for this data" + + # Verify key was retrieved (system attempted decryption) + mock_create_get_key.assert_called_once_with("decrypt", "wrong_key") + + +# -------------------------------- Additional Edge Cases ---------------------------------------- + + +def test_depseudonymize_unstructured_no_decrypt_config(): + """Edge case: Text is returned unchanged when no decryption techniques are configured.""" + # Arrange + original_text = "This text has no {encrypt:values} to decrypt." + config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques + context = build_op_context(op_config=config.model_dump()) + + # Act + result_gen = depseudonymize_unstructured(context, input_text=original_text) + result_output = next(result_gen) + metrics_output = next(result_gen) + + # Assert + assert ( + result_output.value == original_text + ), "Text should remain unchanged when no decryption is configured" + assert ( + metrics_output.value["total_depseudo_count"] == 0 + ), "Should report zero decryptions performed" + + +def test_depseudonymize_unstructured_empty_text(): + """Edge case: Empty input text is returned unchanged with zero decryptions performed.""" + # Arrange + empty_text = "" + config = DepseudonymizeUnstructuredConfig( + used_function=[ + DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key")) + ] + ) + context = build_op_context(op_config=config.model_dump()) + + # Act + with patch( + "src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key" + ) as mock_key: + mock_key.return_value = Fernet.generate_key() + result_gen = depseudonymize_unstructured(context, input_text=empty_text) + result_output = next(result_gen) + metrics_output = next(result_gen) + + # Assert + assert result_output.value == "", "Empty text should remain empty" + assert ( + metrics_output.value["total_depseudo_count"] == 0 + ), "Should report zero decryptions for empty text" diff --git a/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py b/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py new file mode 100644 index 0000000..b89fad3 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py @@ -0,0 +1,1119 @@ +""" +Test suite for field-level pseudonymisation operations (encrypt technique). + +This test suite covers the encryption pseudonymisation technique for structured dataframes, +validating the following Acceptance Criteria: + +## Test Coverage Summary + +### Acceptance Criteria Coverage: +- AC1 (Supported Technique Applied Correctly): 7 tests +- AC2 (Invalid Execution Handling): 7 tests +- AC3 (DataFrame Compliance): 6 tests +- AC4 (Audit Logging - Success): 2 tests +- AC5 (Audit Logging - Failure): 3 tests +- Additional Coverage: 7 tests + +### Test Pattern: +- Each test uses build_op_context with config_to_dagster_dict for configuration +- Tests validate dual outputs (data, metrics) +- Vault access is mocked for isolation + +""" + +import pandas as pd +import pytest +from dagster import build_op_context +from cryptography.fernet import Fernet +from hvac.exceptions import InvalidPath +from unittest.mock import patch, MagicMock + +from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( + AnonymisePseudonymizeStructuredConfig, + EncryptConfig, + HashConfig, + PseudoTechniqueConfig, +) +from template_code_location.field_level_pseudo_anonymisation.ops import anonymize_pseudonymize_structured + +# Import helper functions (fixtures are auto-discovered by pytest) +from .conftest import ( + run_encrypt_op, + clear_vault_key, + get_vault_key, + config_to_dagster_dict, +) + + +# -------------------------------- Test Markers Configuration -------------------------------- + +# Register custom markers +pytest.mark.slow = pytest.mark.slow +pytest.mark.security = pytest.mark.security +pytest.mark.edge_case = pytest.mark.edge_case + + +# -------------------------------- Test-Specific Fixtures ---------------------------------------- + + +@pytest.fixture +def encrypt_single_column_config(): + """ + Configuration for encrypting a single column (email). + Tests basic encryption functionality. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="test_email_key" + ) + ) + ] + ) + + +@pytest.fixture +def encrypt_multiple_columns_config(): + """ + Configuration for encrypting multiple columns (name, email). + Tests encryption across multiple fields. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["name", "email"], key_name="test_multi_key" + ) + ) + ] + ) + + +@pytest.fixture +def encrypt_mixed_types_config(): + """ + Configuration for encrypting columns with different data types. + Tests that encryption handles type conversion (int, float -> string). + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + columns=["id", "age", "salary"], + key_name="test_numeric_key", + ) + ) + ] + ) + + +@pytest.fixture +def encrypt_with_unchanged_columns_config(): + """ + Configuration that encrypts some columns while leaving others unchanged. + Tests AC3 requirement for unchanged column preservation. + """ + return AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="test_partial_key" + ) + ) + ] + ) + + +# -------------------------------- Test-Specific Fixtures ---------------------------------------- + + +def test_encrypt_single_column_applied_correctly(sample_df, encrypt_single_column_config): + """ + AC1: Tests that encryption is applied correctly to a single column. + + Scenario: The system applies encryption to the 'email' field + Given: A structured dataset with an email column + And: A valid encryption configuration for the email field + When: The participant triggers the execution + Then: The email field must be transformed with Fernet encryption + And: The encrypted values must be different from the original values + And: The encrypted values must be valid Fernet tokens (decodable) + """ + # Clear any existing test key + clear_vault_key("test_email_key") + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + + # Verify output structure + assert result_df is not None, "Result DataFrame should not be None" + assert metrics is not None, "Metrics should not be None" + + # Verify email column is encrypted (values changed) + assert not result_df["email"].equals( + sample_df["email"] + ), "Email column should be encrypted (values should change)" + + # Verify all encrypted values are different from originals + for orig, enc in zip(sample_df["email"], result_df["email"]): + assert orig != enc, f"Original value '{orig}' should be encrypted" + + # Verify encrypted values are valid Fernet tokens (can be decrypted) + key = get_vault_key("test_email_key") + f = Fernet(key) + for enc_value in result_df["email"]: + decrypted = f.decrypt(enc_value.encode()).decode() + assert ( + decrypted in sample_df["email"].values + ), f"Decrypted value '{decrypted}' should match an original email" + + # Verify row count is preserved + assert len(result_df) == len(sample_df), "Row count should be preserved" + + +def test_encrypt_multiple_columns_applied_correctly(sample_df, encrypt_multiple_columns_config): + """ + AC1: Tests that encryption is applied correctly to multiple columns. + + Scenario: The system applies encryption to multiple fields (name, email) + Given: A structured dataset with name and email columns + And: A valid encryption configuration for both fields + When: The participant triggers the execution + Then: Both fields must be transformed with Fernet encryption + And: Each field uses the same encryption key (as specified) + """ + clear_vault_key("test_multi_key") + + result_df, metrics = run_encrypt_op(encrypt_multiple_columns_config, sample_df.copy()) + + # Verify both columns are encrypted + assert not result_df["name"].equals(sample_df["name"]), "Name column should be encrypted" + assert not result_df["email"].equals(sample_df["email"]), "Email column should be encrypted" + + # Verify all values are encrypted + key = get_vault_key("test_multi_key") + f = Fernet(key) + + for enc_name in result_df["name"]: + decrypted = f.decrypt(enc_name.encode()).decode() + assert decrypted in sample_df["name"].values + + for enc_email in result_df["email"]: + decrypted = f.decrypt(enc_email.encode()).decode() + assert decrypted in sample_df["email"].values + + +def test_encrypt_numeric_columns_applied_correctly(sample_df, encrypt_mixed_types_config): + """ + AC1: Tests that encryption handles numeric data types correctly. + + Scenario: The system applies encryption to numeric fields (id, age, salary) + Given: A structured dataset with integer and float columns + And: A valid encryption configuration for numeric fields + When: The participant triggers the execution + Then: Numeric values must be converted to strings and encrypted + And: Original numeric values should be recoverable via decryption + """ + clear_vault_key("test_numeric_key") + + result_df, metrics = run_encrypt_op(encrypt_mixed_types_config, sample_df.copy()) + + # Verify all numeric columns are now string type (encrypted) + assert result_df["id"].dtype == object, "Encrypted id should be object/string type" + assert result_df["age"].dtype == object, "Encrypted age should be object/string type" + assert result_df["salary"].dtype == object, "Encrypted salary should be object/string type" + + # Verify original numeric values can be recovered + key = get_vault_key("test_numeric_key") + f = Fernet(key) + + for enc_id in result_df["id"]: + decrypted = int(f.decrypt(enc_id.encode()).decode()) + assert decrypted in sample_df["id"].values + + +def test_encrypt_key_generation_on_first_use(sample_df, encrypt_single_column_config): + """ + AC1: Tests that encryption key is automatically generated and stored in Vault. + + Scenario: First-time encryption generates a key automatically + Given: A structured dataset with valid configuration + And: No encryption key exists in Vault for the specified key_name + When: The participant triggers the execution + Then: The system must generate a new Fernet key + And: Store it in Vault at the specified path + And: Use it for encryption + """ + clear_vault_key("test_email_key") + + # Verify key doesn't exist before encryption + with pytest.raises(InvalidPath): + get_vault_key("test_email_key") + + result_df, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + + # Verify key was created + key = get_vault_key("test_email_key") + assert key is not None, "Encryption key should be created in Vault" + assert len(key) == 44, "Fernet key should be 44 bytes (base64 encoded 32 bytes)" + + # Verify the key works for decryption + f = Fernet(key) + for enc_email in result_df["email"]: + decrypted = f.decrypt(enc_email.encode()).decode() + assert decrypted in sample_df["email"].values + + +def test_encrypt_uses_existing_vault_key(sample_df, encrypt_single_column_config): + """ + AC1: Tests that encryption uses an existing key from Vault if present. + + Scenario: Encryption reuses existing key for consistent pseudonymisation + Given: A structured dataset + And: An encryption key already exists in Vault + When: The participant triggers the execution + Then: The system must use the existing key (not generate a new one) + And: The same input produces the same encrypted output (deterministic with same key) + """ + clear_vault_key("test_email_key") + + # First encryption - generates key + result_df_1, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + key_1 = get_vault_key("test_email_key") + + # Second encryption - should use same key + result_df_2, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + key_2 = get_vault_key("test_email_key") + + # Verify same key is used + assert key_1 == key_2, "Encryption should reuse existing Vault key" + + +# ----------------------- AC2: Invalid Execution Handling ------------------------------------ + + +def test_encrypt_missing_column_error(encrypt_single_column_config): + """ + AC2: Tests graceful error handling when a specified column doesn't exist. + + Scenario: The system aborts gracefully when column is missing + Given: A structured dataset + And: A configuration specifying a non-existent column + When: The participant triggers the execution + Then: The system must raise a clear ValueError + And: The error message must indicate which columns are missing + """ + df_missing_column = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "age": [25, 30, 35], + # Missing 'email' column + } + ) + + with pytest.raises(ValueError) as exc_info: + run_encrypt_op(encrypt_single_column_config, df_missing_column) + + assert "not present in the DataFrame" in str( + exc_info.value + ), "Error message should indicate missing columns" + assert "email" in str(exc_info.value), "Error message should mention the missing 'email' column" + + +def test_encrypt_empty_dataframe_handled(encrypt_single_column_config): + """ + AC2: Tests graceful handling of empty DataFrame input. + + Scenario: The system processes empty DataFrame without errors + Given: An empty structured dataset (no rows) + And: A valid encryption configuration + When: The participant triggers the execution + Then: The system must return an empty DataFrame with correct schema + And: No errors should be raised + """ + clear_vault_key("test_email_key") + + empty_df = pd.DataFrame(columns=["id", "name", "email", "age", "salary", "department"]) + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, empty_df) + + assert len(result_df) == 0, "Result should be empty" + assert "email" in result_df.columns, "Email column should exist in schema" + + +def test_encrypt_vault_connection_error(): + """ + AC2: Tests error handling when Vault is unreachable. + + Scenario: The system fails gracefully when Vault is unavailable + Given: A structured dataset with valid configuration + When: Vault service is unreachable or misconfigured + Then: The system must raise a clear error + And: The error message must indicate the Vault connection issue + + Note: This test requires Vault to be down or uses a bad URL. + For testing purposes, we simulate by using invalid credentials. + """ + # Create a mock client that raises an exception when accessing Vault + mock_client_instance = MagicMock() + mock_client_instance.secrets.kv.v2.read_secret_version.side_effect = Exception( + "Simulated Vault connection error" + ) + + with patch("hvac.Client", return_value=mock_client_instance): + df = pd.DataFrame( + { + "id": [1], + "name": ["Test"], + "email": ["test@example.com"], + "age": [30], + "salary": [50000.0], + "department": ["IT"], + } + ) + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="test_email_key" + ) + ) + ] + ) + with pytest.raises(ValueError) as exc_info: + run_encrypt_op(config, df) + + error_message = str(exc_info.value) + assert ( + "Simulated Vault connection error" in error_message + ), "Error should indicate Vault connection issue" + + +def test_encrypt_null_values_handled(encrypt_single_column_config): + """ + AC2: Tests handling of NULL/NaN values in encrypted columns. + + Scenario: The system handles null values appropriately + Given: A structured dataset with NULL values in the column to encrypt + And: A valid encryption configuration + When: The participant triggers the execution + Then: The system must process null values (encrypt "nan" string or handle appropriately) + And: Not raise an exception + """ + clear_vault_key("test_email_key") + + df_with_nulls = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "name": ["Alice", "Bob", "Charlie", "David"], + "email": ["alice@example.com", None, "charlie@example.com", pd.NA], + "age": [25, 30, 35, 40], + "salary": [50000.0, 60000.0, 70000.0, 80000.0], + "department": ["HR", "IT", "Finance", "IT"], + } + ) + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, df_with_nulls) + + # Verify execution completed without errors + assert result_df is not None + assert len(result_df) == 4 + + # Verify null values were processed (encrypted as string "None" or "nan") + key = get_vault_key("test_email_key") + f = Fernet(key) + + # The null values get converted to string "None" or "nan" before encryption + for enc_email in result_df["email"]: + decrypted = f.decrypt(enc_email.encode()).decode() + # Decrypted value should be original or string representation of null + assert decrypted in [ + "alice@example.com", + "charlie@example.com", + "None", + "nan", + "", + ] + + +def test_encrypt_duplicate_column_configuration_error(): + """ + AC2: Tests that duplicate columns across techniques are rejected. + + Scenario: Configuration validation prevents duplicate column assignments + Given: A configuration that assigns the same column to multiple techniques + When: The configuration is validated + Then: The system must raise a ValueError during configuration creation + And: The error message must indicate duplicate column assignment + """ + with pytest.raises(ValueError) as exc_info: + AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key1") + ), + PseudoTechniqueConfig( + technique=HashConfig( + type="hash", + columns=["email"], # Duplicate column + algorithm="sha256", + ) + ), + ] + ) + + assert "Duplicate column" in str( + exc_info.value + ), "Error should indicate duplicate column configuration" + + +# ------------------ AC3: DataFrame Input and Output Compliance ------------------------------ + + +def test_encrypt_dataframe_input_output_format(sample_df, encrypt_single_column_config): + """ + AC3: Tests that input and output are both pandas DataFrames. + + Scenario: The system accepts DataFrame input and returns DataFrame output + Given: A structured dataset as pandas DataFrame + And: A valid encryption configuration + When: The participant triggers the execution + Then: The system must return a pandas DataFrame + And: The DataFrame structure must be preserved + """ + clear_vault_key("test_email_key") + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + + # Verify output is a DataFrame + assert isinstance(result_df, pd.DataFrame), "Output must be a pandas DataFrame" + + # Verify DataFrame structure preserved + assert list(result_df.columns) == list(sample_df.columns), "Column names should be preserved" + assert len(result_df) == len(sample_df), "Row count should be preserved" + + +def test_encrypt_data_types_transformed_correctly(sample_df, encrypt_mixed_types_config): + """ + AC3: Tests that data types are transformed appropriately after encryption. + + Scenario: Encrypted columns change to string type + Given: A structured dataset with various data types (int, float, str) + And: An encryption configuration for multiple columns + When: The participant triggers the execution + Then: All encrypted columns must be of type object/string + And: This transformation is valid and consistent with encryption technique + """ + clear_vault_key("test_numeric_key") + + # Store original types + original_types = sample_df.dtypes.to_dict() + + result_df, _ = run_encrypt_op(encrypt_mixed_types_config, sample_df.copy()) + + # Verify encrypted columns are now object/string type + assert result_df["id"].dtype == object, "Encrypted integer column should become object type" + assert result_df["age"].dtype == object, "Encrypted integer column should become object type" + assert result_df["salary"].dtype == object, "Encrypted float column should become object type" + + # Verify data types changed (not same as original) + assert result_df["id"].dtype != original_types["id"], "Data type should change after encryption" + + +def test_encrypt_unchanged_columns_preserved(sample_df, encrypt_with_unchanged_columns_config): + """ + AC3: Tests that columns not specified for encryption remain unchanged. + + Scenario: Non-encrypted columns remain identical + Given: A structured dataset with multiple columns + And: An encryption configuration for only one column (email) + When: The participant triggers the execution + Then: Columns not specified (id, name, age, salary, department) must remain unchanged + And: Their values and data types must be identical to the input + """ + clear_vault_key("test_partial_key") + + result_df, _ = run_encrypt_op(encrypt_with_unchanged_columns_config, sample_df.copy()) + + # Verify unchanged columns are identical + assert result_df["id"].equals(sample_df["id"]), "ID column should remain unchanged" + assert result_df["name"].equals(sample_df["name"]), "Name column should remain unchanged" + assert result_df["age"].equals(sample_df["age"]), "Age column should remain unchanged" + assert result_df["salary"].equals(sample_df["salary"]), "Salary column should remain unchanged" + assert result_df["department"].equals( + sample_df["department"] + ), "Department column should remain unchanged" + + # Verify encrypted column is changed + assert not result_df["email"].equals( + sample_df["email"] + ), "Email column should be encrypted (changed)" + + +def test_encrypt_schema_consistency(sample_df, encrypt_multiple_columns_config): + """ + AC3: Tests that DataFrame schema is consistent and coherent. + + Scenario: Output DataFrame has consistent schema + Given: A structured dataset + And: A multi-column encryption configuration + When: The participant triggers the execution + Then: Output DataFrame must have same column names as input + And: Column order must be preserved + And: No columns should be added or removed + """ + clear_vault_key("test_multi_key") + + result_df, _ = run_encrypt_op(encrypt_multiple_columns_config, sample_df.copy()) + + # Verify column names are identical + assert list(result_df.columns) == list(sample_df.columns), "Column names must be identical" + + # Verify column order is preserved + for i, col in enumerate(sample_df.columns): + assert result_df.columns[i] == col, f"Column order should be preserved at position {i}" + + # Verify no extra columns added + assert len(result_df.columns) == len( + sample_df.columns + ), "Number of columns should remain the same" + + +def test_encrypt_index_preservation(sample_df, encrypt_single_column_config): + """ + AC3: Tests that DataFrame index is preserved after encryption. + + Scenario: DataFrame index remains unchanged + Given: A structured dataset with default index + And: A valid encryption configuration + When: The participant triggers the execution + Then: The output DataFrame must preserve the original index + And: No extraneous index column should be added + """ + clear_vault_key("test_email_key") + + # Set custom index to verify preservation + sample_df_with_index = sample_df.copy() + sample_df_with_index.index = [10, 20, 30, 40, 50] + + result_df, _ = run_encrypt_op(encrypt_single_column_config, sample_df_with_index) + + # Verify index is preserved + assert list(result_df.index) == list( + sample_df_with_index.index + ), "DataFrame index should be preserved" + + +# ------------- AC4: Execution Audit & Logging - Positive Scenario --------------------------- + + +def test_encrypt_successful_execution_logging(sample_df, encrypt_single_column_config): + """ + AC4: Tests that successful execution produces appropriate logs/metadata. + + Scenario: Successful pseudonymisation execution is logged + Given: A structured dataset with valid configuration + When: The participant triggers the execution + And: The execution completes successfully + Then: The system must return metrics output + And: Metrics should confirm successful operation + + Note: Dagster automatically logs: + - Timestamp of execution (run start/end times) + - Workflow run identifier (run_id) + - Configuration parameters (captured in op_config) + - Success status (run status in Dagster UI) + + This test validates the op returns proper outputs for Dagster to log. + """ + clear_vault_key("test_email_key") + + op_config_dict = config_to_dagster_dict(encrypt_single_column_config) + context = build_op_context(op_config=op_config_dict) + + # Capture run context information + run_id = context.run_id + + # Execute the operation + result_df, metrics = anonymize_pseudonymize_structured(context, df=sample_df.copy()) + + # Verify outputs for logging + assert result_df is not None, "Data output should be present for logging" + assert metrics is not None, "Metrics output should be present for logging" + assert isinstance(metrics.value, dict), "Metrics should be a dict" + + # Verify run context is available (Dagster provides this automatically) + assert run_id is not None, "Run ID should be available for audit logging" + + # Verify configuration is captured (can be logged) + assert "used_function" in op_config_dict, "Configuration should be captured for audit" + # In Dagster format, technique is nested under the discriminator key + technique_config = op_config_dict["used_function"][0]["technique"] + assert "encrypt" in technique_config, "Encrypt technique should be present" + assert ( + technique_config["encrypt"]["key_name"] == "test_email_key" + ), "Key name should be logged (but not key value)" + + # Verify no PII is in metrics (compliance requirement) + metrics_str = str(metrics.value) + for email in sample_df["email"]: + assert email not in metrics_str, "PII values should not appear in metrics/logs" + + +def test_encrypt_configuration_parameters_logged(sample_df, encrypt_multiple_columns_config): + """ + AC4: Tests that configuration parameters are properly captured for audit. + + Scenario: Configuration details are available for compliance logging + Given: A multi-column encryption configuration + When: The participant triggers the execution + Then: The system must capture configuration parameters including: + - Selected technique (encrypt) + - Columns to encrypt + - Key name (but not key value) + And: These parameters should be accessible for audit logging + """ + clear_vault_key("test_multi_key") + + op_config_dict = config_to_dagster_dict(encrypt_multiple_columns_config) + context = build_op_context(op_config=op_config_dict) + + result_df, metrics = anonymize_pseudonymize_structured(context, df=sample_df.copy()) + + # Verify configuration details are captured + technique_config = op_config_dict["used_function"][0]["technique"] + assert "encrypt" in technique_config, "Encrypt technique should be present" + assert set(technique_config["encrypt"]["columns"]) == {"name", "email"} + assert technique_config["encrypt"]["key_name"] == "test_multi_key" + + # Verify encryption key itself is NOT in config (security) + config_str = str(op_config_dict) + try: + key = get_vault_key("test_multi_key") + assert ( + key.decode() not in config_str + ), "Encryption key value should never be in logged configuration" + except Exception: + pass # Key might not exist yet + + +# ------------- AC5: Execution Audit & Logging - Negative Scenario --------------------------- + + +def test_encrypt_failed_execution_logging(encrypt_single_column_config): + """ + AC5: Tests that failed execution provides error details for audit. + + Scenario: Failed pseudonymisation execution is logged with error details + Given: A structured dataset with valid configuration + When: The participant triggers the execution + And: The execution fails (e.g., missing column) + Then: The system must raise an exception with clear error message + And: The error message should indicate the failure reason + And: Configuration parameters should still be accessible for audit + And: No PII should be exposed in error messages + """ + df_missing_column = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + # Missing 'email' column - will cause failure + } + ) + + op_config_dict = config_to_dagster_dict(encrypt_single_column_config) + context = build_op_context(op_config=op_config_dict) + run_id = context.run_id + + # Execute and capture failure + with pytest.raises(ValueError) as exc_info: + # Need to consume the generator to trigger execution + list(anonymize_pseudonymize_structured(context, df=df_missing_column)) + + # Verify error details are available for logging + error_message = str(exc_info.value) + assert ( + "not present in the DataFrame" in error_message + ), "Error message should explain failure reason" + assert "email" in error_message, "Error message should mention the problematic column" + + # Verify run context is available for failure logging + assert run_id is not None, "Run ID should be available for failure audit" + + # Verify configuration is still accessible for audit + assert op_config_dict is not None, "Configuration should be accessible for failure audit" + + # Verify no actual data values in error message (PII protection) + for name in ["Alice", "Bob", "Charlie"]: + assert name not in error_message, "PII values should not appear in error messages" + + +def test_encrypt_stack_trace_available_on_failure(encrypt_single_column_config): + """ + AC5: Tests that stack trace is available for debugging failed executions. + + Scenario: Failed execution provides stack trace for troubleshooting + Given: A configuration that will cause failure + When: The execution fails + Then: Python exception with stack trace should be raised + And: Stack trace should be available for logging (Dagster captures this) + And: Stack trace should not contain PII values + """ + df_missing_column = pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}) + + try: + run_encrypt_op(encrypt_single_column_config, df_missing_column) + pytest.fail("Should have raised ValueError") + except ValueError: + # Verify exception information is available + import traceback + + stack_trace = traceback.format_exc() + + assert "ValueError" in stack_trace, "Exception type should be in stack trace" + assert ( + "not present in the DataFrame" in stack_trace + ), "Error message should be in stack trace" + + # Verify stack trace contains code location + assert ( + "ops.py" in stack_trace or "anonymize_pseudonymize_structured" in stack_trace + ), "Stack trace should indicate error location" + + +def test_encrypt_vault_error_logged_appropriately(sample_df): + """ + AC5: Tests that Vault-related errors are logged with appropriate detail. + + Scenario: Vault connection/authentication errors are captured + Given: A configuration with invalid Vault setup + When: The execution attempts to access Vault + And: Vault access fails + Then: The system must raise an error with Vault-specific details + And: The error should indicate the Vault-related nature of the failure + + Note: This test validates error handling structure; actual Vault errors + depend on Vault availability. + """ + # Create a mock client that raises an exception when accessing Vault + mock_client_instance = MagicMock() + mock_client_instance.secrets.kv.v2.read_secret_version.side_effect = Exception( + "Simulated Vault authentication error" + ) + + with patch("hvac.Client", return_value=mock_client_instance): + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["email"], key_name="test_email_key" + ) + ) + ] + ) + with pytest.raises(ValueError) as exc_info: + run_encrypt_op(config, sample_df) + + error_message = str(exc_info.value) + assert ( + "Simulated Vault authentication error" in error_message + ), "Error should indicate Vault-related failure" + + +# --------------- Additional Edge Cases & Integration Tests ---------------------------------- + + +def test_encrypt_large_dataset_performance(encrypt_single_column_config): + """ + Additional test: Validates encryption works with larger datasets. + + Tests that encryption scales to realistic dataset sizes without errors. + """ + clear_vault_key("test_email_key") + + # Create a larger dataset (1000 rows) + large_df = pd.DataFrame( + { + "id": range(1000), + "name": [f"Person{i}" for i in range(1000)], + "email": [f"person{i}@example.com" for i in range(1000)], + "age": [25 + (i % 50) for i in range(1000)], + "salary": [50000.0 + (i * 100) for i in range(1000)], + "department": ["HR", "IT", "Finance"] * 333 + ["HR"], + } + ) + + # Save original values for comparison + original_emails = large_df["email"].copy() + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, large_df) + + assert len(result_df) == 1000, "All rows should be processed" + assert not result_df["email"].equals(original_emails), "All email values should be encrypted" + + +def test_encrypt_special_characters_in_data(encrypt_single_column_config): + """ + Additional test: Validates encryption handles special characters correctly. + + Tests that encryption works with unicode, special chars, emojis, etc. + """ + clear_vault_key("test_email_key") + + df_special = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "name": ["Müller", "José", "李明", "🙂 John"], + "email": [ + "test@müller.de", + "josé@example.com", + "李明@example.cn", + "emoji@😀.com", + ], + "age": [25, 30, 35, 40], + "salary": [50000.0, 60000.0, 70000.0, 80000.0], + "department": ["HR", "IT", "Finance", "IT"], + } + ) + + # Save original values for comparison + original_emails = df_special["email"].copy().tolist() + + result_df, metrics = run_encrypt_op(encrypt_single_column_config, df_special) + + # Verify special characters are encrypted and recoverable + key = get_vault_key("test_email_key") + f = Fernet(key) + + decrypted_emails = [f.decrypt(enc.encode()).decode() for enc in result_df["email"]] + assert set(decrypted_emails) == set( + original_emails + ), "Special characters should be preserved through encryption/decryption" + + +def test_encrypt_deterministic_within_session(sample_df, encrypt_single_column_config): + """ + Additional test: Validates encryption produces consistent results with same key. + + Note: Fernet encryption includes a timestamp, so it's NOT deterministic. + This test validates that decryption recovers the original value consistently. + """ + clear_vault_key("test_email_key") + + # First encryption + result_df_1, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + + # Get the key used + key = get_vault_key("test_email_key") + f = Fernet(key) + + # Verify first encryption decrypts correctly + decrypted_1 = [f.decrypt(enc.encode()).decode() for enc in result_df_1["email"]] + assert decrypted_1 == sample_df["email"].tolist(), "Decryption should recover original values" + + # Second encryption with same key (different encrypted values due to timestamp) + result_df_2, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) + + # Verify second encryption also decrypts correctly + decrypted_2 = [f.decrypt(enc.encode()).decode() for enc in result_df_2["email"]] + assert ( + decrypted_2 == sample_df["email"].tolist() + ), "Decryption should consistently recover original values" + + # Note: Encrypted values will be different due to Fernet's timestamp + assert not result_df_1["email"].equals( + result_df_2["email"] + ), "Fernet encryption includes timestamp, so outputs differ" + + +def test_encrypt_empty_string_values(encrypt_single_column_config): + """ + Additional test: Validates encryption handles empty strings correctly. + """ + clear_vault_key("test_email_key") + + df_empty_strings = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "", "Charlie"], + "email": ["alice@example.com", "", "charlie@example.com"], + "age": [25, 30, 35], + "salary": [50000.0, 60000.0, 70000.0], + "department": ["HR", "IT", "Finance"], + } + ) + + result_df, _ = run_encrypt_op(encrypt_single_column_config, df_empty_strings) + + # Verify empty strings are encrypted + key = get_vault_key("test_email_key") + f = Fernet(key) + + decrypted_emails = [f.decrypt(enc.encode()).decode() for enc in result_df["email"]] + assert "" in decrypted_emails, "Empty strings should be encrypted and recoverable" + + +@pytest.mark.edge_case +def test_encrypt_very_long_strings(encrypt_single_column_config): + """ + Edge case: Encryption of very long string values (e.g., 10KB+) + + Validates that Fernet encryption handles large strings without truncation. + """ + clear_vault_key("test_email_key") + + # Create DataFrame with very long strings + long_string = "x" * 10000 # 10KB string + df_long_strings = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "email": [ + f"{long_string}@example.com", + "bob@example.com", + "charlie@example.com", + ], + "age": [25, 30, 35], + "salary": [50000.0, 60000.0, 70000.0], + "department": ["HR", "IT", "Finance"], + } + ) + + result_df, _ = run_encrypt_op(encrypt_single_column_config, df_long_strings) + + # Verify long string is encrypted and recoverable + key = get_vault_key("test_email_key") + f = Fernet(key) + decrypted = f.decrypt(result_df.loc[0, "email"].encode()).decode() + assert ( + decrypted == f"{long_string}@example.com" + ), "Very long strings should be encrypted and recoverable" + + +@pytest.mark.edge_case +def test_encrypt_column_with_all_identical_values(encrypt_single_column_config): + """ + Edge case: Encryption when all values in a column are identical + + Validates that encryption produces different outputs for identical inputs + (due to Fernet's timestamp-based nonce). + """ + clear_vault_key("test_email_key") + + df_identical = pd.DataFrame( + { + "id": [1, 2, 3, 4, 5], + "name": ["Alice"] * 5, + "email": ["same@example.com"] * 5, # All identical + "age": [30] * 5, + "salary": [60000.0] * 5, + "department": ["IT"] * 5, + } + ) + + result_df, _ = run_encrypt_op(encrypt_single_column_config, df_identical) + + # Verify all encrypted values are unique (due to Fernet timestamp) + encrypted_values = result_df["email"].tolist() + assert ( + len(set(encrypted_values)) == 5 + ), "Fernet should produce unique ciphertexts even for identical plaintexts" + + # Verify all decrypt to same original value + key = get_vault_key("test_email_key") + f = Fernet(key) + decrypted_values = [f.decrypt(enc.encode()).decode() for enc in encrypted_values] + assert all( + val == "same@example.com" for val in decrypted_values + ), "All encrypted values should decrypt to same original" + + +@pytest.mark.edge_case +def test_encrypt_whitespace_only_values(encrypt_single_column_config): + """ + Edge case: Encryption of whitespace-only values + """ + clear_vault_key("test_email_key") + + df_whitespace = pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "email": [" ", "\t\t", "\n\n"], # Various whitespace + "age": [25, 30, 35], + "salary": [50000.0, 60000.0, 70000.0], + "department": ["HR", "IT", "Finance"], + } + ) + + # Store original values before encryption + original_emails = df_whitespace["email"].tolist() + + result_df, _ = run_encrypt_op(encrypt_single_column_config, df_whitespace) + + # Verify whitespace values are encrypted and recoverable + key = get_vault_key("test_email_key") + f = Fernet(key) + encrypted_emails = result_df["email"].tolist() + + for orig_ws, enc_val in zip(original_emails, encrypted_emails): + decrypted = f.decrypt(enc_val.encode()).decode() + assert ( + decrypted == orig_ws + ), f"Whitespace value {repr(orig_ws)} should be preserved, but got {repr(decrypted)}" + + +@pytest.mark.edge_case +@pytest.mark.parametrize( + "column_type,test_values", + [ + ("integer", [1, 2, 3, 4, 5]), + ("float", [1.1, 2.2, 3.3, 4.4, 5.5]), + ("string", ["a", "b", "c", "d", "e"]), + ], +) +def test_encrypt_various_data_types(column_type, test_values): + """ + Parameterized test: Encryption across different pandas data types + """ + clear_vault_key("test_type_key") + + df = pd.DataFrame( + { + "id": range(len(test_values)), + "test_column": test_values, + "name": ["Person"] * len(test_values), + "email": ["test@example.com"] * len(test_values), + "age": [30] * len(test_values), + "salary": [60000.0] * len(test_values), + "department": ["IT"] * len(test_values), + } + ) + + config = AnonymisePseudonymizeStructuredConfig( + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", columns=["test_column"], key_name="test_type_key" + ) + ) + ] + ) + + result_df, _ = run_encrypt_op(config, df) + + # Verify encryption occurred (values changed to strings) + assert ( + result_df["test_column"].dtype == object + ), f"Encrypted {column_type} should become object type" + + # Verify decryption recovers original values + key = get_vault_key("test_type_key") + f = Fernet(key) + for idx, orig_val in enumerate(test_values): + decrypted = f.decrypt(result_df.loc[idx, "test_column"].encode()).decode() + assert decrypted == str( + orig_val + ), f"Decrypted value should match original {column_type} value" diff --git a/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py b/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py new file mode 100644 index 0000000..8d6a3cc --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py @@ -0,0 +1,853 @@ +""" +Test suite for field-level pseudonymisation operations on unstructured data. + +This test suite validates the pseudonymisation of unstructured text with PII detection, +covering the following Acceptance Criteria: + +## Test Coverage Summary + +### Acceptance Criteria Coverage: +- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests +- AC2 (Invalid Execution Handling): 5 tests +- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests +- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests +- Additional Coverage: 3 tests + +### Test Pattern: +- Each test uses build_op_context with config_to_dagster_dict for configuration +- Tests validate dual outputs (data, metrics) +- Vault access is mocked for isolation +- Tests validate Scrubadub automatic PII detection +- Tests ensure placeholder replacement for unconfigured PII +""" + +import pytest +import re +from dagster import build_op_context +from unittest.mock import patch, MagicMock + +from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( + AnonymisePseudonymizeUnstructuredConfig, + EncryptConfig, + RetainConfig, + PseudoTechniqueConfig, +) +from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum +from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import ( + anonymize_pseudonymize_unstructured, +) + +from .conftest import clear_vault_key + + +def config_to_dagster_dict_unstructured(config): + """Convert unstructured config to Dagster format.""" + config_dict = {"language": config.language.value, "used_function": []} + + for func_config in config.used_function: + technique = func_config.technique + technique_type = technique.type + technique_dict = technique.model_dump() + + if "pii" in technique_dict: + technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii] + + technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"} + + config_dict["used_function"].append( + {"technique": {technique_type: technique_dict_without_type}} + ) + + return config_dict + + +def run_unstructured_op(config, text): + """ + Helper to run unstructured pseudonymisation op. + + Returns: + tuple: (result_text: str, metrics_markdown: str) + """ + context = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) + result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text) + + # Extract actual values from Output objects + return result_text.value, metrics.value + + +def parse_metrics_markdown(metrics_md: str) -> dict: + """ + Parse markdown metrics into structured dict for easier testing. + + Args: + metrics_md: Markdown metrics string from op output + + Returns: + dict with keys: total_pii_detected, pii_by_type, techniques_applied, language + """ + result = { + "total_pii_detected": 0, + "pii_by_type": {}, + "techniques_applied": {}, + "language": "", + } + + # Extract total PII detected + total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md) + if total_match: + result["total_pii_detected"] = int(total_match.group(1)) + + # Extract language + lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md) + if lang_match: + result["language"] = lang_match.group(1) + + # Extract PII by type from table + pii_table_section = re.search( + r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)", + metrics_md, + ) + if pii_table_section: + for line in pii_table_section.group(1).strip().split("\n"): + parts = [p.strip() for p in line.split("|") if p.strip()] + if len(parts) == 2: + entity_type, count = parts + result["pii_by_type"][entity_type] = int(count) + + # Extract techniques applied + techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md) + if techniques_section: + for line in techniques_section.group(1).strip().split("\n"): + tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line) + if tech_match: + pii_type, technique = tech_match.groups() + result["techniques_applied"][pii_type] = technique + + return result + + +# -------------------------------- Fixtures ---------------------------------------- + + +@pytest.fixture +def sample_text_en(): + """English text with various PII types.""" + return """ + John Smith works at Acme Corporation. His email is john.smith@example.com + and his phone number is +1-555-123-4567. He lives in New York City at + 123 Main Street, Apartment 4B. His SSN is 123-45-6789. + """ + + +@pytest.fixture +def sample_text_multi_person(): + """Text with multiple person names.""" + return """ + The meeting included Alice Johnson, Bob Williams, and Charlie Brown. + They discussed the project with Maria Garcia and David Wilson. + """ + + +@pytest.fixture +def sample_text_mixed_pii(): + """Text with multiple PII types for AC1 comprehensive testing.""" + return """ + Contact Information: + Name: Dr. Emily Watson + Email: emily.watson@hospital.com + Phone: +44-20-7946-0958 + Website: https://patient-portal.hospital.com/records + """ + + +@pytest.fixture +def encrypt_person_config(): + """Configuration to encrypt PERSON entities.""" + return AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_person_key", + ) + ) + ], + ) + + +@pytest.fixture +def retain_person_config(): + """Configuration to retain PERSON entities unchanged.""" + return AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) + ], + ) + + +@pytest.fixture +def mixed_technique_config(): + """Configuration with encryption and retention for AC1 testing.""" + return AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], + key_name="test_mixed_key", + ) + ), + PseudoTechniqueConfig( + technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS]) + ), + ], + ) + + +# ================================================================================================ +# AC1: Pseudonymisation and Retention Are Applied Correctly +# ================================================================================================ + + +def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config): + """AC1: Test that configured PII types are encrypted correctly.""" + clear_vault_key("test_person_key") + + result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii) + metrics = parse_metrics_markdown(metrics_md) + + # Verify person name is encrypted (not in plaintext) + assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted" + + # Verify encryption token is present + assert "{encrypt:" in result_text, "Encrypted token should be present in result" + + # Verify PII was detected and processed + assert metrics["total_pii_detected"] > 0, "System should detect PII entities" + assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII" + + # Verify text structure is preserved (surrounding text intact) + assert "Contact Information:" in result_text, "Non-PII text structure should be preserved" + + +def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person): + """AC1: Test that PII types marked for retention remain unchanged.""" + retain_config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) + ], + ) + + result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person) + metrics = parse_metrics_markdown(metrics_md) + + # Verify retained PII types remain in plaintext + assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged" + assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged" + + # Verify technique applied is 'retain' + assert ( + "retain" in metrics["techniques_applied"].get("PERSON", "").lower() + ), "Retain technique should be recorded for PERSON type" + + +def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii): + """AC1: Test that unconfigured PII types are replaced with placeholders.""" + encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_person_only_key", + ) + ) + ], + ) + + clear_vault_key("test_person_only_key") + + result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) + + # Verify person is encrypted (configured) + assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" + + # Verify unconfigured PII types have placeholders + assert ( + "{{" in result_text and "}}" in result_text + ), "Unconfigured PII should be replaced with placeholders" + + # Verify original unconfigured PII values are not in result + assert ( + "emily.watson@hospital.com" not in result_text + ), "Unconfigured EMAIL should be replaced with placeholder" + + # Verify placeholder format + assert ( + "{{EMAIL}}" in result_text or "{{URL}}" in result_text + ), "Placeholders should indicate entity type" + + +def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config): + """AC1: Test that multiple techniques (encrypt, retain) are applied correctly.""" + clear_vault_key("test_mixed_key") + + result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii) + metrics = parse_metrics_markdown(metrics_md) + + # Verify encrypted PII types (PERSON, EMAIL) + assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" + assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted" + + # Verify retained PII type (PHONE_NUMBERS) + assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained" + + # Verify metrics reflect different techniques + assert ( + "encrypt" in metrics["techniques_applied"].get("PERSON", "").lower() + ), "Encrypt technique should be applied to PERSON" + assert ( + "encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower() + ), "Encrypt technique should be applied to EMAIL" + assert ( + "retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower() + ), "Retain technique should be applied to PHONE_NUMBERS" + + +def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config): + """AC1: Test that all instances of a configured PII type are processed.""" + clear_vault_key("test_person_key") + + result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person) + metrics = parse_metrics_markdown(metrics_md) + + # Verify all person names are encrypted + person_names = [ + "Alice Johnson", + "Bob Williams", + "Charlie Brown", + "Maria Garcia", + "David Wilson", + ] + for name in person_names: + assert name not in result_text, f"All PERSON instances should be encrypted: {name}" + + # Verify metrics count multiple instances + assert metrics["pii_by_type"].get("PERSON", 0) >= len( + person_names + ), f"Should detect at least {len(person_names)} PERSON entities" + + +def test_ac1_empty_text_returns_empty(encrypt_person_config): + """AC1: Test that empty or null text input raises a ValueError.""" + clear_vault_key("test_person_key") + + with pytest.raises(ValueError) as exc_info: + run_unstructured_op(encrypt_person_config, "") + + assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input" + + +def test_ac1_text_without_pii_remains_unchanged(): + """AC1: Test that text without any PII remains unchanged after processing.""" + no_pii_text = """ + The weather today is sunny with a high of 25 degrees Celsius. + The conference starts at 9:00 AM in Room 301. + """ + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_no_pii_key", + ) + ) + ], + ) + + clear_vault_key("test_no_pii_key") + + result_text, metrics_md = run_unstructured_op(config, no_pii_text) + metrics = parse_metrics_markdown(metrics_md) + + assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged" + assert metrics["total_pii_detected"] == 0, "No PII should be detected" + + +def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii): + """AC1: Test that placeholders for unconfigured PII indicate the entity type.""" + encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_placeholder_key", + ) + ) + ], + ) + + clear_vault_key("test_placeholder_key") + + result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) + metrics = parse_metrics_markdown(metrics_md) + + # Verify placeholder format (scrubadub uses {{TYPE}} format) + placeholder_pattern = r"\{\{[A-Z_]+\}\}" + placeholders = re.findall(placeholder_pattern, result_text) + + assert ( + len(placeholders) > 0 + ), "Result should contain entity-type placeholders for unconfigured PII" + + # Verify metrics track which PII types were detected + assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types" + + +# ================================================================================================ +# AC2: Invalid Execution Handling +# ================================================================================================ + + +def test_ac2_graceful_abort_on_scrubadub_failure(): + """AC2: Test graceful abort when the PII detection engine (Scrubadub) fails.""" + text = "Test user John Smith with email john@example.com" + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_abort_key", + ) + ) + ], + ) + + clear_vault_key("test_abort_key") + + # Mock Scrubadub to fail at the right import path + with patch( + "field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber" + ) as mock_scrubber_class: + mock_scrubber = MagicMock() + mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error") + mock_scrubber_class.return_value = mock_scrubber + + with pytest.raises(RuntimeError) as exc_info: + run_unstructured_op(config, text) + + error_msg = str(exc_info.value).lower() + assert ( + "pii" in error_msg + or "detection" in error_msg + or "scrubadub" in error_msg + or "failed" in error_msg + ), "Error message should indicate PII detection failure" + + +def test_ac2_graceful_abort_on_encryption_failure(sample_text_en): + """AC2: Test graceful abort when an encryption technique fails during execution.""" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_encrypt_fail_key", + ) + ) + ], + ) + + clear_vault_key("test_encrypt_fail_key") + + # Mock encrypt function at correct path - it's imported from techniques module + encrypt_path = ( + "field_level_pseudo_anonymisation" + ".techniques.anonymisation_pseudonymisation_techniques.encrypt" + ) + with patch(encrypt_path) as mock_encrypt: + mock_encrypt.side_effect = Exception("Encryption algorithm failure") + + with pytest.raises(RuntimeError) as exc_info: + run_unstructured_op(config, sample_text_en) + + error_msg = str(exc_info.value).lower() + assert ( + "encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg + ), "Error message should indicate encryption failure" + + +def test_ac2_null_text_input_raises_error(encrypt_person_config): + """AC2: Test that a null (None) text input is rejected with an error.""" + clear_vault_key("test_person_key") + + # Dagster will raise DagsterTypeCheckDidNotPass before op executes + from dagster import DagsterTypeCheckDidNotPass + + with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)): + run_unstructured_op(encrypt_person_config, None) + + +def test_ac2_invalid_language_configuration(): + """AC2: Test that an unsupported language in the config raises a validation error.""" + # This should fail at config creation due to Pydantic validation + with pytest.raises((ValueError, TypeError)): + AnonymisePseudonymizeUnstructuredConfig( + language="invalid_lang", # Should fail Pydantic validation + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key" + ) + ) + ], + ) + + +def test_ac2_very_large_text_processing(): + """AC2: Test that very large text inputs are processed successfully without memory errors.""" + # Create large text with repeated PII patterns + large_text = ( + """ + John Smith works at company. Email: john.smith@example.com. + """ + * 1000 + ) # ~60KB of text with repeated PII + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], + key_name="test_large_text_key", + ) + ) + ], + ) + + clear_vault_key("test_large_text_key") + + result_text, metrics_md = run_unstructured_op(config, large_text) + metrics = parse_metrics_markdown(metrics_md) + + # Verify processing completed + assert result_text is not None, "Large text should be processed successfully" + assert len(result_text) > 0, "Result should not be empty" + assert metrics["total_pii_detected"] > 0, "PII should be detected in large text" + + +# ================================================================================================ +# AC3: Execution Audit & Logging - Positive Scenario +# ================================================================================================ + + +def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config): + """AC3: Test that successful execution context contains a run ID for logging.""" + clear_vault_key("test_person_key") + + op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) + context = build_op_context(op_config=op_config_dict) + + # Capture run context + run_id = context.run_id + + # Execute operation + result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) + + # Verify run identifier is available for logging + assert run_id is not None, "Run ID must be available for audit logging" + + # Verify outputs are returned (for Dagster to log) + assert result_text is not None, "Result text should be available for logging" + assert metrics is not None, "Metrics should be available for logging" + + +def test_ac3_successful_execution_logs_configuration_parameters( + sample_text_en, mixed_technique_config +): + """AC3: Test that the used configuration is accessible for logging on success.""" + clear_vault_key("test_mixed_key") + + op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config) + context = build_op_context(op_config=op_config_dict) + + result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) + + # Verify configuration is captured and accessible + assert "used_function" in op_config_dict, "Configuration must be accessible for logging" + assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured" + + # Verify techniques are logged + techniques = [func["technique"] for func in op_config_dict["used_function"]] + assert any( + "encrypt" in str(tech) for tech in techniques + ), "Encrypt technique should be in configuration" + assert any( + "retain" in str(tech) for tech in techniques + ), "Retain technique should be in configuration" + + # Verify metrics contain technique information (in markdown string) + metrics_str = metrics.value + assert ( + "Techniques Applied" in metrics_str + ), "Applied techniques should be in metrics for logging" + + +def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config): + """AC3: Test that logs and metrics from a successful run do not contain raw PII.""" + clear_vault_key("test_person_key") + + op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) + context = build_op_context(op_config=op_config_dict) + + result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii) + + # Verify raw PII values are not in metrics + metrics_str = metrics.value + + sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"] + + for pii_value in sensitive_values: + assert ( + pii_value not in metrics_str + ), f"Raw PII value should not appear in metrics: {pii_value}" + + # Verify configuration logs do not contain raw PII + config_str = str(op_config_dict) + for pii_value in sensitive_values: + assert ( + pii_value not in config_str + ), f"Raw PII value should not appear in configuration logs: {pii_value}" + + +# ================================================================================================ +# AC4: Execution Audit & Logging - Negative Scenario +# ================================================================================================ + + +def test_ac4_failed_execution_logs_error_details(): + """AC4: Negative execution should surface clear error details (encryption key failure).""" + text = "Test user John Smith" + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_fail_log_key", + ) + ) + ], + ) + clear_vault_key("test_fail_log_key") + ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) + + # Patch the key retrieval used inside unstructured_ops to force failure + with patch( + "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key", + side_effect=RuntimeError("Encryption key retrieval failed"), + ): + with pytest.raises(RuntimeError) as exc_info: + # Consume the generator to trigger execution and raise the exception + list(anonymize_pseudonymize_unstructured(ctx, text=text)) + + msg = str(exc_info.value).lower() + assert "key" in msg and "failed" in msg, "Error message should mention key failure" + + +def test_ac4_failed_execution_logs_configuration_used(): + """AC4: Test that the attempted configuration is available for logging on failure.""" + text = "Test data with person John Doe" + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_config_fail_key", + ) + ) + ], + ) + + clear_vault_key("test_config_fail_key") + + op_config_dict = config_to_dagster_dict_unstructured(config) + context = build_op_context(op_config=op_config_dict) + + # Mock _initialize_scrubber to fail + with patch( + "field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber" + ) as mock_init_scrubber: + mock_init_scrubber.side_effect = Exception("Scrubber module not available") + + with pytest.raises((RuntimeError, Exception)) as exc_info: + list(anonymize_pseudonymize_unstructured(context, text=text)) + + # Verify configuration is still accessible despite failure + assert op_config_dict is not None, "Configuration must be accessible for failure audit" + assert ( + "used_function" in op_config_dict + ), "Technique configuration should be available for diagnosis" + + # Verify error was raised with proper message + error_msg = str(exc_info.value).lower() + assert ( + "pii" in error_msg + or "detection" in error_msg + or "failed" in error_msg + or "scrubber" in error_msg + or "module" in error_msg + ), "Error should indicate detection/processing failed" + + +def test_ac4_failed_execution_logs_failure_reason(): + """AC4: Test that the reason for a failure is clearly indicated in the error message.""" + text = "User: Alice Smith, Email: alice@example.com" + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.en, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], + key_name="test_failure_reason_key", + ) + ) + ], + ) + + clear_vault_key("test_failure_reason_key") + + # Mock key retrieval function to fail + with patch( + "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key" + ) as mock_get_key: + mock_get_key.side_effect = RuntimeError("Vault connection timeout") + + with pytest.raises(RuntimeError) as exc_info: + run_unstructured_op(config, text) + + # Verify failure reason is in error message + error_msg = str(exc_info.value).lower() + assert ( + "encrypt" in error_msg + or "key" in error_msg + or "timeout" in error_msg + or "failed" in error_msg + ), "Error should indicate key retrieval/encryption failure" + + +# ================================================================================================ +# Additional Tests - Edge Cases and Integration +# ================================================================================================ + + +def test_multi_language_support_italian(): + """Additional test: Verify that Italian text is processed correctly.""" + italian_text = """ + Il dottor Marco Rossi lavora presso l'ospedale. + Email: marco.rossi@ospedale.it + Telefono: +39-06-12345678 + """ + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.it, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON], + key_name="test_italian_key", + ) + ) + ], + ) + + clear_vault_key("test_italian_key") + + result_text, metrics_md = run_unstructured_op(config, italian_text) + metrics = parse_metrics_markdown(metrics_md) + + # Verify processing occurred + assert result_text != italian_text, "Italian text should be processed" + assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text" + + +def test_special_characters_in_text(): + """Additional test: Verify handling of text with special Unicode characters.""" + special_text = """ + User: João da Silva 🇧🇷 + Email: joão@empresa.com.br + Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ® + """ + + config = AnonymisePseudonymizeUnstructuredConfig( + language=LanguageEnum.pt, + used_function=[ + PseudoTechniqueConfig( + technique=EncryptConfig( + type="encrypt", + pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], + key_name="test_special_chars_key", + ) + ) + ], + ) + + clear_vault_key("test_special_chars_key") + + result_text, metrics_md = run_unstructured_op(config, special_text) + + # Verify processing completed without encoding errors + assert result_text is not None, "Special characters should not cause processing failure" + assert len(result_text) > 0, "Result should not be empty" + + +def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config): + """Additional test: Verify encryption format consistency across runs.""" + clear_vault_key("test_person_key") + + result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en) + result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en) + + # Both should have encryption tokens + assert "{encrypt:" in result1, "First run should produce encrypted tokens" + assert "{encrypt:" in result2, "Second run should produce encrypted tokens" + + # Verify consistent PII detection + metrics1 = parse_metrics_markdown(metrics_md1) + metrics2 = parse_metrics_markdown(metrics_md2) + + assert ( + metrics1["total_pii_detected"] == metrics2["total_pii_detected"] + ), "PII detection should be consistent across runs" + + # Verify token format is consistent (Fernet base64 pattern) + token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}" + tokens1 = re.findall(token_pattern, result1) + tokens2 = re.findall(token_pattern, result2) + + assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated" diff --git a/tests/field_level_pseudo_anonymisation/test_jobs.py b/tests/field_level_pseudo_anonymisation/test_jobs.py new file mode 100644 index 0000000..616c3d5 --- /dev/null +++ b/tests/field_level_pseudo_anonymisation/test_jobs.py @@ -0,0 +1,58 @@ +from template_code_location.field_level_pseudo_anonymisation.jobs import ( + anonymize_pseudonymize_structured_job, + anonymize_pseudonymize_structured_job_s3, + depseudonymize_structured_job, + depseudonymize_structured_job_s3, + anonymize_pseudonymize_unstructured_job_s3, + anonymize_pseudonymize_unstructured_job, + depseudonymize_unstructured_job_s3, + depseudonymize_unstructured_job +) + + +def test_anonymize_pseudonymize_structured_job_is_callable(): + """Test anonymize_pseudonymize_structured_job is a valid Dagster job""" + assert callable(anonymize_pseudonymize_structured_job) + assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process') + + +def test_anonymize_pseudonymize_structured_job_s3_is_callable(): + """Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job""" + assert callable(anonymize_pseudonymize_structured_job_s3) + assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process') + + +def test_depseudonymize_structured_job_is_callable(): + """Test depseudonymize_structured_job is a valid Dagster job""" + assert callable(depseudonymize_structured_job) + assert hasattr(depseudonymize_structured_job, 'execute_in_process') + + +def test_depseudonymize_structured_job_s3_is_callable(): + """Test depseudonymize_structured_job_s3 is a valid Dagster job""" + assert callable(depseudonymize_structured_job_s3) + assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process') + + +def test_anonymize_pseudonymize_unstructured_job_is_callable(): + """Test anonymize_pseudonymize_unstructured_job is a valid Dagster job""" + assert callable(anonymize_pseudonymize_unstructured_job) + assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process') + + +def test_anonymize_pseudonymize_unstructured_job_s3_is_callable(): + """Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job""" + assert callable(anonymize_pseudonymize_unstructured_job_s3) + assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process') + + +def test_depseudonymize_unstructured_job_is_callable(): + """Test depseudonymize_unstructured_job is a valid Dagster job""" + assert callable(depseudonymize_unstructured_job) + assert hasattr(depseudonymize_unstructured_job, 'execute_in_process') + + +def test_depseudonymize_unstructured_job_s3_is_callable(): + """Test depseudonymize_unstructured_job_s3 is a valid Dagster job""" + assert callable(depseudonymize_unstructured_job_s3) + assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process') From 49f3afd6abbd7d60ac23654402d2d01f574af42e Mon Sep 17 00:00:00 2001 From: ILay Date: Fri, 24 Apr 2026 18:42:44 +0200 Subject: [PATCH 03/15] docs(SIMPL-24642): update Development Guide to reflect consolidated structure --- documents/Development Guide.md | 39 ++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/documents/Development Guide.md b/documents/Development Guide.md index 0f140ad..23c60d7 100644 --- a/documents/Development Guide.md +++ b/documents/Development Guide.md @@ -9,18 +9,35 @@ By following a *code-first approach*, developers ensure consistency, traceabilit Development must always begin in a local environment. This allows developers to rapidly iterate, test business logic, and validate DAG (Directed Acyclic Graph) structures without impacting production data. ### 2.1 Project Layout -To ensure compatibility with the Simpl-Open platform, every Dagster code location must adhere to the following directory structure: +This repository (`template-code-location`) serves as the **single consolidated code location** for all data services workflows. It contains the jobs, ops, and configurations previously spread across `data-processing`, `dataframe-level-anonymisation`, and `field-level-pseudo-anonymisation`. + ```text -project-root/ -├── dagster_code_location/ -│ ├── jobs/ # Executable workflows -│ ├── ops/ # Individual functional units (business logic) -│ ├── resources/ # External connections (Object storage, APIs, etc...) -│ └── repository.py # Central entry point for the code location -├── tests/ # Unit and integration tests -├── Dockerfile # Containerization instructions -├── pyproject.toml # Dependency management (Poetry/Pip/UV) -└── README.md # Documentation +template-code-location/ +├── src/ +│ └── template_code_location/ +│ ├── repository.py # Unified entry point (all jobs/sensors/resources) +│ ├── data_processing/ # Data cleaning & transformation ops/jobs +│ │ ├── config_models/ +│ │ ├── jobs.py +│ │ └── ops.py +│ ├── dataframe_level_anonymisation/ # k-anonymity, l-diversity, t-closeness +│ │ ├── config_models/ +│ │ ├── jobs.py +│ │ ├── ops.py +│ │ └── utils.py +│ ├── field_level_pseudo_anonymisation/ # Field-level encryption/hashing/redaction +│ │ ├── config_models/ +│ │ ├── techniques/ +│ │ ├── jobs.py +│ │ ├── ops.py +│ │ ├── unstructured_ops.py +│ │ └── utils.py +│ ├── jobs/ # Template example jobs +│ └── ops/ # Template example ops +├── tests/ # All tests (migrated from source repos) +├── Dockerfile +├── pyproject.toml +└── README.md ``` ### 2.2 Code Examples (Ops, Jobs, and Definitions) From 0847026b3243a4d4d8179b68b5b20339c0d6c765 Mon Sep 17 00:00:00 2001 From: ILay Date: Fri, 24 Apr 2026 19:14:36 +0200 Subject: [PATCH 04/15] fix: loosen numpy>=2.0.1 to resolve anjana dependency conflict --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3b2741f..4c6f2dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ # Data processing "pandas>=2.1.4", "pyarrow>=23.0", - "numpy>=2.4", + "numpy>=2.0.1", "lxml>=6.0", "xmltodict>=1.0", "rdflib>=7.6", From bdfbe3d3102227e0859f655372dfadc2be976d31 Mon Sep 17 00:00:00 2001 From: ILay Date: Mon, 27 Apr 2026 18:18:38 +0200 Subject: [PATCH 05/15] change pip to uv and update dependencies --- Dockerfile | 64 +++++++++++++++++++++++++++++++++++++++---- pipeline.variables.sh | 2 +- pyproject.toml | 26 +++++++++++++----- 3 files changed, 78 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index fd4e780..0c997fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,67 @@ FROM python:3.12-slim-bookworm -# Install git for git-based dependencies -RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/* +# --- Install uv (pinned for reproducibility) --- +COPY --from=ghcr.io/astral-sh/uv:0.10.8 /uv /uvx /bin/ WORKDIR /app -COPY pyproject.toml . -COPY src/ src/ +# Create non-root user with explicit UID/GID 1000 +RUN addgroup --gid 1000 appgroup && \ + adduser --uid 1000 --gid 1000 --disabled-password --gecos "" appuser -# Install the package and all dependencies -RUN pip install --no-cache-dir . +# Install system dependencies: +# - git: required to fetch util-services from GitLab (tool.uv.sources) +# - build-essential / gcc / g++ / python3-dev / cmake: native extensions +# (scrubadub-spacy → spaCy, pycanon, etc.) +# - curl: optional healthcheck / runtime tooling +RUN apt-get update && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential=12.9 \ + cmake=3.25.1-1 \ + gcc=4:12.2.0-3 \ + g++=4:12.2.0-3 \ + python3-dev=3.11.2-1+b1 \ + git=1:2.39.5-0+deb12u3 \ + curl=7.88.1-10+deb12u14 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/* \ + && rm -rf /var/tmp/* + +# Pre-own /app so appuser can write to it +RUN chown -R appuser:appgroup /app + +# Copy project metadata and source +COPY pyproject.toml . +COPY src/ ./src/ + +# uv environment knobs: +# UV_COMPILE_BYTECODE → compile .pyc files at install time for faster cold start +# UV_LINK_MODE=copy → copy files instead of symlinks (required in Docker layers) +# UV_SYSTEM_PYTHON=1 → install into the system Python (no extra venv needed) +ENV UV_COMPILE_BYTECODE=1 +ENV UV_LINK_MODE=copy +ENV UV_SYSTEM_PYTHON=1 + +# Install the project and all dependencies, respecting [tool.uv.sources] +# (git source for util-services and pytorch-cpu index for torch) +# BuildKit cache mount keeps the uv package cache across builds +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install . + +ENV PYTHONPATH="/app/src" + +# Make /app writable for the non-root user (e.g. spaCy model downloads) +RUN chown -R 1000:1000 /app && chmod -R u+w /app + +# Provide a real home directory for appuser +RUN mkdir -p /home/appuser && chown -R 1000:1000 /home/appuser +ENV HOME=/home/appuser + +USER appuser + +# Sanity-check: fail the build early if the dagster CLI is missing +RUN dagster --version EXPOSE 4000 diff --git a/pipeline.variables.sh b/pipeline.variables.sh index 3292612..4a3f9c4 100644 --- a/pipeline.variables.sh +++ b/pipeline.variables.sh @@ -1 +1 @@ -PROJECT_VERSION_NUMBER="0.0.1" \ No newline at end of file +PROJECT_VERSION_NUMBER="0.1.0" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4c6f2dc..7897316 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "lxml>=6.0", "xmltodict>=1.0", "rdflib>=7.6", - "openpyxl", + "openpyxl>=3.1.0", "xlrd>=2.0.1", "tabulate==0.8.10", "pyspellchecker>=0.8.4", @@ -35,14 +35,26 @@ dependencies = [ "pycanon==1.0.1.post2", "anjana>=1.0.0", # Field-level pseudo-anonymisation - "scrubadub", - "scrubadub_spacy", - "hvac", - "cryptography", - # Util services (git dependency) - "util-services @ git+https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git@v0.4.1", + "scrubadub>=2.0.0", + "scrubadub_spacy>=1.0.0", + "hvac>=2.0.0", + "cryptography>=42.0.0", + # Util services — resolved via [tool.uv.sources] (git) + "util-services", ] +[tool.uv] +exclude-dependencies = ["transformers", "spacy-transformers"] + +[tool.uv.sources] +torch = { index = "pytorch-cpu" } +util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "feature/SIMPL-24631" } + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + [project.optional-dependencies] dev = [ "pytest>=8.0.0", From b58e399130691ef93869b9d2003dbb45d4de4c5b Mon Sep 17 00:00:00 2001 From: ILay Date: Mon, 27 Apr 2026 18:52:34 +0200 Subject: [PATCH 06/15] update data processing jobs to use structured data functions --- .../data_processing/jobs.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/template_code_location/data_processing/jobs.py b/src/template_code_location/data_processing/jobs.py index 54fb939..674e3a1 100644 --- a/src/template_code_location/data_processing/jobs.py +++ b/src/template_code_location/data_processing/jobs.py @@ -1,8 +1,8 @@ from dagster import job from util_services.util_ops import ( preview_dataframe, - read_csv_from_s3, - write_csv_to_s3, + read_structured_from_s3, + write_df_to_s3, ) from .ops import ( remove_duplicates, @@ -21,10 +21,10 @@ from .ops import ( "resource_type": "RD_DATA" }) def remove_duplicates_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = remove_duplicates(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @@ -33,10 +33,10 @@ def remove_duplicates_job_s3(): "resource_type": "RD_DATA" }) def fill_missing_values_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = fill_missing_values(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @@ -45,10 +45,10 @@ def fill_missing_values_job_s3(): "resource_type": "RD_DATA" }) def standardize_categorical_values_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = standardize_categorical_values(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @@ -57,10 +57,10 @@ def standardize_categorical_values_job_s3(): "resource_type": "RD_DATA" }) def correct_typos_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = correct_typos(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @job(tags={ @@ -68,10 +68,10 @@ def correct_typos_job_s3(): "resource_type": "RD_DATA" }) def normalize_numeric_min_max_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = normalize_numeric_min_max(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @job(tags={ @@ -79,10 +79,10 @@ def normalize_numeric_min_max_job_s3(): "resource_type": "RD_DATA" }) def normalize_datetime_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = normalize_datetime(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @job(tags={ @@ -90,10 +90,10 @@ def normalize_datetime_job_s3(): "resource_type": "RD_DATA" }) def normalize_coordinates_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = normalize_coordinates(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @job(tags={ @@ -101,10 +101,10 @@ def normalize_coordinates_job_s3(): "resource_type": "RD_DATA" }) def add_global_aggregations_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = add_global_aggregations(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) @job(tags={ @@ -112,8 +112,8 @@ def add_global_aggregations_job_s3(): "resource_type": "RD_DATA" }) def filter_dataset_job_s3(): - org_df = read_csv_from_s3() + org_df = read_structured_from_s3() anon_df = filter_dataset(org_df) preview_dataframe(org_df) - write_csv_to_s3(anon_df) + write_df_to_s3(anon_df) preview_dataframe(anon_df) From 1fc7c7864a5a4f896fae45a2a2386496a9f154dd Mon Sep 17 00:00:00 2001 From: ILay Date: Wed, 29 Apr 2026 15:33:18 +0200 Subject: [PATCH 07/15] fix: update tabulate --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7897316..ba3c8c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "rdflib>=7.6", "openpyxl>=3.1.0", "xlrd>=2.0.1", - "tabulate==0.8.10", + "tabulate>=0.9", "pyspellchecker>=0.8.4", "PyGeodesy>=24.6.11", # Validation From bba5b99420a5c79fc3a6e9ac3e51ce5e29c395cf Mon Sep 17 00:00:00 2001 From: ILay Date: Tue, 5 May 2026 16:37:38 +0200 Subject: [PATCH 08/15] expose data_processing_job for test --- src/template_code_location/repository.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/template_code_location/repository.py b/src/template_code_location/repository.py index cf97606..f825e85 100644 --- a/src/template_code_location/repository.py +++ b/src/template_code_location/repository.py @@ -36,8 +36,11 @@ from template_code_location.field_level_pseudo_anonymisation.jobs import ( depseudonymize_unstructured_job_s3, ) +from template_code_location.jobs import data_processing_job + defs = Definitions( jobs=[ + data_processing_job, # Data processing remove_duplicates_job_s3, fill_missing_values_job_s3, From f0cac061b8e4af5b13bff3f559c4d15e7ed8135e Mon Sep 17 00:00:00 2001 From: ILay Date: Tue, 5 May 2026 16:48:47 +0200 Subject: [PATCH 09/15] update util-services --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ba3c8c7..de7ac13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ exclude-dependencies = ["transformers", "spacy-transformers"] [tool.uv.sources] torch = { index = "pytorch-cpu" } -util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "feature/SIMPL-24631" } +util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" } [[tool.uv.index]] name = "pytorch-cpu" From 2e6e78855290354e69c5645b53dbbb29ff9ddbde Mon Sep 17 00:00:00 2001 From: ILay Date: Tue, 5 May 2026 17:07:07 +0200 Subject: [PATCH 10/15] rename field-level ops and jobs --- .../field_level_pseudo_anonymisation/jobs.py | 64 +++++++++---------- src/template_code_location/repository.py | 16 ++--- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/jobs.py b/src/template_code_location/field_level_pseudo_anonymisation/jobs.py index 56baf11..0f39cfb 100644 --- a/src/template_code_location/field_level_pseudo_anonymisation/jobs.py +++ b/src/template_code_location/field_level_pseudo_anonymisation/jobs.py @@ -3,13 +3,13 @@ from util_services.util_ops import ( preview_dataframe, read_structured_to_df, write_df_to_local, - write_string_to_txt, - read_txt_to_string, - preview_txt, + write_string_to_unstructured, + read_unstructured_to_string, + preview_unstructured, read_structured_from_s3, write_df_to_s3, - read_txt_from_s3, - write_text_to_s3, + read_unstructured_from_s3, + write_unstructured_to_s3, ) from .ops import ( anonymize_pseudonymize_structured, @@ -23,7 +23,7 @@ from .unstructured_ops import ( @job(tags={ "business_operation": "ANONYMISATION_PSEUDONYMISATION" }) -def anonymize_pseudonymize_structured_job(): +def anonymise_pseudonymise_structured_job(): df = read_structured_to_df() preview_dataframe(df) df_anon, metrics = anonymize_pseudonymize_structured(df) @@ -35,7 +35,7 @@ def anonymize_pseudonymize_structured_job(): "business_operation": "ANONYMISATION_PSEUDONYMISATION", "resource_type": "RD_DATA" }) -def anonymize_pseudonymize_structured_job_s3(): +def anonymise_pseudonymise_structured_job_s3(): df = read_structured_from_s3() preview_dataframe(df) df_anon, metrics = anonymize_pseudonymize_structured(df) @@ -46,7 +46,7 @@ def anonymize_pseudonymize_structured_job_s3(): @job(tags={ "business_operation": "DEPSEUDONYMISATION" }) -def depseudonymize_structured_job(): +def depseudonymise_structured_job(): df = read_structured_to_df() preview_dataframe(df) df_anon, metrics = depseudonymize_structured(df) @@ -58,7 +58,7 @@ def depseudonymize_structured_job(): "business_operation": "DEPSEUDONYMISATION", "resource_type": "RD_DATA" }) -def depseudonymize_structured_job_s3(): +def depseudonymise_structured_job_s3(): df = read_structured_from_s3() preview_dataframe(df) df_anon, metrics = depseudonymize_structured(df) @@ -69,7 +69,7 @@ def depseudonymize_structured_job_s3(): @job(tags={ "business_operation": "ANONYMISATION_PSEUDONYMISATION" }) -def anonymize_pseudonymize_depseudonymize_structured_job(): +def anonymise_pseudonymise_depseudonymise_structured_job(): df = read_structured_to_df() preview_dataframe(df) df_pseduo, metrics = anonymize_pseudonymize_structured(df) @@ -81,46 +81,46 @@ def anonymize_pseudonymize_depseudonymize_structured_job(): @job(tags={ "business_operation": "ANONYMISATION_PSEUDONYMISATION" }) -def anonymize_pseudonymize_unstructured_job(): - text = read_txt_to_string() - preview_txt(text) +def anonymise_pseudonymise_unstructured_job(): + text = read_unstructured_to_string() + preview_unstructured(text) text_anon, metrics = anonymize_pseudonymize_unstructured(text) - preview_txt(text_anon) - preview_txt(metrics) - write_string_to_txt(text_anon) + preview_unstructured(text_anon) + preview_unstructured(metrics) + write_string_to_unstructured(text_anon) @job(tags={ "business_operation": "ANONYMISATION_PSEUDONYMISATION", "resource_type": "RD_DATA" }) -def anonymize_pseudonymize_unstructured_job_s3(): - text = read_txt_from_s3() - preview_txt(text) +def anonymise_pseudonymise_unstructured_job_s3(): + text = read_unstructured_from_s3() + preview_unstructured(text) text_anon, metrics = anonymize_pseudonymize_unstructured(text) - preview_txt(text_anon) - preview_txt(metrics) - write_text_to_s3(text_anon) + preview_unstructured(text_anon) + preview_unstructured(metrics) + write_unstructured_to_s3(text_anon) @job(tags={ "business_operation": "DEPSEUDONYMISATION" }) -def depseudonymize_unstructured_job(): - text = read_txt_to_string() - preview_txt(text) +def depseudonymise_unstructured_job(): + text = read_unstructured_to_string() + preview_unstructured(text) text_anon, metrics = depseudonymize_unstructured(text) - preview_txt(text_anon) - write_string_to_txt(text_anon) + preview_unstructured(text_anon) + write_string_to_unstructured(text_anon) @job(tags={ "business_operation": "DEPSEUDONYMISATION", "resource_type": "RD_DATA" }) -def depseudonymize_unstructured_job_s3(): - text = read_txt_from_s3() - preview_txt(text) +def depseudonymise_unstructured_job_s3(): + text = read_unstructured_from_s3() + preview_unstructured(text) text_anon, metrics = depseudonymize_unstructured(text) - preview_txt(text_anon) - write_text_to_s3(text_anon) + preview_unstructured(text_anon) + write_unstructured_to_s3(text_anon) diff --git a/src/template_code_location/repository.py b/src/template_code_location/repository.py index f825e85..d19d6fd 100644 --- a/src/template_code_location/repository.py +++ b/src/template_code_location/repository.py @@ -30,10 +30,10 @@ from template_code_location.dataframe_level_anonymisation.jobs import ( # Field-level pseudo-anonymisation jobs from template_code_location.field_level_pseudo_anonymisation.jobs import ( - anonymize_pseudonymize_structured_job_s3, - depseudonymize_structured_job_s3, - anonymize_pseudonymize_unstructured_job_s3, - depseudonymize_unstructured_job_s3, + anonymise_pseudonymise_structured_job_s3, + depseudonymise_structured_job_s3, + anonymise_pseudonymise_unstructured_job_s3, + depseudonymise_unstructured_job_s3, ) from template_code_location.jobs import data_processing_job @@ -57,10 +57,10 @@ defs = Definitions( t_closeness_job_s3, read_write_semistructured_job_s3, # Field-level pseudo-anonymisation - anonymize_pseudonymize_structured_job_s3, - depseudonymize_structured_job_s3, - anonymize_pseudonymize_unstructured_job_s3, - depseudonymize_unstructured_job_s3, + anonymise_pseudonymise_structured_job_s3, + depseudonymise_structured_job_s3, + anonymise_pseudonymise_unstructured_job_s3, + depseudonymise_unstructured_job_s3, ], sensors=[notify_success, notify_failure, notify_canceled], resources={"s3": s3_resource.configured({"resource_name": "selfS3"})}, From 733a38e128ec350dac562099474b6c40a6c85f0a Mon Sep 17 00:00:00 2001 From: ILay Date: Tue, 5 May 2026 17:26:47 +0200 Subject: [PATCH 11/15] fix: correct import path for data_processing_job --- src/template_code_location/repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/template_code_location/repository.py b/src/template_code_location/repository.py index d19d6fd..1d0be85 100644 --- a/src/template_code_location/repository.py +++ b/src/template_code_location/repository.py @@ -36,7 +36,7 @@ from template_code_location.field_level_pseudo_anonymisation.jobs import ( depseudonymise_unstructured_job_s3, ) -from template_code_location.jobs import data_processing_job +from template_code_location.jobs.jobs import data_processing_job defs = Definitions( jobs=[ From 004bcd5c01007b79469a56ff7659d12e6042856d Mon Sep 17 00:00:00 2001 From: ILay Date: Wed, 6 May 2026 10:58:17 +0200 Subject: [PATCH 12/15] change to import from modules --- pyproject.toml | 7 + .../data_processing/__init__.py | 0 .../data_processing/config_models/__init__.py | 18 - .../aggregation_configuration.py | 25 - .../columns_select_configuration.py | 17 - ...coordinates_normalization_configuration.py | 22 - .../config_models/fill_missing_config.py | 9 - .../config_models/filter_configuration.py | 52 - .../spell_check_configuration.py | 8 - .../data_processing/jobs.py | 119 -- .../data_processing/ops.py | 256 ---- .../dataframe_level_anonymisation/__init__.py | 0 .../config_models/__init__.py | 13 - .../config_models/base_config.py | 33 - .../config_models/hierarchies.py | 18 - .../k_anonymity_configuration.py | 11 - .../l_diversity_configuration.py | 8 - .../t_closeness_configuration.py | 8 - .../dataframe_level_anonymisation/jobs.py | 86 -- .../dataframe_level_anonymisation/ops.py | 187 --- .../dataframe_level_anonymisation/utils.py | 19 - .../__init__.py | 0 .../config_models/__init__.py | 28 - .../config_models/languages.py | 72 -- .../config_models/pii_entities.py | 24 - .../config_models/structured_config.py | 110 -- .../config_models/unstructured_config.py | 115 -- .../field_level_pseudo_anonymisation/jobs.py | 126 -- .../field_level_pseudo_anonymisation/ops.py | 77 -- .../techniques/__init__.py | 3 - ...onymisation_pseudonymisation_techniques.py | 42 - .../depseudonymisation_techniques.py | 9 - .../unstructured_ops.py | 428 ------- .../field_level_pseudo_anonymisation/utils.py | 32 - src/template_code_location/repository.py | 6 +- tests/__init__.py | 1 - tests/data_processing/__init__.py | 1 - tests/data_processing/conftest.py | 53 - tests/data_processing/conftest_utils.py | 7 - tests/data_processing/test_config_models.py | 202 --- tests/data_processing/test_integration.py | 185 --- tests/data_processing/test_jobs.py | 56 - tests/data_processing/test_ops.py | 700 ----------- .../dataframe_level_anonymisation/__init__.py | 1 - .../config_models/__init__.py | 1 - .../config_models/test_base_config.py | 54 - .../config_models/test_hierarchies.py | 48 - .../config_models/test_k_anonymity_config.py | 41 - .../config_models/test_l_diversity_config.py | 44 - .../config_models/test_t_closeness_config.py | 56 - .../test_jobs.py | 44 - .../dataframe_level_anonymisation/test_ops.py | 230 ---- .../test_utils.py | 70 -- .../__init__.py | 1 - .../conftest.py | 444 ------- .../test_config_models_coverage.py | 633 ---------- .../test_decrypt_structured.py | 1090 ---------------- .../test_decrypt_unstructured.py | 288 ----- .../test_encrypt_structured.py | 1119 ----------------- .../test_encrypt_unstructured.py | 853 ------------- .../test_jobs.py | 58 - 61 files changed, 10 insertions(+), 8258 deletions(-) delete mode 100644 src/template_code_location/data_processing/__init__.py delete mode 100644 src/template_code_location/data_processing/config_models/__init__.py delete mode 100644 src/template_code_location/data_processing/config_models/aggregation_configuration.py delete mode 100644 src/template_code_location/data_processing/config_models/columns_select_configuration.py delete mode 100644 src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py delete mode 100644 src/template_code_location/data_processing/config_models/fill_missing_config.py delete mode 100644 src/template_code_location/data_processing/config_models/filter_configuration.py delete mode 100644 src/template_code_location/data_processing/config_models/spell_check_configuration.py delete mode 100644 src/template_code_location/data_processing/jobs.py delete mode 100644 src/template_code_location/data_processing/ops.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/__init__.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/jobs.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/ops.py delete mode 100644 src/template_code_location/dataframe_level_anonymisation/utils.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/__init__.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/jobs.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/ops.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py delete mode 100644 src/template_code_location/field_level_pseudo_anonymisation/utils.py delete mode 100644 tests/__init__.py delete mode 100644 tests/data_processing/__init__.py delete mode 100644 tests/data_processing/conftest.py delete mode 100644 tests/data_processing/conftest_utils.py delete mode 100644 tests/data_processing/test_config_models.py delete mode 100644 tests/data_processing/test_integration.py delete mode 100644 tests/data_processing/test_jobs.py delete mode 100644 tests/data_processing/test_ops.py delete mode 100644 tests/dataframe_level_anonymisation/__init__.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/__init__.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/test_base_config.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/test_hierarchies.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py delete mode 100644 tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py delete mode 100644 tests/dataframe_level_anonymisation/test_jobs.py delete mode 100644 tests/dataframe_level_anonymisation/test_ops.py delete mode 100644 tests/dataframe_level_anonymisation/test_utils.py delete mode 100644 tests/field_level_pseudo_anonymisation/__init__.py delete mode 100644 tests/field_level_pseudo_anonymisation/conftest.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_config_models_coverage.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_decrypt_structured.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_encrypt_structured.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py delete mode 100644 tests/field_level_pseudo_anonymisation/test_jobs.py diff --git a/pyproject.toml b/pyproject.toml index de7ac13..5eb1ab4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,10 @@ dependencies = [ "cryptography>=42.0.0", # Util services — resolved via [tool.uv.sources] (git) "util-services", + # Code location packages — resolved via [tool.uv.sources] (git) + "data-processing", + "dataframe-level-anonymisation", + "field-level-pseudo-anonymisation", ] [tool.uv] @@ -49,6 +53,9 @@ exclude-dependencies = ["transformers", "spacy-transformers"] [tool.uv.sources] torch = { index = "pytorch-cpu" } util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" } +data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "feature/SIMPL-24642" } +dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "feature/SIMPL-24642" } +field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "feature/SIMPL-24642" } [[tool.uv.index]] name = "pytorch-cpu" diff --git a/src/template_code_location/data_processing/__init__.py b/src/template_code_location/data_processing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/template_code_location/data_processing/config_models/__init__.py b/src/template_code_location/data_processing/config_models/__init__.py deleted file mode 100644 index 5833cab..0000000 --- a/src/template_code_location/data_processing/config_models/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Configuration models for data processing.""" - -from .columns_select_configuration import ColumnsSelectConfiguration -from .fill_missing_config import FillMissingConfiguration -from .spell_check_configuration import SpellCheckConfiguration -from .coordinates_normalization_configuration import CoordinatesNormalizationConfiguration -from .aggregation_configuration import AggregationConfiguration -from .filter_configuration import DatasetFilterConfiguration, FilterCondition - -__all__ = [ - "ColumnsSelectConfiguration", - "FillMissingConfiguration", - "SpellCheckConfiguration", - "CoordinatesNormalizationConfiguration", - "AggregationConfiguration", - "FilterCondition", - "DatasetFilterConfiguration" -] diff --git a/src/template_code_location/data_processing/config_models/aggregation_configuration.py b/src/template_code_location/data_processing/config_models/aggregation_configuration.py deleted file mode 100644 index 553740f..0000000 --- a/src/template_code_location/data_processing/config_models/aggregation_configuration.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import List - -from pydantic import Field, field_validator - -from .columns_select_configuration import ColumnsSelectConfiguration - - -class AggregationConfiguration(ColumnsSelectConfiguration): - - operation: str = Field( - default="sum", - description="Aggregation operations: sum, mean, min, max, count" - ) - - @field_validator("operation") - @classmethod - def validate_operations(cls, value): - allowed = {"sum", "mean", "min", "max", "count"} - if value not in allowed: - raise ValueError( - f"Invalid aggregation operation '{value}'. " - f"Allowed values: {allowed}" - ) - - return value diff --git a/src/template_code_location/data_processing/config_models/columns_select_configuration.py b/src/template_code_location/data_processing/config_models/columns_select_configuration.py deleted file mode 100644 index 658450d..0000000 --- a/src/template_code_location/data_processing/config_models/columns_select_configuration.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import List -from pydantic import Field,field_validator -from dagster import Config - - -class ColumnsSelectConfiguration(Config): - columns: List[str] = Field( - default=["Name"], description="List of columns to process." - ) - - @field_validator("columns") - @classmethod - def ensure_unique_columns(cls, v: List[str]) -> List[str]: - - unique_values = list(dict.fromkeys(v)) - - return unique_values diff --git a/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py b/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py deleted file mode 100644 index 64342e4..0000000 --- a/src/template_code_location/data_processing/config_models/coordinates_normalization_configuration.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Optional - -from pydantic import Field, model_validator -from dagster import Config - - -class CoordinatesNormalizationConfiguration(Config): - latColumn: Optional[str] = Field( - default="lat", description="Latitude column name" - ) - lonColumn: Optional[str] = Field( - default="lon", description="Longitude column name" - ) - - @model_validator(mode="before") - @classmethod - def replace_nulls_with_defaults(cls, values): - if values.get("latColumn") is None: - values["latColumn"] = "lat" - if values.get("lonColumn") is None: - values["lonColumn"] = "lon" - return values diff --git a/src/template_code_location/data_processing/config_models/fill_missing_config.py b/src/template_code_location/data_processing/config_models/fill_missing_config.py deleted file mode 100644 index 4c9e5b2..0000000 --- a/src/template_code_location/data_processing/config_models/fill_missing_config.py +++ /dev/null @@ -1,9 +0,0 @@ -from typing import Dict -from dagster import Config -from pydantic import Field - - -class FillMissingConfiguration(Config): - fill_map: Dict[str, str] = Field( - default={"Age": "UNKNOWN_AGE"}, description="Missing values filling map." - ) diff --git a/src/template_code_location/data_processing/config_models/filter_configuration.py b/src/template_code_location/data_processing/config_models/filter_configuration.py deleted file mode 100644 index 86bde37..0000000 --- a/src/template_code_location/data_processing/config_models/filter_configuration.py +++ /dev/null @@ -1,52 +0,0 @@ -from enum import Enum -import operator -from typing import List, Literal, Callable -from pydantic import Field, model_validator -from dagster import Config -import pandas as pd - -class FilterOperator(str, Enum): - EQ = "==" - NE = "!=" - LT = "<" - LE = "<=" - GT = ">" - GE = ">=" - - @property - def function(self) -> Callable: - mapping = { - FilterOperator.EQ: operator.eq, - FilterOperator.NE: operator.ne, - FilterOperator.LT: operator.lt, - FilterOperator.LE: operator.le, - FilterOperator.GT: operator.gt, - FilterOperator.GE: operator.ge, - } - return mapping[self] - -class FilterCondition(Config): - column: str = Field(..., description="Name of the column to filter") - type: Literal["string", "numeric"] = Field(..., description="Column type (string or numeric)") - value: str = Field(..., description="Value to compare against") - op: FilterOperator = Field(default=FilterOperator.EQ, description="Operator to apply (string supports only EQ and NE)") - - @model_validator(mode="after") - def check_operator_compatibility(self) -> "FilterCondition": - if self.type == "string" and self.op not in [FilterOperator.EQ, FilterOperator.NE]: - raise ValueError( - f"Invalid operator '{self.op.name}' for type 'string'. " - "Only EQ (==) and NE (!=) are allowed." - ) - return self - - def apply(self, df: pd.DataFrame) -> pd.Series: - val = float(self.value) if self.type == "numeric" else self.value - return self.op.function(df[self.column], val) - -class DatasetFilterConfiguration(Config): - conditions: List[FilterCondition] = Field( - default=[], - description="List of filter conditions to apply on the dataset. " - "String columns support only 'EQ' and 'NE', numeric columns also support 'LT', 'LE', 'GT' and 'GE'." - ) diff --git a/src/template_code_location/data_processing/config_models/spell_check_configuration.py b/src/template_code_location/data_processing/config_models/spell_check_configuration.py deleted file mode 100644 index 7a12f87..0000000 --- a/src/template_code_location/data_processing/config_models/spell_check_configuration.py +++ /dev/null @@ -1,8 +0,0 @@ -from typing import Literal -from pydantic import Field - -from .columns_select_configuration import ColumnsSelectConfiguration - - -class SpellCheckConfiguration(ColumnsSelectConfiguration): - language: Literal["en", "es", "it", "fr", "pt", "de", "nl"] = Field(default="en", description="Language to use in the SpellChecker module.") diff --git a/src/template_code_location/data_processing/jobs.py b/src/template_code_location/data_processing/jobs.py deleted file mode 100644 index 674e3a1..0000000 --- a/src/template_code_location/data_processing/jobs.py +++ /dev/null @@ -1,119 +0,0 @@ -from dagster import job -from util_services.util_ops import ( - preview_dataframe, - read_structured_from_s3, - write_df_to_s3, -) -from .ops import ( - remove_duplicates, - fill_missing_values, - standardize_categorical_values, - correct_typos, - normalize_numeric_min_max, - normalize_datetime, - normalize_coordinates, - add_global_aggregations, - filter_dataset -) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def remove_duplicates_job_s3(): - org_df = read_structured_from_s3() - anon_df = remove_duplicates(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def fill_missing_values_job_s3(): - org_df = read_structured_from_s3() - anon_df = fill_missing_values(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def standardize_categorical_values_job_s3(): - org_df = read_structured_from_s3() - anon_df = standardize_categorical_values(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def correct_typos_job_s3(): - org_df = read_structured_from_s3() - anon_df = correct_typos(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def normalize_numeric_min_max_job_s3(): - org_df = read_structured_from_s3() - anon_df = normalize_numeric_min_max(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def normalize_datetime_job_s3(): - org_df = read_structured_from_s3() - anon_df = normalize_datetime(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def normalize_coordinates_job_s3(): - org_df = read_structured_from_s3() - anon_df = normalize_coordinates(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def add_global_aggregations_job_s3(): - org_df = read_structured_from_s3() - anon_df = add_global_aggregations(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job(tags={ - "business_operation": "PROCESSING", - "resource_type": "RD_DATA" -}) -def filter_dataset_job_s3(): - org_df = read_structured_from_s3() - anon_df = filter_dataset(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) diff --git a/src/template_code_location/data_processing/ops.py b/src/template_code_location/data_processing/ops.py deleted file mode 100644 index e380cb8..0000000 --- a/src/template_code_location/data_processing/ops.py +++ /dev/null @@ -1,256 +0,0 @@ -import pandas as pd -from dagster import Out, op -from spellchecker import SpellChecker - -from template_code_location.data_processing.config_models import ( - AggregationConfiguration, - ColumnsSelectConfiguration, - CoordinatesNormalizationConfiguration, - FillMissingConfiguration, - SpellCheckConfiguration, - DatasetFilterConfiguration -) - - -def _parse_dms_to_decimal(value): - """Parse a DMS (degrees-minutes-seconds) string to decimal degrees using PyGeodesy. - - Supported formats include (but are not limited to): - - 40°26'46"N / 40°26′46″N - - 40 26 46 N - - 40:26:46N - - 40d26m46sN - - -40.446 (already decimal – returned as-is) - - Returns None if parsing fails. - """ - from pygeodesy.dms import parseDMS - - if pd.isna(value): - return None - - text = str(value).strip() - if not text: - return None - - try: - return float(parseDMS(text)) - except (ValueError, TypeError): - try: - return float(text) - except (ValueError, TypeError): - return None - - -@op(out={"data": Out()}) -def remove_duplicates(context, df: pd.DataFrame): - """Remove duplicate rows from the input DataFrame.""" - logger = context.log - - before = df.shape[0] - - df = df.drop_duplicates() - - after = df.shape[0] - - logger.info(f"Removed {before - after} duplicate rows") - - return df - -@op(out={"data": Out()}) -def fill_missing_values(context, config: FillMissingConfiguration, df: pd.DataFrame): - """Fill missing values in the DataFrame according to the configured column-to-value mapping.""" - logger = context.log - - logger.info(f"Filling missing values: {config.fill_map}") - - return df.fillna(config.fill_map) - -@op(out={"data": Out()}) -def standardize_categorical_values(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): - """Standardize categorical values in selected columns by trimming whitespace and converting text to lowercase.""" - logger = context.log - - for col in config.columns: - if col not in df.columns: - logger.warning(f"Column '{col}' not found in DataFrame, skipping.") - continue - - original = df[col] - - standardized = ( - df[col] - .fillna("") - .astype(str) - .str.strip() - .str.lower() - ) - - changed_count = (original != standardized).sum() - df[col] = standardized - - logger.info(f"Standardized '{col}' column – {changed_count} values modified") - - return df - -@op(out={"data": Out()}) -def correct_typos(context, config: SpellCheckConfiguration, df: pd.DataFrame): - """Correct spelling mistakes in the specified text columns.""" - logger = context.log - - for column in config.columns: - if column not in df.columns: - logger.warning(f"Column '{column}' not found in DataFrame, skipping.") - continue - - spell = SpellChecker(language=config.language) - - original = df[column].astype(str) - corrected = original.apply(lambda x, spell_checker=spell: spell_checker.correction(x) if x else x) - - changed_count = (original != corrected).sum() - logger.info(f"Corrected typos in '{column}' – {changed_count} values modified") - - df[column] = corrected - - return df - -@op(out={"data": Out()}) -def normalize_datetime(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): - logger = context.log - - for col in config.columns: - if col not in df.columns: - logger.warning(f"Column '{col}' not found, skipping normalization.") - continue - - normalized = pd.to_datetime(df[col], utc=True, format="mixed", dayfirst=True, errors="coerce") - - if normalized.notna().sum() == 0: - logger.warning( - f"Column '{col}' has no normalizable datetime values, skipping." - ) - continue - - iso_col = f"{col}_iso" - - formatted = normalized.dt.strftime("%Y-%m-%dT%H:%M:%SZ").fillna("") - non_empty = formatted[formatted != ""] - if len(non_empty) > 0 and non_empty.str.startswith("1970-01-01").all(): - logger.warning( - f"Column '{col}' all normalized values are '1970-01-01', likely bad input — skipping." - ) - continue - - df[iso_col] = formatted - - logger.info(f"Normalized datetime column '{col}' into '{iso_col}'") - - return df - -@op(out={"data": Out()}) -def normalize_numeric_min_max(context, config: ColumnsSelectConfiguration, df: pd.DataFrame): - logger = context.log - - for col in config.columns: - if col not in df.columns: - logger.warning(f"Column '{col}' not found, skipping normalization.") - continue - - min_val = df[col].min() - max_val = df[col].max() - - if min_val == max_val: - logger.warning(f"Column '{col}' has constant values, skipping normalization.") - continue - - df[col + "_norm"] = (df[col] - min_val) / (max_val - min_val) - logger.info(f"Normalized numeric column '{col}'") - - return df - -@op(out={"data": Out()}) -def normalize_coordinates(context, config: CoordinatesNormalizationConfiguration, df: pd.DataFrame): - logger = context.log - - lat = config.latColumn - lon = config.lonColumn - - for col in [lat, lon]: - if pd.api.types.is_numeric_dtype(df[col]): - logger.info(f"Column '{col}' is numeric — coercing directly") - df[col] = pd.to_numeric(df[col], errors="coerce") - else: - logger.info(f"Column '{col}' is non-numeric — parsing as DMS with PyGeodesy") - df[col] = df[col].apply(_parse_dms_to_decimal) - - invalid_lat = df[lat].isnull().sum() - invalid_lon = df[lon].isnull().sum() - logger.info(f"Found {invalid_lat} invalid latitudes and {invalid_lon} invalid longitudes") - - df[lat] = df[lat].round(4) - df[lon] = df[lon].round(4) - - before_filter_rows = len(df) - df = df[(df[lat].between(-90, 90)) & (df[lon].between(-180, 180))] - after_filter_rows = len(df) - logger.info(f"Filtered coordinates out of range: removed {before_filter_rows - after_filter_rows} rows") - - logger.info(f"Coordinate normalization completed: resulting dataframe has {after_filter_rows} rows") - - return df - -@op(out={"data": Out()}) -def add_global_aggregations(context, config: AggregationConfiguration, df: pd.DataFrame): - logger = context.log - - group_by_cols = [] - - for col in config.columns: - if col not in df.columns: - logger.warning(f"Column '{col}' not found, skipping aggregation.") - continue - group_by_cols.append(col) - - if config.operation not in {"sum", "mean", "min", "max", "count"}: - logger.warning(f"Unsupported aggregation '{config.operation}'") - - numeric_cols = df.select_dtypes(include=['number']).columns.tolist() - cols_to_keep = list(set(numeric_cols + group_by_cols)) - df = df[[c for c in cols_to_keep if c in df.columns]] - df = df.groupby(group_by_cols).agg(config.operation).reset_index() - return df - -@op(out={"data": Out()}) -def filter_dataset(context, config: DatasetFilterConfiguration, df: pd.DataFrame): - logger = context.log - total_rows_before = len(df) - - logger.info(f"Starting dataset filtering: initial dataframe has {total_rows_before} rows") - - combined_mask = pd.Series([True] * total_rows_before, index=df.index) - - for condition in config.conditions: - if condition.column not in df.columns: - logger.warning(f"Column '{condition.column}' not found, skipping filtering.") - continue - if df[condition.column].isna().all(): - logger.warning(f"Column '{condition.column}' is empty (all NaN), skipping filtering.") - continue - try: - current_mask = condition.apply(df) - combined_mask &= current_mask - - logger.info(f"Applied filter: {condition.column} {condition.op.value} '{condition.value}'") - except Exception as e: - logger.error(f"Error applying filter on column '{condition.column}': {e}") - - filtered_df = df[combined_mask] - total_rows_after = len(filtered_df) - - logger.info( - f"Filtering completed: {total_rows_after} rows remain " - f"(removed {total_rows_before - total_rows_after} rows in total)" - ) - - return filtered_df diff --git a/src/template_code_location/dataframe_level_anonymisation/__init__.py b/src/template_code_location/dataframe_level_anonymisation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py b/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py deleted file mode 100644 index 0f490b5..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Configuration models for dataframe-level anonymization.""" - -from .k_anonymity_configuration import KAnonymityConfiguration -from .l_diversity_configuration import LDiversityConfiguration -from .t_closeness_configuration import TClosenessConfiguration -from .base_config import BaseConfiguration - -__all__ = [ - "BaseConfiguration", - "KAnonymityConfiguration", - "LDiversityConfiguration", - "TClosenessConfiguration", -] diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py b/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py deleted file mode 100644 index 4abf451..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/base_config.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Dict, List -from dagster import Config -from pydantic import Field, field_validator, model_validator - - -class BaseConfiguration(Config): - ident: List[str] = Field(default=["Name"], description="List of identifier column names.") - quasi_identifiers: List[str] = Field(default=["Age"], description="List of quasi-identifier column names.") - supp_level: float = Field(default=50.0, ge=0.0, le=100.0, description="Max suppression allowed (0–100).") - generalisation_hierarchies: Dict[str, str] = Field( - default={"Age": "simpl_age"}, description="Hierarchies used to generalize quasi-identifiers." - ) - - @field_validator("quasi_identifiers") - def validate_quasi_identifiers(cls, value): - if not value: - raise ValueError("At least one quasi-identifier must be provided.") - return value - - @field_validator("ident") - def validate_ident(cls, value): - if not value: - raise ValueError("At least one identifier must be provided.") - return value - - @model_validator(mode="after") - def check_no_overlap(self): - ident = set(self.ident) - quasi = set(self.quasi_identifiers) - overlap = ident & quasi - if overlap: - raise ValueError(f"Fields cannot be both identifiers and quasi-identifiers: {overlap}") - return self diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py b/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py deleted file mode 100644 index 65105a0..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/hierarchies.py +++ /dev/null @@ -1,18 +0,0 @@ -from anjana.anonymity.utils import utils - -simpl_age = { - 0: [age for age in range(0, 100)], - 1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5), - 2: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 10), - 3: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 20), - 4: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 100), -} -simpl_age2 = { - 0: [age for age in range(0, 100)], - 1: utils.generate_intervals([age for age in range(0, 100)], 0, 100, 5), -} -simpl_gender = {0: ["M", "F", "O"], 1: ["*", "*", "*"]} - - -def get_all_hierarchies(): - return {name: obj for name, obj in globals().items() if isinstance(obj, dict)} diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py deleted file mode 100644 index 0ddd88f..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/k_anonymity_configuration.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import List -from pydantic import Field - -from .base_config import BaseConfiguration - - -class KAnonymityConfiguration(BaseConfiguration): - k: int = Field(default=3, ge=2, description="Desired level of k-anonymity (must be >= 2).") - sensitive_attributes: List[str] = Field( - default=["Disease"], description="List of sensitive attribute column names." - ) diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py deleted file mode 100644 index c764f1d..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/l_diversity_configuration.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import Field -from .base_config import BaseConfiguration - - -class LDiversityConfiguration(BaseConfiguration): - k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).") - l: int = Field(default=3, ge=1, description="L-diversity level (must be >= 1)") - sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.") diff --git a/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py b/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py deleted file mode 100644 index 4461539..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/config_models/t_closeness_configuration.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import Field -from .base_config import BaseConfiguration - - -class TClosenessConfiguration(BaseConfiguration): - k: int = Field(default=2, ge=2, description="Desired level of k-anonymity (must be >= 2).") - t: float = Field(default=0.5, ge=0.0, le=1.0, description="Maximum t-distance threshold.") - sensitive_attribute: str = Field(default="Disease", description="Sensitive attribute name.") diff --git a/src/template_code_location/dataframe_level_anonymisation/jobs.py b/src/template_code_location/dataframe_level_anonymisation/jobs.py deleted file mode 100644 index 35c76f7..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/jobs.py +++ /dev/null @@ -1,86 +0,0 @@ -from dagster import job -from util_services.util_ops import ( - preview_dataframe, - read_structured_to_df, - write_df_to_local, - read_structured_from_s3, - write_df_to_s3, - write_semistructured_to_s3, - read_semistructured_from_s3 -) - -from .ops import apply_k_anonymity, apply_l_diversity, apply_t_closeness - - -@job(tags={ - "business_operation": "ANONYMISATION" -}) -def k_anonymity_job(): - org_df = read_structured_to_df() - anon_df, _ = apply_k_anonymity(org_df) - preview_dataframe(org_df) - write_df_to_local(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "ANONYMISATION" -}) -def l_diversity_job(): - org_df = read_structured_to_df() - anon_df, _ = apply_l_diversity(org_df) - preview_dataframe(org_df) - write_df_to_local(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "ANONYMISATION" -}) -def t_closeness_job(): - org_df = read_structured_to_df() - anon_df, _ = apply_t_closeness(org_df) - preview_dataframe(org_df) - write_df_to_local(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "ANONYMISATION", - "resource_type": "RD_DATA" -}) -def k_anonymity_job_s3(): - org_df = read_structured_from_s3() - anon_df, _ = apply_k_anonymity(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "ANONYMISATION", - "resource_type": "RD_DATA" -}) -def l_diversity_job_s3(): - org_df = read_structured_from_s3() - anon_df, _ = apply_l_diversity(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - - -@job(tags={ - "business_operation": "ANONYMISATION", - "resource_type": "RD_DATA" -}) -def t_closeness_job_s3(): - org_df = read_structured_from_s3() - anon_df, _ = apply_t_closeness(org_df) - preview_dataframe(org_df) - write_df_to_s3(anon_df) - preview_dataframe(anon_df) - -@job() -def read_write_semistructured_job_s3(): - semistruct_data = read_semistructured_from_s3() - write_semistructured_to_s3(semistruct_data) diff --git a/src/template_code_location/dataframe_level_anonymisation/ops.py b/src/template_code_location/dataframe_level_anonymisation/ops.py deleted file mode 100644 index 93682bf..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/ops.py +++ /dev/null @@ -1,187 +0,0 @@ -import json -from textwrap import dedent - -import pandas as pd -from anjana.anonymity import k_anonymity, l_diversity, t_closeness -from dagster import ( - DagsterInvalidInvocationError, - MarkdownMetadataValue, - Out, - Output, - get_dagster_logger, - op, -) -from pycanon import anonymity - -from template_code_location.dataframe_level_anonymisation.config_models import ( - KAnonymityConfiguration, - LDiversityConfiguration, - TClosenessConfiguration, -) -from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import get_all_hierarchies - - -def _calc_dataframe_metrics(df_anon, df_org, quasi_identifiers, sensitive_atttributes): - # --- Metrics --- - # Anonymization metrics - k_anon = anonymity.k_anonymity(df_anon, quasi_identifiers) - l_div = anonymity.l_diversity(df_anon, quasi_identifiers, sensitive_atttributes, True) - t_clos = anonymity.t_closeness(df_anon, quasi_identifiers, sensitive_atttributes, True) - - # Data Utilization metrics - supression_rate = 1 - len(df_anon) / len(df_org) - grouped = df_anon.groupby(quasi_identifiers) - mean_equivalence_class_size = len(df_anon) / len(grouped) if len(grouped) else 0 - - # flake8: noqa - anon_report = dedent( - f""" - ### Anonymization & Data Utilization Metrics - - | Metric | Value | Description | - |--------|-------|-------------| - | **k-anonymity** | `k = {k_anon}` | Minimum number of records sharing the same quasi-identifier values. | - | **l-diversity** | `l = {l_div}` | Diversity of sensitive attributes within each equivalence class. | - | **t-closeness** | `t = {round(t_clos, 2)}` | Distance between sensitive attribute distribution in a group and the overall dataset. | - | **Suppression rate** | `{round(supression_rate, 2)}` | Fraction of records or attributes suppressed to meet privacy requirements. | - | **Mean equivalence class size** | `{round(mean_equivalence_class_size, 2)}` | Average size of equivalence classes for quasi-identifiers, indicates data grouping. | - """ - ) - # flake8: enable - metrics = { - "k_anon": k_anon, - "l_div": l_div, - "t_clos": t_clos, - "supp_rate": supression_rate, - "mean_equivalence_class": mean_equivalence_class_size, - } - return anon_report, metrics - - -def _validate_and_get_hierarchies(config, df: pd.DataFrame): - hierarchies = get_all_hierarchies() - - # Dataset smaller than k - if len(df) < config.k: - raise DagsterInvalidInvocationError( - f"Cannot apply k-anonymity: dataset has {len(df)} records, but k={config.k}" - ) - - # Missing or incomplete generalisation hierarchies - for qi in config.quasi_identifiers: - if qi not in config.generalisation_hierarchies or not config.generalisation_hierarchies[qi]: - raise DagsterInvalidInvocationError( - f"Generalisation hierarchy for quasi-identifier '{qi}' is missing or incomplete" - ) - if config.generalisation_hierarchies[qi] not in hierarchies: - raise DagsterInvalidInvocationError( - f"Generalisation hierarchy '{config.generalisation_hierarchies[qi]}' is missing in the code basis" - ) - - hier = { - qi: hierarchies[config.generalisation_hierarchies[qi]] for qi in config.quasi_identifiers - } - return hier - - -@op(out={"data": Out(), "metrics": Out()}) -def apply_k_anonymity(context, config: KAnonymityConfiguration, df: pd.DataFrame): - - hier = _validate_and_get_hierarchies(config, df) - - data_anon = k_anonymity( - df, config.ident, config.quasi_identifiers, config.k, config.supp_level, hier - ) - if "index" in data_anon.columns and "index" not in df.columns: - data_anon.drop(columns="index", inplace=True) - anon_report, metrics = _calc_dataframe_metrics( - data_anon, df, config.quasi_identifiers, config.sensitive_attributes - ) - yield Output( - value=data_anon, - metadata={ - "metric_report": MarkdownMetadataValue(anon_report), - "metric_json": json.dumps(metrics), - }, - output_name="data", - ) - yield Output(value=metrics, output_name="metrics") - - -@op(out={"data": Out(), "metrics": Out()}) -def apply_l_diversity(context, config: LDiversityConfiguration, df: pd.DataFrame): - - hier = _validate_and_get_hierarchies(config, df) - - data_anon = l_diversity( - df, - config.ident, - config.quasi_identifiers, - config.sensitive_attribute, - config.k, - config.l, - config.supp_level, - hier, - ) - if data_anon.empty: - raise DagsterInvalidInvocationError( - "Could not tranform the data to l-diversity, empty dataset returned!" - ) - anon_report, metrics = _calc_dataframe_metrics( - data_anon, df, config.quasi_identifiers, [config.sensitive_attribute] - ) - yield Output( - value=data_anon, - metadata={ - "metric_report": MarkdownMetadataValue(anon_report), - "metric_json": json.dumps(metrics), - }, - output_name="data", - ) - yield Output(value=metrics, output_name="metrics") - - -@op(out={"data": Out(), "metrics": Out()}) -def apply_t_closeness(context, config: TClosenessConfiguration, df: pd.DataFrame): - - hier = _validate_and_get_hierarchies(config, df) - - try: - data_anon = t_closeness( - df, - config.ident, - config.quasi_identifiers, - config.sensitive_attribute, - config.k, - config.t, - config.supp_level, - hier, - ) - except ValueError as e: - if "Cannot be quasi-identifiers" in str(e): - raise DagsterInvalidInvocationError( - f"T-closeness failed: k-anonymity parameter = {config.k} is too small " - f"for existing hierarchies of {config.quasi_identifiers} in inner k-anonymity call." - ) - else: - # Re-raise other ValueError types with context - raise DagsterInvalidInvocationError(f"T-closeness failed with error: {str(e)}") - - if data_anon.empty: - raise DagsterInvalidInvocationError( - f"Could not transform the data to t-closeness, empty dataset returned! " - f"This may indicate that the t-closeness constraint (t={config.t}) is too strict for the given data." - ) - - anon_report, metrics = _calc_dataframe_metrics( - data_anon, df, config.quasi_identifiers, [config.sensitive_attribute] - ) - yield Output( - value=data_anon, - metadata={ - "metric_report": MarkdownMetadataValue(anon_report), - "metric_json": json.dumps(metrics), - }, - output_name="data", - ) - yield Output(value=metrics, output_name="metrics") diff --git a/src/template_code_location/dataframe_level_anonymisation/utils.py b/src/template_code_location/dataframe_level_anonymisation/utils.py deleted file mode 100644 index c233c4e..0000000 --- a/src/template_code_location/dataframe_level_anonymisation/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -import numpy as np - - -def parse_value_list(values): - return [int(v) if isinstance(v, str) and v.isdigit() else v for v in values] - - -# Hierarchy normalization for Anjana -def normalize_hierarchy_levels(hierarchy_dict): - normalized = {} - for column, levels in hierarchy_dict.items(): - normalized[column] = {} - for level_str, mapping_list in levels.items(): - level = int(level_str) - if level == 0: - normalized[column][level] = np.array(parse_value_list(mapping_list)) - else: - normalized[column][level] = mapping_list - return normalized diff --git a/src/template_code_location/field_level_pseudo_anonymisation/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py deleted file mode 100644 index 60944be..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/config_models/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -from .structured_config import ( # noqa: F401 - HashConfig, - EncryptConfig, - RedactConfig, - ReplaceConfig, - PseudoTechniqueConfig, - AnonymisePseudonymizeStructuredConfig, - DecryptConfig, - DepseudoTechniqueConfig, - DepseudonymizeStructuredConfig, -) - -from .unstructured_config import ( # noqa: F401, F811 - HashConfig, - EncryptConfig, - RedactConfig, - ReplaceConfig, - RetainConfig, - PseudoTechniqueConfig, - AnonymisePseudonymizeUnstructuredConfig, - DecryptConfig, - DepseudoTechniqueConfig, - DepseudonymizeUnstructuredConfig, -) - -from .languages import SupportedLanguages, LanguageEnum # noqa: F401 - -from .pii_entities import PIIEntityEnum, PII_MAPPING # noqa: F401 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py deleted file mode 100644 index e3ba89e..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/config_models/languages.py +++ /dev/null @@ -1,72 +0,0 @@ -from enum import Enum -from typing import ClassVar - - -class SupportedLanguages: - LANGUAGES: ClassVar[dict[str, str]] = { - "hr": "hr_HR", # Croatian - "da": "da_DK", # Danish - "nl": "nl_NL", # Dutch - "en": "en_US", # English - "fi": "fi_FI", # Finnish - "fr": "fr_FR", # French - "de": "de_DE", # German - "el": "el_GR", # Greek - "it": "it_IT", # Italian - "lt": "lt_LT", # Lithuanian - "pl": "pl_PL", # Polish - "pt": "pt_PT", # Portuguese - "ro": "ro_RO", # Romanian - "sl": "sl_SI", # Slovenian - "es": "es_ES", # Spanish - "sv": "sv_SE", # Swedish - } - LANGUAGE_MODELS = { - "en": "en_core_web_sm", - "it": "it_core_news_sm", - "de": "de_core_news_sm", - "fr": "fr_core_news_sm", - "es": "es_core_news_sm", - "nl": "nl_core_news_sm", - "da": "da_core_news_sm", - "sv": "sv_core_news_sm", - "fi": "fi_core_news_sm", - "pl": "pl_core_news_sm", - "el": "el_core_news_sm", - "hr": "hr_core_news_sm", - "lt": "lt_core_news_sm", - "pt": "pt_core_news_sm", - "ro": "ro_core_news_sm", - "sl": "sl_core_news_sm", - } - - @classmethod - def codes(cls) -> list[str]: - return list(cls.LANGUAGES.keys()) - - @classmethod - def get_locale(cls, code: str) -> str: - return cls.LANGUAGES[code] - - @classmethod - def get_language_model(cls, code: str) -> str: - return cls.LANGUAGE_MODELS[code] - - -class LanguageEnum(str, Enum): - hr = "hr" - da = "da" - nl = "nl" - en = "en" - fi = "fi" - fr = "fr" - de = "de" - el = "el" - it = "it" - lt = "lt" - pl = "pl" - pt = "pt" - ro = "ro" - sl = "sl" - es = "es" - sv = "sv" diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py deleted file mode 100644 index e730b6d..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/config_models/pii_entities.py +++ /dev/null @@ -1,24 +0,0 @@ -from enum import Enum - - -class PIIEntityEnum(str, Enum): - PERSON = "Person" - EMAIL = "Email" - CREDIT_CARD = "Credit card" - DATE_OF_BIRTH = "Date of birth" - URL = "URLs" - PHONE_NUMBERS = "Phone numbers" - CREDENTIALS = "Credentials" - X_SOCIAL = "X (formally known as Twitter) username" - - -PII_MAPPING: dict[PIIEntityEnum, str] = { - PIIEntityEnum.PERSON: "NameFilth", - PIIEntityEnum.EMAIL: "EmailFilth", - PIIEntityEnum.CREDIT_CARD: "CreditCardFilth", - PIIEntityEnum.DATE_OF_BIRTH: "DateOfBirthFilth", - PIIEntityEnum.URL: "UrlFilth", - PIIEntityEnum.PHONE_NUMBERS: "PhoneFilth", - PIIEntityEnum.CREDENTIALS: "CredentialFilth", - PIIEntityEnum.X_SOCIAL: "TwitterFilth", -} diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py deleted file mode 100644 index af8abf6..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/config_models/structured_config.py +++ /dev/null @@ -1,110 +0,0 @@ -from typing import List, Literal, Optional, Union - -from dagster import Config -from pydantic import Field as PydanticField, model_validator, field_validator - - -class HashConfig(Config): - type: Literal["hash"] = "hash" - columns: List[str] = PydanticField(default=["example_column"], description="Columns to hash") - algorithm: str = PydanticField(default="sha256", description="Hashing algorithm") - -class EncryptConfig(Config): - type: Literal["encrypt"] = "encrypt" - columns: List[str] = PydanticField(default=["example_column"], description="Columns to encrypt") - key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption") - -class RedactConfig(Config): - type: Literal["redact"] = "redact" - columns: List[str] = PydanticField(default=["example_column"], description="Columns to redact") - -class ReplaceConfig(Config): - type: Literal["replace"] = "replace" - columns: List[str] = PydanticField(default=["example_column"], description="Columns to replace") - new_value: str = PydanticField(default="REPLACED", description="Replacement value") - -class PseudoTechniqueConfig(Config): - technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig] = PydanticField( - default={"hash": HashConfig().model_dump(exclude={"type"})}, - discriminator="type" - ) - - -class AnonymisePseudonymizeStructuredConfig(Config): - used_function: List[PseudoTechniqueConfig] = PydanticField( - default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}], - description=("List of functions to be used on column"), - ) - - @model_validator(mode="after") - def ensure_unique_columns(self): - column_to_techniques = self._collect_column_to_techniques() - duplicates = { - col: techs for col, techs in column_to_techniques.items() if len(techs) > 1 - } - - if duplicates: - formatted = "; ".join( - f"{col} -> {', '.join(techs)}" for col, techs in duplicates.items() - ) - raise ValueError(f"Duplicate column(s) across techniques not allowed:\n{formatted}") - - return self - - def _collect_column_to_techniques(self): - """Extract column-to-techniques mapping from used_function list.""" - column_to_techniques = {} - for f in self.used_function: - technique_type, cols = self._extract_technique_and_columns(f) - for col in cols: - column_to_techniques.setdefault(col, []).append(technique_type) - return column_to_techniques - - def _extract_technique_and_columns(self, item): - """Extract technique type and columns list from a PseudoTechniqueConfig item (dict or model instance).""" - if isinstance(item, dict): - tech = item.get("technique") or {} - if isinstance(tech, dict): - if "type" in tech: - return tech.get("type"), tech.get("columns") or [] - elif len(tech) == 1: - # variant-key mapping: {'hash': {...}} - technique_type, inner = next(iter(tech.items())) - return technique_type, inner.get("columns") or [] - return None, [] - else: - # item is a PseudoTechniqueConfig instance - technique_type = item.technique.type - cols = getattr(item.technique, "columns", []) - return technique_type, cols - -class DecryptConfig(Config): - type: Literal["decrypt"] = "decrypt" - columns: List[str] = PydanticField(default=["example_column"], description="Columns to decrypt") - key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption") - -class DepseudoTechniqueConfig(Config): - technique: DecryptConfig = PydanticField(default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}) - - -class DepseudonymizeStructuredConfig(Config): - used_function: List[DepseudoTechniqueConfig] = PydanticField( - default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}], - description=("Decryption functions to be used on column"), - ) - - @field_validator("used_function", mode="before") - def _normalize_depseudo_used_function(cls, v): - normalized = [] - for item in v: - if isinstance(item, dict): - normalized.append(DepseudoTechniqueConfig.model_validate(item)) - else: - normalized.append(item) - return normalized - - @model_validator(mode="after") - def ensure_unique_columns(self): - # For depseudonymize, we don't have per-column uniqueness constraints, - # but keep a no-op validator to preserve API parity. - return self diff --git a/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py b/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py deleted file mode 100644 index abea0b0..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/config_models/unstructured_config.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import List, Literal, Optional, Union - -from dagster import Config -from pydantic import Field as PydanticField, model_validator, field_validator -from .languages import LanguageEnum -from .pii_entities import PIIEntityEnum - - -class HashConfig(Config): - type: Literal["hash"] = "hash" - pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to hash") - algorithm: str = PydanticField(default="sha256", description="Hashing algorithm") - -class EncryptConfig(Config): - type: Literal["encrypt"] = "encrypt" - pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to encrypt") - key_name: str = PydanticField(default="my_key", description="Key identifier used for encryption") - - -class RedactConfig(Config): - type: Literal["redact"] = "redact" - pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to redact") - -class ReplaceConfig(Config): - type: Literal["replace"] = "replace" - pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to replace") - new_value: str = PydanticField(default="REPLACED", description="Replacement value") - -class RetainConfig(Config): - type: Literal["retain"] = "retain" - pii: List[PIIEntityEnum] = PydanticField(default=[PIIEntityEnum.EMAIL.name], description="PII entities to retain") - -class PseudoTechniqueConfig(Config): - technique: Union[HashConfig, EncryptConfig, RedactConfig, ReplaceConfig, RetainConfig] = PydanticField( - default={"hash": HashConfig().model_dump(exclude={"type"})}, - discriminator="type" - ) - -class AnonymisePseudonymizeUnstructuredConfig(Config): - language: LanguageEnum = PydanticField( - default=LanguageEnum.en, - description="Language code (must be one of: hr, da, nl, en, fi, fr, de, el, it, lt, pl, pt, ro, sl, es, sv)" - - ) - used_function: List[PseudoTechniqueConfig] = PydanticField( - default=[{"technique": {"hash": HashConfig().model_dump(exclude={"type"})}}], - description=("List of functions to be used on PIIs"), - ) - - @field_validator("used_function", mode="before") - def _normalize_used_function(cls, v): - normalized = [] - for item in v: - if isinstance(item, dict): - normalized.append(PseudoTechniqueConfig.model_validate(item)) - else: - normalized.append(item) - return normalized - - @model_validator(mode="after") - def ensure_unique_pii(self): - pii_to_techniques = self._collect_pii_to_techniques() - duplicates = { - pii: techs for pii, techs in pii_to_techniques.items() if len(techs) > 1 - } - - if duplicates: - formatted = "; ".join( - f"{pii} -> {', '.join(techs)}" for pii, techs in duplicates.items() - ) - raise ValueError(f"Duplicate PII(s) across techniques not allowed:\n{formatted}") - - return self - - def _collect_pii_to_techniques(self): - """Extract PII-to-techniques mapping from used_function list.""" - pii_to_techniques = {} - for f in self.used_function: - technique_type, piis = self._extract_technique_and_pii(f) - for pii in piis: - pii_to_techniques.setdefault(pii, []).append(technique_type) - return pii_to_techniques - - def _extract_technique_and_pii(self, item): - """Extract technique type and PII list from a PseudoTechniqueConfig item (dict or model instance).""" - if isinstance(item, dict): - tech = item.get("technique") or {} - if isinstance(tech, dict): - if "type" in tech: - return tech.get("type"), tech.get("pii") or tech.get("columns") or [] - elif len(tech) == 1: - # variant-key mapping: {'hash': {...}} - technique_type, inner = next(iter(tech.items())) - return technique_type, inner.get("pii") or inner.get("columns") or [] - return None, [] - else: - # item is a PseudoTechniqueConfig instance - technique_type = item.technique.type - piis = getattr(item.technique, "pii", []) or getattr(item.technique, "columns", []) - return technique_type, piis - -class DecryptConfig(Config): - type: Literal["decrypt"] = "decrypt" - key_name: str = PydanticField(default="my_key", description="Key identifier used for decryption") - -class DepseudoTechniqueConfig(Config): - technique: DecryptConfig = PydanticField( - default={"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}, - ) - -class DepseudonymizeUnstructuredConfig(Config): - used_function: List[DepseudoTechniqueConfig] = PydanticField( - default=[{"technique": {"type": "decrypt", **DecryptConfig().model_dump(exclude={"type"})}}], - description=("Decryption function"), - ) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/jobs.py b/src/template_code_location/field_level_pseudo_anonymisation/jobs.py deleted file mode 100644 index 0f39cfb..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/jobs.py +++ /dev/null @@ -1,126 +0,0 @@ -from dagster import job -from util_services.util_ops import ( - preview_dataframe, - read_structured_to_df, - write_df_to_local, - write_string_to_unstructured, - read_unstructured_to_string, - preview_unstructured, - read_structured_from_s3, - write_df_to_s3, - read_unstructured_from_s3, - write_unstructured_to_s3, -) -from .ops import ( - anonymize_pseudonymize_structured, - depseudonymize_structured, -) -from .unstructured_ops import ( - anonymize_pseudonymize_unstructured, - depseudonymize_unstructured, -) - -@job(tags={ - "business_operation": "ANONYMISATION_PSEUDONYMISATION" -}) -def anonymise_pseudonymise_structured_job(): - df = read_structured_to_df() - preview_dataframe(df) - df_anon, metrics = anonymize_pseudonymize_structured(df) - preview_dataframe(df_anon) - write_df_to_local(df_anon) - - -@job(tags={ - "business_operation": "ANONYMISATION_PSEUDONYMISATION", - "resource_type": "RD_DATA" -}) -def anonymise_pseudonymise_structured_job_s3(): - df = read_structured_from_s3() - preview_dataframe(df) - df_anon, metrics = anonymize_pseudonymize_structured(df) - preview_dataframe(df_anon) - write_df_to_s3(df_anon) - - -@job(tags={ - "business_operation": "DEPSEUDONYMISATION" -}) -def depseudonymise_structured_job(): - df = read_structured_to_df() - preview_dataframe(df) - df_anon, metrics = depseudonymize_structured(df) - preview_dataframe(df_anon) - write_df_to_local(df_anon) - - -@job(tags={ - "business_operation": "DEPSEUDONYMISATION", - "resource_type": "RD_DATA" -}) -def depseudonymise_structured_job_s3(): - df = read_structured_from_s3() - preview_dataframe(df) - df_anon, metrics = depseudonymize_structured(df) - preview_dataframe(df_anon) - write_df_to_s3(df_anon) - - -@job(tags={ - "business_operation": "ANONYMISATION_PSEUDONYMISATION" -}) -def anonymise_pseudonymise_depseudonymise_structured_job(): - df = read_structured_to_df() - preview_dataframe(df) - df_pseduo, metrics = anonymize_pseudonymize_structured(df) - preview_dataframe(df_pseduo) - df_depseduo, metrics = depseudonymize_structured(df_pseduo) - preview_dataframe(df_depseduo) - - -@job(tags={ - "business_operation": "ANONYMISATION_PSEUDONYMISATION" -}) -def anonymise_pseudonymise_unstructured_job(): - text = read_unstructured_to_string() - preview_unstructured(text) - text_anon, metrics = anonymize_pseudonymize_unstructured(text) - preview_unstructured(text_anon) - preview_unstructured(metrics) - write_string_to_unstructured(text_anon) - - -@job(tags={ - "business_operation": "ANONYMISATION_PSEUDONYMISATION", - "resource_type": "RD_DATA" -}) -def anonymise_pseudonymise_unstructured_job_s3(): - text = read_unstructured_from_s3() - preview_unstructured(text) - text_anon, metrics = anonymize_pseudonymize_unstructured(text) - preview_unstructured(text_anon) - preview_unstructured(metrics) - write_unstructured_to_s3(text_anon) - - -@job(tags={ - "business_operation": "DEPSEUDONYMISATION" -}) -def depseudonymise_unstructured_job(): - text = read_unstructured_to_string() - preview_unstructured(text) - text_anon, metrics = depseudonymize_unstructured(text) - preview_unstructured(text_anon) - write_string_to_unstructured(text_anon) - - -@job(tags={ - "business_operation": "DEPSEUDONYMISATION", - "resource_type": "RD_DATA" -}) -def depseudonymise_unstructured_job_s3(): - text = read_unstructured_from_s3() - preview_unstructured(text) - text_anon, metrics = depseudonymize_unstructured(text) - preview_unstructured(text_anon) - write_unstructured_to_s3(text_anon) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/ops.py b/src/template_code_location/field_level_pseudo_anonymisation/ops.py deleted file mode 100644 index a485ff9..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/ops.py +++ /dev/null @@ -1,77 +0,0 @@ -import pandas as pd -import numpy as np -from dagster import Out, Output, op -from cryptography.fernet import InvalidToken -from template_code_location.field_level_pseudo_anonymisation.config_models import ( - AnonymisePseudonymizeStructuredConfig, - DepseudonymizeStructuredConfig, -) -from template_code_location.field_level_pseudo_anonymisation.techniques import ( - anonymisation_pseudonymisation_techniques as anon_pseudo_funcs, -) -import template_code_location.field_level_pseudo_anonymisation.techniques.depseudonymisation_techniques as depseudo_funcs -from .utils import create_get_encryption_key - - -def _apply_column_wise_function(config, df, funcs): - for used_function in config.used_function: - func_name = used_function.technique.type - columns = used_function.technique.columns - func = getattr(funcs, func_name) - params = used_function.technique.model_dump() - del params["type"] - del params["columns"] - - if func_name in ["encrypt", "decrypt"]: - key_name = used_function.technique.key_name - del params["key_name"] - params["key"] = create_get_encryption_key(func_name, key_name) - - missing = [col for col in columns if col not in df.columns] - if missing: - raise ValueError( - f"The following columns required by technique '{func_name}' " - f"are not present in the DataFrame: {', '.join(missing)}" - ) - - # Skip processing if DataFrame is empty - if len(df) == 0: - continue - - for column in columns: - try: - vectorized_func = np.vectorize(lambda x: func(x, **params)) - df[column] = vectorized_func(df[column].to_numpy()) - except InvalidToken: - raise ValueError( - f"Invalid Fernet token while decrypting column '{column}' " - f"using key '{key_name}'. The data may not be encrypted " - f"or the key may be incorrect. " - ) - return df - - -@op(out={"data": Out(), "metrics": Out()}) -def anonymize_pseudonymize_structured( - context, config: AnonymisePseudonymizeStructuredConfig, df: pd.DataFrame -): - - df = _apply_column_wise_function(config, df, anon_pseudo_funcs) - yield Output( - value=df, - metadata={}, - output_name="data", - ) - yield Output(value={}, output_name="metrics") - - -@op(out={"data": Out(), "metrics": Out()}) -def depseudonymize_structured(context, config: DepseudonymizeStructuredConfig, df: pd.DataFrame): - - df = _apply_column_wise_function(config, df, depseudo_funcs) - yield Output( - value=df, - metadata={}, - output_name="data", - ) - yield Output(value={}, output_name="metrics") diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py deleted file mode 100644 index 128c371..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/techniques/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .anonymisation_pseudonymisation_techniques import hash, redact, replace, encrypt # noqa: F401 - -from .depseudonymisation_techniques import decrypt # noqa: F401 diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py deleted file mode 100644 index ce15613..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/techniques/anonymisation_pseudonymisation_techniques.py +++ /dev/null @@ -1,42 +0,0 @@ -import hashlib -from cryptography.fernet import Fernet - - -def hash(value: str, algorithm: str = "sha256") -> str: - """ - Hash the value using the specified algorithm (default: SHA-256). - """ - value = str(value) - hash_func = hashlib.new(algorithm) - hash_func.update(value.encode("utf-8")) - return hash_func.hexdigest() - - -def redact(value: str) -> str: - """ - Redact the column and return an empty string - """ - return "" - - -def replace(value: str, new_value) -> str: - """ - Replace the value column with the provided value - """ - return new_value - - -def encrypt(value: str, key: bytes) -> str: - """ - Encrypt the value using the provided Fernet key. - """ - value = str(value) - f = Fernet(key) - return f.encrypt(value.encode()).decode() - - -def retain(value: str) -> str: - """ - Retain the original value without any changes. - """ - return value diff --git a/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py b/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py deleted file mode 100644 index 4e0937c..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/techniques/depseudonymisation_techniques.py +++ /dev/null @@ -1,9 +0,0 @@ -from cryptography.fernet import Fernet - - -def decrypt(value: str, key: bytes) -> str: - """ - Decrypt a string using the provided Fernet key. - """ - f = Fernet(key) - return f.decrypt(value.encode()).decode() diff --git a/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py b/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py deleted file mode 100644 index f8f0ffe..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/unstructured_ops.py +++ /dev/null @@ -1,428 +0,0 @@ -import importlib -import importlib.abc -import importlib.machinery -import re -import sys -import types - - -# --------------------------------------------------------------------------- -# Stub out the `transformers` and `spacy_transformers` packages before any -# other import triggers spaCy's entry-point scan or scrubadub_spacy's runtime -# import of spacy_transformers.pipeline_component. -# --------------------------------------------------------------------------- -_STUB_PACKAGES = ("transformers", "spacy_transformers") - - -class _StubModule(types.ModuleType): - """Module that returns a dummy class for any attribute access.""" - - def __getattr__(self, name: str): - return type(name, (), {}) - - -class _StubFinder(importlib.abc.MetaPathFinder): - """Intercept any import under the stubbed packages and return a stub module.""" - - def find_spec(self, fullname, path=None, target=None): # noqa: ANN001 - for pkg in _STUB_PACKAGES: - if fullname == pkg or fullname.startswith(pkg + "."): - return importlib.machinery.ModuleSpec(fullname, _StubLoader()) - return None - - -class _StubLoader(importlib.abc.Loader): - def create_module(self, spec): # noqa: ANN001 - mod = _StubModule(spec.name) - mod.__path__ = [] # mark as package - mod.__spec__ = spec - return mod - - def exec_module(self, module): # noqa: ANN001 - pass - - -# Install the finder once, before scrubadub / spacy are imported. -if not any(isinstance(f, _StubFinder) for f in sys.meta_path): - sys.meta_path.insert(0, _StubFinder()) -# --------------------------------------------------------------------------- - - -import scrubadub # noqa: E402 -import scrubadub_spacy # noqa: E402 -from cryptography.fernet import InvalidToken # noqa: E402 -from dagster import Out, Output, get_dagster_logger, op # noqa: E402 -from scrubadub.detectors import RegexDetector # noqa: E402 -from scrubadub.filth import CredentialFilth, NameFilth # noqa: E402 - -from template_code_location.field_level_pseudo_anonymisation.techniques import ( - anonymisation_pseudonymisation_techniques as anon_pseudo_funcs, -) -from template_code_location.field_level_pseudo_anonymisation.techniques import ( - depseudonymisation_techniques as depseudo_funcs, -) - -from .config_models import ( - PII_MAPPING, - AnonymisePseudonymizeUnstructuredConfig, - DepseudonymizeUnstructuredConfig, - PIIEntityEnum, - PseudoTechniqueConfig, - SupportedLanguages, -) -from .utils import create_get_encryption_key - - -def _initialize_scrubber(language: str) -> scrubadub.Scrubber: - class SIMPLCredentialDetector(RegexDetector): - """ - Remove username/password combinations from dirty ``text``. - """ - - filth_cls = CredentialFilth - name = "credential" - autoload = True - - regex = re.compile( - r""" - (?:username|login|u:)\s*(?::\s*)? - (?P[\w.\-@+]+) - [\s\S]{0,500}? - (?:password|pw|p:)\s*(?::\s*)? - (?P[^\s]+) - """, - re.MULTILINE | re.VERBOSE | re.IGNORECASE, - ) - - locale = SupportedLanguages.get_locale(language) - scrubber = scrubadub.Scrubber(locale=locale) - - model_name = SupportedLanguages.get_language_model(language) - spacy_detector = scrubadub_spacy.detectors.SpacyEntityDetector(model=model_name) - spacy_detector.named_entities = { - "PERSON", - "PER", - "ORG", - "persName", - "PRS", - } # Need to set it after the constructor because scrubadub_spacy uses upper on all entries - spacy_detector.filth_cls_map["persName"] = NameFilth # Required because PL uses persName - spacy_detector.filth_cls_map["PRS"] = NameFilth # Required for swedish that uses PRS - scrubber.add_detector(spacy_detector) - if language in ["en", "de"]: - scrubber.add_detector( - scrubadub.detectors.DateOfBirthDetector - ) # add optional data of birth detector - scrubber.remove_detector( - scrubadub.detectors.CredentialDetector - ) # remove the not so great credentials detector and replace with custom SIMPL one - scrubber.add_detector(SIMPLCredentialDetector()) - return scrubber - - -def _map_filth_to_pii_enum(filth) -> PIIEntityEnum | None: - cls_name = filth.__class__.__name__ - for pii_enum, filth_name in PII_MAPPING.items(): - if filth_name == cls_name: - return pii_enum - return None - - -def _get_metrics(metrics_dict: dict, language: str) -> str: - # Format metrics as Markdown table - metrics_report = f""" -## PII Anonymization Report - -### Summary -- **Total PII Detected**: {metrics_dict['total_pii_detected']} -- **Original Length**: {metrics_dict['text_length_original']} chars -- **Anonymized Length**: {metrics_dict['text_length_anonymised']} chars -- **Language**: {language} - -### PII by Type -| Entity Type | Count | -|-------------|-------| -""" - for pii_type, count in metrics_dict["pii_by_type"].items(): - metrics_report += f"| {pii_type} | {count} |\n" - - metrics_report += "\n### Techniques Applied\n" - for pii, technique in metrics_dict["techniques_applied"].items(): - metrics_report += f"- **{pii}**: {technique}\n" - - return metrics_report - - -def _build_metrics_dict( - pii_counts: dict[str, int], - text: str, - anon_text: str, - technique_map: dict[PIIEntityEnum, PseudoTechniqueConfig], -) -> dict: - metrics_dict = { - "total_pii_detected": sum(pii_counts.values()), - "pii_by_type": pii_counts, - "text_length_original": len(text), - "text_length_anonymised": len(anon_text), - "techniques_applied": { - pii.name: technique_map[pii].technique.type for pii in technique_map.keys() - }, - } - - return metrics_dict - - -@op(out={"data": Out(), "metrics": Out()}) -def anonymize_pseudonymize_unstructured( - context, config: AnonymisePseudonymizeUnstructuredConfig, text: str -): - logger = get_dagster_logger() - - if text is None or not text.strip(): - raise ValueError("Input text cannot be None or empty") - - logger.debug( - f"Starting unstructured PII anonymization | lang={config.language.value} " - f"| input_chars={len(text)}" - ) - - # --- Filth detection --- - try: - scrubber = _initialize_scrubber(config.language.value) - filths = list(scrubber.iter_filth(text)) - logger.info(f"Detected {len(filths)} potential PII entities before filtering.") - except Exception as e: - logger.error(f"Scrubber initialization/detection failed | lang={config.language.value}") - raise RuntimeError(f"PII detection failed for language '{config.language.value}'") from e - - # --- Build technique routing map --- - technique_map = _build_technique_map(config) - logger.debug( - "Technique map constructed: " - + ", ".join(f"{pii.name}->{cfg.technique.type}" for pii, cfg in technique_map.items()) - ) - - replacements = [] - key_cache = {} - pii_counts = {} - - # --- Process filths --- - for idx, filth in enumerate(filths, start=1): - pii_enum = _map_filth_to_pii_enum(filth) - - if pii_enum is None: - logger.debug(f"[{idx}] Skipping unknown filth class={filth.__class__.__name__}") - continue - - start_idx, end_idx = _extract_span(filth, logger, idx) - if start_idx is None: - continue - - original_value = text[start_idx:end_idx] - technique_cfg = technique_map.get(pii_enum) - - # No technique configured - if technique_cfg is None: - _handle_missing_technique( - pii_enum, - start_idx, - end_idx, - text, - pii_counts, - replacements, - logger, - idx, - ) - continue - - # Apply configured technique - t = technique_cfg.technique - params = _prepare_params(t, key_cache, idx, logger) - replacement = _apply_technique(original_value, t.type, params, pii_enum, idx, logger) - - replacements.append((start_idx, end_idx, replacement)) - pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1 - - # --- Apply replacements --- - anon_text = _apply_replacements(text, replacements, logger) - - logger.info(f"Anonymisation completed, total PII counts: {pii_counts}") - - metrics_report = _get_metrics( - _build_metrics_dict(pii_counts, text, anon_text, technique_map), - config.language.value, - ) - - yield Output(value=anon_text, output_name="data") - yield Output(value=metrics_report, output_name="metrics") - - -@op(out={"data": Out(), "metrics": Out()}) -def depseudonymize_unstructured(context, config: DepseudonymizeUnstructuredConfig, input_text: str): - - input_restored, metrics = _apply_depseudonimisation_function(config, input_text, depseudo_funcs) - yield Output( - value=input_restored, - metadata={}, - output_name="data", - ) - yield Output(value=metrics, output_name="metrics") - - -def _apply_depseudonimisation_function(config, input_text: str, funcs_module): - """ - Searches and depseudonymizes text segments formatted as: - {technique:pseudonymized_value} - """ - - total_depseudo_count = 0 - depseudonimized_text = input_text # Initialize with input text - - # Loop through each depseudonymisation technique defined in the config - for used_function in config.used_function: - func_name = used_function.technique.type - func = getattr(funcs_module, func_name) - pseudo_anon_func = "" - - # Prepare parameters - params = used_function.technique.model_dump() - del params["type"] - - if func_name == "decrypt": - key_name = used_function.technique.key_name - del params["key_name"] - pseudo_anon_func = "encrypt" - params["key"] = create_get_encryption_key(func_name, key_name) - - # Regex pattern for this technique, e.g. {encrypt:...} - pattern = rf"\{{{pseudo_anon_func}:([^}}]+)\}}" - - def replace_match(match): - nonlocal total_depseudo_count - pseudovalue = match.group(1) - total_depseudo_count += 1 - try: - return func(pseudovalue, **params) - except InvalidToken: - raise ValueError( - f"Invalid Fernet token while decrypting value using key '{key_name}'. " - f"The data may not be encrypted or the key may be incorrect." - ) - except Exception as e: - raise RuntimeError(f"Error during depseudonymisation with '{func_name}': {e}") - - # Apply replacements for this technique - depseudonimized_text = re.sub(pattern, replace_match, depseudonimized_text) - - yield depseudonimized_text - yield {"total_depseudo_count": total_depseudo_count} - - -def _build_technique_map(config): - technique_map = {} - for func_cfg in config.used_function: - for pii in func_cfg.technique.pii: - technique_map[pii] = func_cfg - return technique_map - - -def _extract_span(filth, logger, idx): - start_idx = getattr(filth, "beg", getattr(filth, "start", None)) - end_idx = getattr(filth, "end", None) - if start_idx is None or end_idx is None: - logger.debug(f"[{idx}] Filth missing span attributes; skipping.") - return None, None - return start_idx, end_idx - - -def _handle_missing_technique( - pii_enum, start_idx, end_idx, text, pii_counts, replacements, logger, idx -): - original_value = text[start_idx:end_idx] - logger.debug( - f"[{idx}] PII={pii_enum.name} span=({start_idx},{end_idx}) value={original_value} " - f"- No technique configured, using placeholder" - ) - placeholder = f"{{{{{pii_enum.name}}}}}" - replacements.append((start_idx, end_idx, placeholder)) - pii_counts[pii_enum.name] = pii_counts.get(pii_enum.name, 0) + 1 - - -def _prepare_params(t, key_cache, idx, logger): - params = t.model_dump() - del params["type"] - del params["pii"] - - if t.type == "encrypt": - try: - if t.key_name not in key_cache: - logger.debug( - f"[{idx}] Retrieving/generating Vault key name={t.key_name} for encryption" - ) - key_cache[t.key_name] = create_get_encryption_key("encrypt", t.key_name) - params["key"] = key_cache[t.key_name] - del params["key_name"] - logger.debug(f"[{idx}] Encryption key prepared") - except Exception as e: - raise RuntimeError( - f"Encryption key retrieval failed for key '{t.key_name}': {type(e).__name__}" - ) from e - - return params - - -def _apply_technique(original_value, t_type, params, pii_enum, idx, logger): - try: - func = getattr(anon_pseudo_funcs, t_type) - replacement = func(original_value, **params) - - if t_type == "encrypt": - replacement = f"{{encrypt:{replacement}}}" - - logger.debug(f"[{idx}] {t_type.capitalize()} complete") - return replacement - - except AttributeError: - logger.warning(f"[{idx}] Technique '{t_type}' not recognized; inserting placeholder.") - return f"{{UNIMPL_{t_type}_{pii_enum.name}}}" - - except Exception as e: - raise RuntimeError( - f"Technique '{t_type}' failed for PII type '{pii_enum.name}': {type(e).__name__}" - ) from e - - -def _apply_replacements(text, replacements, logger): - if not replacements: - logger.info("No PII detected; returning original text.") - return text - - logger.debug(f"Applying {len(replacements)} replacements to text body.") - replacements.sort(key=lambda r: r[0]) - - # Detect overlaps - for i in range(len(replacements) - 1): - if replacements[i][1] > replacements[i + 1][0]: - logger.warning( - f"Overlapping PII detected at positions " - f"({replacements[i][0]},{replacements[i][1]}) " - f"and ({replacements[i+1][0]},{replacements[i+1][1]}). " - f"Using first match." - ) - replacements[i + 1] = ( - replacements[i][1], - replacements[i + 1][1], - replacements[i + 1][2], - ) - - result_parts = [] - last = 0 - for start, end, repl in replacements: - if start < last: - continue - result_parts.append(text[last:start]) - result_parts.append(repl) - last = end - - result_parts.append(text[last:]) - return "".join(result_parts) diff --git a/src/template_code_location/field_level_pseudo_anonymisation/utils.py b/src/template_code_location/field_level_pseudo_anonymisation/utils.py deleted file mode 100644 index 25ebd75..0000000 --- a/src/template_code_location/field_level_pseudo_anonymisation/utils.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import hvac -from hvac.exceptions import InvalidPath -from cryptography.fernet import Fernet - - -def create_get_encryption_key(func_name: str, key_name: str) -> bytes: - client = hvac.Client(url=os.getenv("OPENBAO_URL"), token=os.getenv("OPENBAO_TOKEN")) - - secret_folder = os.getenv("ENCRYPTION_KEYS_PATH") - secret_path = f"{secret_folder}/{key_name}" if secret_folder else key_name - mount_point = os.getenv("ENCRYPTION_KEYS_MOUNT_POINT") - - try: - secret_response = client.secrets.kv.v2.read_secret_version( - path=secret_path, mount_point=mount_point - ) - key_value = secret_response["data"]["data"]["value"] - - except InvalidPath: - if func_name == "encrypt": - new_key = Fernet.generate_key().decode() - client.secrets.kv.v2.create_or_update_secret( - path=secret_path, mount_point=mount_point, secret={"value": new_key} - ) - key_value = new_key - else: - raise ValueError(f"Fernet key '{key_name}' not found in Vault for decrypt.") - except Exception as e: - raise ValueError(f"Error while reading Fernet key '{key_name}': {e}") - - return key_value.encode() diff --git a/src/template_code_location/repository.py b/src/template_code_location/repository.py index 1d0be85..94f3746 100644 --- a/src/template_code_location/repository.py +++ b/src/template_code_location/repository.py @@ -8,7 +8,7 @@ from util_services.sensors import ( from util_services.custom_json_logger import simpl_json_logger # Data processing jobs -from template_code_location.data_processing.jobs import ( +from data_processing.jobs import ( remove_duplicates_job_s3, fill_missing_values_job_s3, standardize_categorical_values_job_s3, @@ -21,7 +21,7 @@ from template_code_location.data_processing.jobs import ( ) # Dataframe-level anonymisation jobs -from template_code_location.dataframe_level_anonymisation.jobs import ( +from dataframe_level_anonymisation.jobs import ( k_anonymity_job_s3, l_diversity_job_s3, t_closeness_job_s3, @@ -29,7 +29,7 @@ from template_code_location.dataframe_level_anonymisation.jobs import ( ) # Field-level pseudo-anonymisation jobs -from template_code_location.field_level_pseudo_anonymisation.jobs import ( +from field_level_pseudo_anonymisation.jobs import ( anonymise_pseudonymise_structured_job_s3, depseudonymise_structured_job_s3, anonymise_pseudonymise_unstructured_job_s3, diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data_processing/__init__.py b/tests/data_processing/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tests/data_processing/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/data_processing/conftest.py b/tests/data_processing/conftest.py deleted file mode 100644 index 9eda2af..0000000 --- a/tests/data_processing/conftest.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Pytest configuration and shared fixtures.""" - -import pytest -import pandas as pd -from unittest.mock import MagicMock, patch -import sys -from dagster import build_op_context - -# Mock external dependencies that might not be available in test environment -sys.modules['spellchecker'] = MagicMock() - - -@pytest.fixture -def mock_context(): - """Create a mock Dagster context for testing operations.""" - context = build_op_context() - return context - - -@pytest.fixture -def sample_dataframe(): - """Create a sample DataFrame for testing.""" - return pd.DataFrame({ - 'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'], - 'Age': [25, 30, 25, None, 25], - 'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'], - 'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active'] - }) - - -@pytest.fixture -def sample_dataframe_with_typos(): - """Create a sample DataFrame with typos for spell checking.""" - return pd.DataFrame({ - 'Name': ['jon doe', 'jane smith', 'bob jonson'], - 'Description': ['developer', 'analst', 'enginer'] - }) - - -@pytest.fixture -def empty_dataframe(): - """Create an empty DataFrame.""" - return pd.DataFrame() - - -@pytest.fixture -def dataframe_with_missing_values(): - """Create a DataFrame with various missing values.""" - return pd.DataFrame({ - 'Column1': [1, None, 3, None, 5], - 'Column2': ['a', 'b', None, 'd', None], - 'Column3': [None, None, None, None, None] - }) diff --git a/tests/data_processing/conftest_utils.py b/tests/data_processing/conftest_utils.py deleted file mode 100644 index 19d2f59..0000000 --- a/tests/data_processing/conftest_utils.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Configuration utilities for testing.""" - -import os -import sys - -# Add src directory to path for imports -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) diff --git a/tests/data_processing/test_config_models.py b/tests/data_processing/test_config_models.py deleted file mode 100644 index 989054f..0000000 --- a/tests/data_processing/test_config_models.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Unit tests for configuration models.""" - -import pytest -from pydantic import ValidationError -from template_code_location.data_processing.config_models import ( - FillMissingConfiguration, - ColumnsSelectConfiguration, - SpellCheckConfiguration, - AggregationConfiguration -) - - -class TestColumnsSelectConfiguration: - """Tests for ColumnsSelectConfiguration.""" - - def test_default_columns(self): - """Test default columns configuration.""" - config = ColumnsSelectConfiguration() - assert config.columns == ['Name'] - - def test_custom_columns(self): - """Test custom columns configuration.""" - config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3']) - assert config.columns == ['Col1', 'Col2', 'Col3'] - - def test_empty_columns_list(self): - """Test with empty columns list.""" - config = ColumnsSelectConfiguration(columns=[]) - assert config.columns == [] - - def test_single_column(self): - """Test with a single column.""" - config = ColumnsSelectConfiguration(columns=['SingleCol']) - assert config.columns == ['SingleCol'] - - def test_columns_with_special_characters(self): - """Test columns with special characters.""" - config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3']) - assert config.columns == ['Col-1', 'Col_2', 'Col.3'] - - def test_duplicate_columns_are_removed(self): - """Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys).""" - config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B']) - - assert config.columns == ['A', 'B', 'C'] - - def test_duplicate_default_behavior(self): - """Verifica che anche input estremi vengano gestiti correttamente.""" - config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name']) - assert config.columns == ['Name'] - - -class TestFillMissingConfiguration: - """Tests for FillMissingConfiguration.""" - - def test_default_fill_map(self): - """Test default fill map configuration.""" - config = FillMissingConfiguration() - - assert config.fill_map == {'Age': 'UNKNOWN_AGE'} - - def test_custom_fill_map(self): - """Test custom fill map configuration.""" - fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'} - config = FillMissingConfiguration(fill_map=fill_map) - - assert config.fill_map == fill_map - - def test_empty_fill_map(self): - """Test with empty fill map.""" - config = FillMissingConfiguration(fill_map={}) - - assert config.fill_map == {} - - def test_fill_map_with_numeric_values(self): - """Test fill map with numeric string values.""" - fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'} - config = FillMissingConfiguration(fill_map=fill_map) - - assert config.fill_map == fill_map - - def test_fill_map_with_string_values(self): - """Test fill map with string values.""" - fill_map = {'Name': 'Unknown', 'Email': 'no-email'} - config = FillMissingConfiguration(fill_map=fill_map) - - assert config.fill_map == fill_map - - def test_fill_map_mixed_types(self): - """Test fill map with mixed value types (all strings).""" - fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'} - config = FillMissingConfiguration(fill_map=fill_map) - - assert config.fill_map == fill_map - - -class TestSpellCheckConfiguration: - """Tests for SpellCheckConfiguration.""" - - def test_default_spell_check_config(self): - """Test default spell check configuration.""" - config = SpellCheckConfiguration() - - assert config.columns == ['Name'] - assert config.language == 'en' - - def test_custom_spell_check_config(self): - """Test custom spell check configuration.""" - config = SpellCheckConfiguration( - columns=['Description', 'Notes'], - language='es' - ) - - assert config.columns == ['Description', 'Notes'] - assert config.language == 'es' - - def test_spell_check_all_languages(self): - """Test spell check with all supported languages.""" - supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'] - - for lang in supported_languages: - config = SpellCheckConfiguration(language=lang) - assert config.language == lang - - def test_spell_check_invalid_language(self): - """Test spell check with invalid language.""" - with pytest.raises(ValidationError): - SpellCheckConfiguration(language='invalid') - - def test_spell_check_multiple_columns(self): - """Test spell check with multiple columns.""" - columns = ['Col1', 'Col2', 'Col3', 'Col4'] - config = SpellCheckConfiguration(columns=columns) - - assert config.columns == columns - - def test_spell_check_empty_columns(self): - """Test spell check with empty columns list.""" - config = SpellCheckConfiguration(columns=[]) - - assert config.columns == [] - assert config.language == 'en' - - def test_spell_check_inheritance(self): - """Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration.""" - config = SpellCheckConfiguration() - - assert isinstance(config, ColumnsSelectConfiguration) - assert hasattr(config, 'columns') - assert hasattr(config, 'language') - - @pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']) - def test_spell_check_languages_parametrized(self, language): - """Test spell check with parametrized languages.""" - config = SpellCheckConfiguration(language=language) - assert config.language == language - -class TestAggregationConfiguration: - """Tests for AggregationConfiguration.""" - - def test_aggregation_default_config(self): - """Test default aggregation configuration.""" - config = AggregationConfiguration() - - assert config.columns == ['Name'] - assert config.operation == 'sum' - - @pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"]) - def test_aggregation_valid_operations(self, op): - """Test all allowed aggregation operations.""" - config = AggregationConfiguration(operation=op) - assert config.operation == op - - def test_aggregation_invalid_operation(self): - """Test that an invalid operation raises a ValidationError.""" - with pytest.raises(ValidationError) as excinfo: - AggregationConfiguration(operation="invalid_op") - - assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value) - - def test_aggregation_custom_columns(self): - """Test aggregation with custom columns.""" - config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean') - - assert config.columns == ['Price', 'Quantity'] - assert config.operation == 'mean' - - def test_aggregation_inheritance(self): - """Test that AggregationConfiguration inherits from ColumnsSelectConfiguration.""" - config = AggregationConfiguration() - - assert isinstance(config, ColumnsSelectConfiguration) - assert hasattr(config, 'columns') - assert hasattr(config, 'operation') - - def test_aggregation_model_dump(self): - """Test that model_dump contains all expected fields (useful for the Dagster op).""" - config = AggregationConfiguration(columns=['Value'], operation='max') - dump = config.model_dump() - - assert dump['columns'] == ['Value'] - assert dump['operation'] == 'max' diff --git a/tests/data_processing/test_integration.py b/tests/data_processing/test_integration.py deleted file mode 100644 index c9d01eb..0000000 --- a/tests/data_processing/test_integration.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Integration tests for data processing jobs.""" - -import pytest -import pandas as pd -from unittest.mock import patch, MagicMock -from template_code_location.data_processing.ops import ( - remove_duplicates, - fill_missing_values, - standardize_categorical_values, - correct_typos -) -from template_code_location.data_processing.config_models import ( - FillMissingConfiguration, - ColumnsSelectConfiguration, - SpellCheckConfiguration -) - - -class TestPipelineIntegration: - """Integration tests for data processing pipeline.""" - - def test_pipeline_remove_duplicates_then_standardize(self, mock_context): - """Test pipeline: remove duplicates then standardize.""" - df = pd.DataFrame({ - 'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'], - 'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago'] - }) - - # Step 1: Remove duplicates - df_no_dupes = remove_duplicates(mock_context, df) - assert df_no_dupes.shape[0] == 3 - - # Step 2: Standardize - config = ColumnsSelectConfiguration(columns=['Name', 'City']) - df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes) - - assert df_standardized['Name'].iloc[0] == 'john doe' - assert df_standardized['City'].iloc[0] == 'new york' - - def test_pipeline_fill_missing_then_standardize(self, mock_context): - """Test pipeline: fill missing values then standardize.""" - df = pd.DataFrame({ - 'Category': [' ACTIVE ', None, ' PENDING '], - 'Value': ['1', '2', None] - }) - - # Step 1: Fill missing values - fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) - df_filled = fill_missing_values(mock_context, fill_config, df) - - # Step 2: Standardize - std_config = ColumnsSelectConfiguration(columns=['Category']) - df_standardized = standardize_categorical_values(mock_context, std_config, df_filled) - - assert df_standardized['Category'].iloc[0] == 'active' - assert df_filled['Value'].iloc[2] == '0' - - def test_pipeline_all_operations(self, mock_context): - """Test complete pipeline with all operations.""" - df = pd.DataFrame({ - 'Name': [' john doe ', 'JANE SMITH', ' john doe ', None], - 'Value': ['1', None, '1', '2'] - }) - - # Step 1: Remove duplicates - df = remove_duplicates(mock_context, df) - assert df.shape[0] == 3 - - # Step 2: Fill missing - fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) - df = fill_missing_values(mock_context, fill_config, df) - assert df['Value'].isna().sum() == 0 - - # Step 3: Standardize - std_config = ColumnsSelectConfiguration(columns=['Name']) - df = standardize_categorical_values(mock_context, std_config, df) - - assert df['Name'].iloc[0] == 'john doe' - - def test_pipeline_with_large_dataset(self, mock_context): - """Test pipeline performance with larger dataset.""" - # Create larger dataset - size = 1000 - df = pd.DataFrame({ - 'ID': list(range(size)), - 'Name': ['User_' + str(i % 50) for i in range(size)], - 'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3), - 'Score': [i % 100 for i in range(size)] - }) - - # Add some duplicates - df = pd.concat([df, df.head(100)], ignore_index=True) - - # Process - df_cleaned = remove_duplicates(mock_context, df) - - assert df_cleaned.shape[0] == 1000 - assert df_cleaned.shape[1] == 4 - - -class TestErrorHandling: - """Tests for error handling and edge cases.""" - - def test_operation_with_corrupted_data(self, mock_context): - """Test operations with corrupted/unusual data.""" - df = pd.DataFrame({ - 'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2] - }) - - # Should handle special float values - result = remove_duplicates(mock_context, df) - assert result.shape[0] > 0 - - def test_operation_preserves_index(self, mock_context): - """Test that index is handled correctly.""" - df = pd.DataFrame( - {'Col': [1, 2, 1, 3]}, - index=['a', 'b', 'c', 'd'] - ) - - result = remove_duplicates(mock_context, df) - # Index may be reset, so just check shape - assert result.shape[0] == 3 - - def test_standardize_with_unicode_characters(self, mock_context): - """Test standardization with unicode characters.""" - df = pd.DataFrame({ - 'Name': ['José', 'François', 'Müller'] - }) - - config = ColumnsSelectConfiguration(columns=['Name']) - result = standardize_categorical_values(mock_context, config, df) - - # Should handle unicode correctly - assert result.shape[0] == 3 - - def test_fill_with_same_key_multiple_times(self, mock_context): - """Test filling when fill_map has multiple entries.""" - df = pd.DataFrame({ - 'A': ['1', None, '3'], - 'B': [None, None, 'c'], - 'C': [None, '2', None] - }) - - config = FillMissingConfiguration(fill_map={ - 'A': '-1', - 'B': 'EMPTY', - 'C': '0' - }) - - result = fill_missing_values(mock_context, config, df) - - assert result.loc[1, 'A'] == '-1' - assert result.loc[0, 'B'] == 'EMPTY' - assert result.loc[0, 'C'] == '0' - - -class TestDataTypePreservation: - """Tests to ensure data types are preserved appropriately.""" - - def test_remove_duplicates_preserves_dtypes(self, mock_context): - """Test that remove_duplicates preserves column data types.""" - df = pd.DataFrame({ - 'int32': pd.array([1, 2, 1], dtype='int32'), - 'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'), - 'str': ['a', 'b', 'a'] - }) - - result = remove_duplicates(mock_context, df) - - assert result['int32'].dtype == df['int32'].dtype - assert result['float64'].dtype == df['float64'].dtype - - def test_fill_missing_preserves_column_types_where_possible(self, mock_context): - """Test that fill_missing handles type preservation.""" - df = pd.DataFrame({ - 'A': pd.array(['1', None, '3'], dtype='string'), - 'B': ['x', 'y', 'z'] - }) - - config = FillMissingConfiguration(fill_map={'A': '0'}) - result = fill_missing_values(mock_context, config, df) - - assert result['A'].loc[1] == '0' - assert result['B'].dtype == df['B'].dtype diff --git a/tests/data_processing/test_jobs.py b/tests/data_processing/test_jobs.py deleted file mode 100644 index 5373f7c..0000000 --- a/tests/data_processing/test_jobs.py +++ /dev/null @@ -1,56 +0,0 @@ -from template_code_location.data_processing.jobs import ( - remove_duplicates_job_s3, - fill_missing_values_job_s3, - standardize_categorical_values_job_s3, - correct_typos_job_s3, - normalize_numeric_min_max_job_s3, - normalize_datetime_job_s3, - normalize_coordinates_job_s3, - add_global_aggregations_job_s3 -) - - -def test_remove_duplicates_job_s3_is_callable(): - """Test remove_duplicates_job_s3 is a valid Dagster job""" - assert callable(remove_duplicates_job_s3) - assert hasattr(remove_duplicates_job_s3, 'execute_in_process') - - -def test_fill_missing_values_job_s3_is_callable(): - """Test fill_missing_values_job_s3 is a valid Dagster job""" - assert callable(fill_missing_values_job_s3) - assert hasattr(fill_missing_values_job_s3, 'execute_in_process') - - -def test_standardize_categorical_values_job_s3_is_callable(): - """Test standardize_categorical_values_job_s3 is a valid Dagster job""" - assert callable(standardize_categorical_values_job_s3) - assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process') - - -def test_correct_typos_job_s3_is_callable(): - """Test correct_typos_job_s3 is a valid Dagster job""" - assert callable(correct_typos_job_s3) - assert hasattr(correct_typos_job_s3, 'execute_in_process') - - -def test_normalize_numeric_min_max_job_s3_is_callable(): - """Test normalize_numeric_min_max_job_s3 is a valid Dagster job""" - assert callable(normalize_numeric_min_max_job_s3) - assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process') - - -def test_normalize_datetime_job_s3_is_callable(): - """Test normalize_datetime_job_s3 is a valid Dagster job""" - assert callable(normalize_datetime_job_s3) - assert hasattr(normalize_datetime_job_s3, 'execute_in_process') - -def test_normalize_coordinates_job_s3_is_callable(): - """Test normalize_coordinates_job_s3 is a valid Dagster job""" - assert callable(normalize_coordinates_job_s3) - assert hasattr(normalize_coordinates_job_s3, 'execute_in_process') - -def test_add_global_aggregations_job_s3_is_callable(): - """Test add_global_aggregations_job_s3 is a valid Dagster job""" - assert callable(add_global_aggregations_job_s3) - assert hasattr(add_global_aggregations_job_s3, 'execute_in_process') diff --git a/tests/data_processing/test_ops.py b/tests/data_processing/test_ops.py deleted file mode 100644 index def913b..0000000 --- a/tests/data_processing/test_ops.py +++ /dev/null @@ -1,700 +0,0 @@ -"""Unit tests for data processing operations.""" - -import pytest -import pandas as pd -from template_code_location.data_processing.ops import ( - remove_duplicates, - fill_missing_values, - standardize_categorical_values, - correct_typos, - normalize_datetime, - normalize_numeric_min_max, - normalize_coordinates, - add_global_aggregations -) -from template_code_location.data_processing.config_models import ( - FillMissingConfiguration, - ColumnsSelectConfiguration, - SpellCheckConfiguration, - AggregationConfiguration, - CoordinatesNormalizationConfiguration -) - - -class TestRemoveDuplicates: - """Tests for the remove_duplicates operation.""" - - def test_remove_duplicates_basic(self, mock_context, sample_dataframe): - """Test basic duplicate removal.""" - result = remove_duplicates(mock_context, sample_dataframe) - - # Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x) - assert result.shape[0] == 3 - assert len(result) < len(sample_dataframe) - - def test_remove_duplicates_no_duplicates(self, mock_context): - """Test remove_duplicates when there are no duplicates.""" - df = pd.DataFrame({ - 'A': [1, 2, 3], - 'B': ['x', 'y', 'z'] - }) - result = remove_duplicates(mock_context, df) - - assert result.shape[0] == 3 - pd.testing.assert_frame_equal(result, df) - - def test_remove_duplicates_all_duplicates(self, mock_context): - """Test remove_duplicates when all rows are identical.""" - df = pd.DataFrame({ - 'A': [1, 1, 1], - 'B': ['x', 'x', 'x'] - }) - result = remove_duplicates(mock_context, df) - - assert result.shape[0] == 1 - - def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe): - """Test remove_duplicates with empty DataFrame.""" - result = remove_duplicates(mock_context, empty_dataframe) - - assert result.shape[0] == 0 - assert result.shape[1] == 0 - - def test_remove_duplicates_preserves_data_types(self, mock_context): - """Test that remove_duplicates preserves data types.""" - df = pd.DataFrame({ - 'int_col': [1, 2, 1], - 'str_col': ['a', 'b', 'a'], - 'float_col': [1.5, 2.5, 1.5] - }) - result = remove_duplicates(mock_context, df) - - assert result['int_col'].dtype == df['int_col'].dtype - assert result['str_col'].dtype == df['str_col'].dtype - assert result['float_col'].dtype == df['float_col'].dtype - - -class TestFillMissingValues: - """Tests for the fill_missing_values operation.""" - - def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values): - """Test basic missing value filling.""" - config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'}) - result = fill_missing_values(mock_context, config, dataframe_with_missing_values) - - # Check that no NaN values remain - assert result['Column1'].isna().sum() == 0 - assert result['Column2'].isna().sum() == 0 - - def test_fill_missing_values_with_different_values(self, mock_context): - """Test filling with different replacement values.""" - df = pd.DataFrame({ - 'A': [1, None, 3], - 'B': [None, 'b', 'c'] - }) - config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'}) - result = fill_missing_values(mock_context, config, df) - - assert result.loc[1, 'A'] == '-1' - assert result.loc[0, 'B'] == 'UNKNOWN' - - def test_fill_missing_values_partial_columns(self, mock_context): - """Test filling only specified columns.""" - df = pd.DataFrame({ - 'A': [1, None, 3], - 'B': [None, 'b', 'c'] - }) - config = FillMissingConfiguration(fill_map={'A': '999'}) - result = fill_missing_values(mock_context, config, df) - - assert result.loc[1, 'A'] == '999' - assert pd.isna(result.loc[0, 'B']) # B should still have NaN - - def test_fill_missing_values_no_missing(self, mock_context): - """Test when there are no missing values.""" - df = pd.DataFrame({ - 'A': ['1', '2', '3'], - 'B': ['a', 'b', 'c'] - }) - config = FillMissingConfiguration(fill_map={'A': '0'}) - result = fill_missing_values(mock_context, config, df) - - pd.testing.assert_frame_equal(result, df) - - def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe): - """Test with empty DataFrame.""" - config = FillMissingConfiguration(fill_map={}) - result = fill_missing_values(mock_context, config, empty_dataframe) - - assert result.shape[0] == 0 - - -class TestStandardizeCategoricalValues: - """Tests for the standardize_categorical_values operation.""" - - def test_standardize_categorical_basic(self, mock_context, sample_dataframe): - """Test basic categorical standardization.""" - config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status']) - result = standardize_categorical_values(mock_context, config, sample_dataframe) - - # Check that values are lowercase and stripped - assert result['Name'].iloc[0] == 'john doe' - assert result['City'].iloc[1] == 'los angeles' - assert result['Status'].iloc[1] == 'inactive' - - def test_standardize_categorical_single_column(self, mock_context): - """Test standardization on a single column.""" - df = pd.DataFrame({ - 'City': [' NEW YORK ', 'LOS ANGELES', ' chicago '] - }) - config = ColumnsSelectConfiguration(columns=['City']) - result = standardize_categorical_values(mock_context, config, df) - - assert result['City'].iloc[0] == 'new york' - assert result['City'].iloc[1] == 'los angeles' - assert result['City'].iloc[2] == 'chicago' - - def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe): - """Test with non-existent column (should skip).""" - config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name']) - result = standardize_categorical_values(mock_context, config, sample_dataframe) - - # Should process 'Name' column without error - assert result['Name'].iloc[0] == 'john doe' - - def test_standardize_categorical_with_missing_values(self, mock_context): - """Test standardization with missing values.""" - df = pd.DataFrame({ - 'Category': [' ACTIVE ', None, ' pending '] - }) - config = ColumnsSelectConfiguration(columns=['Category']) - result = standardize_categorical_values(mock_context, config, df) - - assert result['Category'].iloc[0] == 'active' - assert result['Category'].iloc[1] == '' - assert result['Category'].iloc[2] == 'pending' - - def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe): - """Test with empty DataFrame.""" - config = ColumnsSelectConfiguration(columns=['A', 'B']) - result = standardize_categorical_values(mock_context, config, empty_dataframe) - - assert result.shape[0] == 0 - - def test_standardize_categorical_numeric_columns(self, mock_context): - """Test that numeric columns are converted to strings.""" - df = pd.DataFrame({ - 'NumCol': [1, 2, 3] - }) - config = ColumnsSelectConfiguration(columns=['NumCol']) - result = standardize_categorical_values(mock_context, config, df) - - assert result['NumCol'].iloc[0] == '1' - assert isinstance(result['NumCol'].iloc[0], str) - - -class TestCorrectTypos: - """Tests for the correct_typos operation.""" - - def test_correct_typos_basic(self, mock_context): - """Test basic typo correction.""" - df = pd.DataFrame({ - 'Name': ['jon', 'jayne', 'bob'] - }) - config = SpellCheckConfiguration(columns=['Name'], language='en') - result = correct_typos(mock_context, config, df) - - # Result should have corrections applied - assert result.shape[0] == 3 - - def test_correct_typos_missing_column(self, mock_context): - """Test with non-existent column (should skip).""" - df = pd.DataFrame({ - 'Name': ['jon', 'jayne'] - }) - config = SpellCheckConfiguration(columns=['NonExistent'], language='en') - result = correct_typos(mock_context, config, df) - - # Should not raise error, just skip - pd.testing.assert_frame_equal(result, df) - - def test_correct_typos_with_missing_values(self, mock_context): - """Test typo correction with missing values.""" - df = pd.DataFrame({ - 'Text': ['helo', '', 'wrld'] - }) - config = SpellCheckConfiguration(columns=['Text'], language='en') - result = correct_typos(mock_context, config, df) - - # Empty strings should be preserved - assert result.loc[1, 'Text'] == '' - - def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe): - """Test with empty DataFrame.""" - config = SpellCheckConfiguration(columns=['A'], language='en') - result = correct_typos(mock_context, config, empty_dataframe) - - assert result.shape[0] == 0 - - def test_correct_typos_different_languages(self, mock_context): - """Test typo correction with different languages.""" - df = pd.DataFrame({ - 'Text': ['ciao', 'mondo'] - }) - - for lang in ['en', 'es', 'it']: - config = SpellCheckConfiguration(columns=['Text'], language=lang) - result = correct_typos(mock_context, config, df) - - # Should process without error - assert result.shape[0] == 2 - - def test_correct_typos_numeric_values(self, mock_context): - """Test typo correction on numeric values converted to strings.""" - df = pd.DataFrame({ - 'Values': [123, 456, 789] - }) - config = SpellCheckConfiguration(columns=['Values'], language='en') - result = correct_typos(mock_context, config, df) - - # Numeric values should be converted to string and processed - assert result.shape[0] == 3 - -class TestNormalizeDatetime: - """Tests for the normalize_datetime operation.""" - - def test_normalize_datetime_basic(self, mock_context): - """Test basic datetime normalization to ISO format.""" - df = pd.DataFrame({ - 'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59'] - }) - - config = ColumnsSelectConfiguration(columns=['date_col']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'date_col_iso' in result.columns - assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z' - assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z' - - def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe): - """Test behavior when a configured column is missing in the DataFrame.""" - config = ColumnsSelectConfiguration(columns=['non_existent_column']) - - result = normalize_datetime(mock_context, config, sample_dataframe.copy()) - - pd.testing.assert_frame_equal(result, sample_dataframe) - - def test_normalize_datetime_unparseable_values(self, mock_context): - """Test column with values that cannot be parsed as dates.""" - df = pd.DataFrame({ - 'invalid_col': ['not-a-date', 'completely-random-text'] - }) - config = ColumnsSelectConfiguration(columns=['invalid_col']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'invalid_col_iso' not in result.columns - - def test_normalize_datetime_mixed_and_nulls(self, mock_context): - """Test column with mixed valid dates, invalid dates, and NaNs.""" - df = pd.DataFrame({ - 'mixed_col': ['2023-05-01', None, 'invalid-date'] - }) - config = ColumnsSelectConfiguration(columns=['mixed_col']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'mixed_col_iso' in result.columns - assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z' - - assert result['mixed_col_iso'].iloc[1] == "" - assert result['mixed_col_iso'].iloc[2] == "" - - def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe): - """Test with an empty DataFrame.""" - config = ColumnsSelectConfiguration(columns=['some_col']) - - result = normalize_datetime(mock_context, config, empty_dataframe) - - assert result.empty - - def test_normalize_datetime_epoch_only(self, mock_context, capsys): - """If parsing a column yields only the Unix epoch date, it should be skipped.""" - df = pd.DataFrame({ - 'weird_col': ['0', 0, '0000', ''] - }) - - config = ColumnsSelectConfiguration(columns=['weird_col']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'weird_col_iso' not in result.columns - - captured = capsys.readouterr() - assert "all normalized values are '1970-01-01'" in captured.err - - def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys): - """If all formatted values are '1970-01-01', the column should be skipped with a warning.""" - df = pd.DataFrame({ - 'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00'] - }) - - config = ColumnsSelectConfiguration(columns=['ts_col']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'ts_col_iso' not in result.columns - - captured = capsys.readouterr() - assert "all normalized values are '1970-01-01'" in captured.err - - def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys): - """If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped.""" - df = pd.DataFrame({ - 'age': [66, 45, 40, 43, 20, 26, 69, 21, 46] - }) - - config = ColumnsSelectConfiguration(columns=['age']) - - result = normalize_datetime(mock_context, config, df.copy()) - - assert 'age_iso' not in result.columns - - captured = capsys.readouterr() - assert "all normalized values are '1970-01-01'" in captured.err - -class TestNormalizeNumericMinMax: - """Tests for the normalize_numeric_min_max operation.""" - - def test_normalize_numeric_basic(self, mock_context): - """Test standard min-max normalization between 0 and 1.""" - df = pd.DataFrame({ - 'score': [10, 20, 30, 40, 50] - }) - config = ColumnsSelectConfiguration(columns=['score']) - - result = normalize_numeric_min_max(mock_context, config, df.copy()) - - assert 'score_norm' in result.columns - assert result['score_norm'].min() == 0.0 - assert result['score_norm'].max() == 1.0 - - assert result['score_norm'].iloc[2] == 0.5 - - def test_normalize_numeric_missing_column(self, mock_context): - """Test skipping of non-existent columns.""" - df = pd.DataFrame({'existing': [1, 2, 3]}) - config = ColumnsSelectConfiguration(columns=['missing_col']) - - result = normalize_numeric_min_max(mock_context, config, df.copy()) - - assert 'missing_col_norm' not in result.columns - - def test_normalize_numeric_constant_values(self, mock_context): - """Test skipping when min == max to avoid division by zero.""" - df = pd.DataFrame({ - 'constant': [10, 10, 10] - }) - config = ColumnsSelectConfiguration(columns=['constant']) - - result = normalize_numeric_min_max(mock_context, config, df.copy()) - - assert 'constant_norm' not in result.columns - - def test_normalize_numeric_with_nans(self, mock_context): - """Test normalization with NaN values (pandas min/max ignore NaNs by default).""" - df = pd.DataFrame({ - 'with_nans': [10, None, 50] - }) - config = ColumnsSelectConfiguration(columns=['with_nans']) - - result = normalize_numeric_min_max(mock_context, config, df.copy()) - - assert 'with_nans_norm' in result.columns - assert result['with_nans_norm'].iloc[0] == 0.0 - assert result['with_nans_norm'].iloc[2] == 1.0 - assert pd.isna(result['with_nans_norm'].iloc[1]) - - def test_normalize_numeric_multiple_columns(self, mock_context): - """Test processing multiple columns in one call.""" - df = pd.DataFrame({ - 'A': [1, 2], - 'B': [10, 20] - }) - config = ColumnsSelectConfiguration(columns=['A', 'B']) - - result = normalize_numeric_min_max(mock_context, config, df.copy()) - - assert 'A_norm' in result.columns - assert 'B_norm' in result.columns - -class TestNormalizeCoordinates: - """Tests for the normalize_coordinates operation.""" - - def test_normalize_coordinates_basic(self, mock_context): - """Test rounding and basic coordinate normalization.""" - df = pd.DataFrame({ - 'lat': [45.123456, 46.0], - 'lon': [9.123456, 10.0] - }) - config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') - - result = normalize_coordinates(mock_context, config, df.copy()) - - assert result['lat'].iloc[0] == 45.1235 - assert result['lon'].iloc[0] == 9.1235 - - assert len(result) == 2 - - def test_normalize_coordinates_filtering(self, mock_context): - """Test filtering of out-of-range coordinates.""" - df = pd.DataFrame({ - 'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range - 'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range - }) - config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') - - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 1 - assert result['lat'].iloc[0] == 45.0 - - def test_normalize_coordinates_invalid_types(self, mock_context): - """Test conversion of strings to numeric and handling of NaNs.""" - df = pd.DataFrame({ - 'lat': ["45.5", "invalid", None], - 'lon': ["9.5", "10.0", "11.0"] - }) - config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') - - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 1 - assert isinstance(result['lat'].iloc[0], float) - - def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe): - """Test with an empty DataFrame.""" - - df = pd.DataFrame(columns=['lat', 'lon']) - config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') - - result = normalize_coordinates(mock_context, config, df) - - assert len(result) == 0 - assert result.empty - - def test_normalize_coordinates_default_config(self, mock_context): - """Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided.""" - df = pd.DataFrame({ - 'lat': [45.123456, 46.0], - 'lon': [9.123456, 10.0] - }) - config = CoordinatesNormalizationConfiguration() - - result = normalize_coordinates(mock_context, config, df.copy()) - - assert result['lat'].iloc[0] == 45.1235 - assert result['lon'].iloc[0] == 9.1235 - assert len(result) == 2 - - def test_normalize_coordinates_null_config_values(self, mock_context): - """Test that null lat/lon column names fall back to defaults ('lat'/'lon').""" - df = pd.DataFrame({ - 'lat': [45.123456, 46.0], - 'lon': [9.123456, 10.0] - }) - config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None) - - assert config.latColumn == "lat" - assert config.lonColumn == "lon" - - result = normalize_coordinates(mock_context, config, df.copy()) - - assert result['lat'].iloc[0] == 45.1235 - assert result['lon'].iloc[0] == 9.1235 - assert len(result) == 2 - - def test_normalize_coordinates_dms_degree_symbol(self, mock_context): - """Test DMS parsing with degree/minute/second symbols like 40°26'46\"N.""" - df = pd.DataFrame({ - 'lat': ["40°26'46\"N", "51°30'26\"N"], - 'lon': ["79°58'56\"W", "0°7'39\"W"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 2 - # 40°26'46"N ≈ 40.4461 - assert abs(result['lat'].iloc[0] - 40.4461) < 0.001 - # 79°58'56"W ≈ -79.9822 - assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001 - - def test_normalize_coordinates_dms_spaced_format(self, mock_context): - """Test DMS parsing with space-separated format like '40 26 46 N'.""" - df = pd.DataFrame({ - 'lat': ["40 26 46 N"], - 'lon': ["79 58 56 W"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 1 - assert abs(result['lat'].iloc[0] - 40.4461) < 0.001 - assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001 - - def test_normalize_coordinates_dms_already_decimal(self, mock_context): - """Test that string columns with decimal values are auto-parsed correctly.""" - df = pd.DataFrame({ - 'lat': ["45.5", "46.0"], - 'lon': ["9.5", "10.0"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 2 - assert result['lat'].iloc[0] == 45.5 - assert result['lon'].iloc[0] == 9.5 - - def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context): - """Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values.""" - df = pd.DataFrame({ - 'lat': ["40°26'46\"N", "not_a_coord", "51.5"], - 'lon': ["79°58'56\"W", "10.0", "0.1"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - # Row with "not_a_coord" for lat should be dropped (NaN lat) - assert len(result) == 2 - - def test_normalize_coordinates_dms_out_of_range(self, mock_context): - """Test that DMS-parsed coordinates outside valid range are filtered out.""" - df = pd.DataFrame({ - 'lat': ["91°0'0\"N", "45°0'0\"N"], - 'lon': ["0°0'0\"E", "9°0'0\"E"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - # First row has lat=91° which is out of [-90, 90] - assert len(result) == 1 - assert abs(result['lat'].iloc[0] - 45.0) < 0.001 - - def test_normalize_coordinates_dms_south_and_east(self, mock_context): - """Test DMS parsing with south latitude and east longitude.""" - df = pd.DataFrame({ - 'lat': ["33°51'54\"S"], - 'lon': ["151°12'36\"E"] - }) - config = CoordinatesNormalizationConfiguration( - latColumn='lat', lonColumn='lon' - ) - result = normalize_coordinates(mock_context, config, df.copy()) - - assert len(result) == 1 - # 33°51'54"S ≈ -33.865 - assert result['lat'].iloc[0] < 0 - assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001 - # 151°12'36"E ≈ 151.21 - assert result['lon'].iloc[0] > 0 - assert abs(result['lon'].iloc[0] - 151.21) < 0.01 - - def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context): - """Test that numeric columns are coerced directly while string columns are parsed as DMS.""" - # Numeric columns — should go through pd.to_numeric path - df_numeric = pd.DataFrame({ - 'lat': [45.123456, 46.0], - 'lon': [9.123456, 10.0] - }) - config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon') - result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy()) - - assert result_numeric['lat'].iloc[0] == 45.1235 - assert len(result_numeric) == 2 - - # String DMS columns — should go through _parse_dms_to_decimal path - df_dms = pd.DataFrame({ - 'lat': ["40°26'46\"N"], - 'lon': ["79°58'56\"W"] - }) - result_dms = normalize_coordinates(mock_context, config, df_dms.copy()) - - assert len(result_dms) == 1 - assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001 - -class TestAddGlobalAggregations: - """Tests for the add_global_aggregations operation.""" - - def test_add_global_aggregations_success(self, mock_context): - """Test a successful group by and aggregation.""" - df = pd.DataFrame({ - 'category': ['A', 'A', 'B'], - 'value': [10, 20, 100], - 'ignored_str': ['x', 'y', 'z'] - }) - - config = AggregationConfiguration( - columns=['category'], - operation='sum' - ) - - result = add_global_aggregations(mock_context, config, df.copy()) - - assert len(result) == 2 - assert result.loc[result['category'] == 'A', 'value'].values[0] == 30 - assert result.loc[result['category'] == 'B', 'value'].values[0] == 100 - assert 'ignored_str' not in result.columns - mock_context.log.info.assert_called() - - def test_add_global_aggregations_missing_column(self, mock_context): - """Test skipping a column that does not exist in the dataframe.""" - df = pd.DataFrame({'value': [1, 2, 3]}) - config = AggregationConfiguration( - columns=['missing_col'], - operation='count' - ) - - result = add_global_aggregations(mock_context, config, df.copy()) - - mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.") - assert len(result) == 1 - - def test_add_global_aggregations_unsupported_op(self, mock_context): - """Test the warning when an unsupported operation is provided.""" - df = pd.DataFrame({'category': ['A'], 'value': [1]}) - - config = AggregationConfiguration( - columns=['category'], - operation='unsupported' - ) - - with pytest.raises(Exception): - add_global_aggregations(mock_context, config, df.copy()) - - mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'") - - def test_add_global_aggregations_only_numeric_kept(self, mock_context): - """Verify that non-numeric and non-grouping columns are dropped.""" - df = pd.DataFrame({ - 'group': ['A', 'A'], - 'num': [1, 2], - 'text': ['hello', 'world'] - }) - config = AggregationConfiguration(columns=['group'], operation='mean') - - result = add_global_aggregations(mock_context, config, df.copy()) - - assert 'text' not in result.columns - assert 'num' in result.columns - assert 'group' in result.columns diff --git a/tests/dataframe_level_anonymisation/__init__.py b/tests/dataframe_level_anonymisation/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tests/dataframe_level_anonymisation/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/dataframe_level_anonymisation/config_models/__init__.py b/tests/dataframe_level_anonymisation/config_models/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tests/dataframe_level_anonymisation/config_models/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/dataframe_level_anonymisation/config_models/test_base_config.py b/tests/dataframe_level_anonymisation/config_models/test_base_config.py deleted file mode 100644 index 92e599b..0000000 --- a/tests/dataframe_level_anonymisation/config_models/test_base_config.py +++ /dev/null @@ -1,54 +0,0 @@ -import pytest -from pydantic import ValidationError - -from template_code_location.dataframe_level_anonymisation.config_models.base_config import BaseConfiguration - - -def test_valid_configuration_with_overrides(): - cfg = BaseConfiguration( - ident=["id"], - quasi_identifiers=["age"], - supp_level=10.0, - generalisation_hierarchies={"age": "age_hierarchy"}, - ) - assert cfg.ident == ["id"] - assert cfg.quasi_identifiers == ["age"] - assert cfg.supp_level == 10.0 - assert cfg.generalisation_hierarchies == {"age": "age_hierarchy"} - - -def test_default_values_are_loaded(): - cfg = BaseConfiguration() - assert cfg.ident == ["Name"] - assert cfg.quasi_identifiers == ["Age"] - assert cfg.supp_level == 50.0 - assert cfg.generalisation_hierarchies == {"Age": "simpl_age"} - - -def test_missing_ident_raises_error(): - with pytest.raises(ValidationError): - BaseConfiguration( - ident=[] - ) - - -def test_missing_quasi_ident_raises_error(): - with pytest.raises(ValidationError): - BaseConfiguration( - quasi_identifiers=[] - ) - - -def test_overlap_between_ident_and_quasi_identifiers(): - with pytest.raises(ValidationError): - BaseConfiguration( - ident=["age"], - quasi_identifiers=["age"] - ) - - -def test_supp_level_bounds(): - with pytest.raises(ValidationError): - BaseConfiguration( - supp_level=150.0 # fuori range - ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py b/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py deleted file mode 100644 index c6994a9..0000000 --- a/tests/dataframe_level_anonymisation/config_models/test_hierarchies.py +++ /dev/null @@ -1,48 +0,0 @@ -from template_code_location.dataframe_level_anonymisation.config_models.hierarchies import ( - simpl_age, - simpl_age2, - simpl_gender, - get_all_hierarchies, -) - - -def test_simpl_age_structure(): - assert isinstance(simpl_age, dict) - assert 0 in simpl_age - assert isinstance(simpl_age[0], list) - # verify first level contains 100 ages - assert len(simpl_age[0]) == 100 - assert simpl_age[0][0] == 0 - assert simpl_age[0][-1] == 99 - - -def test_simpl_age2_structure(): - assert isinstance(simpl_age2, dict) - assert 0 in simpl_age2 - assert 1 in simpl_age2 - assert isinstance(simpl_age2[0], list) - assert isinstance(simpl_age2[1], list) - - -def test_simpl_gender_structure(): - assert isinstance(simpl_gender, dict) - assert 0 in simpl_gender - assert 1 in simpl_gender - assert simpl_gender[0] == ["M", "F", "O"] - assert simpl_gender[1] == ["*", "*", "*"] - - -def test_get_all_hierarchies(): - hier = get_all_hierarchies() - - # the function should return dicts only - assert isinstance(hier, dict) - - # ensure expected dicts are included - assert "simpl_age" in hier - assert "simpl_age2" in hier - assert "simpl_gender" in hier - - # ensure the values returned are references to the actual dicts - assert hier["simpl_age"] is simpl_age - assert hier["simpl_gender"] is simpl_gender diff --git a/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py b/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py deleted file mode 100644 index ef6e2c8..0000000 --- a/tests/dataframe_level_anonymisation/config_models/test_k_anonymity_config.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest -from pydantic import ValidationError - -from template_code_location.dataframe_level_anonymisation.config_models.k_anonymity_configuration import ( - KAnonymityConfiguration, -) - - -def test_valid_k_anonymity_config_with_overrides(): - cfg = KAnonymityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - supp_level=5.0, - generalisation_hierarchies={"age": "age_hier"}, - k=3, - sensitive_attributes=["disease"], - ) - assert cfg.k == 3 - assert cfg.sensitive_attributes == ["disease"] - assert cfg.generalisation_hierarchies == {"age": "age_hier"} - - -def test_default_values_are_loaded(): - cfg = KAnonymityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"} - ) - assert cfg.k == 3 - assert cfg.sensitive_attributes == ["Disease"] - - -def test_invalid_k_value_raises_error(): - with pytest.raises(ValidationError): - KAnonymityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"}, - k=1, # invalid, must be >= 2 - sensitive_attributes=["disease"], - ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py b/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py deleted file mode 100644 index c94db3e..0000000 --- a/tests/dataframe_level_anonymisation/config_models/test_l_diversity_config.py +++ /dev/null @@ -1,44 +0,0 @@ -import pytest -from pydantic import ValidationError - -from template_code_location.dataframe_level_anonymisation.config_models.l_diversity_configuration import ( - LDiversityConfiguration, -) - - -def test_valid_l_diversity_config_with_overrides(): - cfg = LDiversityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - supp_level=5.0, - generalisation_hierarchies={"age": "age_hier"}, - k=3, - l=2, - sensitive_attribute="disease", - ) - assert cfg.k == 3 - assert cfg.l == 2 - assert cfg.sensitive_attribute == "disease" - - -def test_default_values_are_loaded(): - cfg = LDiversityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"} - ) - assert cfg.k == 2 - assert cfg.l == 3 - assert cfg.sensitive_attribute == "Disease" - - -def test_invalid_l_value_raises_error(): - with pytest.raises(ValidationError): - LDiversityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"}, - k=3, - l=0, # invalid, must be >= 1 - sensitive_attribute="disease", - ) diff --git a/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py b/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py deleted file mode 100644 index 615bd27..0000000 --- a/tests/dataframe_level_anonymisation/config_models/test_t_closeness_config.py +++ /dev/null @@ -1,56 +0,0 @@ -import pytest -from pydantic import ValidationError - -from template_code_location.dataframe_level_anonymisation.config_models.t_closeness_configuration import ( - TClosenessConfiguration, -) - - -def test_valid_t_closeness_config_with_overrides(): - cfg = TClosenessConfiguration( - ident=["id"], - quasi_identifiers=["age"], - supp_level=5.0, - generalisation_hierarchies={"age": "age_hier"}, - k=3, - t=0.4, - sensitive_attribute="disease", - ) - assert cfg.k == 3 - assert cfg.t == 0.4 - assert cfg.sensitive_attribute == "disease" - - -def test_default_values_are_loaded(): - cfg = TClosenessConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"} - ) - assert cfg.k == 2 - assert cfg.t == 0.5 - assert cfg.sensitive_attribute == "Disease" - - -def test_invalid_t_value_low(): - with pytest.raises(ValidationError): - TClosenessConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"}, - k=3, - t=-0.1, # invalid - sensitive_attribute="disease", - ) - - -def test_invalid_t_value_high(): - with pytest.raises(ValidationError): - TClosenessConfiguration( - ident=["id"], - quasi_identifiers=["age"], - generalisation_hierarchies={"age": "age_hier"}, - k=3, - t=2.0, # invalid > 1 - sensitive_attribute="disease", - ) diff --git a/tests/dataframe_level_anonymisation/test_jobs.py b/tests/dataframe_level_anonymisation/test_jobs.py deleted file mode 100644 index f890e2d..0000000 --- a/tests/dataframe_level_anonymisation/test_jobs.py +++ /dev/null @@ -1,44 +0,0 @@ -from template_code_location.dataframe_level_anonymisation.jobs import ( - k_anonymity_job, - l_diversity_job, - t_closeness_job, - k_anonymity_job_s3, - l_diversity_job_s3, - t_closeness_job_s3 -) - - -def test_k_anonymity_job_is_callable(): - """Test k_anonymity_job is a valid Dagster job""" - assert callable(k_anonymity_job) - assert hasattr(k_anonymity_job, 'execute_in_process') - - -def test_l_diversity_job_is_callable(): - """Test l_diversity_job is a valid Dagster job""" - assert callable(l_diversity_job) - assert hasattr(l_diversity_job, 'execute_in_process') - - -def test_t_closeness_job_is_callable(): - """Test t_closeness_job is a valid Dagster job""" - assert callable(t_closeness_job) - assert hasattr(t_closeness_job, 'execute_in_process') - - -def test_k_anonymity_job_s3_is_callable(): - """Test k_anonymity_job_s3 is a valid Dagster job""" - assert callable(k_anonymity_job_s3) - assert hasattr(k_anonymity_job_s3, 'execute_in_process') - - -def test_l_diversity_job_s3_is_callable(): - """Test l_diversity_job_s3 is a valid Dagster job""" - assert callable(l_diversity_job_s3) - assert hasattr(l_diversity_job_s3, 'execute_in_process') - - -def test_t_closeness_job_s3_is_callable(): - """Test t_closeness_job_s3 is a valid Dagster job""" - assert callable(t_closeness_job_s3) - assert hasattr(t_closeness_job_s3, 'execute_in_process') diff --git a/tests/dataframe_level_anonymisation/test_ops.py b/tests/dataframe_level_anonymisation/test_ops.py deleted file mode 100644 index 90c01aa..0000000 --- a/tests/dataframe_level_anonymisation/test_ops.py +++ /dev/null @@ -1,230 +0,0 @@ -import pytest -import pandas as pd -from unittest.mock import patch -from dagster import DagsterInvalidInvocationError, build_op_context - -from template_code_location.dataframe_level_anonymisation.ops import ( - apply_k_anonymity, - apply_l_diversity, - apply_t_closeness, -) -from template_code_location.dataframe_level_anonymisation.config_models import ( - KAnonymityConfiguration, - LDiversityConfiguration, - TClosenessConfiguration, -) - - -# --------------------------- -# Fixtures -# --------------------------- -@pytest.fixture -def fake_df(): - return pd.DataFrame({"id": [1, 2], "age": [30, 40]}) - - -@pytest.fixture -def k_config(): - return KAnonymityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - sensitive_attributes=["age"], - k=2, - supp_level=0.0, - generalisation_hierarchies={"age": "simpl_age"}, - ) - - -@pytest.fixture -def l_config(): - return LDiversityConfiguration( - ident=["id"], - quasi_identifiers=["age"], - sensitive_attribute="age", - k=2, - l=1, - supp_level=0.0, - generalisation_hierarchies={"age": "simpl_age"}, - ) - - -@pytest.fixture -def t_config(): - return TClosenessConfiguration( - ident=["id"], - quasi_identifiers=["age"], - sensitive_attribute="age", - k=2, - t=0.5, - supp_level=0.0, - generalisation_hierarchies={"age": "simpl_age"}, - ) - - -@pytest.fixture -def op_context(): - return build_op_context() - - -# --------------------------- -# Helper for patching external functions -# --------------------------- -@pytest.fixture(autouse=True) -def patch_external_ops(): - with ( - patch( - "dataframe_level_anonymisation.ops.get_all_hierarchies", - return_value={"simpl_age": {0: [30, 40]}}, - ), - patch( - "dataframe_level_anonymisation.ops.k_anonymity", - return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), - ), - patch( - "dataframe_level_anonymisation.ops.l_diversity", - return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), - ), - patch( - "dataframe_level_anonymisation.ops.t_closeness", - return_value=pd.DataFrame({"id": [1, 2], "age": [30, 40]}), - ), - ): - yield - - -# --------------------------- -# Tests for apply_k_anonymity -# --------------------------- -def test_apply_k_anonymity_outputs(op_context, k_config, fake_df): - results = list(apply_k_anonymity(op_context, k_config, fake_df)) - assert len(results) == 2 - - data_output = results[0].value - metrics_output = results[1].value - - # Check types - assert isinstance(data_output, pd.DataFrame) - assert isinstance(metrics_output, dict) - assert "k_anon" in metrics_output - assert "l_div" in metrics_output - assert "t_clos" in metrics_output - - -# --------------------------- -# Tests for apply_l_diversity -# --------------------------- -def test_apply_l_diversity_outputs(op_context, l_config, fake_df): - results = list(apply_l_diversity(op_context, l_config, fake_df)) - assert len(results) == 2 - - data_output = results[0].value - metrics_output = results[1].value - - assert isinstance(data_output, pd.DataFrame) - assert isinstance(metrics_output, dict) - assert "k_anon" in metrics_output - assert "l_div" in metrics_output - assert "t_clos" in metrics_output - - -def test_apply_l_diversity_empty_raises(op_context, l_config): - with patch("dataframe_level_anonymisation.ops.l_diversity", return_value=pd.DataFrame()): - - with pytest.raises(DagsterInvalidInvocationError): - list(apply_l_diversity(op_context, l_config, pd.DataFrame({"id": [1], "age": [30]}))) - - -# --------------------------- -# Tests for apply_t_closeness -# --------------------------- -def test_apply_t_closeness_outputs(op_context, t_config, fake_df): - results = list(apply_t_closeness(op_context, t_config, fake_df)) - assert len(results) == 2 - - data_output = results[0].value - metrics_output = results[1].value - - assert isinstance(data_output, pd.DataFrame) - assert isinstance(metrics_output, dict) - assert "k_anon" in metrics_output - assert "l_div" in metrics_output - assert "t_clos" in metrics_output - - -def test_apply_t_closeness_empty_raises(op_context, t_config): - with patch("dataframe_level_anonymisation.ops.t_closeness", return_value=pd.DataFrame()): - with pytest.raises(DagsterInvalidInvocationError): - list(apply_t_closeness(op_context, t_config, pd.DataFrame({"id": [1], "age": [30]}))) - - -# --------------------------- -# Additional tests for _validate_and_get_hierarchies -# --------------------------- -def test_validate_hierarchies_dataset_too_small(k_config): - small_df = pd.DataFrame({"id": [1], "age": [30]}) - from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies - - with pytest.raises(DagsterInvalidInvocationError): - _validate_and_get_hierarchies(k_config, small_df) - - -def test_validate_hierarchies_missing_hierarchy(k_config, fake_df): - from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies - - bad_config = k_config.model_copy(update={"generalisation_hierarchies": {}}) - - with pytest.raises(DagsterInvalidInvocationError): - _validate_and_get_hierarchies(bad_config, fake_df) - - -def test_validate_hierarchies_hierarchy_not_in_code(k_config, fake_df): - from template_code_location.dataframe_level_anonymisation.ops import _validate_and_get_hierarchies - - with patch("dataframe_level_anonymisation.ops.get_all_hierarchies", return_value={}): - with pytest.raises(DagsterInvalidInvocationError): - _validate_and_get_hierarchies(k_config, fake_df) - - -# --------------------------- -# Additional tests for _calc_dataframe_metrics -# --------------------------- -def test_calc_dataframe_metrics_basic(): - from template_code_location.dataframe_level_anonymisation.ops import _calc_dataframe_metrics - - df_org = pd.DataFrame({"age": [30, 40], "id": [1, 2]}) - df_anon = df_org.copy() - - with ( - patch("dataframe_level_anonymisation.ops.anonymity.k_anonymity", return_value=2), - patch("dataframe_level_anonymisation.ops.anonymity.l_diversity", return_value=1), - patch("dataframe_level_anonymisation.ops.anonymity.t_closeness", return_value=0.1), - ): - - report, metrics = _calc_dataframe_metrics(df_anon, df_org, ["age"], ["age"]) - - assert "k-anonymity" in report - assert metrics["k_anon"] == 2 - assert metrics["l_div"] == 1 - assert metrics["t_clos"] == 0.1 - - -# --------------------------- -# Tests for apply_t_closeness exception branches -# --------------------------- -def test_apply_t_closeness_value_error_quasi_identifiers(op_context, t_config, fake_df): - """Covers the branch where ValueError contains 'Cannot be quasi-identifiers'.""" - with patch( - "dataframe_level_anonymisation.ops.t_closeness", - side_effect=ValueError("Cannot be quasi-identifiers invalid"), - ): - with pytest.raises(DagsterInvalidInvocationError): - list(apply_t_closeness(op_context, t_config, fake_df)) - - -def test_apply_t_closeness_value_error_other_message(op_context, t_config, fake_df): - """Covers the branch where ValueError is raised but message does NOT contain that substring.""" - with patch( - "dataframe_level_anonymisation.ops.t_closeness", side_effect=ValueError("Some other error") - ): - with pytest.raises(DagsterInvalidInvocationError): - list(apply_t_closeness(op_context, t_config, fake_df)) diff --git a/tests/dataframe_level_anonymisation/test_utils.py b/tests/dataframe_level_anonymisation/test_utils.py deleted file mode 100644 index 3fa1841..0000000 --- a/tests/dataframe_level_anonymisation/test_utils.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np - -from template_code_location.dataframe_level_anonymisation.utils import ( - parse_value_list, - normalize_hierarchy_levels, -) - - -# ------------------------------------ -# Tests for parse_value_list -# ------------------------------------ -def test_parse_value_list_all_strings_digits(): - values = ["1", "2", "3"] - assert parse_value_list(values) == [1, 2, 3] - - -def test_parse_value_list_mixed_values(): - values = ["1", 2, "abc", "5"] - assert parse_value_list(values) == [1, 2, "abc", 5] - - -def test_parse_value_list_no_digits(): - values = ["a", "b", "c"] - assert parse_value_list(values) == ["a", "b", "c"] - - -# ------------------------------------ -# Tests for normalize_hierarchy_levels -# ------------------------------------ -def test_normalize_hierarchy_levels_level_0_converted_to_numpy_array(): - hierarchy = {"age": {"0": ["1", "2", "3"], "1": ["0-10", "11-20"]}} - - normalized = normalize_hierarchy_levels(hierarchy) - - assert "age" in normalized - assert 0 in normalized["age"] - assert isinstance(normalized["age"][0], np.ndarray) - assert normalized["age"][0].tolist() == [1, 2, 3] # converted via parse_value_list - assert normalized["age"][1] == ["0-10", "11-20"] # untouched - - -def test_normalize_hierarchy_levels_multiple_columns(): - hierarchy = {"age": {"0": ["10", "20"]}, "gender": {"0": ["M", "F"], "1": ["*"]}} - - normalized = normalize_hierarchy_levels(hierarchy) - - # First column - assert isinstance(normalized["age"][0], np.ndarray) - assert normalized["age"][0].tolist() == [10, 20] - - # Second column - assert isinstance(normalized["gender"][0], np.ndarray) - assert normalized["gender"][0].tolist() == ["M", "F"] - assert normalized["gender"][1] == ["*"] - - -def test_normalize_hierarchy_levels_mixed_digit_non_digit_at_level_0(): - hierarchy = {"test": {"0": ["1", "x", "3"]}} - - normalized = normalize_hierarchy_levels(hierarchy) - - assert isinstance(normalized["test"][0], np.ndarray) - assert normalized["test"][0].tolist() == ["1", "x", "3"] - - -def test_normalize_hierarchy_levels_empty_mapping(): - hierarchy = {"col": {}} - normalized = normalize_hierarchy_levels(hierarchy) - - assert normalized == {"col": {}} diff --git a/tests/field_level_pseudo_anonymisation/__init__.py b/tests/field_level_pseudo_anonymisation/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tests/field_level_pseudo_anonymisation/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tests/field_level_pseudo_anonymisation/conftest.py b/tests/field_level_pseudo_anonymisation/conftest.py deleted file mode 100644 index ee54069..0000000 --- a/tests/field_level_pseudo_anonymisation/conftest.py +++ /dev/null @@ -1,444 +0,0 @@ -""" -Shared pytest fixtures and helpers for field-level pseudonymisation tests. - -This module provides: -- Mock Vault client for testing without real Vault connections -- Sample data fixtures -- Configuration fixtures for encryption/decryption operations -- Helper functions for running ops and managing test Vault storage -""" - -import pandas as pd -import pytest -from dagster import build_op_context -from cryptography.fernet import Fernet -from hvac.exceptions import InvalidPath, Forbidden -from unittest.mock import patch, MagicMock - -from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( - AnonymisePseudonymizeStructuredConfig, - DepseudonymizeStructuredConfig, - EncryptConfig, - DecryptConfig, - PseudoTechniqueConfig, - DepseudoTechniqueConfig, -) -from template_code_location.field_level_pseudo_anonymisation.ops import ( - anonymize_pseudonymize_structured, - depseudonymize_structured, -) - - -# -------------------------------- Mock Vault Storage ---------------------------------------- - -# In-memory Vault simulation for tests -_test_vault_storage = {} -_test_vault_access_control = {} # For simulating access control - - -@pytest.fixture(autouse=True) -def mock_vault_client(): - """ - Auto-use fixture that mocks the hvac.Client to avoid real Vault connections. - Uses an in-memory dict to simulate Vault storage for tests. - Includes access control simulation for AC3. - """ - global _test_vault_storage, _test_vault_access_control - _test_vault_storage = {} # Reset storage before each test - _test_vault_access_control = {} # Reset access control - - def mock_read_secret(path, mount_point): - """Mock reading secret from Vault with access control""" - full_path = f"{mount_point}/{path}" - - # Check access control first - if full_path in _test_vault_access_control: - if not _test_vault_access_control[full_path]: - raise Forbidden(f"Access denied to secret: {full_path}") - - if full_path not in _test_vault_storage: - raise InvalidPath(f"Secret not found: {full_path}") - return {"data": {"data": {"value": _test_vault_storage[full_path]}}} - - def mock_create_or_update_secret(path, mount_point, secret): - """Mock creating/updating secret in Vault""" - full_path = f"{mount_point}/{path}" - _test_vault_storage[full_path] = secret["value"] - - def mock_delete_metadata(path, mount_point): - """Mock deleting secret from Vault""" - full_path = f"{mount_point}/{path}" - if full_path in _test_vault_storage: - del _test_vault_storage[full_path] - if full_path in _test_vault_access_control: - del _test_vault_access_control[full_path] - - with patch("hvac.Client") as mock_client_class: - mock_instance = MagicMock() - mock_instance.secrets.kv.v2.read_secret_version.side_effect = mock_read_secret - mock_instance.secrets.kv.v2.create_or_update_secret.side_effect = ( - mock_create_or_update_secret - ) - mock_instance.secrets.kv.v2.delete_metadata_and_all_versions.side_effect = ( - mock_delete_metadata - ) - mock_client_class.return_value = mock_instance - yield mock_instance - - -# -------------------------------- Sample Data Fixtures ---------------------------------------- - - -@pytest.fixture -def sample_df(): - """ - Fixture providing a sample structured dataset with PII data. - Represents typical data that requires pseudonymisation and restoration. - """ - return pd.DataFrame( - { - "id": [1, 2, 3, 4, 5], - "name": [ - "Alice Smith", - "Bob Jones", - "Charlie Brown", - "David Wilson", - "Eva Garcia", - ], - "email": [ - "alice@example.com", - "bob@example.com", - "charlie@example.com", - "david@example.com", - "eva@example.com", - ], - "ssn": [ - "123-45-6789", - "234-56-7890", - "345-67-8901", - "456-78-9012", - "567-89-0123", - ], - "age": [25, 30, 35, 40, 45], - "salary": [50000.0, 60000.0, 70000.0, 80000.0, 90000.0], - "department": ["HR", "IT", "Finance", "IT", "HR"], - } - ) - - -# -------------------------------- Configuration Fixtures ---------------------------------------- - - -@pytest.fixture -def encrypt_config_single_field(): - """ - Configuration for encrypting a single field (email). - Used to create pseudonymised data for restoration tests. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - -@pytest.fixture -def decrypt_config_single_field(): - """ - Configuration for decrypting a single field (email). - Used to restore original values. - """ - return DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - -@pytest.fixture -def encrypt_config_multiple_fields(): - """ - Configuration for encrypting multiple fields (name, email, ssn). - Tests restoration of multiple sensitive fields. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - columns=["name", "email", "ssn"], - key_name="test_restoration_key_multi", - ) - ) - ] - ) - - -@pytest.fixture -def decrypt_config_multiple_fields(): - """ - Configuration for decrypting multiple fields (name, email, ssn). - """ - return DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["name", "email", "ssn"], - key_name="test_restoration_key_multi", - ) - ) - ] - ) - - -@pytest.fixture -def encrypt_config_partial_fields(): - """ - Configuration for encrypting only some fields (email, ssn). - Tests partial restoration scenarios. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - columns=["email", "ssn"], - key_name="test_restoration_key_partial", - ) - ) - ] - ) - - -@pytest.fixture -def decrypt_config_partial_fields(): - """ - Configuration for decrypting only some fields (email, ssn). - """ - return DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email", "ssn"], - key_name="test_restoration_key_partial", - ) - ) - ] - ) - - -@pytest.fixture -def authorized_multi_key_scenario(): - """ - Fixture for testing multi-key authorization scenarios. - Sets up two keys: one authorized, one denied. - """ - clear_vault_key("authorized_key") - clear_vault_key("unauthorized_key") - - # Create authorized key by generating it - authorized_key = Fernet.generate_key().decode() - set_vault_key("authorized_key", authorized_key) - - # Create unauthorized key and deny access - unauthorized_key = Fernet.generate_key().decode() - set_vault_key("unauthorized_key", unauthorized_key) - deny_vault_access("unauthorized_key") - - yield {"authorized": "authorized_key", "unauthorized": "unauthorized_key"} - - # Cleanup - clear_vault_key("authorized_key") - clear_vault_key("unauthorized_key") - - -@pytest.fixture -def large_dataset(): - """ - Fixture providing a large dataset (10,000 rows) for performance testing. - Reusable across multiple performance tests. - """ - return pd.DataFrame( - { - "id": range(1, 10001), - "email": [f"user{i}@example.com" for i in range(1, 10001)], - "name": [f"User {i}" for i in range(1, 10001)], - "ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)], - "age": [20 + (i % 50) for i in range(1, 10001)], - "salary": [30000.0 + (i * 10) for i in range(1, 10001)], - "department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)], - } - ) - - -@pytest.fixture(scope="session") -def vault_test_keys(): - """ - Session-scoped fixture to pre-generate test keys for faster test execution. - Avoids repeated key generation in each test. - """ - keys = {f"test_key_{i}": Fernet.generate_key().decode() for i in range(10)} - - return keys - - -@pytest.fixture -def cleanup_test_keys(request): - """ - Fixture to automatically cleanup test keys after each test. - Use with: @pytest.mark.usefixtures("cleanup_test_keys") - """ - yield - - # Cleanup all test keys from mock Vault - test_keys = [k for k in _test_vault_storage.keys() if "test_" in k] - for key in test_keys: - _test_vault_storage.pop(key, None) - - -# -------------------------------- Helper Functions ---------------------------------------- - - -def config_to_dagster_dict(config): - """ - Convert Pydantic config to Dagster-compatible dictionary. - - For AnonymisePseudonymizeStructuredConfig (uses discriminated Union): - Pydantic v2 outputs: {'technique': {'type': 'encrypt', 'columns': [...], 'key_name': '...'}} - Dagster expects: {'technique': {'encrypt': {'columns': [...], 'key_name': '...'}}} - - For DepseudonymizeStructuredConfig (direct DecryptConfig, no Union): - Pydantic v2 outputs: - {'technique': {'type': 'decrypt', 'columns': [...], 'key_name': '...'}} - Dagster expects: Same flat structure with 'type' field - - Args: - config: Pydantic config instance - (AnonymisePseudonymizeStructuredConfig or - DepseudonymizeStructuredConfig) - - Returns: - dict: Dagster-compatible configuration dictionary - """ - from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( - AnonymisePseudonymizeStructuredConfig, - ) - - config_dict = config.model_dump() - - # Only convert discriminated unions for AnonymisePseudonymizeStructuredConfig - # DepseudonymizeStructuredConfig uses direct DecryptConfig (no discriminated union) - if isinstance(config, AnonymisePseudonymizeStructuredConfig): - if "used_function" in config_dict: - for func_config in config_dict["used_function"]: - if "technique" in func_config: - technique = func_config["technique"] - # Pydantic outputs flat dict with 'type' field for discriminated unions - if isinstance(technique, dict) and "type" in technique: - # Extract the type discriminator - technique_type = technique["type"] - # Create nested structure without the 'type' field - technique_data = {k: v for k, v in technique.items() if k != "type"} - # Nest under the discriminator key for Dagster - func_config["technique"] = {technique_type: technique_data} - - return config_dict - - -def run_encrypt_op(config, df): - """ - Helper function to execute the anonymize_pseudonymize_structured op. - - Args: - config: AnonymisePseudonymizeStructuredConfig instance - df: Input pandas DataFrame - - Returns: - tuple: (result_df, metrics) - Output DataFrame and metrics dict - """ - context = build_op_context(op_config=config_to_dagster_dict(config)) - result_df, metrics = anonymize_pseudonymize_structured(context, df=df) - return result_df.value, metrics.value - - -def run_decrypt_op(config, df): - """ - Helper function to execute the depseudonymize_structured op. - - Args: - config: DepseudonymizeStructuredConfig instance - df: Input pandas DataFrame - - Returns: - tuple: (result_df, metrics) - Output DataFrame and metrics dict - """ - context = build_op_context(op_config=config_to_dagster_dict(config)) - result_df, metrics = depseudonymize_structured(context, df=df) - return result_df.value, metrics.value - - -def clear_vault_key(key_name: str): - """ - Helper function to clear a key from the simulated Vault storage for test isolation. - - Args: - key_name: Name of the key to delete from Vault - """ - full_path = f"secret/PseudonymKeys/{key_name}" - if full_path in _test_vault_storage: - del _test_vault_storage[full_path] - if full_path in _test_vault_access_control: - del _test_vault_access_control[full_path] - - -def set_vault_key(key_name: str, key_value: str): - """ - Helper function to set a key in the simulated Vault storage. - - Args: - key_name: Name of the key - key_value: Value of the key (Fernet key as string) - """ - full_path = f"secret/PseudonymKeys/{key_name}" - _test_vault_storage[full_path] = key_value - - -def deny_vault_access(key_name: str): - """ - Helper function to deny access to a key for authorization testing (AC3). - - Args: - key_name: Name of the key to deny access to - """ - full_path = f"secret/PseudonymKeys/{key_name}" - _test_vault_access_control[full_path] = False - - -def get_vault_key(key_name: str) -> bytes: - """ - Helper function to retrieve a key from the simulated Vault storage. - - Args: - key_name: Name of the key to retrieve - - Returns: - bytes: The encryption key - """ - full_path = f"secret/PseudonymKeys/{key_name}" - if full_path not in _test_vault_storage: - raise InvalidPath(f"Key not found: {key_name}") - return _test_vault_storage[full_path].encode() diff --git a/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py b/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py deleted file mode 100644 index 010b9a6..0000000 --- a/tests/field_level_pseudo_anonymisation/test_config_models_coverage.py +++ /dev/null @@ -1,633 +0,0 @@ -import pytest -from pydantic import ValidationError - -from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( - AnonymisePseudonymizeStructuredConfig, - DepseudonymizeStructuredConfig, - PseudoTechniqueConfig, - DepseudoTechniqueConfig, - HashConfig, - EncryptConfig, - RedactConfig, - ReplaceConfig, - DecryptConfig, -) -from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( - AnonymisePseudonymizeUnstructuredConfig, - DepseudonymizeUnstructuredConfig, - PseudoTechniqueConfig as UnstructuredPseudoTechniqueConfig, - DepseudoTechniqueConfig as UnstructuredDepseudoTechniqueConfig, - HashConfig as UnstructuredHashConfig, - EncryptConfig as UnstructuredEncryptConfig, - RedactConfig as UnstructuredRedactConfig, - ReplaceConfig as UnstructuredReplaceConfig, - RetainConfig, - DecryptConfig as UnstructuredDecryptConfig, -) -from template_code_location.field_level_pseudo_anonymisation.config_models.languages import LanguageEnum -from template_code_location.field_level_pseudo_anonymisation.config_models.pii_entities import PIIEntityEnum - - -# ==================== Structured Config Tests ==================== - -class TestStructuredConfigValidators: - """Tests for structured_config.py validators and validators.""" - - def test_ensure_unique_columns_valid_single_technique(self): - """Test that single technique with single column passes validation.""" - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - columns=["email"], - key_name="key1" - ) - ) - ] - ) - assert config is not None - assert len(config.used_function) == 1 - - def test_ensure_unique_columns_valid_multiple_techniques_different_columns(self): - """Test that multiple techniques with different columns passes validation.""" - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - columns=["email"], - key_name="key1" - ) - ), - PseudoTechniqueConfig( - technique=HashConfig( - columns=["ssn"], - algorithm="sha256" - ) - ) - ] - ) - assert config is not None - assert len(config.used_function) == 2 - - def test_ensure_unique_columns_duplicate_columns_same_technique(self): - """Test that duplicate columns in different techniques raises error.""" - with pytest.raises(ValueError) as exc_info: - AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - columns=["email"], - key_name="key1" - ) - ), - PseudoTechniqueConfig( - technique=HashConfig( - columns=["email"], - algorithm="sha256" - ) - ) - ] - ) - assert "Duplicate column" in str(exc_info.value) - assert "email" in str(exc_info.value) - - def test_ensure_unique_columns_multiple_duplicates(self): - """Test error message with multiple duplicate columns.""" - with pytest.raises(ValueError) as exc_info: - AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - columns=["email", "phone"], - key_name="key1" - ) - ), - PseudoTechniqueConfig( - technique=HashConfig( - columns=["email", "phone"], - algorithm="sha256" - ) - ) - ] - ) - error_msg = str(exc_info.value) - assert "Duplicate column" in error_msg - assert "email" in error_msg - assert "phone" in error_msg - - def test_collect_column_to_techniques_single_technique(self): - """Test _collect_column_to_techniques with single technique.""" - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - columns=["email", "phone"], - key_name="key1" - ) - ) - ] - ) - mapping = config._collect_column_to_techniques() - assert mapping == { - "email": ["encrypt"], - "phone": ["encrypt"] - } - - def test_extract_technique_and_columns_dict_with_type_field(self): - """Test _extract_technique_and_columns with dict containing 'type' field.""" - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns( - { - "technique": { - "type": "encrypt", - "columns": ["email", "ssn"], - "key_name": "test_key" - } - } - ) - assert technique_type == "encrypt" - assert columns == ["email", "ssn"] - - def test_extract_technique_and_columns_dict_with_variant_mapping(self): - """Test _extract_technique_and_columns with variant-key mapping {'hash': {...}}.""" - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns( - { - "technique": { - "encrypt": { - "columns": ["ssn"], - "key_name": "test_key" - } - } - } - ) - assert technique_type == "encrypt" - assert columns == ["ssn"] - - def test_extract_technique_and_columns_model_instance(self): - """Test _extract_technique_and_columns with PseudoTechniqueConfig model instance.""" - pseudo_config = PseudoTechniqueConfig( - technique=RedactConfig(columns=["address"]) - ) - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns(pseudo_config) - assert technique_type == "redact" - assert columns == ["address"] - - def test_extract_technique_and_columns_empty_dict(self): - """Test _extract_technique_and_columns with empty dict.""" - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns( - {"technique": {}} - ) - assert technique_type is None - assert columns == [] - - def test_extract_technique_and_columns_none_technique(self): - """Test _extract_technique_and_columns with None technique.""" - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns( - {"technique": None} - ) - assert technique_type is None - assert columns == [] - - def test_extract_technique_and_columns_missing_columns_key(self): - """Test _extract_technique_and_columns when 'columns' key is missing.""" - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns( - { - "technique": { - "type": "encrypt", - "key_name": "test_key" - } - } - ) - assert technique_type == "encrypt" - assert columns == [] - - def test_extract_technique_and_columns_model_without_columns_attr(self): - """Test _extract_technique_and_columns with model instance missing columns attribute.""" - pseudo_config = PseudoTechniqueConfig( - technique=ReplaceConfig(columns=["old_value"], new_value="NEW") - ) - config = AnonymisePseudonymizeStructuredConfig() - technique_type, columns = config._extract_technique_and_columns(pseudo_config) - assert technique_type == "replace" - assert columns == ["old_value"] - - -class TestStructuredDepseudonymizeConfig: - """Tests for DepseudonymizeStructuredConfig.""" - - def test_depseudonymize_config_normalize_used_function_with_dict(self): - """Test _normalize_depseudo_used_function with dict input.""" - config = DepseudonymizeStructuredConfig( - used_function=[ - { - "technique": { - "type": "decrypt", - "columns": ["email"], - "key_name": "key1" - } - } - ] - ) - assert len(config.used_function) == 1 - assert isinstance(config.used_function[0], DepseudoTechniqueConfig) - assert config.used_function[0].technique.type == "decrypt" - - def test_depseudonymize_config_normalize_used_function_with_model(self): - """Test _normalize_depseudo_used_function with model instance.""" - depseudo_tech = DepseudoTechniqueConfig( - technique=DecryptConfig( - columns=["email"], - key_name="key1" - ) - ) - config = DepseudonymizeStructuredConfig( - used_function=[depseudo_tech] - ) - assert len(config.used_function) == 1 - assert config.used_function[0] is depseudo_tech - - def test_depseudonymize_config_ensure_unique_columns_no_op(self): - """Test that ensure_unique_columns is a no-op for depseudonymize.""" - # For depseudonymize, there's no per-column uniqueness constraint - config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - columns=["email"], - key_name="key1" - ) - ), - DepseudoTechniqueConfig( - technique=DecryptConfig( - columns=["email"], - key_name="key2" - ) - ) - ] - ) - # Should not raise - no-op validator - assert config is not None - - -# ==================== Unstructured Config Tests ==================== - -class TestUnstructuredConfigValidators: - """Tests for unstructured_config.py validators.""" - - def test_normalize_used_function_with_dict(self): - """Test _normalize_used_function with dict input.""" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - { - "technique": { - "encrypt": { - "pii": [PIIEntityEnum.EMAIL.value], - "key_name": "key1" - } - } - } - ] - ) - assert len(config.used_function) == 1 - - def test_normalize_used_function_with_model(self): - """Test _normalize_used_function with model instance.""" - pseudo_tech = UnstructuredPseudoTechniqueConfig( - technique=UnstructuredEncryptConfig( - pii=[PIIEntityEnum.EMAIL.value], - key_name="key1" - ) - ) - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[pseudo_tech] - ) - assert len(config.used_function) == 1 - - def test_ensure_unique_pii_valid_different_pii_types(self): - """Test that different PII types pass validation.""" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredEncryptConfig( - pii=[PIIEntityEnum.EMAIL.value], - key_name="key1" - ) - ), - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredHashConfig( - pii=[PIIEntityEnum.PERSON.value], - algorithm="sha256" - ) - ) - ] - ) - assert config is not None - assert len(config.used_function) == 2 - - def test_ensure_unique_pii_duplicate_pii_types(self): - """Test that duplicate PII types raise error.""" - with pytest.raises(ValueError) as exc_info: - AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredEncryptConfig( - pii=[PIIEntityEnum.EMAIL.value], - key_name="key1" - ) - ), - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredHashConfig( - pii=[PIIEntityEnum.EMAIL.value], - algorithm="sha256" - ) - ) - ] - ) - assert "Duplicate PII" in str(exc_info.value) - # Error message shows PIIEntityEnum.EMAIL (the enum repr) rather than the value - assert "EMAIL" in str(exc_info.value) - - def test_collect_pii_to_techniques_single_technique(self): - """Test _collect_pii_to_techniques with single technique.""" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredEncryptConfig( - pii=[PIIEntityEnum.EMAIL.value, PIIEntityEnum.PERSON.value], - key_name="key1" - ) - ) - ] - ) - mapping = config._collect_pii_to_techniques() - assert mapping == { - PIIEntityEnum.EMAIL.value: ["encrypt"], - PIIEntityEnum.PERSON.value: ["encrypt"] - } - - def test_extract_technique_and_pii_dict_with_type_field(self): - """Test _extract_technique_and_pii with dict containing 'type' field.""" - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii( - { - "technique": { - "type": "encrypt", - "pii": [PIIEntityEnum.EMAIL.value], - "key_name": "test_key" - } - } - ) - assert technique_type == "encrypt" - assert piis == [PIIEntityEnum.EMAIL.value] - - def test_extract_technique_and_pii_dict_with_variant_mapping(self): - """Test _extract_technique_and_pii with variant-key mapping.""" - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii( - { - "technique": { - "hash": { - "pii": [PIIEntityEnum.PERSON.value], - "algorithm": "sha256" - } - } - } - ) - assert technique_type == "hash" - assert piis == [PIIEntityEnum.PERSON.value] - - def test_extract_technique_and_pii_dict_fallback_to_columns(self): - """Test _extract_technique_and_pii fallback to 'columns' key when 'pii' is missing.""" - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii( - { - "technique": { - "type": "redact", - "columns": ["fallback_col"] - } - } - ) - assert technique_type == "redact" - assert piis == ["fallback_col"] - - def test_extract_technique_and_pii_model_instance(self): - """Test _extract_technique_and_pii with model instance.""" - pseudo_tech = UnstructuredPseudoTechniqueConfig( - technique=UnstructuredRedactConfig( - pii=[PIIEntityEnum.EMAIL.value] - ) - ) - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii(pseudo_tech) - assert technique_type == "redact" - assert piis == [PIIEntityEnum.EMAIL.value] - - def test_extract_technique_and_pii_model_with_getattr_fallback(self): - """Test _extract_technique_and_pii model with getattr fallback to columns.""" - # Create a mock-like scenario where pii attribute doesn't exist - pseudo_tech = UnstructuredPseudoTechniqueConfig( - technique=RetainConfig(pii=[PIIEntityEnum.PERSON.value]) - ) - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii(pseudo_tech) - assert technique_type == "retain" - assert piis == [PIIEntityEnum.PERSON.value] - - def test_extract_technique_and_pii_empty_dict(self): - """Test _extract_technique_and_pii with empty dict.""" - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii( - {"technique": {}} - ) - assert technique_type is None - assert piis == [] - - def test_extract_technique_and_pii_missing_pii_key(self): - """Test _extract_technique_and_pii when 'pii' key is missing.""" - config = AnonymisePseudonymizeUnstructuredConfig(language=LanguageEnum.en) - technique_type, piis = config._extract_technique_and_pii( - { - "technique": { - "type": "encrypt", - "key_name": "test_key" - } - } - ) - assert technique_type == "encrypt" - assert piis == [] - - -class TestUnstructuredDepseudonymizeConfig: - """Tests for DepseudonymizeUnstructuredConfig.""" - - def test_depseudonymize_unstructured_config_default(self): - """Test default DepseudonymizeUnstructuredConfig.""" - config = DepseudonymizeUnstructuredConfig() - assert config is not None - assert len(config.used_function) >= 1 - - def test_depseudonymize_unstructured_config_with_custom_function(self): - """Test DepseudonymizeUnstructuredConfig with custom function.""" - config = DepseudonymizeUnstructuredConfig( - used_function=[ - UnstructuredDepseudoTechniqueConfig( - technique=UnstructuredDecryptConfig( - key_name="custom_key" - ) - ) - ] - ) - assert len(config.used_function) == 1 - assert config.used_function[0].technique.key_name == "custom_key" - - -class TestLanguageSupport: - """Tests for language configuration support.""" - - def test_all_supported_languages(self): - """Test that all supported languages can be set.""" - supported_languages = [ - LanguageEnum.hr, LanguageEnum.da, LanguageEnum.nl, LanguageEnum.en, - LanguageEnum.fi, LanguageEnum.fr, LanguageEnum.de, LanguageEnum.el, - LanguageEnum.it, LanguageEnum.lt, LanguageEnum.pl, LanguageEnum.pt, - LanguageEnum.ro, LanguageEnum.sl, LanguageEnum.es, LanguageEnum.sv - ] - - for lang in supported_languages: - config = AnonymisePseudonymizeUnstructuredConfig(language=lang) - assert config.language == lang - - def test_default_language_is_english(self): - """Test that default language is English.""" - config = AnonymisePseudonymizeUnstructuredConfig() - assert config.language == LanguageEnum.en - - -class TestTechniqueConfigDefaults: - """Tests for technique config defaults.""" - - def test_hash_config_default_algorithm(self): - """Test HashConfig default algorithm.""" - config = HashConfig() - assert config.algorithm == "sha256" - assert config.type == "hash" - - def test_encrypt_config_defaults(self): - """Test EncryptConfig defaults.""" - config = EncryptConfig() - assert config.type == "encrypt" - assert config.key_name == "my_key" - - def test_redact_config_defaults(self): - """Test RedactConfig defaults.""" - config = RedactConfig() - assert config.type == "redact" - - def test_replace_config_defaults(self): - """Test ReplaceConfig defaults.""" - config = ReplaceConfig() - assert config.type == "replace" - assert config.new_value == "REPLACED" - - def test_decrypt_config_defaults(self): - """Test DecryptConfig defaults.""" - config = DecryptConfig() - assert config.type == "decrypt" - assert config.key_name == "my_key" - - def test_unstructured_retain_config_defaults(self): - """Test RetainConfig defaults.""" - config = RetainConfig() - assert config.type == "retain" - - -class TestPseudoTechniqueConfigDefaults: - """Tests for PseudoTechniqueConfig defaults.""" - - def test_pseudo_technique_default_to_hash(self): - """Test PseudoTechniqueConfig defaults to hash technique.""" - config = PseudoTechniqueConfig() - # For Dagster Config, technique may be a dict with the discriminator structure - if isinstance(config.technique, dict): - # Check if it has hash configuration - assert "hash" in config.technique or config.technique.get("type") == "hash" - else: - assert config.technique.type == "hash" - - def test_unstructured_pseudo_technique_default_to_hash(self): - """Test UnstructuredPseudoTechniqueConfig defaults to hash technique.""" - config = UnstructuredPseudoTechniqueConfig() - # For Dagster Config, technique may be a dict with the discriminator structure - if isinstance(config.technique, dict): - # Check if it has hash configuration - assert "hash" in config.technique or config.technique.get("type") == "hash" - else: - assert config.technique.type == "hash" - - -class TestConfigModelIntegration: - """Integration tests for config models.""" - - def test_structured_config_with_all_technique_types(self): - """Test structured config with all technique types.""" - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=HashConfig(columns=["col1"]) - ), - PseudoTechniqueConfig( - technique=EncryptConfig(columns=["col2"], key_name="k1") - ), - PseudoTechniqueConfig( - technique=RedactConfig(columns=["col3"]) - ), - PseudoTechniqueConfig( - technique=ReplaceConfig(columns=["col4"], new_value="X") - ) - ] - ) - assert len(config.used_function) == 4 - techniques = {f.technique.type for f in config.used_function} - assert techniques == {"hash", "encrypt", "redact", "replace"} - - def test_unstructured_config_with_all_technique_types(self): - """Test unstructured config with all technique types.""" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredHashConfig(pii=[PIIEntityEnum.EMAIL.value]) - ), - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredEncryptConfig( - pii=[PIIEntityEnum.PERSON.value], - key_name="k1" - ) - ), - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredRedactConfig(pii=[PIIEntityEnum.PHONE_NUMBERS.value]) - ), - UnstructuredPseudoTechniqueConfig( - technique=UnstructuredReplaceConfig( - pii=[PIIEntityEnum.CREDIT_CARD.value], - new_value="X" - ) - ), - UnstructuredPseudoTechniqueConfig( - technique=RetainConfig(pii=[PIIEntityEnum.DATE_OF_BIRTH.value]) - ) - ] - ) - assert len(config.used_function) == 5 - techniques = {f.technique.type for f in config.used_function} - assert techniques == {"hash", "encrypt", "redact", "replace", "retain"} diff --git a/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py b/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py deleted file mode 100644 index 9ed013a..0000000 --- a/tests/field_level_pseudo_anonymisation/test_decrypt_structured.py +++ /dev/null @@ -1,1090 +0,0 @@ -""" -Test suite for data restoration (depseudonymization) operations. - -This test suite validates the data restoration feature against the following Acceptance Criteria: - -## Test Coverage Summary - -### Acceptance Criteria Coverage: -- AC1 (Data Restoration with Valid Key): 7 tests -- AC2 (Restoration Denial - Missing Key): 3 tests -- AC3 (Restoration Denial - Unauthorized Access): 2 tests -- AC4 (Restoration Denial - Invalid Key): 3 tests -- Additional Coverage: 3 tests - -### Test Pattern: -- Each test uses build_op_context with .model_dump() for configuration -- Tests validate dual outputs (data, metrics) -- Tests verify complete restoration of original values -- Tests validate security controls and error handling - -""" - -import pandas as pd -import pytest -from cryptography.fernet import Fernet - -from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( - AnonymisePseudonymizeStructuredConfig, - DepseudonymizeStructuredConfig, - EncryptConfig, - DecryptConfig, - PseudoTechniqueConfig, - DepseudoTechniqueConfig, -) - -# Import helper functions (fixtures are auto-discovered by pytest) -from .conftest import ( - run_encrypt_op, - run_decrypt_op, - clear_vault_key, - set_vault_key, - deny_vault_access, - get_vault_key, -) - - -# -------------------------------- Test Markers Configuration -------------------------------- - -# Register custom markers -pytest.mark.slow = pytest.mark.slow -pytest.mark.security = pytest.mark.security -pytest.mark.edge_case = pytest.mark.edge_case -pytest.mark.integration = pytest.mark.integration - - -# ---------------------- AC1: Data Restoration with Valid Key -------------------------------- - - -def test_ac1_restore_single_encrypted_field_with_valid_key( - sample_df, encrypt_config_single_field, decrypt_config_single_field -): - """ - AC1: Data Restoration using Secret Management Tool-Stored Decryption Key - - Scenario: Restore encrypted field with a valid key - Given: A pseudonymised dataset with encrypted email field - And: A valid decryption key stored in secret management tool - And: The participant provided the field that needs to be restored (email) - And: The participant is authorized - When: The participant requests data restoration - And: Provides the correct key name - Then: The system retrieves the key from secret management tool - And: Decrypts the dataset accurately - And: All original values are restored - And: A success message is presented to the user (via successful return) - And: The result is presented to the user - """ - # Clear any existing test key - clear_vault_key("test_restoration_key_single") - - # Step 1: Encrypt the data (pseudonymisation phase) - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Verify encryption occurred - assert not encrypted_df["email"].equals(sample_df["email"]), "Email field should be encrypted" - - # Verify key was created in Vault - key = get_vault_key("test_restoration_key_single") - assert key is not None, "Encryption key should exist in Vault" - - # Step 2: Restore the data (depseudonymisation phase) - restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) - - # Verify restoration succeeded - assert restored_df is not None, "Restored DataFrame should not be None" - assert metrics is not None, "Metrics should not be None" - - # Verify all original values are restored exactly - assert restored_df["email"].equals( - sample_df["email"] - ), "Email field should be restored to original values" - - # Verify each individual value - for idx, (original, restored) in enumerate(zip(sample_df["email"], restored_df["email"])): - assert ( - original == restored - ), f"Row {idx}: Original '{original}' should match restored '{restored}'" - - # Verify row count preserved - assert len(restored_df) == len(sample_df), "Row count should be preserved during restoration" - - # Verify non-encrypted columns remain unchanged - assert restored_df["name"].equals( - sample_df["name"] - ), "Non-encrypted fields should remain unchanged" - assert restored_df["age"].equals( - sample_df["age"] - ), "Non-encrypted fields should remain unchanged" - assert restored_df["department"].equals( - sample_df["department"] - ), "Non-encrypted fields should remain unchanged" - - -def test_ac1_restore_multiple_encrypted_fields_with_valid_key( - sample_df, encrypt_config_multiple_fields, decrypt_config_multiple_fields -): - """ - AC1: Data Restoration of multiple encrypted fields with a valid key - - Scenario: Restore multiple encrypted fields (name, email, ssn) with a valid key - Given: A pseudonymised dataset with multiple encrypted fields - And: A valid decryption key stored in secret management tool - And: The participant provided the fields that need to be restored - When: The participant requests data restoration - Then: All specified fields are decrypted accurately - And: All original values are restored - """ - clear_vault_key("test_restoration_key_multi") - - # Encrypt multiple fields - encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) - - # Verify all specified fields were encrypted - assert not encrypted_df["name"].equals(sample_df["name"]), "Name should be encrypted" - assert not encrypted_df["email"].equals(sample_df["email"]), "Email should be encrypted" - assert not encrypted_df["ssn"].equals(sample_df["ssn"]), "SSN should be encrypted" - - # Restore all encrypted fields - restored_df, _ = run_decrypt_op(decrypt_config_multiple_fields, encrypted_df.copy()) - - # Verify all fields restored to original values - assert restored_df["name"].equals( - sample_df["name"] - ), "Name field should be restored to original values" - assert restored_df["email"].equals( - sample_df["email"] - ), "Email field should be restored to original values" - assert restored_df["ssn"].equals( - sample_df["ssn"] - ), "SSN field should be restored to original values" - - # Verify non-encrypted columns remain unchanged - assert restored_df["age"].equals( - sample_df["age"] - ), "Non-encrypted fields should remain unchanged" - assert restored_df["salary"].equals( - sample_df["salary"] - ), "Non-encrypted fields should remain unchanged" - - -def test_ac1_restore_partial_fields_leaves_others_encrypted( - sample_df, encrypt_config_multiple_fields -): - """ - AC1: Partial restoration - participant specifies only some fields to restore - - Scenario: Restore only selected fields while leaving others encrypted - Given: A pseudonymised dataset with multiple encrypted fields (name, email, ssn) - And: The participant specifies only some fields to restore (e.g., only email) - When: The participant requests partial restoration - Then: Only the specified fields are decrypted - And: Other encrypted fields remain encrypted - """ - clear_vault_key("test_restoration_key_multi") - - # Encrypt multiple fields - encrypted_df, _ = run_encrypt_op(encrypt_config_multiple_fields, sample_df.copy()) - - # Create config to restore only email field - partial_decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], # Only restore email - key_name="test_restoration_key_multi", - ) - ) - ] - ) - - # Restore only email field - restored_df, _ = run_decrypt_op(partial_decrypt_config, encrypted_df.copy()) - - # Verify email is restored - assert restored_df["email"].equals( - sample_df["email"] - ), "Email field should be restored to original values" - - # Verify other fields remain encrypted (different from original) - assert not restored_df["name"].equals(sample_df["name"]), "Name field should remain encrypted" - assert not restored_df["ssn"].equals(sample_df["ssn"]), "SSN field should remain encrypted" - - -def test_ac1_restore_preserves_data_types(sample_df): - """ - AC1: Data restoration preserves original data types for all fields - - Scenario: Restore encrypted numeric and string fields - Given: A dataset with mixed data types (strings, integers, floats) - When: Fields are encrypted and then restored - Then: Original data types are preserved after restoration - """ - # Create config to encrypt mixed types - encrypt_config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - columns=["name", "age", "salary"], - key_name="test_restoration_types", - ) - ) - ] - ) - - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["name", "age", "salary"], - key_name="test_restoration_types", - ) - ) - ] - ) - - clear_vault_key("test_restoration_types") - - # Encrypt and restore - encrypted_df, _ = run_encrypt_op(encrypt_config, sample_df.copy()) - restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) - - # Verify values are restored (as strings due to encryption/decryption) - # Note: Fernet encryption/decryption converts everything to strings - # This is expected behavior - original types are preserved via string representation - assert ( - restored_df["name"].tolist() == sample_df["name"].tolist() - ), "String values should be restored" - assert ( - restored_df["age"].tolist() == sample_df["age"].astype(str).tolist() - ), "Integer values should be restored as strings" - assert ( - restored_df["salary"].tolist() == sample_df["salary"].astype(str).tolist() - ), "Float values should be restored as strings" - - -def test_ac1_restore_empty_dataframe(encrypt_config_single_field, decrypt_config_single_field): - """ - AC1: Edge case - restore an empty dataset - - Scenario: Attempt to restore an empty pseudonymised dataset - Given: An empty DataFrame with correct schema - When: Restoration is attempted - Then: Operation completes successfully without errors - And: Returns an empty DataFrame - """ - clear_vault_key("test_restoration_key_single") - - # Create empty DataFrame with same schema - empty_df = pd.DataFrame(columns=["id", "name", "email", "ssn", "age", "salary", "department"]) - - # Encrypt (should handle empty DataFrame) - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, empty_df.copy()) - - # Restore (should also handle empty DataFrame) - restored_df, metrics = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) - - # Verify empty DataFrame returned - assert len(restored_df) == 0, "Restored DataFrame should be empty" - assert list(restored_df.columns) == list(empty_df.columns), "Column schema should be preserved" - - -def test_ac1_restore_with_special_characters( - encrypt_config_single_field, decrypt_config_single_field -): - """ - AC1: Data restoration with special characters and edge case values - - Scenario: Restore data containing special characters, unicode, etc. - Given: A dataset with special characters in string fields - When: Data is encrypted and then restored - Then: All special characters are preserved accurately - """ - clear_vault_key("test_restoration_key_single") - - # Create DataFrame with special characters - special_df = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "name": ["José García", "François Müller", "李明", "O'Brien"], - "email": [ - "josé@example.com", - "françois@example.com", - "li@example.cn", - "o'brien@example.ie", - ], - "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", "456-78-9012"], - "age": [25, 30, 35, 40], - "salary": [50000.0, 60000.0, 70000.0, 80000.0], - "department": ["HR", "IT", "Finance", "IT"], - } - ) - - # Encrypt and restore - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, special_df.copy()) - restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) - - # Verify special characters preserved - assert restored_df["email"].equals( - special_df["email"] - ), "Special characters should be preserved during restoration" - - for idx, (original, restored) in enumerate(zip(special_df["email"], restored_df["email"])): - assert ( - original == restored - ), f"Row {idx}: Special characters in '{original}' should be preserved" - - -# ------------------- AC2: Restoration Denial when Key is Missing ---------------------------- - - -def test_ac2_restore_fails_when_key_missing(sample_df, encrypt_config_single_field): - """ - AC2: Restoration Denial when Decryption Key is missing - - Scenario: Attempt to restore encrypted fields when decryption key is missing - Given: A pseudonymised dataset - And: The decryption key is missing from Vault - And: The participant provides the correct key name - When: The participant attempts to restore the data - Then: The system fails the restoration request - And: Logs the failed key retrieval for auditing (via exception) - And: An error message is presented to the user - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data first - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Delete the key from Vault to simulate missing key - clear_vault_key("test_restoration_key_single") - - # Create decrypt config with missing key - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Attempt restoration - should fail with clear error - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - # Verify error message is informative - error_message = str(exc_info.value) - assert ( - "not found" in error_message.lower() or "decrypt" in error_message.lower() - ), "Error message should indicate key not found for decrypt operation" - assert ( - "test_restoration_key_single" in error_message - ), "Error message should include the key name for auditing" - - -def test_ac2_restore_fails_with_nonexistent_key_name(sample_df, encrypt_config_single_field): - """ - AC2: Restoration fails when using a key name that never existed - - Scenario: Attempt to restore with a key name that was never created - Given: A pseudonymised dataset - And: A key name that does not exist in Vault - When: The participant attempts to restore the data - Then: The system fails the restoration request with appropriate error - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data with one key - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Try to decrypt with a different, non-existent key - decrypt_config_wrong_key = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", columns=["email"], key_name="nonexistent_key_name" - ) - ) - ] - ) - - # Attempt restoration - should fail - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config_wrong_key, encrypted_df.copy()) - - error_message = str(exc_info.value) - assert "not found" in error_message.lower(), "Error message should indicate key not found" - - -def test_ac2_restore_fails_when_key_corrupted(sample_df, encrypt_config_single_field): - """ - AC2: Restoration Denial when Decryption Key is corrupted - - Scenario: Attempt to restore when key is corrupted in Vault - Given: A pseudonymised dataset - And: The decryption key is corrupted (invalid format) - When: The participant attempts to restore the data - Then: The system fails the restoration request - And: An appropriate error message is presented - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data first - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Corrupt the key by replacing it with invalid data - set_vault_key("test_restoration_key_single", "corrupted_invalid_key_data") - - # Create decrypt config - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Attempt restoration - should fail due to corrupted key - with pytest.raises(Exception) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - # Should raise either ValueError or Fernet-related exception - assert "Fernet" in str(type(exc_info.value)) or "ValueError" in str( - type(exc_info.value) - ), "Should raise Fernet or ValueError for corrupted key" - - -# ------------- AC3: Restoration Denial when Access is Unauthorized -------------------------- - - -def test_ac3_restore_fails_when_access_unauthorized(sample_df, encrypt_config_single_field): - """ - AC3: Restoration Denial when Decryption Key access is unauthorized - - Scenario: Attempt to restore encrypted fields without authorization - Given: A pseudonymised dataset - And: A decryption key in secret management tool - And: The participant is not authorized to access the key - When: The participant attempts to restore the data - Then: The system denies the participant access to the key - And: The system denies the initiation of the restoration process - And: The system logs the unauthorized access attempt (via exception) - And: An appropriate error message is presented to the user - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data first - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Set access control to deny access - deny_vault_access("test_restoration_key_single") - - # Create decrypt config - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Attempt restoration - should fail with ValueError (wrapping Forbidden) - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - # Verify error indicates access denial - error_message = str(exc_info.value) - assert ( - "access denied" in error_message.lower() or "error while reading" in error_message.lower() - ), "Error message should indicate access denial or error reading key" - assert ( - "test_restoration_key_single" in error_message - ), "Error message should include the key name for auditing" - - -def test_ac3_restore_multiple_keys_with_mixed_authorization(sample_df): - """ - AC3: Restoration with mixed authorization - some keys authorized, others not - - Scenario: Attempt to restore multiple fields where user has access to some keys but not others - Given: A pseudonymised dataset with multiple encrypted fields using different keys - And: The participant is authorized for some keys but not others - When: The participant attempts to restore all fields - Then: The system denies access when unauthorized key is encountered - """ - # Encrypt email with one key, ssn with another - encrypt_config_multi_keys = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="authorized_key" - ) - ) - ] - ) - - clear_vault_key("authorized_key") - clear_vault_key("unauthorized_key") - - # Encrypt data - encrypted_df, _ = run_encrypt_op(encrypt_config_multi_keys, sample_df.copy()) - - # Manually encrypt another field with different key (simulating separate encryption) - encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["ssn"], key_name="unauthorized_key" - ) - ) - ] - ) - encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) - - # Deny access to unauthorized_key - deny_vault_access("unauthorized_key") - - # Try to decrypt both fields - decrypt_config_both = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", columns=["email"], key_name="authorized_key" - ) - ), - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", columns=["ssn"], key_name="unauthorized_key" - ) - ), - ] - ) - - # Should fail when trying to access unauthorized_key with ValueError (wrapping Forbidden) - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config_both, encrypted_df.copy()) - - # Verify error indicates access issue with unauthorized key - error_message = str(exc_info.value) - assert ( - "access denied" in error_message.lower() or "error while reading" in error_message.lower() - ), "Error message should indicate access denial" - assert "unauthorized_key" in error_message, "Error message should mention the unauthorized key" - - -# ------------------- AC4: Restoration Denial when Key is Invalid ---------------------------- - - -def test_ac4_restore_fails_with_wrong_key(sample_df): - """ - AC4: Restoration Denial when Decryption Key is invalid - - Scenario: Attempt to restore encrypted fields with a key that doesn't match the encryption key - Given: A pseudonymised dataset encrypted with key A - And: A different valid decryption key B is stored in secret management tool - And: The participant provides key B (which is not the correct key) - And: Key B does not correspond to the fields to be restored - When: The participant attempts to restore the data - Then: The system fails the restoration request - And: Logs the failed decryption attempt for auditing (via exception) - And: An error message is presented to the user - """ - # Encrypt with one key - encrypt_config_key_a = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="encryption_key_a" - ) - ) - ] - ) - - clear_vault_key("encryption_key_a") - clear_vault_key("encryption_key_b") - - # Encrypt data with key A - encrypted_df, _ = run_encrypt_op(encrypt_config_key_a, sample_df.copy()) - - # Generate a different valid key B in Vault - different_key = Fernet.generate_key().decode() - set_vault_key("encryption_key_b", different_key) - - # Try to decrypt with key B (wrong key) - decrypt_config_key_b = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", columns=["email"], key_name="encryption_key_b" - ) - ) - ] - ) - - # Attempt restoration - should fail with InvalidToken or ValueError - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config_key_b, encrypted_df.copy()) - - # Verify error message indicates decryption failure - error_message = str(exc_info.value) - assert ( - "invalid" in error_message.lower() or "token" in error_message.lower() - ), "Error message should indicate invalid token or decryption failure" - assert ( - "encryption_key_b" in error_message - ), "Error message should include the key name for auditing" - - -def test_ac4_restore_fails_with_key_from_different_field(sample_df): - """ - AC4: Restoration fails when using a key intended for a different field - - Scenario: Attempt to restore field A using the key for field B - Given: A dataset with multiple fields encrypted with different keys - And: The participant provides the key for field B to decrypt field A - When: The participant attempts to restore field A - Then: The system fails the restoration request - """ - # Encrypt email and ssn with different keys - encrypt_config_email = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["email"], key_name="email_key") - ) - ] - ) - - encrypt_config_ssn = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["ssn"], key_name="ssn_key") - ) - ] - ) - - clear_vault_key("email_key") - clear_vault_key("ssn_key") - - # Encrypt both fields - encrypted_df, _ = run_encrypt_op(encrypt_config_email, sample_df.copy()) - encrypted_df, _ = run_encrypt_op(encrypt_config_ssn, encrypted_df.copy()) - - # Try to decrypt email field using ssn_key - decrypt_config_wrong_field = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], # Trying to decrypt email - key_name="ssn_key", # But using ssn's key - ) - ) - ] - ) - - # Should fail with InvalidToken - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config_wrong_field, encrypted_df.copy()) - - error_message = str(exc_info.value) - assert ( - "invalid" in error_message.lower() or "token" in error_message.lower() - ), "Error message should indicate invalid token" - - -def test_ac4_restore_fails_with_tampered_encrypted_data(sample_df, encrypt_config_single_field): - """ - AC4: Restoration fails when encrypted data has been tampered with - - Scenario: Attempt to restore encrypted data that has been modified - Given: A pseudonymised dataset - And: Some encrypted values have been tampered with - And: The correct decryption key is provided - When: The participant attempts to restore the data - Then: The system fails the restoration for tampered values - And: An appropriate error message is presented - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Tamper with encrypted data (modify one encrypted value) - encrypted_df.loc[0, "email"] = "tampered_invalid_encrypted_data" - - # Create decrypt config - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Attempt restoration - should fail on tampered data - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - error_message = str(exc_info.value) - assert ( - "invalid" in error_message.lower() or "token" in error_message.lower() - ), "Error message should indicate invalid token due to tampering" - - -# ---------------- Additional Edge Cases and Integration Tests ------------------------------- - - -def test_integration_full_cycle_encrypt_decrypt_multiple_operations(sample_df): - """ - Integration test: Full cycle of multiple encrypt/decrypt operations - - Scenario: Complex workflow with multiple encryption and restoration operations - Given: A dataset - When: Multiple fields are encrypted at different times - And: Fields are restored in different orders - Then: All operations complete successfully - And: Final restored data matches original - """ - # Phase 1: Encrypt email - encrypt_config_1 = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_1") - ) - ] - ) - clear_vault_key("key_1") - encrypted_df_1, _ = run_encrypt_op(encrypt_config_1, sample_df.copy()) - - # Phase 2: Encrypt name and ssn - encrypt_config_2 = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["name", "ssn"], key_name="key_2") - ) - ] - ) - clear_vault_key("key_2") - encrypted_df_2, _ = run_encrypt_op(encrypt_config_2, encrypted_df_1.copy()) - - # Phase 3: Restore email first - decrypt_config_1 = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_1") - ) - ] - ) - restored_df_1, _ = run_decrypt_op(decrypt_config_1, encrypted_df_2.copy()) - assert restored_df_1["email"].equals(sample_df["email"]), "Email should be restored" - - # Phase 4: Restore name and ssn - decrypt_config_2 = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", columns=["name", "ssn"], key_name="key_2") - ) - ] - ) - restored_df_2, _ = run_decrypt_op(decrypt_config_2, restored_df_1.copy()) - - # Verify all fields restored - assert restored_df_2["email"].equals(sample_df["email"]), "Email should remain restored" - assert restored_df_2["name"].equals(sample_df["name"]), "Name should be restored" - assert restored_df_2["ssn"].equals(sample_df["ssn"]), "SSN should be restored" - - -def test_restore_with_null_values(encrypt_config_single_field, decrypt_config_single_field): - """ - Edge case: Restoration of dataset with null/NaN values - - Scenario: Dataset contains null values in encrypted fields - Given: A dataset with null values in fields to be encrypted - When: Data is encrypted and then restored - Then: Null values are handled appropriately - """ - clear_vault_key("test_restoration_key_single") - - # Create DataFrame with null values - df_with_nulls = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "name": ["Alice", "Bob", None, "David"], - "email": [ - "alice@example.com", - None, - "charlie@example.com", - "david@example.com", - ], - "ssn": ["123-45-6789", "234-56-7890", "345-67-8901", None], - "age": [25, 30, 35, 40], - "salary": [50000.0, 60000.0, 70000.0, 80000.0], - "department": ["HR", "IT", "Finance", "IT"], - } - ) - - # Note: Encryption of NaN/None values will convert them to string "nan" or "None" - # This is expected behavior - Fernet encryption requires string input - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, df_with_nulls.copy()) - restored_df, _ = run_decrypt_op(decrypt_config_single_field, encrypted_df.copy()) - - # Verify non-null values are restored correctly - assert restored_df.loc[0, "email"] == "alice@example.com" - assert restored_df.loc[2, "email"] == "charlie@example.com" - assert restored_df.loc[3, "email"] == "david@example.com" - - -def test_restore_large_dataset_performance(): - """ - Performance test: Restoration of large dataset - - Scenario: Restore a large dataset with many rows - Given: A large dataset with 10,000 rows - When: Data is encrypted and then restored - Then: Operation completes without errors or timeout - And: All values are restored correctly - """ - # Create large dataset - large_df = pd.DataFrame( - { - "id": range(1, 10001), - "email": [f"user{i}@example.com" for i in range(1, 10001)], - "name": [f"User {i}" for i in range(1, 10001)], - "ssn": [f"{i:03d}-{i:02d}-{i:04d}" for i in range(1, 10001)], - "age": [20 + (i % 50) for i in range(1, 10001)], - "salary": [30000 + (i * 10) for i in range(1, 10001)], - "department": [["HR", "IT", "Finance", "Sales"][i % 4] for i in range(1, 10001)], - } - ) - - encrypt_config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="test_large_dataset" - ) - ) - ] - ) - - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", columns=["email"], key_name="test_large_dataset" - ) - ) - ] - ) - - clear_vault_key("test_large_dataset") - - # Encrypt and restore - encrypted_df, _ = run_encrypt_op(encrypt_config, large_df.copy()) - restored_df, _ = run_decrypt_op(decrypt_config, encrypted_df.copy()) - - # Verify sample of values - assert len(restored_df) == 10000, "Should restore all 10,000 rows" - assert restored_df["email"].equals(large_df["email"]), "All emails should be restored" - - # Spot check specific values - assert restored_df.loc[0, "email"] == "user1@example.com" - assert restored_df.loc[5000, "email"] == "user5001@example.com" - assert restored_df.loc[9999, "email"] == "user10000@example.com" - - -@pytest.mark.edge_case -@pytest.mark.security -def test_restore_after_key_rotation(sample_df, encrypt_config_single_field): - """ - AC4: Restoration fails after key rotation (key changed in Vault) - - Scenario: Key is rotated in Vault after encryption - Given: Data encrypted with key version 1 - And: Key is rotated to version 2 in Vault - When: Participant attempts to restore using new key version - Then: Restoration fails with clear error message - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt with original key - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Rotate key (replace with new key) - new_key = Fernet.generate_key().decode() - set_vault_key("test_restoration_key_single", new_key) - - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Should fail - key mismatch - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - assert ( - "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() - ), "Should indicate invalid token due to key rotation" - - -@pytest.mark.edge_case -def test_restore_partially_encrypted_column(sample_df, encrypt_config_single_field): - """ - Edge case: Attempt to restore column where only some rows are encrypted - - Scenario: Column has mixed encrypted/plaintext values (data corruption scenario) - """ - clear_vault_key("test_restoration_key_single") - - # Encrypt data - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Corrupt by replacing some encrypted values with plaintext - encrypted_df.loc[0, "email"] = "plaintext@example.com" - encrypted_df.loc[2, "email"] = "another_plaintext@example.com" - - decrypt_config = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig( - type="decrypt", - columns=["email"], - key_name="test_restoration_key_single", - ) - ) - ] - ) - - # Should fail on plaintext values - with pytest.raises(ValueError) as exc_info: - run_decrypt_op(decrypt_config, encrypted_df.copy()) - - assert ( - "invalid" in str(exc_info.value).lower() or "decrypt" in str(exc_info.value).lower() - ), "Should indicate invalid token for plaintext values" - - -@pytest.mark.edge_case -def test_restore_with_missing_column_in_encrypted_data( - sample_df, encrypt_config_single_field, decrypt_config_single_field -): - """ - AC2: Restoration fails when specified column doesn't exist in encrypted dataset - """ - clear_vault_key("test_restoration_key_single") - - # First encrypt the sample data to create the key - encrypted_df, _ = run_encrypt_op(encrypt_config_single_field, sample_df.copy()) - - # Create encrypted DataFrame missing the 'email' column - incomplete_df = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - # Missing 'email' column that decrypt config expects - "age": [25, 30, 35], - "salary": [50000.0, 60000.0, 70000.0], - "department": ["HR", "IT", "Finance"], - } - ) - - with pytest.raises((ValueError, KeyError)) as exc_info: - run_decrypt_op(decrypt_config_single_field, incomplete_df) - - error_msg = str(exc_info.value) - assert ( - "email" in error_msg or "not present" in error_msg or "not found" in error_msg - ), f"Error should indicate missing column, got: {error_msg}" - - -@pytest.mark.integration -def test_restore_with_multiple_encryption_keys(sample_df): - """ - Integration test: Restore data encrypted with multiple different keys - - Scenario: Different fields encrypted with different keys - Given: name encrypted with key_a, email encrypted with key_b - When: Participant provides both keys for restoration - Then: Both fields are restored correctly - """ - clear_vault_key("key_a") - clear_vault_key("key_b") - - # Encrypt name with key_a - encrypt_config_name = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["name"], key_name="key_a") - ) - ] - ) - - # Encrypt email with key_b - encrypt_config_email = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key_b") - ) - ] - ) - - # Encrypt both fields - df_encrypted = sample_df.copy() - df_encrypted, _ = run_encrypt_op(encrypt_config_name, df_encrypted) - df_encrypted, _ = run_encrypt_op(encrypt_config_email, df_encrypted) - - # Decrypt name with key_a - decrypt_config_name = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", columns=["name"], key_name="key_a") - ) - ] - ) - - # Decrypt email with key_b - decrypt_config_email = DepseudonymizeStructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", columns=["email"], key_name="key_b") - ) - ] - ) - - # Restore both fields - df_restored = df_encrypted.copy() - df_restored, _ = run_decrypt_op(decrypt_config_name, df_restored) - df_restored, _ = run_decrypt_op(decrypt_config_email, df_restored) - - # Verify both fields restored - assert df_restored["name"].equals(sample_df["name"]), "Name field should be restored with key_a" - assert df_restored["email"].equals( - sample_df["email"] - ), "Email field should be restored with key_b" diff --git a/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py b/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py deleted file mode 100644 index 1ce8585..0000000 --- a/tests/field_level_pseudo_anonymisation/test_decrypt_unstructured.py +++ /dev/null @@ -1,288 +0,0 @@ -""" -Test suite for data restoration (depseudonymisation) of unstructured text. - -## Test Coverage Summary - -### Acceptance Criteria Coverage: -- AC1 (Data Restoration with Valid Key): 2 tests -- AC2 (Restoration Denial - Missing Key): 1 test -- AC3 (Restoration Denial - Unauthorized Access): 1 test -- AC4 (Restoration Denial - Invalid Key): 1 test -- Additional Coverage: 2 tests (edge cases) - -### Test Pattern: -- Each test uses build_op_context with .model_dump() for configuration -- Tests validate dual outputs (data, metrics) -- Tests verify complete restoration of original text -- Tests validate security controls and error handling -- Tests use descriptive names mapping to AC scenarios - -""" - -import pytest -from unittest.mock import patch -from cryptography.fernet import Fernet -from dagster import build_op_context - -from src.field_level_pseudo_anonymisation.unstructured_ops import ( - depseudonymize_unstructured, -) -from src.field_level_pseudo_anonymisation.config_models.unstructured_config import ( - DepseudonymizeUnstructuredConfig, - DecryptConfig, - DepseudoTechniqueConfig, -) - - -@pytest.fixture -def fernet_key() -> bytes: - """Generate a valid Fernet key for encryption in tests.""" - return Fernet.generate_key() - - -@pytest.fixture -def encrypted_text_data(fernet_key: bytes) -> dict: - """ - Create encrypted data for testing decryption. - - Returns a dict with: - - original_text: The unencrypted text - - encrypted_text: Text with PII values encrypted in {encrypt:...} format - """ - original_text = "My name is John Doe and my email is john.doe@example.com." - fernet = Fernet(fernet_key) - encrypted_name = fernet.encrypt(b"John Doe").decode() - encrypted_email = fernet.encrypt(b"john.doe@example.com").decode() - encrypted_text = ( - f"My name is {{encrypt:{encrypted_name}}} and my email is {{encrypt:{encrypted_email}}}." - ) - return { - "original_text": original_text, - "encrypted_text": encrypted_text, - } - - -# ---------------------- AC1: Data Restoration with Valid Key -------------------------------- - - -@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") -def test_ac1_restore_encrypted_pii_entities_with_valid_key( - mock_create_get_key, fernet_key: bytes, encrypted_text_data: dict -): - """AC1: Restore encrypted PII entities with a valid key from secret management tool.""" - # Arrange - Mock the Vault key retrieval to return the valid key - mock_create_get_key.return_value = fernet_key - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key")) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act - Request data restoration - result_gen = depseudonymize_unstructured( - context, input_text=encrypted_text_data["encrypted_text"] - ) - data_output = next(result_gen) - metrics_output = next(result_gen) - - # Assert - Verify successful restoration - # 1. All original values are restored exactly - assert ( - data_output.value == encrypted_text_data["original_text"] - ), "Original text should be fully restored" - - # 2. Correct output structure - assert data_output.output_name == "data", "Output should be named 'data'" - - # 3. Metrics show correct number of restored entities - assert ( - metrics_output.value["total_depseudo_count"] == 2 - ), "Should restore 2 encrypted entities (name and email)" - - # 4. System retrieved key from secret management tool - mock_create_get_key.assert_called_once_with("decrypt", "test_key") - - -@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") -def test_ac1_restore_multiple_pii_types_with_valid_key(mock_create_get_key, fernet_key: bytes): - """AC1: Restore multiple encrypted PII entity types (name, email, phone) with a valid key.""" - # Arrange - Create text with multiple PII types encrypted - original_text = "Contact John Doe at john.doe@example.com or call 555-1234." - fernet = Fernet(fernet_key) - encrypted_name = fernet.encrypt(b"John Doe").decode() - encrypted_email = fernet.encrypt(b"john.doe@example.com").decode() - encrypted_phone = fernet.encrypt(b"555-1234").decode() - encrypted_text = ( - f"Contact {{encrypt:{encrypted_name}}} at " - f"{{encrypt:{encrypted_email}}} or call {{encrypt:{encrypted_phone}}}." - ) - - mock_create_get_key.return_value = fernet_key - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", key_name="multi_pii_key") - ) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act - result_gen = depseudonymize_unstructured(context, input_text=encrypted_text) - data_output = next(result_gen) - metrics_output = next(result_gen) - - # Assert - assert data_output.value == original_text, "All PII types should be restored" - assert ( - metrics_output.value["total_depseudo_count"] == 3 - ), "Should restore 3 encrypted entities (name, email, phone)" - mock_create_get_key.assert_called_once_with("decrypt", "multi_pii_key") - - -# ------------------- AC2: Restoration Denial when Key is Missing ---------------------------- - - -@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") -def test_ac2_restoration_denial_when_key_missing(mock_create_get_key, encrypted_text_data: dict): - """AC2: Deny restoration when decryption key is missing from secret management tool.""" - # Arrange - Mock Vault to indicate key is missing - mock_create_get_key.side_effect = ValueError( - "Fernet key 'non_existent_key' not found in Vault for decrypt." - ) - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", key_name="non_existent_key") - ) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act & Assert - Verify system fails the restoration request - with pytest.raises( - ValueError, - match="Fernet key 'non_existent_key' not found in Vault for decrypt.", - ) as exc_info: - list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) - - # Verify error message is clear and actionable - assert "not found in Vault" in str( - exc_info.value - ), "Error message should indicate key is missing from Vault" - - # Verify system attempted to retrieve the key (logged attempt) - mock_create_get_key.assert_called_once_with("decrypt", "non_existent_key") - - -# ------------- AC3: Restoration Denial when Access is Unauthorized -------------------------- - - -@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") -def test_ac3_restoration_denial_when_unauthorized_access( - mock_create_get_key, encrypted_text_data: dict -): - """AC3: Deny restoration when participant is not authorized to access the decryption key.""" - # Arrange - Mock Vault to deny access - mock_create_get_key.side_effect = ValueError("Access denied to secret: unauthorized_key") - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig( - technique=DecryptConfig(type="decrypt", key_name="unauthorized_key") - ) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act & Assert - Verify system denies access - with pytest.raises(ValueError, match="Access denied to secret: unauthorized_key") as exc_info: - list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) - - # Verify error message clearly indicates access denial - assert "Access denied" in str( - exc_info.value - ), "Error message should clearly indicate access was denied" - - # Verify the unauthorized access attempt was logged (function was called) - mock_create_get_key.assert_called_once_with("decrypt", "unauthorized_key") - - -# ------------------- AC4: Restoration Denial when Key is Invalid ---------------------------- - - -@patch("src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key") -def test_ac4_restoration_denial_when_key_invalid(mock_create_get_key, encrypted_text_data: dict): - """AC4: Deny restoration when decryption key does not correspond to the encrypted fields.""" - # Arrange - Mock Vault to return a different (wrong) key - invalid_key = Fernet.generate_key() # A different, incorrect key - mock_create_get_key.return_value = invalid_key - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="wrong_key")) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act & Assert - Verify system fails the restoration - with pytest.raises(ValueError, match="Invalid Fernet token") as exc_info: - list(depseudonymize_unstructured(context, input_text=encrypted_text_data["encrypted_text"])) - - # Verify error message indicates decryption failure - assert "Invalid Fernet token" in str( - exc_info.value - ), "Error message should indicate the key is invalid for this data" - - # Verify key was retrieved (system attempted decryption) - mock_create_get_key.assert_called_once_with("decrypt", "wrong_key") - - -# -------------------------------- Additional Edge Cases ---------------------------------------- - - -def test_depseudonymize_unstructured_no_decrypt_config(): - """Edge case: Text is returned unchanged when no decryption techniques are configured.""" - # Arrange - original_text = "This text has no {encrypt:values} to decrypt." - config = DepseudonymizeUnstructuredConfig(used_function=[]) # No techniques - context = build_op_context(op_config=config.model_dump()) - - # Act - result_gen = depseudonymize_unstructured(context, input_text=original_text) - result_output = next(result_gen) - metrics_output = next(result_gen) - - # Assert - assert ( - result_output.value == original_text - ), "Text should remain unchanged when no decryption is configured" - assert ( - metrics_output.value["total_depseudo_count"] == 0 - ), "Should report zero decryptions performed" - - -def test_depseudonymize_unstructured_empty_text(): - """Edge case: Empty input text is returned unchanged with zero decryptions performed.""" - # Arrange - empty_text = "" - config = DepseudonymizeUnstructuredConfig( - used_function=[ - DepseudoTechniqueConfig(technique=DecryptConfig(type="decrypt", key_name="test_key")) - ] - ) - context = build_op_context(op_config=config.model_dump()) - - # Act - with patch( - "src.field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key" - ) as mock_key: - mock_key.return_value = Fernet.generate_key() - result_gen = depseudonymize_unstructured(context, input_text=empty_text) - result_output = next(result_gen) - metrics_output = next(result_gen) - - # Assert - assert result_output.value == "", "Empty text should remain empty" - assert ( - metrics_output.value["total_depseudo_count"] == 0 - ), "Should report zero decryptions for empty text" diff --git a/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py b/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py deleted file mode 100644 index b89fad3..0000000 --- a/tests/field_level_pseudo_anonymisation/test_encrypt_structured.py +++ /dev/null @@ -1,1119 +0,0 @@ -""" -Test suite for field-level pseudonymisation operations (encrypt technique). - -This test suite covers the encryption pseudonymisation technique for structured dataframes, -validating the following Acceptance Criteria: - -## Test Coverage Summary - -### Acceptance Criteria Coverage: -- AC1 (Supported Technique Applied Correctly): 7 tests -- AC2 (Invalid Execution Handling): 7 tests -- AC3 (DataFrame Compliance): 6 tests -- AC4 (Audit Logging - Success): 2 tests -- AC5 (Audit Logging - Failure): 3 tests -- Additional Coverage: 7 tests - -### Test Pattern: -- Each test uses build_op_context with config_to_dagster_dict for configuration -- Tests validate dual outputs (data, metrics) -- Vault access is mocked for isolation - -""" - -import pandas as pd -import pytest -from dagster import build_op_context -from cryptography.fernet import Fernet -from hvac.exceptions import InvalidPath -from unittest.mock import patch, MagicMock - -from template_code_location.field_level_pseudo_anonymisation.config_models.structured_config import ( - AnonymisePseudonymizeStructuredConfig, - EncryptConfig, - HashConfig, - PseudoTechniqueConfig, -) -from template_code_location.field_level_pseudo_anonymisation.ops import anonymize_pseudonymize_structured - -# Import helper functions (fixtures are auto-discovered by pytest) -from .conftest import ( - run_encrypt_op, - clear_vault_key, - get_vault_key, - config_to_dagster_dict, -) - - -# -------------------------------- Test Markers Configuration -------------------------------- - -# Register custom markers -pytest.mark.slow = pytest.mark.slow -pytest.mark.security = pytest.mark.security -pytest.mark.edge_case = pytest.mark.edge_case - - -# -------------------------------- Test-Specific Fixtures ---------------------------------------- - - -@pytest.fixture -def encrypt_single_column_config(): - """ - Configuration for encrypting a single column (email). - Tests basic encryption functionality. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="test_email_key" - ) - ) - ] - ) - - -@pytest.fixture -def encrypt_multiple_columns_config(): - """ - Configuration for encrypting multiple columns (name, email). - Tests encryption across multiple fields. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["name", "email"], key_name="test_multi_key" - ) - ) - ] - ) - - -@pytest.fixture -def encrypt_mixed_types_config(): - """ - Configuration for encrypting columns with different data types. - Tests that encryption handles type conversion (int, float -> string). - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - columns=["id", "age", "salary"], - key_name="test_numeric_key", - ) - ) - ] - ) - - -@pytest.fixture -def encrypt_with_unchanged_columns_config(): - """ - Configuration that encrypts some columns while leaving others unchanged. - Tests AC3 requirement for unchanged column preservation. - """ - return AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="test_partial_key" - ) - ) - ] - ) - - -# -------------------------------- Test-Specific Fixtures ---------------------------------------- - - -def test_encrypt_single_column_applied_correctly(sample_df, encrypt_single_column_config): - """ - AC1: Tests that encryption is applied correctly to a single column. - - Scenario: The system applies encryption to the 'email' field - Given: A structured dataset with an email column - And: A valid encryption configuration for the email field - When: The participant triggers the execution - Then: The email field must be transformed with Fernet encryption - And: The encrypted values must be different from the original values - And: The encrypted values must be valid Fernet tokens (decodable) - """ - # Clear any existing test key - clear_vault_key("test_email_key") - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - - # Verify output structure - assert result_df is not None, "Result DataFrame should not be None" - assert metrics is not None, "Metrics should not be None" - - # Verify email column is encrypted (values changed) - assert not result_df["email"].equals( - sample_df["email"] - ), "Email column should be encrypted (values should change)" - - # Verify all encrypted values are different from originals - for orig, enc in zip(sample_df["email"], result_df["email"]): - assert orig != enc, f"Original value '{orig}' should be encrypted" - - # Verify encrypted values are valid Fernet tokens (can be decrypted) - key = get_vault_key("test_email_key") - f = Fernet(key) - for enc_value in result_df["email"]: - decrypted = f.decrypt(enc_value.encode()).decode() - assert ( - decrypted in sample_df["email"].values - ), f"Decrypted value '{decrypted}' should match an original email" - - # Verify row count is preserved - assert len(result_df) == len(sample_df), "Row count should be preserved" - - -def test_encrypt_multiple_columns_applied_correctly(sample_df, encrypt_multiple_columns_config): - """ - AC1: Tests that encryption is applied correctly to multiple columns. - - Scenario: The system applies encryption to multiple fields (name, email) - Given: A structured dataset with name and email columns - And: A valid encryption configuration for both fields - When: The participant triggers the execution - Then: Both fields must be transformed with Fernet encryption - And: Each field uses the same encryption key (as specified) - """ - clear_vault_key("test_multi_key") - - result_df, metrics = run_encrypt_op(encrypt_multiple_columns_config, sample_df.copy()) - - # Verify both columns are encrypted - assert not result_df["name"].equals(sample_df["name"]), "Name column should be encrypted" - assert not result_df["email"].equals(sample_df["email"]), "Email column should be encrypted" - - # Verify all values are encrypted - key = get_vault_key("test_multi_key") - f = Fernet(key) - - for enc_name in result_df["name"]: - decrypted = f.decrypt(enc_name.encode()).decode() - assert decrypted in sample_df["name"].values - - for enc_email in result_df["email"]: - decrypted = f.decrypt(enc_email.encode()).decode() - assert decrypted in sample_df["email"].values - - -def test_encrypt_numeric_columns_applied_correctly(sample_df, encrypt_mixed_types_config): - """ - AC1: Tests that encryption handles numeric data types correctly. - - Scenario: The system applies encryption to numeric fields (id, age, salary) - Given: A structured dataset with integer and float columns - And: A valid encryption configuration for numeric fields - When: The participant triggers the execution - Then: Numeric values must be converted to strings and encrypted - And: Original numeric values should be recoverable via decryption - """ - clear_vault_key("test_numeric_key") - - result_df, metrics = run_encrypt_op(encrypt_mixed_types_config, sample_df.copy()) - - # Verify all numeric columns are now string type (encrypted) - assert result_df["id"].dtype == object, "Encrypted id should be object/string type" - assert result_df["age"].dtype == object, "Encrypted age should be object/string type" - assert result_df["salary"].dtype == object, "Encrypted salary should be object/string type" - - # Verify original numeric values can be recovered - key = get_vault_key("test_numeric_key") - f = Fernet(key) - - for enc_id in result_df["id"]: - decrypted = int(f.decrypt(enc_id.encode()).decode()) - assert decrypted in sample_df["id"].values - - -def test_encrypt_key_generation_on_first_use(sample_df, encrypt_single_column_config): - """ - AC1: Tests that encryption key is automatically generated and stored in Vault. - - Scenario: First-time encryption generates a key automatically - Given: A structured dataset with valid configuration - And: No encryption key exists in Vault for the specified key_name - When: The participant triggers the execution - Then: The system must generate a new Fernet key - And: Store it in Vault at the specified path - And: Use it for encryption - """ - clear_vault_key("test_email_key") - - # Verify key doesn't exist before encryption - with pytest.raises(InvalidPath): - get_vault_key("test_email_key") - - result_df, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - - # Verify key was created - key = get_vault_key("test_email_key") - assert key is not None, "Encryption key should be created in Vault" - assert len(key) == 44, "Fernet key should be 44 bytes (base64 encoded 32 bytes)" - - # Verify the key works for decryption - f = Fernet(key) - for enc_email in result_df["email"]: - decrypted = f.decrypt(enc_email.encode()).decode() - assert decrypted in sample_df["email"].values - - -def test_encrypt_uses_existing_vault_key(sample_df, encrypt_single_column_config): - """ - AC1: Tests that encryption uses an existing key from Vault if present. - - Scenario: Encryption reuses existing key for consistent pseudonymisation - Given: A structured dataset - And: An encryption key already exists in Vault - When: The participant triggers the execution - Then: The system must use the existing key (not generate a new one) - And: The same input produces the same encrypted output (deterministic with same key) - """ - clear_vault_key("test_email_key") - - # First encryption - generates key - result_df_1, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - key_1 = get_vault_key("test_email_key") - - # Second encryption - should use same key - result_df_2, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - key_2 = get_vault_key("test_email_key") - - # Verify same key is used - assert key_1 == key_2, "Encryption should reuse existing Vault key" - - -# ----------------------- AC2: Invalid Execution Handling ------------------------------------ - - -def test_encrypt_missing_column_error(encrypt_single_column_config): - """ - AC2: Tests graceful error handling when a specified column doesn't exist. - - Scenario: The system aborts gracefully when column is missing - Given: A structured dataset - And: A configuration specifying a non-existent column - When: The participant triggers the execution - Then: The system must raise a clear ValueError - And: The error message must indicate which columns are missing - """ - df_missing_column = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], - # Missing 'email' column - } - ) - - with pytest.raises(ValueError) as exc_info: - run_encrypt_op(encrypt_single_column_config, df_missing_column) - - assert "not present in the DataFrame" in str( - exc_info.value - ), "Error message should indicate missing columns" - assert "email" in str(exc_info.value), "Error message should mention the missing 'email' column" - - -def test_encrypt_empty_dataframe_handled(encrypt_single_column_config): - """ - AC2: Tests graceful handling of empty DataFrame input. - - Scenario: The system processes empty DataFrame without errors - Given: An empty structured dataset (no rows) - And: A valid encryption configuration - When: The participant triggers the execution - Then: The system must return an empty DataFrame with correct schema - And: No errors should be raised - """ - clear_vault_key("test_email_key") - - empty_df = pd.DataFrame(columns=["id", "name", "email", "age", "salary", "department"]) - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, empty_df) - - assert len(result_df) == 0, "Result should be empty" - assert "email" in result_df.columns, "Email column should exist in schema" - - -def test_encrypt_vault_connection_error(): - """ - AC2: Tests error handling when Vault is unreachable. - - Scenario: The system fails gracefully when Vault is unavailable - Given: A structured dataset with valid configuration - When: Vault service is unreachable or misconfigured - Then: The system must raise a clear error - And: The error message must indicate the Vault connection issue - - Note: This test requires Vault to be down or uses a bad URL. - For testing purposes, we simulate by using invalid credentials. - """ - # Create a mock client that raises an exception when accessing Vault - mock_client_instance = MagicMock() - mock_client_instance.secrets.kv.v2.read_secret_version.side_effect = Exception( - "Simulated Vault connection error" - ) - - with patch("hvac.Client", return_value=mock_client_instance): - df = pd.DataFrame( - { - "id": [1], - "name": ["Test"], - "email": ["test@example.com"], - "age": [30], - "salary": [50000.0], - "department": ["IT"], - } - ) - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="test_email_key" - ) - ) - ] - ) - with pytest.raises(ValueError) as exc_info: - run_encrypt_op(config, df) - - error_message = str(exc_info.value) - assert ( - "Simulated Vault connection error" in error_message - ), "Error should indicate Vault connection issue" - - -def test_encrypt_null_values_handled(encrypt_single_column_config): - """ - AC2: Tests handling of NULL/NaN values in encrypted columns. - - Scenario: The system handles null values appropriately - Given: A structured dataset with NULL values in the column to encrypt - And: A valid encryption configuration - When: The participant triggers the execution - Then: The system must process null values (encrypt "nan" string or handle appropriately) - And: Not raise an exception - """ - clear_vault_key("test_email_key") - - df_with_nulls = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "name": ["Alice", "Bob", "Charlie", "David"], - "email": ["alice@example.com", None, "charlie@example.com", pd.NA], - "age": [25, 30, 35, 40], - "salary": [50000.0, 60000.0, 70000.0, 80000.0], - "department": ["HR", "IT", "Finance", "IT"], - } - ) - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, df_with_nulls) - - # Verify execution completed without errors - assert result_df is not None - assert len(result_df) == 4 - - # Verify null values were processed (encrypted as string "None" or "nan") - key = get_vault_key("test_email_key") - f = Fernet(key) - - # The null values get converted to string "None" or "nan" before encryption - for enc_email in result_df["email"]: - decrypted = f.decrypt(enc_email.encode()).decode() - # Decrypted value should be original or string representation of null - assert decrypted in [ - "alice@example.com", - "charlie@example.com", - "None", - "nan", - "", - ] - - -def test_encrypt_duplicate_column_configuration_error(): - """ - AC2: Tests that duplicate columns across techniques are rejected. - - Scenario: Configuration validation prevents duplicate column assignments - Given: A configuration that assigns the same column to multiple techniques - When: The configuration is validated - Then: The system must raise a ValueError during configuration creation - And: The error message must indicate duplicate column assignment - """ - with pytest.raises(ValueError) as exc_info: - AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig(type="encrypt", columns=["email"], key_name="key1") - ), - PseudoTechniqueConfig( - technique=HashConfig( - type="hash", - columns=["email"], # Duplicate column - algorithm="sha256", - ) - ), - ] - ) - - assert "Duplicate column" in str( - exc_info.value - ), "Error should indicate duplicate column configuration" - - -# ------------------ AC3: DataFrame Input and Output Compliance ------------------------------ - - -def test_encrypt_dataframe_input_output_format(sample_df, encrypt_single_column_config): - """ - AC3: Tests that input and output are both pandas DataFrames. - - Scenario: The system accepts DataFrame input and returns DataFrame output - Given: A structured dataset as pandas DataFrame - And: A valid encryption configuration - When: The participant triggers the execution - Then: The system must return a pandas DataFrame - And: The DataFrame structure must be preserved - """ - clear_vault_key("test_email_key") - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - - # Verify output is a DataFrame - assert isinstance(result_df, pd.DataFrame), "Output must be a pandas DataFrame" - - # Verify DataFrame structure preserved - assert list(result_df.columns) == list(sample_df.columns), "Column names should be preserved" - assert len(result_df) == len(sample_df), "Row count should be preserved" - - -def test_encrypt_data_types_transformed_correctly(sample_df, encrypt_mixed_types_config): - """ - AC3: Tests that data types are transformed appropriately after encryption. - - Scenario: Encrypted columns change to string type - Given: A structured dataset with various data types (int, float, str) - And: An encryption configuration for multiple columns - When: The participant triggers the execution - Then: All encrypted columns must be of type object/string - And: This transformation is valid and consistent with encryption technique - """ - clear_vault_key("test_numeric_key") - - # Store original types - original_types = sample_df.dtypes.to_dict() - - result_df, _ = run_encrypt_op(encrypt_mixed_types_config, sample_df.copy()) - - # Verify encrypted columns are now object/string type - assert result_df["id"].dtype == object, "Encrypted integer column should become object type" - assert result_df["age"].dtype == object, "Encrypted integer column should become object type" - assert result_df["salary"].dtype == object, "Encrypted float column should become object type" - - # Verify data types changed (not same as original) - assert result_df["id"].dtype != original_types["id"], "Data type should change after encryption" - - -def test_encrypt_unchanged_columns_preserved(sample_df, encrypt_with_unchanged_columns_config): - """ - AC3: Tests that columns not specified for encryption remain unchanged. - - Scenario: Non-encrypted columns remain identical - Given: A structured dataset with multiple columns - And: An encryption configuration for only one column (email) - When: The participant triggers the execution - Then: Columns not specified (id, name, age, salary, department) must remain unchanged - And: Their values and data types must be identical to the input - """ - clear_vault_key("test_partial_key") - - result_df, _ = run_encrypt_op(encrypt_with_unchanged_columns_config, sample_df.copy()) - - # Verify unchanged columns are identical - assert result_df["id"].equals(sample_df["id"]), "ID column should remain unchanged" - assert result_df["name"].equals(sample_df["name"]), "Name column should remain unchanged" - assert result_df["age"].equals(sample_df["age"]), "Age column should remain unchanged" - assert result_df["salary"].equals(sample_df["salary"]), "Salary column should remain unchanged" - assert result_df["department"].equals( - sample_df["department"] - ), "Department column should remain unchanged" - - # Verify encrypted column is changed - assert not result_df["email"].equals( - sample_df["email"] - ), "Email column should be encrypted (changed)" - - -def test_encrypt_schema_consistency(sample_df, encrypt_multiple_columns_config): - """ - AC3: Tests that DataFrame schema is consistent and coherent. - - Scenario: Output DataFrame has consistent schema - Given: A structured dataset - And: A multi-column encryption configuration - When: The participant triggers the execution - Then: Output DataFrame must have same column names as input - And: Column order must be preserved - And: No columns should be added or removed - """ - clear_vault_key("test_multi_key") - - result_df, _ = run_encrypt_op(encrypt_multiple_columns_config, sample_df.copy()) - - # Verify column names are identical - assert list(result_df.columns) == list(sample_df.columns), "Column names must be identical" - - # Verify column order is preserved - for i, col in enumerate(sample_df.columns): - assert result_df.columns[i] == col, f"Column order should be preserved at position {i}" - - # Verify no extra columns added - assert len(result_df.columns) == len( - sample_df.columns - ), "Number of columns should remain the same" - - -def test_encrypt_index_preservation(sample_df, encrypt_single_column_config): - """ - AC3: Tests that DataFrame index is preserved after encryption. - - Scenario: DataFrame index remains unchanged - Given: A structured dataset with default index - And: A valid encryption configuration - When: The participant triggers the execution - Then: The output DataFrame must preserve the original index - And: No extraneous index column should be added - """ - clear_vault_key("test_email_key") - - # Set custom index to verify preservation - sample_df_with_index = sample_df.copy() - sample_df_with_index.index = [10, 20, 30, 40, 50] - - result_df, _ = run_encrypt_op(encrypt_single_column_config, sample_df_with_index) - - # Verify index is preserved - assert list(result_df.index) == list( - sample_df_with_index.index - ), "DataFrame index should be preserved" - - -# ------------- AC4: Execution Audit & Logging - Positive Scenario --------------------------- - - -def test_encrypt_successful_execution_logging(sample_df, encrypt_single_column_config): - """ - AC4: Tests that successful execution produces appropriate logs/metadata. - - Scenario: Successful pseudonymisation execution is logged - Given: A structured dataset with valid configuration - When: The participant triggers the execution - And: The execution completes successfully - Then: The system must return metrics output - And: Metrics should confirm successful operation - - Note: Dagster automatically logs: - - Timestamp of execution (run start/end times) - - Workflow run identifier (run_id) - - Configuration parameters (captured in op_config) - - Success status (run status in Dagster UI) - - This test validates the op returns proper outputs for Dagster to log. - """ - clear_vault_key("test_email_key") - - op_config_dict = config_to_dagster_dict(encrypt_single_column_config) - context = build_op_context(op_config=op_config_dict) - - # Capture run context information - run_id = context.run_id - - # Execute the operation - result_df, metrics = anonymize_pseudonymize_structured(context, df=sample_df.copy()) - - # Verify outputs for logging - assert result_df is not None, "Data output should be present for logging" - assert metrics is not None, "Metrics output should be present for logging" - assert isinstance(metrics.value, dict), "Metrics should be a dict" - - # Verify run context is available (Dagster provides this automatically) - assert run_id is not None, "Run ID should be available for audit logging" - - # Verify configuration is captured (can be logged) - assert "used_function" in op_config_dict, "Configuration should be captured for audit" - # In Dagster format, technique is nested under the discriminator key - technique_config = op_config_dict["used_function"][0]["technique"] - assert "encrypt" in technique_config, "Encrypt technique should be present" - assert ( - technique_config["encrypt"]["key_name"] == "test_email_key" - ), "Key name should be logged (but not key value)" - - # Verify no PII is in metrics (compliance requirement) - metrics_str = str(metrics.value) - for email in sample_df["email"]: - assert email not in metrics_str, "PII values should not appear in metrics/logs" - - -def test_encrypt_configuration_parameters_logged(sample_df, encrypt_multiple_columns_config): - """ - AC4: Tests that configuration parameters are properly captured for audit. - - Scenario: Configuration details are available for compliance logging - Given: A multi-column encryption configuration - When: The participant triggers the execution - Then: The system must capture configuration parameters including: - - Selected technique (encrypt) - - Columns to encrypt - - Key name (but not key value) - And: These parameters should be accessible for audit logging - """ - clear_vault_key("test_multi_key") - - op_config_dict = config_to_dagster_dict(encrypt_multiple_columns_config) - context = build_op_context(op_config=op_config_dict) - - result_df, metrics = anonymize_pseudonymize_structured(context, df=sample_df.copy()) - - # Verify configuration details are captured - technique_config = op_config_dict["used_function"][0]["technique"] - assert "encrypt" in technique_config, "Encrypt technique should be present" - assert set(technique_config["encrypt"]["columns"]) == {"name", "email"} - assert technique_config["encrypt"]["key_name"] == "test_multi_key" - - # Verify encryption key itself is NOT in config (security) - config_str = str(op_config_dict) - try: - key = get_vault_key("test_multi_key") - assert ( - key.decode() not in config_str - ), "Encryption key value should never be in logged configuration" - except Exception: - pass # Key might not exist yet - - -# ------------- AC5: Execution Audit & Logging - Negative Scenario --------------------------- - - -def test_encrypt_failed_execution_logging(encrypt_single_column_config): - """ - AC5: Tests that failed execution provides error details for audit. - - Scenario: Failed pseudonymisation execution is logged with error details - Given: A structured dataset with valid configuration - When: The participant triggers the execution - And: The execution fails (e.g., missing column) - Then: The system must raise an exception with clear error message - And: The error message should indicate the failure reason - And: Configuration parameters should still be accessible for audit - And: No PII should be exposed in error messages - """ - df_missing_column = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - # Missing 'email' column - will cause failure - } - ) - - op_config_dict = config_to_dagster_dict(encrypt_single_column_config) - context = build_op_context(op_config=op_config_dict) - run_id = context.run_id - - # Execute and capture failure - with pytest.raises(ValueError) as exc_info: - # Need to consume the generator to trigger execution - list(anonymize_pseudonymize_structured(context, df=df_missing_column)) - - # Verify error details are available for logging - error_message = str(exc_info.value) - assert ( - "not present in the DataFrame" in error_message - ), "Error message should explain failure reason" - assert "email" in error_message, "Error message should mention the problematic column" - - # Verify run context is available for failure logging - assert run_id is not None, "Run ID should be available for failure audit" - - # Verify configuration is still accessible for audit - assert op_config_dict is not None, "Configuration should be accessible for failure audit" - - # Verify no actual data values in error message (PII protection) - for name in ["Alice", "Bob", "Charlie"]: - assert name not in error_message, "PII values should not appear in error messages" - - -def test_encrypt_stack_trace_available_on_failure(encrypt_single_column_config): - """ - AC5: Tests that stack trace is available for debugging failed executions. - - Scenario: Failed execution provides stack trace for troubleshooting - Given: A configuration that will cause failure - When: The execution fails - Then: Python exception with stack trace should be raised - And: Stack trace should be available for logging (Dagster captures this) - And: Stack trace should not contain PII values - """ - df_missing_column = pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}) - - try: - run_encrypt_op(encrypt_single_column_config, df_missing_column) - pytest.fail("Should have raised ValueError") - except ValueError: - # Verify exception information is available - import traceback - - stack_trace = traceback.format_exc() - - assert "ValueError" in stack_trace, "Exception type should be in stack trace" - assert ( - "not present in the DataFrame" in stack_trace - ), "Error message should be in stack trace" - - # Verify stack trace contains code location - assert ( - "ops.py" in stack_trace or "anonymize_pseudonymize_structured" in stack_trace - ), "Stack trace should indicate error location" - - -def test_encrypt_vault_error_logged_appropriately(sample_df): - """ - AC5: Tests that Vault-related errors are logged with appropriate detail. - - Scenario: Vault connection/authentication errors are captured - Given: A configuration with invalid Vault setup - When: The execution attempts to access Vault - And: Vault access fails - Then: The system must raise an error with Vault-specific details - And: The error should indicate the Vault-related nature of the failure - - Note: This test validates error handling structure; actual Vault errors - depend on Vault availability. - """ - # Create a mock client that raises an exception when accessing Vault - mock_client_instance = MagicMock() - mock_client_instance.secrets.kv.v2.read_secret_version.side_effect = Exception( - "Simulated Vault authentication error" - ) - - with patch("hvac.Client", return_value=mock_client_instance): - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["email"], key_name="test_email_key" - ) - ) - ] - ) - with pytest.raises(ValueError) as exc_info: - run_encrypt_op(config, sample_df) - - error_message = str(exc_info.value) - assert ( - "Simulated Vault authentication error" in error_message - ), "Error should indicate Vault-related failure" - - -# --------------- Additional Edge Cases & Integration Tests ---------------------------------- - - -def test_encrypt_large_dataset_performance(encrypt_single_column_config): - """ - Additional test: Validates encryption works with larger datasets. - - Tests that encryption scales to realistic dataset sizes without errors. - """ - clear_vault_key("test_email_key") - - # Create a larger dataset (1000 rows) - large_df = pd.DataFrame( - { - "id": range(1000), - "name": [f"Person{i}" for i in range(1000)], - "email": [f"person{i}@example.com" for i in range(1000)], - "age": [25 + (i % 50) for i in range(1000)], - "salary": [50000.0 + (i * 100) for i in range(1000)], - "department": ["HR", "IT", "Finance"] * 333 + ["HR"], - } - ) - - # Save original values for comparison - original_emails = large_df["email"].copy() - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, large_df) - - assert len(result_df) == 1000, "All rows should be processed" - assert not result_df["email"].equals(original_emails), "All email values should be encrypted" - - -def test_encrypt_special_characters_in_data(encrypt_single_column_config): - """ - Additional test: Validates encryption handles special characters correctly. - - Tests that encryption works with unicode, special chars, emojis, etc. - """ - clear_vault_key("test_email_key") - - df_special = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "name": ["Müller", "José", "李明", "🙂 John"], - "email": [ - "test@müller.de", - "josé@example.com", - "李明@example.cn", - "emoji@😀.com", - ], - "age": [25, 30, 35, 40], - "salary": [50000.0, 60000.0, 70000.0, 80000.0], - "department": ["HR", "IT", "Finance", "IT"], - } - ) - - # Save original values for comparison - original_emails = df_special["email"].copy().tolist() - - result_df, metrics = run_encrypt_op(encrypt_single_column_config, df_special) - - # Verify special characters are encrypted and recoverable - key = get_vault_key("test_email_key") - f = Fernet(key) - - decrypted_emails = [f.decrypt(enc.encode()).decode() for enc in result_df["email"]] - assert set(decrypted_emails) == set( - original_emails - ), "Special characters should be preserved through encryption/decryption" - - -def test_encrypt_deterministic_within_session(sample_df, encrypt_single_column_config): - """ - Additional test: Validates encryption produces consistent results with same key. - - Note: Fernet encryption includes a timestamp, so it's NOT deterministic. - This test validates that decryption recovers the original value consistently. - """ - clear_vault_key("test_email_key") - - # First encryption - result_df_1, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - - # Get the key used - key = get_vault_key("test_email_key") - f = Fernet(key) - - # Verify first encryption decrypts correctly - decrypted_1 = [f.decrypt(enc.encode()).decode() for enc in result_df_1["email"]] - assert decrypted_1 == sample_df["email"].tolist(), "Decryption should recover original values" - - # Second encryption with same key (different encrypted values due to timestamp) - result_df_2, _ = run_encrypt_op(encrypt_single_column_config, sample_df.copy()) - - # Verify second encryption also decrypts correctly - decrypted_2 = [f.decrypt(enc.encode()).decode() for enc in result_df_2["email"]] - assert ( - decrypted_2 == sample_df["email"].tolist() - ), "Decryption should consistently recover original values" - - # Note: Encrypted values will be different due to Fernet's timestamp - assert not result_df_1["email"].equals( - result_df_2["email"] - ), "Fernet encryption includes timestamp, so outputs differ" - - -def test_encrypt_empty_string_values(encrypt_single_column_config): - """ - Additional test: Validates encryption handles empty strings correctly. - """ - clear_vault_key("test_email_key") - - df_empty_strings = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "", "Charlie"], - "email": ["alice@example.com", "", "charlie@example.com"], - "age": [25, 30, 35], - "salary": [50000.0, 60000.0, 70000.0], - "department": ["HR", "IT", "Finance"], - } - ) - - result_df, _ = run_encrypt_op(encrypt_single_column_config, df_empty_strings) - - # Verify empty strings are encrypted - key = get_vault_key("test_email_key") - f = Fernet(key) - - decrypted_emails = [f.decrypt(enc.encode()).decode() for enc in result_df["email"]] - assert "" in decrypted_emails, "Empty strings should be encrypted and recoverable" - - -@pytest.mark.edge_case -def test_encrypt_very_long_strings(encrypt_single_column_config): - """ - Edge case: Encryption of very long string values (e.g., 10KB+) - - Validates that Fernet encryption handles large strings without truncation. - """ - clear_vault_key("test_email_key") - - # Create DataFrame with very long strings - long_string = "x" * 10000 # 10KB string - df_long_strings = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "email": [ - f"{long_string}@example.com", - "bob@example.com", - "charlie@example.com", - ], - "age": [25, 30, 35], - "salary": [50000.0, 60000.0, 70000.0], - "department": ["HR", "IT", "Finance"], - } - ) - - result_df, _ = run_encrypt_op(encrypt_single_column_config, df_long_strings) - - # Verify long string is encrypted and recoverable - key = get_vault_key("test_email_key") - f = Fernet(key) - decrypted = f.decrypt(result_df.loc[0, "email"].encode()).decode() - assert ( - decrypted == f"{long_string}@example.com" - ), "Very long strings should be encrypted and recoverable" - - -@pytest.mark.edge_case -def test_encrypt_column_with_all_identical_values(encrypt_single_column_config): - """ - Edge case: Encryption when all values in a column are identical - - Validates that encryption produces different outputs for identical inputs - (due to Fernet's timestamp-based nonce). - """ - clear_vault_key("test_email_key") - - df_identical = pd.DataFrame( - { - "id": [1, 2, 3, 4, 5], - "name": ["Alice"] * 5, - "email": ["same@example.com"] * 5, # All identical - "age": [30] * 5, - "salary": [60000.0] * 5, - "department": ["IT"] * 5, - } - ) - - result_df, _ = run_encrypt_op(encrypt_single_column_config, df_identical) - - # Verify all encrypted values are unique (due to Fernet timestamp) - encrypted_values = result_df["email"].tolist() - assert ( - len(set(encrypted_values)) == 5 - ), "Fernet should produce unique ciphertexts even for identical plaintexts" - - # Verify all decrypt to same original value - key = get_vault_key("test_email_key") - f = Fernet(key) - decrypted_values = [f.decrypt(enc.encode()).decode() for enc in encrypted_values] - assert all( - val == "same@example.com" for val in decrypted_values - ), "All encrypted values should decrypt to same original" - - -@pytest.mark.edge_case -def test_encrypt_whitespace_only_values(encrypt_single_column_config): - """ - Edge case: Encryption of whitespace-only values - """ - clear_vault_key("test_email_key") - - df_whitespace = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "email": [" ", "\t\t", "\n\n"], # Various whitespace - "age": [25, 30, 35], - "salary": [50000.0, 60000.0, 70000.0], - "department": ["HR", "IT", "Finance"], - } - ) - - # Store original values before encryption - original_emails = df_whitespace["email"].tolist() - - result_df, _ = run_encrypt_op(encrypt_single_column_config, df_whitespace) - - # Verify whitespace values are encrypted and recoverable - key = get_vault_key("test_email_key") - f = Fernet(key) - encrypted_emails = result_df["email"].tolist() - - for orig_ws, enc_val in zip(original_emails, encrypted_emails): - decrypted = f.decrypt(enc_val.encode()).decode() - assert ( - decrypted == orig_ws - ), f"Whitespace value {repr(orig_ws)} should be preserved, but got {repr(decrypted)}" - - -@pytest.mark.edge_case -@pytest.mark.parametrize( - "column_type,test_values", - [ - ("integer", [1, 2, 3, 4, 5]), - ("float", [1.1, 2.2, 3.3, 4.4, 5.5]), - ("string", ["a", "b", "c", "d", "e"]), - ], -) -def test_encrypt_various_data_types(column_type, test_values): - """ - Parameterized test: Encryption across different pandas data types - """ - clear_vault_key("test_type_key") - - df = pd.DataFrame( - { - "id": range(len(test_values)), - "test_column": test_values, - "name": ["Person"] * len(test_values), - "email": ["test@example.com"] * len(test_values), - "age": [30] * len(test_values), - "salary": [60000.0] * len(test_values), - "department": ["IT"] * len(test_values), - } - ) - - config = AnonymisePseudonymizeStructuredConfig( - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", columns=["test_column"], key_name="test_type_key" - ) - ) - ] - ) - - result_df, _ = run_encrypt_op(config, df) - - # Verify encryption occurred (values changed to strings) - assert ( - result_df["test_column"].dtype == object - ), f"Encrypted {column_type} should become object type" - - # Verify decryption recovers original values - key = get_vault_key("test_type_key") - f = Fernet(key) - for idx, orig_val in enumerate(test_values): - decrypted = f.decrypt(result_df.loc[idx, "test_column"].encode()).decode() - assert decrypted == str( - orig_val - ), f"Decrypted value should match original {column_type} value" diff --git a/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py b/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py deleted file mode 100644 index 8d6a3cc..0000000 --- a/tests/field_level_pseudo_anonymisation/test_encrypt_unstructured.py +++ /dev/null @@ -1,853 +0,0 @@ -""" -Test suite for field-level pseudonymisation operations on unstructured data. - -This test suite validates the pseudonymisation of unstructured text with PII detection, -covering the following Acceptance Criteria: - -## Test Coverage Summary - -### Acceptance Criteria Coverage: -- AC1 (Pseudonymisation and Retention Applied Correctly): 8 tests -- AC2 (Invalid Execution Handling): 5 tests -- AC3 (Execution Audit & Logging - Positive Scenario): 3 tests -- AC4 (Execution Audit & Logging - Negative Scenario): 4 tests -- Additional Coverage: 3 tests - -### Test Pattern: -- Each test uses build_op_context with config_to_dagster_dict for configuration -- Tests validate dual outputs (data, metrics) -- Vault access is mocked for isolation -- Tests validate Scrubadub automatic PII detection -- Tests ensure placeholder replacement for unconfigured PII -""" - -import pytest -import re -from dagster import build_op_context -from unittest.mock import patch, MagicMock - -from template_code_location.field_level_pseudo_anonymisation.config_models.unstructured_config import ( - AnonymisePseudonymizeUnstructuredConfig, - EncryptConfig, - RetainConfig, - PseudoTechniqueConfig, -) -from template_code_location.field_level_pseudo_anonymisation.config_models import PIIEntityEnum, LanguageEnum -from template_code_location.field_level_pseudo_anonymisation.unstructured_ops import ( - anonymize_pseudonymize_unstructured, -) - -from .conftest import clear_vault_key - - -def config_to_dagster_dict_unstructured(config): - """Convert unstructured config to Dagster format.""" - config_dict = {"language": config.language.value, "used_function": []} - - for func_config in config.used_function: - technique = func_config.technique - technique_type = technique.type - technique_dict = technique.model_dump() - - if "pii" in technique_dict: - technique_dict["pii"] = [pii_enum.name for pii_enum in technique.pii] - - technique_dict_without_type = {k: v for k, v in technique_dict.items() if k != "type"} - - config_dict["used_function"].append( - {"technique": {technique_type: technique_dict_without_type}} - ) - - return config_dict - - -def run_unstructured_op(config, text): - """ - Helper to run unstructured pseudonymisation op. - - Returns: - tuple: (result_text: str, metrics_markdown: str) - """ - context = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) - result_text, metrics = anonymize_pseudonymize_unstructured(context, text=text) - - # Extract actual values from Output objects - return result_text.value, metrics.value - - -def parse_metrics_markdown(metrics_md: str) -> dict: - """ - Parse markdown metrics into structured dict for easier testing. - - Args: - metrics_md: Markdown metrics string from op output - - Returns: - dict with keys: total_pii_detected, pii_by_type, techniques_applied, language - """ - result = { - "total_pii_detected": 0, - "pii_by_type": {}, - "techniques_applied": {}, - "language": "", - } - - # Extract total PII detected - total_match = re.search(r"\*\*Total PII Detected\*\*:\s*(\d+)", metrics_md) - if total_match: - result["total_pii_detected"] = int(total_match.group(1)) - - # Extract language - lang_match = re.search(r"\*\*Language\*\*:\s*(\w+)", metrics_md) - if lang_match: - result["language"] = lang_match.group(1) - - # Extract PII by type from table - pii_table_section = re.search( - r"### PII by Type\n\| Entity Type \| Count \|\n\|[^\n]+\n((?:\|[^\n]+\n)+)", - metrics_md, - ) - if pii_table_section: - for line in pii_table_section.group(1).strip().split("\n"): - parts = [p.strip() for p in line.split("|") if p.strip()] - if len(parts) == 2: - entity_type, count = parts - result["pii_by_type"][entity_type] = int(count) - - # Extract techniques applied - techniques_section = re.search(r"### Techniques Applied\n((?:- \*\*[^\n]+\n)+)", metrics_md) - if techniques_section: - for line in techniques_section.group(1).strip().split("\n"): - tech_match = re.match(r"-\s*\*\*(.+?)\*\*:\s*(.+)", line) - if tech_match: - pii_type, technique = tech_match.groups() - result["techniques_applied"][pii_type] = technique - - return result - - -# -------------------------------- Fixtures ---------------------------------------- - - -@pytest.fixture -def sample_text_en(): - """English text with various PII types.""" - return """ - John Smith works at Acme Corporation. His email is john.smith@example.com - and his phone number is +1-555-123-4567. He lives in New York City at - 123 Main Street, Apartment 4B. His SSN is 123-45-6789. - """ - - -@pytest.fixture -def sample_text_multi_person(): - """Text with multiple person names.""" - return """ - The meeting included Alice Johnson, Bob Williams, and Charlie Brown. - They discussed the project with Maria Garcia and David Wilson. - """ - - -@pytest.fixture -def sample_text_mixed_pii(): - """Text with multiple PII types for AC1 comprehensive testing.""" - return """ - Contact Information: - Name: Dr. Emily Watson - Email: emily.watson@hospital.com - Phone: +44-20-7946-0958 - Website: https://patient-portal.hospital.com/records - """ - - -@pytest.fixture -def encrypt_person_config(): - """Configuration to encrypt PERSON entities.""" - return AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_person_key", - ) - ) - ], - ) - - -@pytest.fixture -def retain_person_config(): - """Configuration to retain PERSON entities unchanged.""" - return AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) - ], - ) - - -@pytest.fixture -def mixed_technique_config(): - """Configuration with encryption and retention for AC1 testing.""" - return AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], - key_name="test_mixed_key", - ) - ), - PseudoTechniqueConfig( - technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PHONE_NUMBERS]) - ), - ], - ) - - -# ================================================================================================ -# AC1: Pseudonymisation and Retention Are Applied Correctly -# ================================================================================================ - - -def test_ac1_encrypt_configured_pii_types(sample_text_mixed_pii, encrypt_person_config): - """AC1: Test that configured PII types are encrypted correctly.""" - clear_vault_key("test_person_key") - - result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_mixed_pii) - metrics = parse_metrics_markdown(metrics_md) - - # Verify person name is encrypted (not in plaintext) - assert "Emily Watson" not in result_text, "Configured PERSON PII should be encrypted" - - # Verify encryption token is present - assert "{encrypt:" in result_text, "Encrypted token should be present in result" - - # Verify PII was detected and processed - assert metrics["total_pii_detected"] > 0, "System should detect PII entities" - assert "PERSON" in metrics["pii_by_type"], "PERSON type should be in detected PII" - - # Verify text structure is preserved (surrounding text intact) - assert "Contact Information:" in result_text, "Non-PII text structure should be preserved" - - -def test_ac1_retain_configured_pii_unchanged(sample_text_multi_person): - """AC1: Test that PII types marked for retention remain unchanged.""" - retain_config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig(technique=RetainConfig(type="retain", pii=[PIIEntityEnum.PERSON])) - ], - ) - - result_text, metrics_md = run_unstructured_op(retain_config, sample_text_multi_person) - metrics = parse_metrics_markdown(metrics_md) - - # Verify retained PII types remain in plaintext - assert "Alice Johnson" in result_text, "Retained PERSON PII should remain unchanged" - assert "Bob Williams" in result_text, "Retained PERSON PII should remain unchanged" - - # Verify technique applied is 'retain' - assert ( - "retain" in metrics["techniques_applied"].get("PERSON", "").lower() - ), "Retain technique should be recorded for PERSON type" - - -def test_ac1_unconfigured_pii_replaced_with_placeholders(sample_text_mixed_pii): - """AC1: Test that unconfigured PII types are replaced with placeholders.""" - encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_person_only_key", - ) - ) - ], - ) - - clear_vault_key("test_person_only_key") - - result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) - - # Verify person is encrypted (configured) - assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" - - # Verify unconfigured PII types have placeholders - assert ( - "{{" in result_text and "}}" in result_text - ), "Unconfigured PII should be replaced with placeholders" - - # Verify original unconfigured PII values are not in result - assert ( - "emily.watson@hospital.com" not in result_text - ), "Unconfigured EMAIL should be replaced with placeholder" - - # Verify placeholder format - assert ( - "{{EMAIL}}" in result_text or "{{URL}}" in result_text - ), "Placeholders should indicate entity type" - - -def test_ac1_mixed_techniques_applied_correctly(sample_text_mixed_pii, mixed_technique_config): - """AC1: Test that multiple techniques (encrypt, retain) are applied correctly.""" - clear_vault_key("test_mixed_key") - - result_text, metrics_md = run_unstructured_op(mixed_technique_config, sample_text_mixed_pii) - metrics = parse_metrics_markdown(metrics_md) - - # Verify encrypted PII types (PERSON, EMAIL) - assert "Emily Watson" not in result_text, "Configured PERSON should be encrypted" - assert "emily.watson@hospital.com" not in result_text, "Configured EMAIL should be encrypted" - - # Verify retained PII type (PHONE_NUMBERS) - assert "+44-20-7946-0958" in result_text, "Configured PHONE_NUMBERS should be retained" - - # Verify metrics reflect different techniques - assert ( - "encrypt" in metrics["techniques_applied"].get("PERSON", "").lower() - ), "Encrypt technique should be applied to PERSON" - assert ( - "encrypt" in metrics["techniques_applied"].get("EMAIL", "").lower() - ), "Encrypt technique should be applied to EMAIL" - assert ( - "retain" in metrics["techniques_applied"].get("PHONE_NUMBERS", "").lower() - ), "Retain technique should be applied to PHONE_NUMBERS" - - -def test_ac1_multiple_instances_same_pii_type(sample_text_multi_person, encrypt_person_config): - """AC1: Test that all instances of a configured PII type are processed.""" - clear_vault_key("test_person_key") - - result_text, metrics_md = run_unstructured_op(encrypt_person_config, sample_text_multi_person) - metrics = parse_metrics_markdown(metrics_md) - - # Verify all person names are encrypted - person_names = [ - "Alice Johnson", - "Bob Williams", - "Charlie Brown", - "Maria Garcia", - "David Wilson", - ] - for name in person_names: - assert name not in result_text, f"All PERSON instances should be encrypted: {name}" - - # Verify metrics count multiple instances - assert metrics["pii_by_type"].get("PERSON", 0) >= len( - person_names - ), f"Should detect at least {len(person_names)} PERSON entities" - - -def test_ac1_empty_text_returns_empty(encrypt_person_config): - """AC1: Test that empty or null text input raises a ValueError.""" - clear_vault_key("test_person_key") - - with pytest.raises(ValueError) as exc_info: - run_unstructured_op(encrypt_person_config, "") - - assert "empty" in str(exc_info.value).lower(), "Error should indicate empty input" - - -def test_ac1_text_without_pii_remains_unchanged(): - """AC1: Test that text without any PII remains unchanged after processing.""" - no_pii_text = """ - The weather today is sunny with a high of 25 degrees Celsius. - The conference starts at 9:00 AM in Room 301. - """ - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_no_pii_key", - ) - ) - ], - ) - - clear_vault_key("test_no_pii_key") - - result_text, metrics_md = run_unstructured_op(config, no_pii_text) - metrics = parse_metrics_markdown(metrics_md) - - assert result_text.strip() == no_pii_text.strip(), "Text without PII should remain unchanged" - assert metrics["total_pii_detected"] == 0, "No PII should be detected" - - -def test_ac1_placeholder_format_indicates_entity_type(sample_text_mixed_pii): - """AC1: Test that placeholders for unconfigured PII indicate the entity type.""" - encrypt_person_only = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_placeholder_key", - ) - ) - ], - ) - - clear_vault_key("test_placeholder_key") - - result_text, metrics_md = run_unstructured_op(encrypt_person_only, sample_text_mixed_pii) - metrics = parse_metrics_markdown(metrics_md) - - # Verify placeholder format (scrubadub uses {{TYPE}} format) - placeholder_pattern = r"\{\{[A-Z_]+\}\}" - placeholders = re.findall(placeholder_pattern, result_text) - - assert ( - len(placeholders) > 0 - ), "Result should contain entity-type placeholders for unconfigured PII" - - # Verify metrics track which PII types were detected - assert len(metrics["pii_by_type"]) > 0, "Metrics should list detected PII types" - - -# ================================================================================================ -# AC2: Invalid Execution Handling -# ================================================================================================ - - -def test_ac2_graceful_abort_on_scrubadub_failure(): - """AC2: Test graceful abort when the PII detection engine (Scrubadub) fails.""" - text = "Test user John Smith with email john@example.com" - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_abort_key", - ) - ) - ], - ) - - clear_vault_key("test_abort_key") - - # Mock Scrubadub to fail at the right import path - with patch( - "field_level_pseudo_anonymisation.unstructured_ops.scrubadub.Scrubber" - ) as mock_scrubber_class: - mock_scrubber = MagicMock() - mock_scrubber.iter_filth.side_effect = RuntimeError("Scrubadub internal error") - mock_scrubber_class.return_value = mock_scrubber - - with pytest.raises(RuntimeError) as exc_info: - run_unstructured_op(config, text) - - error_msg = str(exc_info.value).lower() - assert ( - "pii" in error_msg - or "detection" in error_msg - or "scrubadub" in error_msg - or "failed" in error_msg - ), "Error message should indicate PII detection failure" - - -def test_ac2_graceful_abort_on_encryption_failure(sample_text_en): - """AC2: Test graceful abort when an encryption technique fails during execution.""" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_encrypt_fail_key", - ) - ) - ], - ) - - clear_vault_key("test_encrypt_fail_key") - - # Mock encrypt function at correct path - it's imported from techniques module - encrypt_path = ( - "field_level_pseudo_anonymisation" - ".techniques.anonymisation_pseudonymisation_techniques.encrypt" - ) - with patch(encrypt_path) as mock_encrypt: - mock_encrypt.side_effect = Exception("Encryption algorithm failure") - - with pytest.raises(RuntimeError) as exc_info: - run_unstructured_op(config, sample_text_en) - - error_msg = str(exc_info.value).lower() - assert ( - "encrypt" in error_msg or "failed" in error_msg or "technique" in error_msg - ), "Error message should indicate encryption failure" - - -def test_ac2_null_text_input_raises_error(encrypt_person_config): - """AC2: Test that a null (None) text input is rejected with an error.""" - clear_vault_key("test_person_key") - - # Dagster will raise DagsterTypeCheckDidNotPass before op executes - from dagster import DagsterTypeCheckDidNotPass - - with pytest.raises((ValueError, DagsterTypeCheckDidNotPass, TypeError)): - run_unstructured_op(encrypt_person_config, None) - - -def test_ac2_invalid_language_configuration(): - """AC2: Test that an unsupported language in the config raises a validation error.""" - # This should fail at config creation due to Pydantic validation - with pytest.raises((ValueError, TypeError)): - AnonymisePseudonymizeUnstructuredConfig( - language="invalid_lang", # Should fail Pydantic validation - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", pii=[PIIEntityEnum.PERSON], key_name="test_key" - ) - ) - ], - ) - - -def test_ac2_very_large_text_processing(): - """AC2: Test that very large text inputs are processed successfully without memory errors.""" - # Create large text with repeated PII patterns - large_text = ( - """ - John Smith works at company. Email: john.smith@example.com. - """ - * 1000 - ) # ~60KB of text with repeated PII - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], - key_name="test_large_text_key", - ) - ) - ], - ) - - clear_vault_key("test_large_text_key") - - result_text, metrics_md = run_unstructured_op(config, large_text) - metrics = parse_metrics_markdown(metrics_md) - - # Verify processing completed - assert result_text is not None, "Large text should be processed successfully" - assert len(result_text) > 0, "Result should not be empty" - assert metrics["total_pii_detected"] > 0, "PII should be detected in large text" - - -# ================================================================================================ -# AC3: Execution Audit & Logging - Positive Scenario -# ================================================================================================ - - -def test_ac3_successful_execution_logs_timestamp_and_run_id(sample_text_en, encrypt_person_config): - """AC3: Test that successful execution context contains a run ID for logging.""" - clear_vault_key("test_person_key") - - op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) - context = build_op_context(op_config=op_config_dict) - - # Capture run context - run_id = context.run_id - - # Execute operation - result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) - - # Verify run identifier is available for logging - assert run_id is not None, "Run ID must be available for audit logging" - - # Verify outputs are returned (for Dagster to log) - assert result_text is not None, "Result text should be available for logging" - assert metrics is not None, "Metrics should be available for logging" - - -def test_ac3_successful_execution_logs_configuration_parameters( - sample_text_en, mixed_technique_config -): - """AC3: Test that the used configuration is accessible for logging on success.""" - clear_vault_key("test_mixed_key") - - op_config_dict = config_to_dagster_dict_unstructured(mixed_technique_config) - context = build_op_context(op_config=op_config_dict) - - result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_en) - - # Verify configuration is captured and accessible - assert "used_function" in op_config_dict, "Configuration must be accessible for logging" - assert len(op_config_dict["used_function"]) == 2, "Multiple techniques should be captured" - - # Verify techniques are logged - techniques = [func["technique"] for func in op_config_dict["used_function"]] - assert any( - "encrypt" in str(tech) for tech in techniques - ), "Encrypt technique should be in configuration" - assert any( - "retain" in str(tech) for tech in techniques - ), "Retain technique should be in configuration" - - # Verify metrics contain technique information (in markdown string) - metrics_str = metrics.value - assert ( - "Techniques Applied" in metrics_str - ), "Applied techniques should be in metrics for logging" - - -def test_ac3_successful_execution_logs_no_raw_pii(sample_text_mixed_pii, encrypt_person_config): - """AC3: Test that logs and metrics from a successful run do not contain raw PII.""" - clear_vault_key("test_person_key") - - op_config_dict = config_to_dagster_dict_unstructured(encrypt_person_config) - context = build_op_context(op_config=op_config_dict) - - result_text, metrics = anonymize_pseudonymize_unstructured(context, text=sample_text_mixed_pii) - - # Verify raw PII values are not in metrics - metrics_str = metrics.value - - sensitive_values = ["Emily Watson", "emily.watson@hospital.com", "+44-20-7946-0958"] - - for pii_value in sensitive_values: - assert ( - pii_value not in metrics_str - ), f"Raw PII value should not appear in metrics: {pii_value}" - - # Verify configuration logs do not contain raw PII - config_str = str(op_config_dict) - for pii_value in sensitive_values: - assert ( - pii_value not in config_str - ), f"Raw PII value should not appear in configuration logs: {pii_value}" - - -# ================================================================================================ -# AC4: Execution Audit & Logging - Negative Scenario -# ================================================================================================ - - -def test_ac4_failed_execution_logs_error_details(): - """AC4: Negative execution should surface clear error details (encryption key failure).""" - text = "Test user John Smith" - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_fail_log_key", - ) - ) - ], - ) - clear_vault_key("test_fail_log_key") - ctx = build_op_context(op_config=config_to_dagster_dict_unstructured(config)) - - # Patch the key retrieval used inside unstructured_ops to force failure - with patch( - "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key", - side_effect=RuntimeError("Encryption key retrieval failed"), - ): - with pytest.raises(RuntimeError) as exc_info: - # Consume the generator to trigger execution and raise the exception - list(anonymize_pseudonymize_unstructured(ctx, text=text)) - - msg = str(exc_info.value).lower() - assert "key" in msg and "failed" in msg, "Error message should mention key failure" - - -def test_ac4_failed_execution_logs_configuration_used(): - """AC4: Test that the attempted configuration is available for logging on failure.""" - text = "Test data with person John Doe" - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_config_fail_key", - ) - ) - ], - ) - - clear_vault_key("test_config_fail_key") - - op_config_dict = config_to_dagster_dict_unstructured(config) - context = build_op_context(op_config=op_config_dict) - - # Mock _initialize_scrubber to fail - with patch( - "field_level_pseudo_anonymisation.unstructured_ops._initialize_scrubber" - ) as mock_init_scrubber: - mock_init_scrubber.side_effect = Exception("Scrubber module not available") - - with pytest.raises((RuntimeError, Exception)) as exc_info: - list(anonymize_pseudonymize_unstructured(context, text=text)) - - # Verify configuration is still accessible despite failure - assert op_config_dict is not None, "Configuration must be accessible for failure audit" - assert ( - "used_function" in op_config_dict - ), "Technique configuration should be available for diagnosis" - - # Verify error was raised with proper message - error_msg = str(exc_info.value).lower() - assert ( - "pii" in error_msg - or "detection" in error_msg - or "failed" in error_msg - or "scrubber" in error_msg - or "module" in error_msg - ), "Error should indicate detection/processing failed" - - -def test_ac4_failed_execution_logs_failure_reason(): - """AC4: Test that the reason for a failure is clearly indicated in the error message.""" - text = "User: Alice Smith, Email: alice@example.com" - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.en, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], - key_name="test_failure_reason_key", - ) - ) - ], - ) - - clear_vault_key("test_failure_reason_key") - - # Mock key retrieval function to fail - with patch( - "field_level_pseudo_anonymisation.unstructured_ops.create_get_encryption_key" - ) as mock_get_key: - mock_get_key.side_effect = RuntimeError("Vault connection timeout") - - with pytest.raises(RuntimeError) as exc_info: - run_unstructured_op(config, text) - - # Verify failure reason is in error message - error_msg = str(exc_info.value).lower() - assert ( - "encrypt" in error_msg - or "key" in error_msg - or "timeout" in error_msg - or "failed" in error_msg - ), "Error should indicate key retrieval/encryption failure" - - -# ================================================================================================ -# Additional Tests - Edge Cases and Integration -# ================================================================================================ - - -def test_multi_language_support_italian(): - """Additional test: Verify that Italian text is processed correctly.""" - italian_text = """ - Il dottor Marco Rossi lavora presso l'ospedale. - Email: marco.rossi@ospedale.it - Telefono: +39-06-12345678 - """ - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.it, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON], - key_name="test_italian_key", - ) - ) - ], - ) - - clear_vault_key("test_italian_key") - - result_text, metrics_md = run_unstructured_op(config, italian_text) - metrics = parse_metrics_markdown(metrics_md) - - # Verify processing occurred - assert result_text != italian_text, "Italian text should be processed" - assert metrics["total_pii_detected"] > 0, "PII should be detected in Italian text" - - -def test_special_characters_in_text(): - """Additional test: Verify handling of text with special Unicode characters.""" - special_text = """ - User: João da Silva 🇧🇷 - Email: joão@empresa.com.br - Message: "Hello, World!" — Testing special chars: €, £, ¥, ©, ® - """ - - config = AnonymisePseudonymizeUnstructuredConfig( - language=LanguageEnum.pt, - used_function=[ - PseudoTechniqueConfig( - technique=EncryptConfig( - type="encrypt", - pii=[PIIEntityEnum.PERSON, PIIEntityEnum.EMAIL], - key_name="test_special_chars_key", - ) - ) - ], - ) - - clear_vault_key("test_special_chars_key") - - result_text, metrics_md = run_unstructured_op(config, special_text) - - # Verify processing completed without encoding errors - assert result_text is not None, "Special characters should not cause processing failure" - assert len(result_text) > 0, "Result should not be empty" - - -def test_deterministic_encryption_within_session(sample_text_en, encrypt_person_config): - """Additional test: Verify encryption format consistency across runs.""" - clear_vault_key("test_person_key") - - result1, metrics_md1 = run_unstructured_op(encrypt_person_config, sample_text_en) - result2, metrics_md2 = run_unstructured_op(encrypt_person_config, sample_text_en) - - # Both should have encryption tokens - assert "{encrypt:" in result1, "First run should produce encrypted tokens" - assert "{encrypt:" in result2, "Second run should produce encrypted tokens" - - # Verify consistent PII detection - metrics1 = parse_metrics_markdown(metrics_md1) - metrics2 = parse_metrics_markdown(metrics_md2) - - assert ( - metrics1["total_pii_detected"] == metrics2["total_pii_detected"] - ), "PII detection should be consistent across runs" - - # Verify token format is consistent (Fernet base64 pattern) - token_pattern = r"\{encrypt:gAAAAAB[A-Za-z0-9+/=_-]+\}" - tokens1 = re.findall(token_pattern, result1) - tokens2 = re.findall(token_pattern, result2) - - assert len(tokens1) == len(tokens2), "Same number of encryption tokens should be generated" diff --git a/tests/field_level_pseudo_anonymisation/test_jobs.py b/tests/field_level_pseudo_anonymisation/test_jobs.py deleted file mode 100644 index 616c3d5..0000000 --- a/tests/field_level_pseudo_anonymisation/test_jobs.py +++ /dev/null @@ -1,58 +0,0 @@ -from template_code_location.field_level_pseudo_anonymisation.jobs import ( - anonymize_pseudonymize_structured_job, - anonymize_pseudonymize_structured_job_s3, - depseudonymize_structured_job, - depseudonymize_structured_job_s3, - anonymize_pseudonymize_unstructured_job_s3, - anonymize_pseudonymize_unstructured_job, - depseudonymize_unstructured_job_s3, - depseudonymize_unstructured_job -) - - -def test_anonymize_pseudonymize_structured_job_is_callable(): - """Test anonymize_pseudonymize_structured_job is a valid Dagster job""" - assert callable(anonymize_pseudonymize_structured_job) - assert hasattr(anonymize_pseudonymize_structured_job, 'execute_in_process') - - -def test_anonymize_pseudonymize_structured_job_s3_is_callable(): - """Test anonymize_pseudonymize_structured_job_s3 is a valid Dagster job""" - assert callable(anonymize_pseudonymize_structured_job_s3) - assert hasattr(anonymize_pseudonymize_structured_job_s3, 'execute_in_process') - - -def test_depseudonymize_structured_job_is_callable(): - """Test depseudonymize_structured_job is a valid Dagster job""" - assert callable(depseudonymize_structured_job) - assert hasattr(depseudonymize_structured_job, 'execute_in_process') - - -def test_depseudonymize_structured_job_s3_is_callable(): - """Test depseudonymize_structured_job_s3 is a valid Dagster job""" - assert callable(depseudonymize_structured_job_s3) - assert hasattr(depseudonymize_structured_job_s3, 'execute_in_process') - - -def test_anonymize_pseudonymize_unstructured_job_is_callable(): - """Test anonymize_pseudonymize_unstructured_job is a valid Dagster job""" - assert callable(anonymize_pseudonymize_unstructured_job) - assert hasattr(anonymize_pseudonymize_unstructured_job, 'execute_in_process') - - -def test_anonymize_pseudonymize_unstructured_job_s3_is_callable(): - """Test anonymize_pseudonymize_unstructured_job_s3 is a valid Dagster job""" - assert callable(anonymize_pseudonymize_unstructured_job_s3) - assert hasattr(anonymize_pseudonymize_unstructured_job_s3, 'execute_in_process') - - -def test_depseudonymize_unstructured_job_is_callable(): - """Test depseudonymize_unstructured_job is a valid Dagster job""" - assert callable(depseudonymize_unstructured_job) - assert hasattr(depseudonymize_unstructured_job, 'execute_in_process') - - -def test_depseudonymize_unstructured_job_s3_is_callable(): - """Test depseudonymize_unstructured_job_s3 is a valid Dagster job""" - assert callable(depseudonymize_unstructured_job_s3) - assert hasattr(depseudonymize_unstructured_job_s3, 'execute_in_process') From 3ff92fc1134fd79871c2136b222a5caf0136e4ba Mon Sep 17 00:00:00 2001 From: ILay Date: Wed, 6 May 2026 11:57:52 +0200 Subject: [PATCH 13/15] pin code-locations to develop --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5eb1ab4..2d1fc57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,9 +53,9 @@ exclude-dependencies = ["transformers", "spacy-transformers"] [tool.uv.sources] torch = { index = "pytorch-cpu" } util-services = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/util-services.git", rev = "v0.5.0" } -data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "feature/SIMPL-24642" } -dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "feature/SIMPL-24642" } -field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "feature/SIMPL-24642" } +data-processing = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/data-processing.git", branch = "develop" } +dataframe-level-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/dataframe-level-anonymisation.git", branch = "develop" } +field-level-pseudo-anonymisation = { git = "https://code.europa.eu/simpl/simpl-open/development/data-services/field-level-pseudo-anonymisation.git", branch = "develop" } [[tool.uv.index]] name = "pytorch-cpu" From 9aaee17d20fb4ffea9d31454f7f1885901cb16d8 Mon Sep 17 00:00:00 2001 From: ILay Date: Wed, 6 May 2026 15:12:52 +0200 Subject: [PATCH 14/15] clean dependencies --- pyproject.toml | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2d1fc57..f85da15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,35 +10,6 @@ requires-python = ">=3.12" dependencies = [ # Dagster core "dagster>=1.8.13", - "dagster-webserver>=1.8.13", - "dagster-postgres>=0.24.13", - # Data processing - "pandas>=2.1.4", - "pyarrow>=23.0", - "numpy>=2.0.1", - "lxml>=6.0", - "xmltodict>=1.0", - "rdflib>=7.6", - "openpyxl>=3.1.0", - "xlrd>=2.0.1", - "tabulate>=0.9", - "pyspellchecker>=0.8.4", - "PyGeodesy>=24.6.11", - # Validation - "great_expectations>=1.16", - "pandera>=0.31", - "pydantic>=2.6.0,<3.0.0", - # Scraping - "scrapy>=2.15", - "BeautifulSoup4>=4.14", - # Anonymisation libraries - "pycanon==1.0.1.post2", - "anjana>=1.0.0", - # Field-level pseudo-anonymisation - "scrubadub>=2.0.0", - "scrubadub_spacy>=1.0.0", - "hvac>=2.0.0", - "cryptography>=42.0.0", # Util services — resolved via [tool.uv.sources] (git) "util-services", # Code location packages — resolved via [tool.uv.sources] (git) From 9ebba755ad2302a994c8aee4388cc368d8917cee Mon Sep 17 00:00:00 2001 From: ILay Date: Wed, 6 May 2026 15:18:02 +0200 Subject: [PATCH 15/15] update Development Guide to clarify project layout and external dependencies --- documents/Development Guide.md | 97 ++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/documents/Development Guide.md b/documents/Development Guide.md index 23c60d7..6582768 100644 --- a/documents/Development Guide.md +++ b/documents/Development Guide.md @@ -9,81 +9,100 @@ By following a *code-first approach*, developers ensure consistency, traceabilit Development must always begin in a local environment. This allows developers to rapidly iterate, test business logic, and validate DAG (Directed Acyclic Graph) structures without impacting production data. ### 2.1 Project Layout -This repository (`template-code-location`) serves as the **single consolidated code location** for all data services workflows. It contains the jobs, ops, and configurations previously spread across `data-processing`, `dataframe-level-anonymisation`, and `field-level-pseudo-anonymisation`. +This repository (`template-code-location`) serves as the **single consolidated code location** for all data services workflows. It imports jobs and ops from three external packages (`data-processing`, `dataframe-level-anonymisation`, and `field-level-pseudo-anonymisation`) which are installed as Git dependencies, and also provides a place for custom template jobs/ops. ```text template-code-location/ ├── src/ │ └── template_code_location/ +│ ├── __init__.py │ ├── repository.py # Unified entry point (all jobs/sensors/resources) -│ ├── data_processing/ # Data cleaning & transformation ops/jobs -│ │ ├── config_models/ -│ │ ├── jobs.py -│ │ └── ops.py -│ ├── dataframe_level_anonymisation/ # k-anonymity, l-diversity, t-closeness -│ │ ├── config_models/ -│ │ ├── jobs.py -│ │ ├── ops.py -│ │ └── utils.py -│ ├── field_level_pseudo_anonymisation/ # Field-level encryption/hashing/redaction -│ │ ├── config_models/ -│ │ ├── techniques/ -│ │ ├── jobs.py -│ │ ├── ops.py -│ │ ├── unstructured_ops.py -│ │ └── utils.py -│ ├── jobs/ # Template example jobs -│ └── ops/ # Template example ops -├── tests/ # All tests (migrated from source repos) +│ ├── jobs/ # Custom jobs specific to this code location +│ │ ├── __init__.py +│ │ └── jobs.py +│ └── ops/ # Custom ops specific to this code location +│ ├── __init__.py +│ └── ops.py +├── tests/ # Unit & integration tests ├── Dockerfile -├── pyproject.toml +├── pyproject.toml # Dependencies & external package sources └── README.md ``` -### 2.2 Code Examples (Ops, Jobs, and Definitions) +### 2.2 External Dependencies (Git Packages) + +The heavy-lifting logic lives in separate repositories, pulled in as installable Python packages via `pyproject.toml` and `[tool.uv.sources]`: + +| Package | Purpose | Source | +|---------|---------|--------| +| `data-processing` | Data cleaning & transformation jobs | Git (branch: `develop`) | +| `dataframe-level-anonymisation` | k-anonymity, l-diversity, t-closeness | Git (branch: `develop`) | +| `field-level-pseudo-anonymisation` | Field-level encryption/hashing/redaction | Git (branch: `develop`) | +| `util-services` | Shared resources, sensors, and logging | Git (tag: `v0.5.0`) | + +These packages expose their jobs and ops which are then imported and registered in `repository.py`. + +### 2.3 Code Examples (Ops, Jobs, and Definitions) The orchestration logic should be modular. Here is a practical example of how to construct a workflow. -**1. Defining Ops (ops.py)** +**1. Defining Ops (`ops/ops.py`)** Ops are the core units of computation. Keep them focused on a single task. + ```python from dagster import op @op -def fetch_raw_data() -> list: - """Fetches raw data from an external source.""" +def fetch_data() -> list: + """Fetches raw data from a source.""" return [{"id": 1, "value": "A"}, {"id": 2, "value": "B"}] @op def process_data(data: list) -> dict: - """Transforms raw data into an aggregated format.""" - return {"processed_count": len(data), "status": "success"} + """Processes raw data and returns a summary.""" + return {"count": len(data), "status": "success"} ``` -**2. Assembling Jobs (jobs.py)** + +**2. Assembling Jobs (`jobs/jobs.py`)** Jobs link ops together to form a dependency graph (workflow). + ```python from dagster import job -from .ops import fetch_raw_data, process_data +from ..ops.ops import fetch_data, process_data @job def data_processing_job(): - """A workflow that fetches and processes data.""" - raw_data = fetch_raw_data() - process_data(raw_data) + """A simple job that fetches and processes data.""" + raw = fetch_data() + process_data(raw) ``` -**3. Registering Definitions (repository.py)** -This file acts as the entry point for the Simpl-Open orchestration platform to discover your code. + +**3. Registering Definitions (`repository.py`)** +This file acts as the entry point for the Simpl-Open orchestration platform to discover your code. It imports jobs from local modules as well as from external packages. + ```python from dagster import Definitions -from .jobs import data_processing_job +from util_services.resources import s3_resource +from util_services.sensors import notify_success, notify_failure, notify_canceled +from util_services.custom_json_logger import simpl_json_logger + +# External package jobs +from data_processing.jobs import remove_duplicates_job_s3, fill_missing_values_job_s3 +from dataframe_level_anonymisation.jobs import k_anonymity_job_s3, l_diversity_job_s3 +from field_level_pseudo_anonymisation.jobs import anonymise_pseudonymise_structured_job_s3 + +# Local template jobs +from template_code_location.jobs.jobs import data_processing_job -# The platform will load this Definitions object defs = Definitions( - jobs=[data_processing_job] - # You can also declare schedules, sensors, and resources here + jobs=[data_processing_job, remove_duplicates_job_s3, ...], + sensors=[notify_success, notify_failure, notify_canceled], + resources={"s3": s3_resource.configured({"resource_name": "selfS3"})}, + loggers={"simpl": simpl_json_logger}, ) ``` -### 2.3 Best Practices & Constraints +### 2.4 Best Practices & Constraints + - **Separation of Concerns**: Keep orchestration logic (how ops connect) strictly separate from heavy business logic (which should ideally live in separate Python modules/classes). - **Naming Conventions**: Use snake_case for jobs and ops. Code locations should be named based on the domain they represent (e.g., inventory_sync_service). - **Dependency Management**: All dependencies must be explicitly declared in pyproject.toml or requirements.txt.