feat(SIMPL-24642): migrate tests from 3 source repos with updated imports
This commit is contained in:
1
tests/data_processing/__init__.py
Normal file
1
tests/data_processing/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
53
tests/data_processing/conftest.py
Normal file
53
tests/data_processing/conftest.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Pytest configuration and shared fixtures."""
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from unittest.mock import MagicMock, patch
|
||||
import sys
|
||||
from dagster import build_op_context
|
||||
|
||||
# Mock external dependencies that might not be available in test environment
|
||||
sys.modules['spellchecker'] = MagicMock()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_context():
|
||||
"""Create a mock Dagster context for testing operations."""
|
||||
context = build_op_context()
|
||||
return context
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe():
|
||||
"""Create a sample DataFrame for testing."""
|
||||
return pd.DataFrame({
|
||||
'Name': ['John Doe', 'jane smith', 'John Doe', 'bob johnson', 'John Doe'],
|
||||
'Age': [25, 30, 25, None, 25],
|
||||
'City': ['New York', 'los angeles', 'New York', 'chicago', 'New York'],
|
||||
'Status': ['Active', 'INACTIVE', 'Active', 'penDing', 'Active']
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_dataframe_with_typos():
|
||||
"""Create a sample DataFrame with typos for spell checking."""
|
||||
return pd.DataFrame({
|
||||
'Name': ['jon doe', 'jane smith', 'bob jonson'],
|
||||
'Description': ['developer', 'analst', 'enginer']
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def empty_dataframe():
|
||||
"""Create an empty DataFrame."""
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dataframe_with_missing_values():
|
||||
"""Create a DataFrame with various missing values."""
|
||||
return pd.DataFrame({
|
||||
'Column1': [1, None, 3, None, 5],
|
||||
'Column2': ['a', 'b', None, 'd', None],
|
||||
'Column3': [None, None, None, None, None]
|
||||
})
|
||||
7
tests/data_processing/conftest_utils.py
Normal file
7
tests/data_processing/conftest_utils.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""Configuration utilities for testing."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add src directory to path for imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
202
tests/data_processing/test_config_models.py
Normal file
202
tests/data_processing/test_config_models.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Unit tests for configuration models."""
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
from template_code_location.data_processing.config_models import (
|
||||
FillMissingConfiguration,
|
||||
ColumnsSelectConfiguration,
|
||||
SpellCheckConfiguration,
|
||||
AggregationConfiguration
|
||||
)
|
||||
|
||||
|
||||
class TestColumnsSelectConfiguration:
|
||||
"""Tests for ColumnsSelectConfiguration."""
|
||||
|
||||
def test_default_columns(self):
|
||||
"""Test default columns configuration."""
|
||||
config = ColumnsSelectConfiguration()
|
||||
assert config.columns == ['Name']
|
||||
|
||||
def test_custom_columns(self):
|
||||
"""Test custom columns configuration."""
|
||||
config = ColumnsSelectConfiguration(columns=['Col1', 'Col2', 'Col3'])
|
||||
assert config.columns == ['Col1', 'Col2', 'Col3']
|
||||
|
||||
def test_empty_columns_list(self):
|
||||
"""Test with empty columns list."""
|
||||
config = ColumnsSelectConfiguration(columns=[])
|
||||
assert config.columns == []
|
||||
|
||||
def test_single_column(self):
|
||||
"""Test with a single column."""
|
||||
config = ColumnsSelectConfiguration(columns=['SingleCol'])
|
||||
assert config.columns == ['SingleCol']
|
||||
|
||||
def test_columns_with_special_characters(self):
|
||||
"""Test columns with special characters."""
|
||||
config = ColumnsSelectConfiguration(columns=['Col-1', 'Col_2', 'Col.3'])
|
||||
assert config.columns == ['Col-1', 'Col_2', 'Col.3']
|
||||
|
||||
def test_duplicate_columns_are_removed(self):
|
||||
"""Verifica che i duplicati vengano rimossi mantenendo l'ordine (grazie a dict.fromkeys)."""
|
||||
config = ColumnsSelectConfiguration(columns=['A', 'B', 'A', 'C', 'B'])
|
||||
|
||||
assert config.columns == ['A', 'B', 'C']
|
||||
|
||||
def test_duplicate_default_behavior(self):
|
||||
"""Verifica che anche input estremi vengano gestiti correttamente."""
|
||||
config = ColumnsSelectConfiguration(columns=['Name', 'Name', 'Name'])
|
||||
assert config.columns == ['Name']
|
||||
|
||||
|
||||
class TestFillMissingConfiguration:
|
||||
"""Tests for FillMissingConfiguration."""
|
||||
|
||||
def test_default_fill_map(self):
|
||||
"""Test default fill map configuration."""
|
||||
config = FillMissingConfiguration()
|
||||
|
||||
assert config.fill_map == {'Age': 'UNKNOWN_AGE'}
|
||||
|
||||
def test_custom_fill_map(self):
|
||||
"""Test custom fill map configuration."""
|
||||
fill_map = {'Age': '0', 'Name': 'UNKNOWN', 'City': 'N/A'}
|
||||
config = FillMissingConfiguration(fill_map=fill_map)
|
||||
|
||||
assert config.fill_map == fill_map
|
||||
|
||||
def test_empty_fill_map(self):
|
||||
"""Test with empty fill map."""
|
||||
config = FillMissingConfiguration(fill_map={})
|
||||
|
||||
assert config.fill_map == {}
|
||||
|
||||
def test_fill_map_with_numeric_values(self):
|
||||
"""Test fill map with numeric string values."""
|
||||
fill_map = {'Age': '0', 'Score': '-1', 'Count': '999'}
|
||||
config = FillMissingConfiguration(fill_map=fill_map)
|
||||
|
||||
assert config.fill_map == fill_map
|
||||
|
||||
def test_fill_map_with_string_values(self):
|
||||
"""Test fill map with string values."""
|
||||
fill_map = {'Name': 'Unknown', 'Email': 'no-email'}
|
||||
config = FillMissingConfiguration(fill_map=fill_map)
|
||||
|
||||
assert config.fill_map == fill_map
|
||||
|
||||
def test_fill_map_mixed_types(self):
|
||||
"""Test fill map with mixed value types (all strings)."""
|
||||
fill_map = {'IntCol': '0', 'StrCol': 'Unknown', 'FloatCol': '0.0'}
|
||||
config = FillMissingConfiguration(fill_map=fill_map)
|
||||
|
||||
assert config.fill_map == fill_map
|
||||
|
||||
|
||||
class TestSpellCheckConfiguration:
|
||||
"""Tests for SpellCheckConfiguration."""
|
||||
|
||||
def test_default_spell_check_config(self):
|
||||
"""Test default spell check configuration."""
|
||||
config = SpellCheckConfiguration()
|
||||
|
||||
assert config.columns == ['Name']
|
||||
assert config.language == 'en'
|
||||
|
||||
def test_custom_spell_check_config(self):
|
||||
"""Test custom spell check configuration."""
|
||||
config = SpellCheckConfiguration(
|
||||
columns=['Description', 'Notes'],
|
||||
language='es'
|
||||
)
|
||||
|
||||
assert config.columns == ['Description', 'Notes']
|
||||
assert config.language == 'es'
|
||||
|
||||
def test_spell_check_all_languages(self):
|
||||
"""Test spell check with all supported languages."""
|
||||
supported_languages = ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl']
|
||||
|
||||
for lang in supported_languages:
|
||||
config = SpellCheckConfiguration(language=lang)
|
||||
assert config.language == lang
|
||||
|
||||
def test_spell_check_invalid_language(self):
|
||||
"""Test spell check with invalid language."""
|
||||
with pytest.raises(ValidationError):
|
||||
SpellCheckConfiguration(language='invalid')
|
||||
|
||||
def test_spell_check_multiple_columns(self):
|
||||
"""Test spell check with multiple columns."""
|
||||
columns = ['Col1', 'Col2', 'Col3', 'Col4']
|
||||
config = SpellCheckConfiguration(columns=columns)
|
||||
|
||||
assert config.columns == columns
|
||||
|
||||
def test_spell_check_empty_columns(self):
|
||||
"""Test spell check with empty columns list."""
|
||||
config = SpellCheckConfiguration(columns=[])
|
||||
|
||||
assert config.columns == []
|
||||
assert config.language == 'en'
|
||||
|
||||
def test_spell_check_inheritance(self):
|
||||
"""Test that SpellCheckConfiguration inherits from ColumnsSelectConfiguration."""
|
||||
config = SpellCheckConfiguration()
|
||||
|
||||
assert isinstance(config, ColumnsSelectConfiguration)
|
||||
assert hasattr(config, 'columns')
|
||||
assert hasattr(config, 'language')
|
||||
|
||||
@pytest.mark.parametrize("language", ['en', 'es', 'it', 'fr', 'pt', 'de', 'nl'])
|
||||
def test_spell_check_languages_parametrized(self, language):
|
||||
"""Test spell check with parametrized languages."""
|
||||
config = SpellCheckConfiguration(language=language)
|
||||
assert config.language == language
|
||||
|
||||
class TestAggregationConfiguration:
|
||||
"""Tests for AggregationConfiguration."""
|
||||
|
||||
def test_aggregation_default_config(self):
|
||||
"""Test default aggregation configuration."""
|
||||
config = AggregationConfiguration()
|
||||
|
||||
assert config.columns == ['Name']
|
||||
assert config.operation == 'sum'
|
||||
|
||||
@pytest.mark.parametrize("op", ["sum", "mean", "min", "max", "count"])
|
||||
def test_aggregation_valid_operations(self, op):
|
||||
"""Test all allowed aggregation operations."""
|
||||
config = AggregationConfiguration(operation=op)
|
||||
assert config.operation == op
|
||||
|
||||
def test_aggregation_invalid_operation(self):
|
||||
"""Test that an invalid operation raises a ValidationError."""
|
||||
with pytest.raises(ValidationError) as excinfo:
|
||||
AggregationConfiguration(operation="invalid_op")
|
||||
|
||||
assert "Invalid aggregation operation 'invalid_op'" in str(excinfo.value)
|
||||
|
||||
def test_aggregation_custom_columns(self):
|
||||
"""Test aggregation with custom columns."""
|
||||
config = AggregationConfiguration(columns=['Price', 'Quantity'], operation='mean')
|
||||
|
||||
assert config.columns == ['Price', 'Quantity']
|
||||
assert config.operation == 'mean'
|
||||
|
||||
def test_aggregation_inheritance(self):
|
||||
"""Test that AggregationConfiguration inherits from ColumnsSelectConfiguration."""
|
||||
config = AggregationConfiguration()
|
||||
|
||||
assert isinstance(config, ColumnsSelectConfiguration)
|
||||
assert hasattr(config, 'columns')
|
||||
assert hasattr(config, 'operation')
|
||||
|
||||
def test_aggregation_model_dump(self):
|
||||
"""Test that model_dump contains all expected fields (useful for the Dagster op)."""
|
||||
config = AggregationConfiguration(columns=['Value'], operation='max')
|
||||
dump = config.model_dump()
|
||||
|
||||
assert dump['columns'] == ['Value']
|
||||
assert dump['operation'] == 'max'
|
||||
185
tests/data_processing/test_integration.py
Normal file
185
tests/data_processing/test_integration.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""Integration tests for data processing jobs."""
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from unittest.mock import patch, MagicMock
|
||||
from template_code_location.data_processing.ops import (
|
||||
remove_duplicates,
|
||||
fill_missing_values,
|
||||
standardize_categorical_values,
|
||||
correct_typos
|
||||
)
|
||||
from template_code_location.data_processing.config_models import (
|
||||
FillMissingConfiguration,
|
||||
ColumnsSelectConfiguration,
|
||||
SpellCheckConfiguration
|
||||
)
|
||||
|
||||
|
||||
class TestPipelineIntegration:
|
||||
"""Integration tests for data processing pipeline."""
|
||||
|
||||
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
|
||||
"""Test pipeline: remove duplicates then standardize."""
|
||||
df = pd.DataFrame({
|
||||
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
|
||||
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
|
||||
})
|
||||
|
||||
# Step 1: Remove duplicates
|
||||
df_no_dupes = remove_duplicates(mock_context, df)
|
||||
assert df_no_dupes.shape[0] == 3
|
||||
|
||||
# Step 2: Standardize
|
||||
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
|
||||
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
|
||||
|
||||
assert df_standardized['Name'].iloc[0] == 'john doe'
|
||||
assert df_standardized['City'].iloc[0] == 'new york'
|
||||
|
||||
def test_pipeline_fill_missing_then_standardize(self, mock_context):
|
||||
"""Test pipeline: fill missing values then standardize."""
|
||||
df = pd.DataFrame({
|
||||
'Category': [' ACTIVE ', None, ' PENDING '],
|
||||
'Value': ['1', '2', None]
|
||||
})
|
||||
|
||||
# Step 1: Fill missing values
|
||||
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
||||
df_filled = fill_missing_values(mock_context, fill_config, df)
|
||||
|
||||
# Step 2: Standardize
|
||||
std_config = ColumnsSelectConfiguration(columns=['Category'])
|
||||
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
|
||||
|
||||
assert df_standardized['Category'].iloc[0] == 'active'
|
||||
assert df_filled['Value'].iloc[2] == '0'
|
||||
|
||||
def test_pipeline_all_operations(self, mock_context):
|
||||
"""Test complete pipeline with all operations."""
|
||||
df = pd.DataFrame({
|
||||
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
|
||||
'Value': ['1', None, '1', '2']
|
||||
})
|
||||
|
||||
# Step 1: Remove duplicates
|
||||
df = remove_duplicates(mock_context, df)
|
||||
assert df.shape[0] == 3
|
||||
|
||||
# Step 2: Fill missing
|
||||
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
||||
df = fill_missing_values(mock_context, fill_config, df)
|
||||
assert df['Value'].isna().sum() == 0
|
||||
|
||||
# Step 3: Standardize
|
||||
std_config = ColumnsSelectConfiguration(columns=['Name'])
|
||||
df = standardize_categorical_values(mock_context, std_config, df)
|
||||
|
||||
assert df['Name'].iloc[0] == 'john doe'
|
||||
|
||||
def test_pipeline_with_large_dataset(self, mock_context):
|
||||
"""Test pipeline performance with larger dataset."""
|
||||
# Create larger dataset
|
||||
size = 1000
|
||||
df = pd.DataFrame({
|
||||
'ID': list(range(size)),
|
||||
'Name': ['User_' + str(i % 50) for i in range(size)],
|
||||
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
|
||||
'Score': [i % 100 for i in range(size)]
|
||||
})
|
||||
|
||||
# Add some duplicates
|
||||
df = pd.concat([df, df.head(100)], ignore_index=True)
|
||||
|
||||
# Process
|
||||
df_cleaned = remove_duplicates(mock_context, df)
|
||||
|
||||
assert df_cleaned.shape[0] == 1000
|
||||
assert df_cleaned.shape[1] == 4
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
"""Tests for error handling and edge cases."""
|
||||
|
||||
def test_operation_with_corrupted_data(self, mock_context):
|
||||
"""Test operations with corrupted/unusual data."""
|
||||
df = pd.DataFrame({
|
||||
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
|
||||
})
|
||||
|
||||
# Should handle special float values
|
||||
result = remove_duplicates(mock_context, df)
|
||||
assert result.shape[0] > 0
|
||||
|
||||
def test_operation_preserves_index(self, mock_context):
|
||||
"""Test that index is handled correctly."""
|
||||
df = pd.DataFrame(
|
||||
{'Col': [1, 2, 1, 3]},
|
||||
index=['a', 'b', 'c', 'd']
|
||||
)
|
||||
|
||||
result = remove_duplicates(mock_context, df)
|
||||
# Index may be reset, so just check shape
|
||||
assert result.shape[0] == 3
|
||||
|
||||
def test_standardize_with_unicode_characters(self, mock_context):
|
||||
"""Test standardization with unicode characters."""
|
||||
df = pd.DataFrame({
|
||||
'Name': ['José', 'François', 'Müller']
|
||||
})
|
||||
|
||||
config = ColumnsSelectConfiguration(columns=['Name'])
|
||||
result = standardize_categorical_values(mock_context, config, df)
|
||||
|
||||
# Should handle unicode correctly
|
||||
assert result.shape[0] == 3
|
||||
|
||||
def test_fill_with_same_key_multiple_times(self, mock_context):
|
||||
"""Test filling when fill_map has multiple entries."""
|
||||
df = pd.DataFrame({
|
||||
'A': ['1', None, '3'],
|
||||
'B': [None, None, 'c'],
|
||||
'C': [None, '2', None]
|
||||
})
|
||||
|
||||
config = FillMissingConfiguration(fill_map={
|
||||
'A': '-1',
|
||||
'B': 'EMPTY',
|
||||
'C': '0'
|
||||
})
|
||||
|
||||
result = fill_missing_values(mock_context, config, df)
|
||||
|
||||
assert result.loc[1, 'A'] == '-1'
|
||||
assert result.loc[0, 'B'] == 'EMPTY'
|
||||
assert result.loc[0, 'C'] == '0'
|
||||
|
||||
|
||||
class TestDataTypePreservation:
|
||||
"""Tests to ensure data types are preserved appropriately."""
|
||||
|
||||
def test_remove_duplicates_preserves_dtypes(self, mock_context):
|
||||
"""Test that remove_duplicates preserves column data types."""
|
||||
df = pd.DataFrame({
|
||||
'int32': pd.array([1, 2, 1], dtype='int32'),
|
||||
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
|
||||
'str': ['a', 'b', 'a']
|
||||
})
|
||||
|
||||
result = remove_duplicates(mock_context, df)
|
||||
|
||||
assert result['int32'].dtype == df['int32'].dtype
|
||||
assert result['float64'].dtype == df['float64'].dtype
|
||||
|
||||
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
|
||||
"""Test that fill_missing handles type preservation."""
|
||||
df = pd.DataFrame({
|
||||
'A': pd.array(['1', None, '3'], dtype='string'),
|
||||
'B': ['x', 'y', 'z']
|
||||
})
|
||||
|
||||
config = FillMissingConfiguration(fill_map={'A': '0'})
|
||||
result = fill_missing_values(mock_context, config, df)
|
||||
|
||||
assert result['A'].loc[1] == '0'
|
||||
assert result['B'].dtype == df['B'].dtype
|
||||
56
tests/data_processing/test_jobs.py
Normal file
56
tests/data_processing/test_jobs.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from template_code_location.data_processing.jobs import (
|
||||
remove_duplicates_job_s3,
|
||||
fill_missing_values_job_s3,
|
||||
standardize_categorical_values_job_s3,
|
||||
correct_typos_job_s3,
|
||||
normalize_numeric_min_max_job_s3,
|
||||
normalize_datetime_job_s3,
|
||||
normalize_coordinates_job_s3,
|
||||
add_global_aggregations_job_s3
|
||||
)
|
||||
|
||||
|
||||
def test_remove_duplicates_job_s3_is_callable():
|
||||
"""Test remove_duplicates_job_s3 is a valid Dagster job"""
|
||||
assert callable(remove_duplicates_job_s3)
|
||||
assert hasattr(remove_duplicates_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_fill_missing_values_job_s3_is_callable():
|
||||
"""Test fill_missing_values_job_s3 is a valid Dagster job"""
|
||||
assert callable(fill_missing_values_job_s3)
|
||||
assert hasattr(fill_missing_values_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_standardize_categorical_values_job_s3_is_callable():
|
||||
"""Test standardize_categorical_values_job_s3 is a valid Dagster job"""
|
||||
assert callable(standardize_categorical_values_job_s3)
|
||||
assert hasattr(standardize_categorical_values_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_correct_typos_job_s3_is_callable():
|
||||
"""Test correct_typos_job_s3 is a valid Dagster job"""
|
||||
assert callable(correct_typos_job_s3)
|
||||
assert hasattr(correct_typos_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_normalize_numeric_min_max_job_s3_is_callable():
|
||||
"""Test normalize_numeric_min_max_job_s3 is a valid Dagster job"""
|
||||
assert callable(normalize_numeric_min_max_job_s3)
|
||||
assert hasattr(normalize_numeric_min_max_job_s3, 'execute_in_process')
|
||||
|
||||
|
||||
def test_normalize_datetime_job_s3_is_callable():
|
||||
"""Test normalize_datetime_job_s3 is a valid Dagster job"""
|
||||
assert callable(normalize_datetime_job_s3)
|
||||
assert hasattr(normalize_datetime_job_s3, 'execute_in_process')
|
||||
|
||||
def test_normalize_coordinates_job_s3_is_callable():
|
||||
"""Test normalize_coordinates_job_s3 is a valid Dagster job"""
|
||||
assert callable(normalize_coordinates_job_s3)
|
||||
assert hasattr(normalize_coordinates_job_s3, 'execute_in_process')
|
||||
|
||||
def test_add_global_aggregations_job_s3_is_callable():
|
||||
"""Test add_global_aggregations_job_s3 is a valid Dagster job"""
|
||||
assert callable(add_global_aggregations_job_s3)
|
||||
assert hasattr(add_global_aggregations_job_s3, 'execute_in_process')
|
||||
700
tests/data_processing/test_ops.py
Normal file
700
tests/data_processing/test_ops.py
Normal file
@@ -0,0 +1,700 @@
|
||||
"""Unit tests for data processing operations."""
|
||||
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from template_code_location.data_processing.ops import (
|
||||
remove_duplicates,
|
||||
fill_missing_values,
|
||||
standardize_categorical_values,
|
||||
correct_typos,
|
||||
normalize_datetime,
|
||||
normalize_numeric_min_max,
|
||||
normalize_coordinates,
|
||||
add_global_aggregations
|
||||
)
|
||||
from template_code_location.data_processing.config_models import (
|
||||
FillMissingConfiguration,
|
||||
ColumnsSelectConfiguration,
|
||||
SpellCheckConfiguration,
|
||||
AggregationConfiguration,
|
||||
CoordinatesNormalizationConfiguration
|
||||
)
|
||||
|
||||
|
||||
class TestRemoveDuplicates:
|
||||
"""Tests for the remove_duplicates operation."""
|
||||
|
||||
def test_remove_duplicates_basic(self, mock_context, sample_dataframe):
|
||||
"""Test basic duplicate removal."""
|
||||
result = remove_duplicates(mock_context, sample_dataframe)
|
||||
|
||||
# Should have 3 unique rows (john doe appears 3x, jane smith 1x, bob johnson 1x)
|
||||
assert result.shape[0] == 3
|
||||
assert len(result) < len(sample_dataframe)
|
||||
|
||||
def test_remove_duplicates_no_duplicates(self, mock_context):
|
||||
"""Test remove_duplicates when there are no duplicates."""
|
||||
df = pd.DataFrame({
|
||||
'A': [1, 2, 3],
|
||||
'B': ['x', 'y', 'z']
|
||||
})
|
||||
result = remove_duplicates(mock_context, df)
|
||||
|
||||
assert result.shape[0] == 3
|
||||
pd.testing.assert_frame_equal(result, df)
|
||||
|
||||
def test_remove_duplicates_all_duplicates(self, mock_context):
|
||||
"""Test remove_duplicates when all rows are identical."""
|
||||
df = pd.DataFrame({
|
||||
'A': [1, 1, 1],
|
||||
'B': ['x', 'x', 'x']
|
||||
})
|
||||
result = remove_duplicates(mock_context, df)
|
||||
|
||||
assert result.shape[0] == 1
|
||||
|
||||
def test_remove_duplicates_empty_dataframe(self, mock_context, empty_dataframe):
|
||||
"""Test remove_duplicates with empty DataFrame."""
|
||||
result = remove_duplicates(mock_context, empty_dataframe)
|
||||
|
||||
assert result.shape[0] == 0
|
||||
assert result.shape[1] == 0
|
||||
|
||||
def test_remove_duplicates_preserves_data_types(self, mock_context):
|
||||
"""Test that remove_duplicates preserves data types."""
|
||||
df = pd.DataFrame({
|
||||
'int_col': [1, 2, 1],
|
||||
'str_col': ['a', 'b', 'a'],
|
||||
'float_col': [1.5, 2.5, 1.5]
|
||||
})
|
||||
result = remove_duplicates(mock_context, df)
|
||||
|
||||
assert result['int_col'].dtype == df['int_col'].dtype
|
||||
assert result['str_col'].dtype == df['str_col'].dtype
|
||||
assert result['float_col'].dtype == df['float_col'].dtype
|
||||
|
||||
|
||||
class TestFillMissingValues:
|
||||
"""Tests for the fill_missing_values operation."""
|
||||
|
||||
def test_fill_missing_values_basic(self, mock_context, dataframe_with_missing_values):
|
||||
"""Test basic missing value filling."""
|
||||
config = FillMissingConfiguration(fill_map={'Column1': '0', 'Column2': 'N/A'})
|
||||
result = fill_missing_values(mock_context, config, dataframe_with_missing_values)
|
||||
|
||||
# Check that no NaN values remain
|
||||
assert result['Column1'].isna().sum() == 0
|
||||
assert result['Column2'].isna().sum() == 0
|
||||
|
||||
def test_fill_missing_values_with_different_values(self, mock_context):
|
||||
"""Test filling with different replacement values."""
|
||||
df = pd.DataFrame({
|
||||
'A': [1, None, 3],
|
||||
'B': [None, 'b', 'c']
|
||||
})
|
||||
config = FillMissingConfiguration(fill_map={'A': '-1', 'B': 'UNKNOWN'})
|
||||
result = fill_missing_values(mock_context, config, df)
|
||||
|
||||
assert result.loc[1, 'A'] == '-1'
|
||||
assert result.loc[0, 'B'] == 'UNKNOWN'
|
||||
|
||||
def test_fill_missing_values_partial_columns(self, mock_context):
|
||||
"""Test filling only specified columns."""
|
||||
df = pd.DataFrame({
|
||||
'A': [1, None, 3],
|
||||
'B': [None, 'b', 'c']
|
||||
})
|
||||
config = FillMissingConfiguration(fill_map={'A': '999'})
|
||||
result = fill_missing_values(mock_context, config, df)
|
||||
|
||||
assert result.loc[1, 'A'] == '999'
|
||||
assert pd.isna(result.loc[0, 'B']) # B should still have NaN
|
||||
|
||||
def test_fill_missing_values_no_missing(self, mock_context):
|
||||
"""Test when there are no missing values."""
|
||||
df = pd.DataFrame({
|
||||
'A': ['1', '2', '3'],
|
||||
'B': ['a', 'b', 'c']
|
||||
})
|
||||
config = FillMissingConfiguration(fill_map={'A': '0'})
|
||||
result = fill_missing_values(mock_context, config, df)
|
||||
|
||||
pd.testing.assert_frame_equal(result, df)
|
||||
|
||||
def test_fill_missing_values_empty_dataframe(self, mock_context, empty_dataframe):
|
||||
"""Test with empty DataFrame."""
|
||||
config = FillMissingConfiguration(fill_map={})
|
||||
result = fill_missing_values(mock_context, config, empty_dataframe)
|
||||
|
||||
assert result.shape[0] == 0
|
||||
|
||||
|
||||
class TestStandardizeCategoricalValues:
|
||||
"""Tests for the standardize_categorical_values operation."""
|
||||
|
||||
def test_standardize_categorical_basic(self, mock_context, sample_dataframe):
|
||||
"""Test basic categorical standardization."""
|
||||
config = ColumnsSelectConfiguration(columns=['Name', 'City', 'Status'])
|
||||
result = standardize_categorical_values(mock_context, config, sample_dataframe)
|
||||
|
||||
# Check that values are lowercase and stripped
|
||||
assert result['Name'].iloc[0] == 'john doe'
|
||||
assert result['City'].iloc[1] == 'los angeles'
|
||||
assert result['Status'].iloc[1] == 'inactive'
|
||||
|
||||
def test_standardize_categorical_single_column(self, mock_context):
|
||||
"""Test standardization on a single column."""
|
||||
df = pd.DataFrame({
|
||||
'City': [' NEW YORK ', 'LOS ANGELES', ' chicago ']
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['City'])
|
||||
result = standardize_categorical_values(mock_context, config, df)
|
||||
|
||||
assert result['City'].iloc[0] == 'new york'
|
||||
assert result['City'].iloc[1] == 'los angeles'
|
||||
assert result['City'].iloc[2] == 'chicago'
|
||||
|
||||
def test_standardize_categorical_missing_column(self, mock_context, sample_dataframe):
|
||||
"""Test with non-existent column (should skip)."""
|
||||
config = ColumnsSelectConfiguration(columns=['NonExistent', 'Name'])
|
||||
result = standardize_categorical_values(mock_context, config, sample_dataframe)
|
||||
|
||||
# Should process 'Name' column without error
|
||||
assert result['Name'].iloc[0] == 'john doe'
|
||||
|
||||
def test_standardize_categorical_with_missing_values(self, mock_context):
|
||||
"""Test standardization with missing values."""
|
||||
df = pd.DataFrame({
|
||||
'Category': [' ACTIVE ', None, ' pending ']
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['Category'])
|
||||
result = standardize_categorical_values(mock_context, config, df)
|
||||
|
||||
assert result['Category'].iloc[0] == 'active'
|
||||
assert result['Category'].iloc[1] == ''
|
||||
assert result['Category'].iloc[2] == 'pending'
|
||||
|
||||
def test_standardize_categorical_empty_dataframe(self, mock_context, empty_dataframe):
|
||||
"""Test with empty DataFrame."""
|
||||
config = ColumnsSelectConfiguration(columns=['A', 'B'])
|
||||
result = standardize_categorical_values(mock_context, config, empty_dataframe)
|
||||
|
||||
assert result.shape[0] == 0
|
||||
|
||||
def test_standardize_categorical_numeric_columns(self, mock_context):
|
||||
"""Test that numeric columns are converted to strings."""
|
||||
df = pd.DataFrame({
|
||||
'NumCol': [1, 2, 3]
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['NumCol'])
|
||||
result = standardize_categorical_values(mock_context, config, df)
|
||||
|
||||
assert result['NumCol'].iloc[0] == '1'
|
||||
assert isinstance(result['NumCol'].iloc[0], str)
|
||||
|
||||
|
||||
class TestCorrectTypos:
|
||||
"""Tests for the correct_typos operation."""
|
||||
|
||||
def test_correct_typos_basic(self, mock_context):
|
||||
"""Test basic typo correction."""
|
||||
df = pd.DataFrame({
|
||||
'Name': ['jon', 'jayne', 'bob']
|
||||
})
|
||||
config = SpellCheckConfiguration(columns=['Name'], language='en')
|
||||
result = correct_typos(mock_context, config, df)
|
||||
|
||||
# Result should have corrections applied
|
||||
assert result.shape[0] == 3
|
||||
|
||||
def test_correct_typos_missing_column(self, mock_context):
|
||||
"""Test with non-existent column (should skip)."""
|
||||
df = pd.DataFrame({
|
||||
'Name': ['jon', 'jayne']
|
||||
})
|
||||
config = SpellCheckConfiguration(columns=['NonExistent'], language='en')
|
||||
result = correct_typos(mock_context, config, df)
|
||||
|
||||
# Should not raise error, just skip
|
||||
pd.testing.assert_frame_equal(result, df)
|
||||
|
||||
def test_correct_typos_with_missing_values(self, mock_context):
|
||||
"""Test typo correction with missing values."""
|
||||
df = pd.DataFrame({
|
||||
'Text': ['helo', '', 'wrld']
|
||||
})
|
||||
config = SpellCheckConfiguration(columns=['Text'], language='en')
|
||||
result = correct_typos(mock_context, config, df)
|
||||
|
||||
# Empty strings should be preserved
|
||||
assert result.loc[1, 'Text'] == ''
|
||||
|
||||
def test_correct_typos_empty_dataframe(self, mock_context, empty_dataframe):
|
||||
"""Test with empty DataFrame."""
|
||||
config = SpellCheckConfiguration(columns=['A'], language='en')
|
||||
result = correct_typos(mock_context, config, empty_dataframe)
|
||||
|
||||
assert result.shape[0] == 0
|
||||
|
||||
def test_correct_typos_different_languages(self, mock_context):
|
||||
"""Test typo correction with different languages."""
|
||||
df = pd.DataFrame({
|
||||
'Text': ['ciao', 'mondo']
|
||||
})
|
||||
|
||||
for lang in ['en', 'es', 'it']:
|
||||
config = SpellCheckConfiguration(columns=['Text'], language=lang)
|
||||
result = correct_typos(mock_context, config, df)
|
||||
|
||||
# Should process without error
|
||||
assert result.shape[0] == 2
|
||||
|
||||
def test_correct_typos_numeric_values(self, mock_context):
|
||||
"""Test typo correction on numeric values converted to strings."""
|
||||
df = pd.DataFrame({
|
||||
'Values': [123, 456, 789]
|
||||
})
|
||||
config = SpellCheckConfiguration(columns=['Values'], language='en')
|
||||
result = correct_typos(mock_context, config, df)
|
||||
|
||||
# Numeric values should be converted to string and processed
|
||||
assert result.shape[0] == 3
|
||||
|
||||
class TestNormalizeDatetime:
|
||||
"""Tests for the normalize_datetime operation."""
|
||||
|
||||
def test_normalize_datetime_basic(self, mock_context):
|
||||
"""Test basic datetime normalization to ISO format."""
|
||||
df = pd.DataFrame({
|
||||
'date_col': ['2023-01-01 10:00:00', '2023-12-31T23:59:59']
|
||||
})
|
||||
|
||||
config = ColumnsSelectConfiguration(columns=['date_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'date_col_iso' in result.columns
|
||||
assert result['date_col_iso'].iloc[0] == '2023-01-01T10:00:00Z'
|
||||
assert result['date_col_iso'].iloc[1] == '2023-12-31T23:59:59Z'
|
||||
|
||||
def test_normalize_datetime_missing_column(self, mock_context, sample_dataframe):
|
||||
"""Test behavior when a configured column is missing in the DataFrame."""
|
||||
config = ColumnsSelectConfiguration(columns=['non_existent_column'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, sample_dataframe.copy())
|
||||
|
||||
pd.testing.assert_frame_equal(result, sample_dataframe)
|
||||
|
||||
def test_normalize_datetime_unparseable_values(self, mock_context):
|
||||
"""Test column with values that cannot be parsed as dates."""
|
||||
df = pd.DataFrame({
|
||||
'invalid_col': ['not-a-date', 'completely-random-text']
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['invalid_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'invalid_col_iso' not in result.columns
|
||||
|
||||
def test_normalize_datetime_mixed_and_nulls(self, mock_context):
|
||||
"""Test column with mixed valid dates, invalid dates, and NaNs."""
|
||||
df = pd.DataFrame({
|
||||
'mixed_col': ['2023-05-01', None, 'invalid-date']
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['mixed_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'mixed_col_iso' in result.columns
|
||||
assert result['mixed_col_iso'].iloc[0] == '2023-05-01T00:00:00Z'
|
||||
|
||||
assert result['mixed_col_iso'].iloc[1] == ""
|
||||
assert result['mixed_col_iso'].iloc[2] == ""
|
||||
|
||||
def test_normalize_datetime_empty_dataframe(self, mock_context, empty_dataframe):
|
||||
"""Test with an empty DataFrame."""
|
||||
config = ColumnsSelectConfiguration(columns=['some_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, empty_dataframe)
|
||||
|
||||
assert result.empty
|
||||
|
||||
def test_normalize_datetime_epoch_only(self, mock_context, capsys):
|
||||
"""If parsing a column yields only the Unix epoch date, it should be skipped."""
|
||||
df = pd.DataFrame({
|
||||
'weird_col': ['0', 0, '0000', '']
|
||||
})
|
||||
|
||||
config = ColumnsSelectConfiguration(columns=['weird_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'weird_col_iso' not in result.columns
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "all normalized values are '1970-01-01'" in captured.err
|
||||
|
||||
def test_normalize_datetime_all_1970_skipped(self, mock_context, capsys):
|
||||
"""If all formatted values are '1970-01-01', the column should be skipped with a warning."""
|
||||
df = pd.DataFrame({
|
||||
'ts_col': ['1970-01-01 05:30:00', '1970-01-01 12:00:00']
|
||||
})
|
||||
|
||||
config = ColumnsSelectConfiguration(columns=['ts_col'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'ts_col_iso' not in result.columns
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "all normalized values are '1970-01-01'" in captured.err
|
||||
|
||||
def test_normalize_datetime_integer_age_column_skipped(self, mock_context, capsys):
|
||||
"""If an integer column like 'age' is passed, all values become 1970-01-01 and should be skipped."""
|
||||
df = pd.DataFrame({
|
||||
'age': [66, 45, 40, 43, 20, 26, 69, 21, 46]
|
||||
})
|
||||
|
||||
config = ColumnsSelectConfiguration(columns=['age'])
|
||||
|
||||
result = normalize_datetime(mock_context, config, df.copy())
|
||||
|
||||
assert 'age_iso' not in result.columns
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert "all normalized values are '1970-01-01'" in captured.err
|
||||
|
||||
class TestNormalizeNumericMinMax:
|
||||
"""Tests for the normalize_numeric_min_max operation."""
|
||||
|
||||
def test_normalize_numeric_basic(self, mock_context):
|
||||
"""Test standard min-max normalization between 0 and 1."""
|
||||
df = pd.DataFrame({
|
||||
'score': [10, 20, 30, 40, 50]
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['score'])
|
||||
|
||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
||||
|
||||
assert 'score_norm' in result.columns
|
||||
assert result['score_norm'].min() == 0.0
|
||||
assert result['score_norm'].max() == 1.0
|
||||
|
||||
assert result['score_norm'].iloc[2] == 0.5
|
||||
|
||||
def test_normalize_numeric_missing_column(self, mock_context):
|
||||
"""Test skipping of non-existent columns."""
|
||||
df = pd.DataFrame({'existing': [1, 2, 3]})
|
||||
config = ColumnsSelectConfiguration(columns=['missing_col'])
|
||||
|
||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
||||
|
||||
assert 'missing_col_norm' not in result.columns
|
||||
|
||||
def test_normalize_numeric_constant_values(self, mock_context):
|
||||
"""Test skipping when min == max to avoid division by zero."""
|
||||
df = pd.DataFrame({
|
||||
'constant': [10, 10, 10]
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['constant'])
|
||||
|
||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
||||
|
||||
assert 'constant_norm' not in result.columns
|
||||
|
||||
def test_normalize_numeric_with_nans(self, mock_context):
|
||||
"""Test normalization with NaN values (pandas min/max ignore NaNs by default)."""
|
||||
df = pd.DataFrame({
|
||||
'with_nans': [10, None, 50]
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['with_nans'])
|
||||
|
||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
||||
|
||||
assert 'with_nans_norm' in result.columns
|
||||
assert result['with_nans_norm'].iloc[0] == 0.0
|
||||
assert result['with_nans_norm'].iloc[2] == 1.0
|
||||
assert pd.isna(result['with_nans_norm'].iloc[1])
|
||||
|
||||
def test_normalize_numeric_multiple_columns(self, mock_context):
|
||||
"""Test processing multiple columns in one call."""
|
||||
df = pd.DataFrame({
|
||||
'A': [1, 2],
|
||||
'B': [10, 20]
|
||||
})
|
||||
config = ColumnsSelectConfiguration(columns=['A', 'B'])
|
||||
|
||||
result = normalize_numeric_min_max(mock_context, config, df.copy())
|
||||
|
||||
assert 'A_norm' in result.columns
|
||||
assert 'B_norm' in result.columns
|
||||
|
||||
class TestNormalizeCoordinates:
|
||||
"""Tests for the normalize_coordinates operation."""
|
||||
|
||||
def test_normalize_coordinates_basic(self, mock_context):
|
||||
"""Test rounding and basic coordinate normalization."""
|
||||
df = pd.DataFrame({
|
||||
'lat': [45.123456, 46.0],
|
||||
'lon': [9.123456, 10.0]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert result['lat'].iloc[0] == 45.1235
|
||||
assert result['lon'].iloc[0] == 9.1235
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
def test_normalize_coordinates_filtering(self, mock_context):
|
||||
"""Test filtering of out-of-range coordinates."""
|
||||
df = pd.DataFrame({
|
||||
'lat': [45.0, 100.0, -91.0, 0.0], # 100 e -91 sono out of range
|
||||
'lon': [9.0, 0.0, 0.0, 200.0] # 200 è out of range
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 1
|
||||
assert result['lat'].iloc[0] == 45.0
|
||||
|
||||
def test_normalize_coordinates_invalid_types(self, mock_context):
|
||||
"""Test conversion of strings to numeric and handling of NaNs."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["45.5", "invalid", None],
|
||||
'lon': ["9.5", "10.0", "11.0"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 1
|
||||
assert isinstance(result['lat'].iloc[0], float)
|
||||
|
||||
def test_normalize_coordinates_empty_df(self, mock_context, empty_dataframe):
|
||||
"""Test with an empty DataFrame."""
|
||||
|
||||
df = pd.DataFrame(columns=['lat', 'lon'])
|
||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df)
|
||||
|
||||
assert len(result) == 0
|
||||
assert result.empty
|
||||
|
||||
def test_normalize_coordinates_default_config(self, mock_context):
|
||||
"""Test that normalize_coordinates uses default 'lat'/'lon' columns when no config is provided."""
|
||||
df = pd.DataFrame({
|
||||
'lat': [45.123456, 46.0],
|
||||
'lon': [9.123456, 10.0]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration()
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert result['lat'].iloc[0] == 45.1235
|
||||
assert result['lon'].iloc[0] == 9.1235
|
||||
assert len(result) == 2
|
||||
|
||||
def test_normalize_coordinates_null_config_values(self, mock_context):
|
||||
"""Test that null lat/lon column names fall back to defaults ('lat'/'lon')."""
|
||||
df = pd.DataFrame({
|
||||
'lat': [45.123456, 46.0],
|
||||
'lon': [9.123456, 10.0]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(latColumn=None, lonColumn=None)
|
||||
|
||||
assert config.latColumn == "lat"
|
||||
assert config.lonColumn == "lon"
|
||||
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert result['lat'].iloc[0] == 45.1235
|
||||
assert result['lon'].iloc[0] == 9.1235
|
||||
assert len(result) == 2
|
||||
|
||||
def test_normalize_coordinates_dms_degree_symbol(self, mock_context):
|
||||
"""Test DMS parsing with degree/minute/second symbols like 40°26'46\"N."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["40°26'46\"N", "51°30'26\"N"],
|
||||
'lon': ["79°58'56\"W", "0°7'39\"W"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 2
|
||||
# 40°26'46"N ≈ 40.4461
|
||||
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
|
||||
# 79°58'56"W ≈ -79.9822
|
||||
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
|
||||
|
||||
def test_normalize_coordinates_dms_spaced_format(self, mock_context):
|
||||
"""Test DMS parsing with space-separated format like '40 26 46 N'."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["40 26 46 N"],
|
||||
'lon': ["79 58 56 W"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 1
|
||||
assert abs(result['lat'].iloc[0] - 40.4461) < 0.001
|
||||
assert abs(result['lon'].iloc[0] - (-79.9822)) < 0.001
|
||||
|
||||
def test_normalize_coordinates_dms_already_decimal(self, mock_context):
|
||||
"""Test that string columns with decimal values are auto-parsed correctly."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["45.5", "46.0"],
|
||||
'lon': ["9.5", "10.0"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 2
|
||||
assert result['lat'].iloc[0] == 45.5
|
||||
assert result['lon'].iloc[0] == 9.5
|
||||
|
||||
def test_normalize_coordinates_dms_mixed_valid_invalid(self, mock_context):
|
||||
"""Test auto-detection with a mix of valid DMS, valid decimal, and unparseable values."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["40°26'46\"N", "not_a_coord", "51.5"],
|
||||
'lon': ["79°58'56\"W", "10.0", "0.1"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
# Row with "not_a_coord" for lat should be dropped (NaN lat)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_normalize_coordinates_dms_out_of_range(self, mock_context):
|
||||
"""Test that DMS-parsed coordinates outside valid range are filtered out."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["91°0'0\"N", "45°0'0\"N"],
|
||||
'lon': ["0°0'0\"E", "9°0'0\"E"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
# First row has lat=91° which is out of [-90, 90]
|
||||
assert len(result) == 1
|
||||
assert abs(result['lat'].iloc[0] - 45.0) < 0.001
|
||||
|
||||
def test_normalize_coordinates_dms_south_and_east(self, mock_context):
|
||||
"""Test DMS parsing with south latitude and east longitude."""
|
||||
df = pd.DataFrame({
|
||||
'lat': ["33°51'54\"S"],
|
||||
'lon': ["151°12'36\"E"]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(
|
||||
latColumn='lat', lonColumn='lon'
|
||||
)
|
||||
result = normalize_coordinates(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 1
|
||||
# 33°51'54"S ≈ -33.865
|
||||
assert result['lat'].iloc[0] < 0
|
||||
assert abs(result['lat'].iloc[0] - (-33.865)) < 0.001
|
||||
# 151°12'36"E ≈ 151.21
|
||||
assert result['lon'].iloc[0] > 0
|
||||
assert abs(result['lon'].iloc[0] - 151.21) < 0.01
|
||||
|
||||
def test_normalize_coordinates_autodetect_numeric_vs_dms(self, mock_context):
|
||||
"""Test that numeric columns are coerced directly while string columns are parsed as DMS."""
|
||||
# Numeric columns — should go through pd.to_numeric path
|
||||
df_numeric = pd.DataFrame({
|
||||
'lat': [45.123456, 46.0],
|
||||
'lon': [9.123456, 10.0]
|
||||
})
|
||||
config = CoordinatesNormalizationConfiguration(latColumn='lat', lonColumn='lon')
|
||||
result_numeric = normalize_coordinates(mock_context, config, df_numeric.copy())
|
||||
|
||||
assert result_numeric['lat'].iloc[0] == 45.1235
|
||||
assert len(result_numeric) == 2
|
||||
|
||||
# String DMS columns — should go through _parse_dms_to_decimal path
|
||||
df_dms = pd.DataFrame({
|
||||
'lat': ["40°26'46\"N"],
|
||||
'lon': ["79°58'56\"W"]
|
||||
})
|
||||
result_dms = normalize_coordinates(mock_context, config, df_dms.copy())
|
||||
|
||||
assert len(result_dms) == 1
|
||||
assert abs(result_dms['lat'].iloc[0] - 40.4461) < 0.001
|
||||
|
||||
class TestAddGlobalAggregations:
|
||||
"""Tests for the add_global_aggregations operation."""
|
||||
|
||||
def test_add_global_aggregations_success(self, mock_context):
|
||||
"""Test a successful group by and aggregation."""
|
||||
df = pd.DataFrame({
|
||||
'category': ['A', 'A', 'B'],
|
||||
'value': [10, 20, 100],
|
||||
'ignored_str': ['x', 'y', 'z']
|
||||
})
|
||||
|
||||
config = AggregationConfiguration(
|
||||
columns=['category'],
|
||||
operation='sum'
|
||||
)
|
||||
|
||||
result = add_global_aggregations(mock_context, config, df.copy())
|
||||
|
||||
assert len(result) == 2
|
||||
assert result.loc[result['category'] == 'A', 'value'].values[0] == 30
|
||||
assert result.loc[result['category'] == 'B', 'value'].values[0] == 100
|
||||
assert 'ignored_str' not in result.columns
|
||||
mock_context.log.info.assert_called()
|
||||
|
||||
def test_add_global_aggregations_missing_column(self, mock_context):
|
||||
"""Test skipping a column that does not exist in the dataframe."""
|
||||
df = pd.DataFrame({'value': [1, 2, 3]})
|
||||
config = AggregationConfiguration(
|
||||
columns=['missing_col'],
|
||||
operation='count'
|
||||
)
|
||||
|
||||
result = add_global_aggregations(mock_context, config, df.copy())
|
||||
|
||||
mock_context.log.warning.assert_any_call("Column 'missing_col' not found, skipping aggregation.")
|
||||
assert len(result) == 1
|
||||
|
||||
def test_add_global_aggregations_unsupported_op(self, mock_context):
|
||||
"""Test the warning when an unsupported operation is provided."""
|
||||
df = pd.DataFrame({'category': ['A'], 'value': [1]})
|
||||
|
||||
config = AggregationConfiguration(
|
||||
columns=['category'],
|
||||
operation='unsupported'
|
||||
)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
add_global_aggregations(mock_context, config, df.copy())
|
||||
|
||||
mock_context.log.warning.assert_any_call("Unsupported aggregation 'unsupported'")
|
||||
|
||||
def test_add_global_aggregations_only_numeric_kept(self, mock_context):
|
||||
"""Verify that non-numeric and non-grouping columns are dropped."""
|
||||
df = pd.DataFrame({
|
||||
'group': ['A', 'A'],
|
||||
'num': [1, 2],
|
||||
'text': ['hello', 'world']
|
||||
})
|
||||
config = AggregationConfiguration(columns=['group'], operation='mean')
|
||||
|
||||
result = add_global_aggregations(mock_context, config, df.copy())
|
||||
|
||||
assert 'text' not in result.columns
|
||||
assert 'num' in result.columns
|
||||
assert 'group' in result.columns
|
||||
Reference in New Issue
Block a user