Files
template-code-location/tests/data_processing/test_integration.py

186 lines
6.6 KiB
Python

"""Integration tests for data processing jobs."""
import pytest
import pandas as pd
from unittest.mock import patch, MagicMock
from template_code_location.data_processing.ops import (
remove_duplicates,
fill_missing_values,
standardize_categorical_values,
correct_typos
)
from template_code_location.data_processing.config_models import (
FillMissingConfiguration,
ColumnsSelectConfiguration,
SpellCheckConfiguration
)
class TestPipelineIntegration:
"""Integration tests for data processing pipeline."""
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
"""Test pipeline: remove duplicates then standardize."""
df = pd.DataFrame({
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
})
# Step 1: Remove duplicates
df_no_dupes = remove_duplicates(mock_context, df)
assert df_no_dupes.shape[0] == 3
# Step 2: Standardize
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
assert df_standardized['Name'].iloc[0] == 'john doe'
assert df_standardized['City'].iloc[0] == 'new york'
def test_pipeline_fill_missing_then_standardize(self, mock_context):
"""Test pipeline: fill missing values then standardize."""
df = pd.DataFrame({
'Category': [' ACTIVE ', None, ' PENDING '],
'Value': ['1', '2', None]
})
# Step 1: Fill missing values
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df_filled = fill_missing_values(mock_context, fill_config, df)
# Step 2: Standardize
std_config = ColumnsSelectConfiguration(columns=['Category'])
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
assert df_standardized['Category'].iloc[0] == 'active'
assert df_filled['Value'].iloc[2] == '0'
def test_pipeline_all_operations(self, mock_context):
"""Test complete pipeline with all operations."""
df = pd.DataFrame({
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
'Value': ['1', None, '1', '2']
})
# Step 1: Remove duplicates
df = remove_duplicates(mock_context, df)
assert df.shape[0] == 3
# Step 2: Fill missing
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
df = fill_missing_values(mock_context, fill_config, df)
assert df['Value'].isna().sum() == 0
# Step 3: Standardize
std_config = ColumnsSelectConfiguration(columns=['Name'])
df = standardize_categorical_values(mock_context, std_config, df)
assert df['Name'].iloc[0] == 'john doe'
def test_pipeline_with_large_dataset(self, mock_context):
"""Test pipeline performance with larger dataset."""
# Create larger dataset
size = 1000
df = pd.DataFrame({
'ID': list(range(size)),
'Name': ['User_' + str(i % 50) for i in range(size)],
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
'Score': [i % 100 for i in range(size)]
})
# Add some duplicates
df = pd.concat([df, df.head(100)], ignore_index=True)
# Process
df_cleaned = remove_duplicates(mock_context, df)
assert df_cleaned.shape[0] == 1000
assert df_cleaned.shape[1] == 4
class TestErrorHandling:
"""Tests for error handling and edge cases."""
def test_operation_with_corrupted_data(self, mock_context):
"""Test operations with corrupted/unusual data."""
df = pd.DataFrame({
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
})
# Should handle special float values
result = remove_duplicates(mock_context, df)
assert result.shape[0] > 0
def test_operation_preserves_index(self, mock_context):
"""Test that index is handled correctly."""
df = pd.DataFrame(
{'Col': [1, 2, 1, 3]},
index=['a', 'b', 'c', 'd']
)
result = remove_duplicates(mock_context, df)
# Index may be reset, so just check shape
assert result.shape[0] == 3
def test_standardize_with_unicode_characters(self, mock_context):
"""Test standardization with unicode characters."""
df = pd.DataFrame({
'Name': ['José', 'François', 'Müller']
})
config = ColumnsSelectConfiguration(columns=['Name'])
result = standardize_categorical_values(mock_context, config, df)
# Should handle unicode correctly
assert result.shape[0] == 3
def test_fill_with_same_key_multiple_times(self, mock_context):
"""Test filling when fill_map has multiple entries."""
df = pd.DataFrame({
'A': ['1', None, '3'],
'B': [None, None, 'c'],
'C': [None, '2', None]
})
config = FillMissingConfiguration(fill_map={
'A': '-1',
'B': 'EMPTY',
'C': '0'
})
result = fill_missing_values(mock_context, config, df)
assert result.loc[1, 'A'] == '-1'
assert result.loc[0, 'B'] == 'EMPTY'
assert result.loc[0, 'C'] == '0'
class TestDataTypePreservation:
"""Tests to ensure data types are preserved appropriately."""
def test_remove_duplicates_preserves_dtypes(self, mock_context):
"""Test that remove_duplicates preserves column data types."""
df = pd.DataFrame({
'int32': pd.array([1, 2, 1], dtype='int32'),
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
'str': ['a', 'b', 'a']
})
result = remove_duplicates(mock_context, df)
assert result['int32'].dtype == df['int32'].dtype
assert result['float64'].dtype == df['float64'].dtype
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
"""Test that fill_missing handles type preservation."""
df = pd.DataFrame({
'A': pd.array(['1', None, '3'], dtype='string'),
'B': ['x', 'y', 'z']
})
config = FillMissingConfiguration(fill_map={'A': '0'})
result = fill_missing_values(mock_context, config, df)
assert result['A'].loc[1] == '0'
assert result['B'].dtype == df['B'].dtype