186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
"""Integration tests for data processing jobs."""
|
|
|
|
import pytest
|
|
import pandas as pd
|
|
from unittest.mock import patch, MagicMock
|
|
from template_code_location.data_processing.ops import (
|
|
remove_duplicates,
|
|
fill_missing_values,
|
|
standardize_categorical_values,
|
|
correct_typos
|
|
)
|
|
from template_code_location.data_processing.config_models import (
|
|
FillMissingConfiguration,
|
|
ColumnsSelectConfiguration,
|
|
SpellCheckConfiguration
|
|
)
|
|
|
|
|
|
class TestPipelineIntegration:
|
|
"""Integration tests for data processing pipeline."""
|
|
|
|
def test_pipeline_remove_duplicates_then_standardize(self, mock_context):
|
|
"""Test pipeline: remove duplicates then standardize."""
|
|
df = pd.DataFrame({
|
|
'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'],
|
|
'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago']
|
|
})
|
|
|
|
# Step 1: Remove duplicates
|
|
df_no_dupes = remove_duplicates(mock_context, df)
|
|
assert df_no_dupes.shape[0] == 3
|
|
|
|
# Step 2: Standardize
|
|
config = ColumnsSelectConfiguration(columns=['Name', 'City'])
|
|
df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes)
|
|
|
|
assert df_standardized['Name'].iloc[0] == 'john doe'
|
|
assert df_standardized['City'].iloc[0] == 'new york'
|
|
|
|
def test_pipeline_fill_missing_then_standardize(self, mock_context):
|
|
"""Test pipeline: fill missing values then standardize."""
|
|
df = pd.DataFrame({
|
|
'Category': [' ACTIVE ', None, ' PENDING '],
|
|
'Value': ['1', '2', None]
|
|
})
|
|
|
|
# Step 1: Fill missing values
|
|
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
|
df_filled = fill_missing_values(mock_context, fill_config, df)
|
|
|
|
# Step 2: Standardize
|
|
std_config = ColumnsSelectConfiguration(columns=['Category'])
|
|
df_standardized = standardize_categorical_values(mock_context, std_config, df_filled)
|
|
|
|
assert df_standardized['Category'].iloc[0] == 'active'
|
|
assert df_filled['Value'].iloc[2] == '0'
|
|
|
|
def test_pipeline_all_operations(self, mock_context):
|
|
"""Test complete pipeline with all operations."""
|
|
df = pd.DataFrame({
|
|
'Name': [' john doe ', 'JANE SMITH', ' john doe ', None],
|
|
'Value': ['1', None, '1', '2']
|
|
})
|
|
|
|
# Step 1: Remove duplicates
|
|
df = remove_duplicates(mock_context, df)
|
|
assert df.shape[0] == 3
|
|
|
|
# Step 2: Fill missing
|
|
fill_config = FillMissingConfiguration(fill_map={'Value': '0'})
|
|
df = fill_missing_values(mock_context, fill_config, df)
|
|
assert df['Value'].isna().sum() == 0
|
|
|
|
# Step 3: Standardize
|
|
std_config = ColumnsSelectConfiguration(columns=['Name'])
|
|
df = standardize_categorical_values(mock_context, std_config, df)
|
|
|
|
assert df['Name'].iloc[0] == 'john doe'
|
|
|
|
def test_pipeline_with_large_dataset(self, mock_context):
|
|
"""Test pipeline performance with larger dataset."""
|
|
# Create larger dataset
|
|
size = 1000
|
|
df = pd.DataFrame({
|
|
'ID': list(range(size)),
|
|
'Name': ['User_' + str(i % 50) for i in range(size)],
|
|
'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3),
|
|
'Score': [i % 100 for i in range(size)]
|
|
})
|
|
|
|
# Add some duplicates
|
|
df = pd.concat([df, df.head(100)], ignore_index=True)
|
|
|
|
# Process
|
|
df_cleaned = remove_duplicates(mock_context, df)
|
|
|
|
assert df_cleaned.shape[0] == 1000
|
|
assert df_cleaned.shape[1] == 4
|
|
|
|
|
|
class TestErrorHandling:
|
|
"""Tests for error handling and edge cases."""
|
|
|
|
def test_operation_with_corrupted_data(self, mock_context):
|
|
"""Test operations with corrupted/unusual data."""
|
|
df = pd.DataFrame({
|
|
'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2]
|
|
})
|
|
|
|
# Should handle special float values
|
|
result = remove_duplicates(mock_context, df)
|
|
assert result.shape[0] > 0
|
|
|
|
def test_operation_preserves_index(self, mock_context):
|
|
"""Test that index is handled correctly."""
|
|
df = pd.DataFrame(
|
|
{'Col': [1, 2, 1, 3]},
|
|
index=['a', 'b', 'c', 'd']
|
|
)
|
|
|
|
result = remove_duplicates(mock_context, df)
|
|
# Index may be reset, so just check shape
|
|
assert result.shape[0] == 3
|
|
|
|
def test_standardize_with_unicode_characters(self, mock_context):
|
|
"""Test standardization with unicode characters."""
|
|
df = pd.DataFrame({
|
|
'Name': ['José', 'François', 'Müller']
|
|
})
|
|
|
|
config = ColumnsSelectConfiguration(columns=['Name'])
|
|
result = standardize_categorical_values(mock_context, config, df)
|
|
|
|
# Should handle unicode correctly
|
|
assert result.shape[0] == 3
|
|
|
|
def test_fill_with_same_key_multiple_times(self, mock_context):
|
|
"""Test filling when fill_map has multiple entries."""
|
|
df = pd.DataFrame({
|
|
'A': ['1', None, '3'],
|
|
'B': [None, None, 'c'],
|
|
'C': [None, '2', None]
|
|
})
|
|
|
|
config = FillMissingConfiguration(fill_map={
|
|
'A': '-1',
|
|
'B': 'EMPTY',
|
|
'C': '0'
|
|
})
|
|
|
|
result = fill_missing_values(mock_context, config, df)
|
|
|
|
assert result.loc[1, 'A'] == '-1'
|
|
assert result.loc[0, 'B'] == 'EMPTY'
|
|
assert result.loc[0, 'C'] == '0'
|
|
|
|
|
|
class TestDataTypePreservation:
|
|
"""Tests to ensure data types are preserved appropriately."""
|
|
|
|
def test_remove_duplicates_preserves_dtypes(self, mock_context):
|
|
"""Test that remove_duplicates preserves column data types."""
|
|
df = pd.DataFrame({
|
|
'int32': pd.array([1, 2, 1], dtype='int32'),
|
|
'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'),
|
|
'str': ['a', 'b', 'a']
|
|
})
|
|
|
|
result = remove_duplicates(mock_context, df)
|
|
|
|
assert result['int32'].dtype == df['int32'].dtype
|
|
assert result['float64'].dtype == df['float64'].dtype
|
|
|
|
def test_fill_missing_preserves_column_types_where_possible(self, mock_context):
|
|
"""Test that fill_missing handles type preservation."""
|
|
df = pd.DataFrame({
|
|
'A': pd.array(['1', None, '3'], dtype='string'),
|
|
'B': ['x', 'y', 'z']
|
|
})
|
|
|
|
config = FillMissingConfiguration(fill_map={'A': '0'})
|
|
result = fill_missing_values(mock_context, config, df)
|
|
|
|
assert result['A'].loc[1] == '0'
|
|
assert result['B'].dtype == df['B'].dtype
|