"""Integration tests for data processing jobs.""" import pytest import pandas as pd from unittest.mock import patch, MagicMock from template_code_location.data_processing.ops import ( remove_duplicates, fill_missing_values, standardize_categorical_values, correct_typos ) from template_code_location.data_processing.config_models import ( FillMissingConfiguration, ColumnsSelectConfiguration, SpellCheckConfiguration ) class TestPipelineIntegration: """Integration tests for data processing pipeline.""" def test_pipeline_remove_duplicates_then_standardize(self, mock_context): """Test pipeline: remove duplicates then standardize.""" df = pd.DataFrame({ 'Name': [' JOHN DOE ', 'jane smith', ' JOHN DOE ', 'bob johnson'], 'City': ['NEW YORK', 'los angeles', 'NEW YORK', 'chicago'] }) # Step 1: Remove duplicates df_no_dupes = remove_duplicates(mock_context, df) assert df_no_dupes.shape[0] == 3 # Step 2: Standardize config = ColumnsSelectConfiguration(columns=['Name', 'City']) df_standardized = standardize_categorical_values(mock_context, config, df_no_dupes) assert df_standardized['Name'].iloc[0] == 'john doe' assert df_standardized['City'].iloc[0] == 'new york' def test_pipeline_fill_missing_then_standardize(self, mock_context): """Test pipeline: fill missing values then standardize.""" df = pd.DataFrame({ 'Category': [' ACTIVE ', None, ' PENDING '], 'Value': ['1', '2', None] }) # Step 1: Fill missing values fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) df_filled = fill_missing_values(mock_context, fill_config, df) # Step 2: Standardize std_config = ColumnsSelectConfiguration(columns=['Category']) df_standardized = standardize_categorical_values(mock_context, std_config, df_filled) assert df_standardized['Category'].iloc[0] == 'active' assert df_filled['Value'].iloc[2] == '0' def test_pipeline_all_operations(self, mock_context): """Test complete pipeline with all operations.""" df = pd.DataFrame({ 'Name': [' john doe ', 'JANE SMITH', ' john doe ', None], 'Value': ['1', None, '1', '2'] }) # Step 1: Remove duplicates df = remove_duplicates(mock_context, df) assert df.shape[0] == 3 # Step 2: Fill missing fill_config = FillMissingConfiguration(fill_map={'Value': '0'}) df = fill_missing_values(mock_context, fill_config, df) assert df['Value'].isna().sum() == 0 # Step 3: Standardize std_config = ColumnsSelectConfiguration(columns=['Name']) df = standardize_categorical_values(mock_context, std_config, df) assert df['Name'].iloc[0] == 'john doe' def test_pipeline_with_large_dataset(self, mock_context): """Test pipeline performance with larger dataset.""" # Create larger dataset size = 1000 df = pd.DataFrame({ 'ID': list(range(size)), 'Name': ['User_' + str(i % 50) for i in range(size)], 'Status': ['ACTIVE', 'INACTIVE', 'PENDING'] * (size // 3) + ['ACTIVE'] * (size % 3), 'Score': [i % 100 for i in range(size)] }) # Add some duplicates df = pd.concat([df, df.head(100)], ignore_index=True) # Process df_cleaned = remove_duplicates(mock_context, df) assert df_cleaned.shape[0] == 1000 assert df_cleaned.shape[1] == 4 class TestErrorHandling: """Tests for error handling and edge cases.""" def test_operation_with_corrupted_data(self, mock_context): """Test operations with corrupted/unusual data.""" df = pd.DataFrame({ 'Col': [float('nan'), float('inf'), -float('inf'), 0, 1, 2] }) # Should handle special float values result = remove_duplicates(mock_context, df) assert result.shape[0] > 0 def test_operation_preserves_index(self, mock_context): """Test that index is handled correctly.""" df = pd.DataFrame( {'Col': [1, 2, 1, 3]}, index=['a', 'b', 'c', 'd'] ) result = remove_duplicates(mock_context, df) # Index may be reset, so just check shape assert result.shape[0] == 3 def test_standardize_with_unicode_characters(self, mock_context): """Test standardization with unicode characters.""" df = pd.DataFrame({ 'Name': ['José', 'François', 'Müller'] }) config = ColumnsSelectConfiguration(columns=['Name']) result = standardize_categorical_values(mock_context, config, df) # Should handle unicode correctly assert result.shape[0] == 3 def test_fill_with_same_key_multiple_times(self, mock_context): """Test filling when fill_map has multiple entries.""" df = pd.DataFrame({ 'A': ['1', None, '3'], 'B': [None, None, 'c'], 'C': [None, '2', None] }) config = FillMissingConfiguration(fill_map={ 'A': '-1', 'B': 'EMPTY', 'C': '0' }) result = fill_missing_values(mock_context, config, df) assert result.loc[1, 'A'] == '-1' assert result.loc[0, 'B'] == 'EMPTY' assert result.loc[0, 'C'] == '0' class TestDataTypePreservation: """Tests to ensure data types are preserved appropriately.""" def test_remove_duplicates_preserves_dtypes(self, mock_context): """Test that remove_duplicates preserves column data types.""" df = pd.DataFrame({ 'int32': pd.array([1, 2, 1], dtype='int32'), 'float64': pd.array([1.5, 2.5, 1.5], dtype='float64'), 'str': ['a', 'b', 'a'] }) result = remove_duplicates(mock_context, df) assert result['int32'].dtype == df['int32'].dtype assert result['float64'].dtype == df['float64'].dtype def test_fill_missing_preserves_column_types_where_possible(self, mock_context): """Test that fill_missing handles type preservation.""" df = pd.DataFrame({ 'A': pd.array(['1', None, '3'], dtype='string'), 'B': ['x', 'y', 'z'] }) config = FillMissingConfiguration(fill_map={'A': '0'}) result = fill_missing_values(mock_context, config, df) assert result['A'].loc[1] == '0' assert result['B'].dtype == df['B'].dtype