Skip to main content
Glama
test_data_discovery.py12.9 kB
"""Test cases for data discovery module.""" import os import tempfile import pytest import pandas as pd import numpy as np from datetime import datetime, date from src.data_discovery import DataDiscoveryEngine from src.file_operations import ExcelFileManager class TestDataDiscoveryEngine: """Test cases for DataDiscoveryEngine class.""" def setup_method(self): """Set up test fixtures.""" self.discovery = DataDiscoveryEngine() self.file_manager = ExcelFileManager() self.temp_dir = tempfile.mkdtemp() # Create comprehensive test data self.test_data = pd.DataFrame({ 'id': [1, 2, 3, 4, 5], 'name': ['Alice', 'Bob', 'Charlie', None, 'Eve'], 'age': [25.5, 30, 35, 40, None], 'salary': [50000, 60000, 75000, 80000, 55000], 'is_active': [True, False, True, True, False], 'join_date': pd.to_datetime(['2020-01-15', '2019-06-20', '2021-03-10', '2018-12-01', '2022-05-30']), 'rating': [4.5, 3.8, 4.2, 4.9, 3.5] }) self.test_file = os.path.join(self.temp_dir, 'test_data.xlsx') self.test_data.to_excel(self.test_file, index=False, sheet_name='Employees') # Create multi-sheet file self.multi_sheet_file = os.path.join(self.temp_dir, 'multi_sheet.xlsx') with pd.ExcelWriter(self.multi_sheet_file) as writer: self.test_data.to_excel(writer, sheet_name='Employees', index=False) pd.DataFrame({ 'category': ['A', 'B', 'A', 'C', 'B'], 'value': [10, 20, 15, 25, 12] }).to_excel(writer, sheet_name='Categories', index=False) def teardown_method(self): """Clean up test fixtures.""" # Clean up temp files for file in [self.test_file, self.multi_sheet_file]: if os.path.exists(file): os.remove(file) os.rmdir(self.temp_dir) # Clear cache self.file_manager.clear_cache() def test_get_schema_success(self): """Test successful schema retrieval.""" result = self.discovery.get_schema(self.test_file) assert result['success'] is True assert result['total_columns'] == 7 assert result['total_rows'] == 5 assert len(result['schema']) == 7 # Check specific column schema name_col = next(col for col in result['schema'] if col['column_name'] == 'name') assert name_col['semantic_type'] == 'text' assert name_col['nullable'] is True assert name_col['null_count'] == 1 age_col = next(col for col in result['schema'] if col['column_name'] == 'age') assert age_col['semantic_type'] == 'float' assert age_col['nullable'] is True is_active_col = next(col for col in result['schema'] if col['column_name'] == 'is_active') assert is_active_col['semantic_type'] == 'boolean' join_date_col = next(col for col in result['schema'] if col['column_name'] == 'join_date') assert join_date_col['semantic_type'] == 'datetime' def test_get_schema_specific_sheet(self): """Test schema retrieval for specific sheet.""" result = self.discovery.get_schema(self.multi_sheet_file, 'Categories') assert result['success'] is True assert result['total_columns'] == 2 assert result['sheet_name'] == 'Categories' category_col = next(col for col in result['schema'] if col['column_name'] == 'category') assert category_col['semantic_type'] == 'text' assert category_col['unique_count'] == 3 def test_get_sample_data_head(self): """Test getting head sample data.""" result = self.discovery.get_sample_data(self.test_file, rows=3, sample_type="head") assert result['success'] is True assert result['sample_type'] == 'head' assert result['requested_rows'] == 3 assert result['returned_rows'] == 3 assert len(result['data']) == 3 assert result['data'][0]['id'] == 1 assert result['data'][0]['name'] == 'Alice' def test_get_sample_data_tail(self): """Test getting tail sample data.""" result = self.discovery.get_sample_data(self.test_file, rows=2, sample_type="tail") assert result['success'] is True assert result['sample_type'] == 'tail' assert result['returned_rows'] == 2 assert result['data'][0]['id'] == 4 assert result['data'][1]['id'] == 5 def test_get_sample_data_random(self): """Test getting random sample data.""" result = self.discovery.get_sample_data(self.test_file, rows=3, sample_type="random") assert result['success'] is True assert result['sample_type'] == 'random' assert result['returned_rows'] == 3 # Should have 3 different IDs ids = [row['id'] for row in result['data']] assert len(set(ids)) == 3 def test_get_sample_data_invalid_type(self): """Test invalid sample type.""" result = self.discovery.get_sample_data(self.test_file, sample_type="invalid") assert 'error' in result assert 'Invalid sample_type' in result['error'] def test_get_sample_data_invalid_rows(self): """Test invalid row count.""" result = self.discovery.get_sample_data(self.test_file, rows=0) assert 'error' in result assert 'Number of rows must be positive' in result['error'] def test_get_column_info_numeric(self): """Test detailed column info for numeric column.""" result = self.discovery.get_column_info(self.test_file, 'salary') assert result['success'] is True assert result['column_name'] == 'salary' assert result['total_count'] == 5 assert result['null_count'] == 0 assert result['unique_count'] == 5 assert result['min'] == 50000 assert result['max'] == 80000 assert 'mean' in result assert 'median' in result assert 'std' in result assert 'q25' in result assert 'q75' in result assert 'sample_values' in result def test_get_column_info_text(self): """Test detailed column info for text column.""" result = self.discovery.get_column_info(self.test_file, 'name') assert result['success'] is True assert result['column_name'] == 'name' assert result['null_count'] == 1 assert result['unique_count'] == 4 # One null assert 'top_values' in result assert 'sample_values' in result # Should not have numeric stats assert 'mean' not in result assert 'std' not in result def test_get_column_info_nonexistent(self): """Test column info for non-existent column.""" result = self.discovery.get_column_info(self.test_file, 'nonexistent') assert 'error' in result assert 'Column \'nonexistent\' not found' in result['error'] def test_get_dimensions(self): """Test getting dataset dimensions.""" result = self.discovery.get_dimensions(self.test_file) assert result['success'] is True assert result['rows'] == 5 assert result['columns'] == 7 assert result['total_cells'] == 35 assert result['memory_usage_mb'] >= 0 # Small datasets might have very small memory usage assert len(result['column_names']) == 7 assert 'id' in result['column_names'] assert 'dtypes' in result assert result['empty_rows'] == 0 assert result['empty_columns'] == 0 def test_describe_dataset_numeric_only(self): """Test dataset description with numeric columns only.""" result = self.discovery.describe_dataset(self.test_file, include_all=False) assert result['success'] is True assert result['total_rows'] == 5 assert result['total_columns'] == 7 assert 'numeric_summary' in result assert 'categorical_summary' not in result # Check numeric columns are included assert 'age' in result['numeric_summary'] assert 'salary' in result['numeric_summary'] assert 'rating' in result['numeric_summary'] # Check salary statistics salary_stats = result['numeric_summary']['salary'] assert salary_stats['count'] == 5 assert salary_stats['min'] == 50000 assert salary_stats['max'] == 80000 # Check data quality metrics assert 'data_quality' in result assert result['data_quality']['total_missing_values'] > 0 # We have nulls assert result['data_quality']['duplicate_rows'] == 0 assert result['data_quality']['complete_rows'] == 3 # Only 3 rows without nulls def test_describe_dataset_include_all(self): """Test dataset description including categorical columns.""" result = self.discovery.describe_dataset(self.test_file, include_all=True) assert result['success'] is True assert 'numeric_summary' in result assert 'categorical_summary' in result # Check categorical summary assert 'name' in result['categorical_summary'] name_stats = result['categorical_summary']['name'] assert name_stats['count'] == 4 # 4 non-null values assert name_stats['unique'] == 4 def test_file_not_found_errors(self): """Test error handling for non-existent files.""" nonexistent_file = '/nonexistent/file.xlsx' schema_result = self.discovery.get_schema(nonexistent_file) assert 'error' in schema_result sample_result = self.discovery.get_sample_data(nonexistent_file) assert 'error' in sample_result column_result = self.discovery.get_column_info(nonexistent_file, 'test') assert 'error' in column_result dim_result = self.discovery.get_dimensions(nonexistent_file) assert 'error' in dim_result desc_result = self.discovery.describe_dataset(nonexistent_file) assert 'error' in desc_result class TestDataDiscoveryIntegration: """Integration tests using the actual sample data file.""" def setup_method(self): """Set up integration test fixtures.""" self.discovery = DataDiscoveryEngine() self.sample_file = 'data/food_reviews.xlsx' def teardown_method(self): """Clean up after integration tests.""" self.discovery.file_manager.clear_cache() def test_sample_file_schema(self): """Test schema analysis of sample file.""" if os.path.exists(self.sample_file): result = self.discovery.get_schema(self.sample_file) assert result['success'] is True assert result['total_columns'] > 0 assert result['total_rows'] > 0 print(f"Sample file schema: {result['total_rows']} rows, {result['total_columns']} columns") for col in result['schema'][:3]: # Print first 3 columns print(f" {col['column_name']}: {col['semantic_type']} (nulls: {col['null_count']})") else: pytest.skip("Sample file not found") def test_sample_file_sample_data(self): """Test sample data from sample file.""" if os.path.exists(self.sample_file): result = self.discovery.get_sample_data(self.sample_file, rows=3) assert result['success'] is True assert len(result['data']) <= 3 assert len(result['columns']) > 0 print(f"Sample data columns: {result['columns']}") print(f"First row: {result['data'][0] if result['data'] else 'No data'}") else: pytest.skip("Sample file not found") def test_sample_file_description(self): """Test dataset description of sample file.""" if os.path.exists(self.sample_file): result = self.discovery.describe_dataset(self.sample_file, include_all=True) assert result['success'] is True assert 'data_quality' in result print(f"Data quality: {result['data_quality']}") if 'numeric_summary' in result: print(f"Numeric columns: {list(result['numeric_summary'].keys())}") if 'categorical_summary' in result: print(f"Categorical columns: {list(result['categorical_summary'].keys())}") else: pytest.skip("Sample file not found") if __name__ == '__main__': """Run tests directly.""" pytest.main([__file__, '-v'])

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ASH1998/excel-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server