import pandas as pd import numpy as np def cleanse_data(df, remove_duplicates, missing_strategy): """ Perform data cleansing on the dataframe. Args: df: pandas DataFrame remove_duplicates: bool, whether to remove duplicate rows missing_strategy: str, 'drop', 'impute_mean', 'impute_median', 'impute_mode' Returns: df_clean: cleaned DataFrame original_shape: tuple (rows, cols) before cleansing cleaned_shape: tuple (rows, cols) after cleansing """ df = df.copy() original_shape = df.shape # Remove duplicates if remove_duplicates: df = df.drop_duplicates() # Handle missing values if missing_strategy == 'drop': df = df.dropna() elif missing_strategy in ['impute_mean', 'impute_median']: for col in df.select_dtypes(include=[np.number]).columns: if missing_strategy == 'impute_mean': df[col] = df[col].fillna(df[col].mean()) elif missing_strategy == 'impute_median': df[col] = df[col].fillna(df[col].median()) elif missing_strategy == 'impute_mode': for col in df.columns: mode_val = df[col].mode() if not mode_val.empty: df[col] = df[col].fillna(mode_val[0]) cleaned_shape = df.shape return df, original_shape, cleaned_shape