数据清洗与预处理1. 技术分析1.1 数据清洗概述数据清洗是数据分析的基础步骤数据质量问题 缺失值: 数据不完整 异常值: 数据不合理 重复值: 数据重复 不一致: 格式不统一 清洗步骤: 检测问题 处理问题 验证结果1.2 数据预处理预处理技术 数据转换: 格式转换 特征缩放: 标准化/归一化 编码处理: 类别编码 特征选择: 选择重要特征 预处理目标: 提高数据质量 适配模型要求 提升模型性能1.3 数据质量问题分类问题类型表现处理方法缺失值NaN/空值删除/填充异常值离群点删除/修正重复值相同记录删除格式问题类型错误转换2. 核心功能实现2.1 缺失值处理import pandas as pd import numpy as np class MissingValueHandler: def __init__(self, strategymean): self.strategy strategy self.fill_values {} def fit(self, df): if self.strategy mean: self.fill_values df.mean(numeric_onlyTrue).to_dict() elif self.strategy median: self.fill_values df.median(numeric_onlyTrue).to_dict() elif self.strategy mode: self.fill_values df.mode().iloc[0].to_dict() elif self.strategy constant: for col in df.columns: if df[col].dtype object: self.fill_values[col] Unknown else: self.fill_values[col] 0 def transform(self, df): return df.fillna(self.fill_values) def fit_transform(self, df): self.fit(df) return self.transform(df) # 使用示例 handler MissingValueHandler(strategymedian) cleaned_df handler.fit_transform(df)2.2 异常值检测class OutlierDetector: def __init__(self, methodiqr, threshold1.5): self.method method self.threshold threshold self.bounds {} def fit(self, df, columnsNone): if columns is None: columns df.select_dtypes(include[np.number]).columns for col in columns: if self.method iqr: q1 df[col].quantile(0.25) q3 df[col].quantile(0.75) iqr q3 - q1 self.bounds[col] { lower: q1 - self.threshold * iqr, upper: q3 self.threshold * iqr } elif self.method zscore: mean df[col].mean() std df[col].std() self.bounds[col] { lower: mean - self.threshold * std, upper: mean self.threshold * std } def detect(self, df): outliers pd.DataFrame() for col, bounds in self.bounds.items(): mask (df[col] bounds[lower]) | (df[col] bounds[upper]) outliers pd.concat([outliers, df[mask]]) return outliers.drop_duplicates() def remove_outliers(self, df): cleaned_df df.copy() for col, bounds in self.bounds.items(): mask (df[col] bounds[lower]) (df[col] bounds[upper]) cleaned_df cleaned_df[mask] return cleaned_df2.3 特征缩放class FeatureScaler: def __init__(self, methodstandardization): self.method method self.params {} def fit(self, df, columnsNone): if columns is None: columns df.select_dtypes(include[np.number]).columns if self.method standardization: for col in columns: self.params[col] { mean: df[col].mean(), std: df[col].std() } elif self.method minmax: for col in columns: self.params[col] { min: df[col].min(), max: df[col].max() } elif self.method normalization: for col in columns: self.params[col] { mean: df[col].mean(), max_abs: df[col].abs().max() } def transform(self, df): transformed df.copy() for col, params in self.params.items(): if self.method standardization: transformed[col] (df[col] - params[mean]) / params[std] elif self.method minmax: transformed[col] (df[col] - params[min]) / (params[max] - params[min]) elif self.method normalization: transformed[col] df[col] / params[max_abs] return transformed def fit_transform(self, df): self.fit(df) return self.transform(df)2.4 类别编码class CategoricalEncoder: def __init__(self, methodonehot): self.method method self.mappings {} def fit(self, df, columnsNone): if columns is None: columns df.select_dtypes(include[object]).columns for col in columns: unique_vals df[col].unique() if self.method onehot: self.mappings[col] {val: i for i, val in enumerate(unique_vals)} elif self.method label: self.mappings[col] {val: i for i, val in enumerate(sorted(unique_vals))} elif self.method frequency: freq df[col].value_counts(normalizeTrue) self.mappings[col] freq.to_dict() def transform(self, df): transformed df.copy() for col, mapping in self.mappings.items(): if self.method onehot: for val in mapping: transformed[f{col}_{val}] (df[col] val).astype(int) transformed transformed.drop(col, axis1) elif self.method label: transformed[col] df[col].map(mapping) elif self.method frequency: transformed[col] df[col].map(mapping) return transformed def fit_transform(self, df): self.fit(df) return self.transform(df)3. 性能对比3.1 缺失值处理对比方法优点缺点适用场景删除简单数据丢失缺失少均值填充保留数据低估方差正态分布中位数填充抗异常值信息损失偏态分布3.2 异常值检测对比方法检测能力复杂度适用数据IQR稳健低任意分布Z-score敏感低正态分布DBSCAN多维度高复杂数据3.3 特征缩放对比方法范围适用场景标准化(-∞, ∞)回归、SVM归一化[-1, 1]神经网络最小最大[0, 1]图像数据4. 最佳实践4.1 数据清洗流程def data_cleaning_pipeline(df): # 1. 检查重复值 initial_rows len(df) df df.drop_duplicates() print(fRemoved {initial_rows - len(df)} duplicates) # 2. 处理缺失值 handler MissingValueHandler(strategymedian) df handler.fit_transform(df) print(fHandled missing values) # 3. 检测异常值 detector OutlierDetector(methodiqr) detector.fit(df) outliers detector.detect(df) print(fFound {len(outliers)} outliers) # 4. 特征缩放 scaler FeatureScaler(methodstandardization) df scaler.fit_transform(df) print(fApplied feature scaling) # 5. 类别编码 encoder CategoricalEncoder(methodonehot) df encoder.fit_transform(df) print(fApplied categorical encoding) return df4.2 数据验证def validate_cleaning(df, original_df): checks [ (无缺失值, df.isnull().sum().sum() 0), (无重复值, df.duplicated().sum() 0), (数值范围合理, (df -3).all().all() and (df 3).all().all()), (行数合理, len(df) len(original_df) * 0.8) ] print(数据清洗验证:) for check_name, result in checks: status ✓ if result else ✗ print(f{status} {check_name})5. 总结数据清洗是数据分析的关键步骤缺失值处理填充或删除异常值检测识别并处理离群点特征缩放标准化或归一化类别编码转换分类变量对比数据如下IQR方法最稳健中位数填充适合偏态数据标准化适合大多数ML算法推荐使用Pipeline组合多个步骤数据质量决定分析结果的可靠性必须重视数据清洗工作。