import numpy as np from scipy import stats from collections import defaultdict def detect_outliers_3sigma(series, threshold=3): """3倍标准差法检测离群点""" mean = np.mean(series) std = np.std(series) upper_bound = mean + threshold * std lower_bound = mean - threshold * std outliers = (series > upper_bound) | (series < lower_bound) outlier_indices = np.where(outliers)[0] return { 'series': series, 'series_name': series.name if series.name else '序列', 'mean': mean, 'std': std, 'upper_bound': upper_bound, 'lower_bound': lower_bound, 'outliers': series[outliers], 'outlier_indices': outlier_indices, 'threshold': threshold } def detect_outliers_iqr(series, k=1.5): """四分位数法检测离群点""" q1 = series.quantile(0.25) q3 = series.quantile(0.75) iqr = q3 - q1 upper_bound = q3 + k * iqr lower_bound = q1 - k * iqr outliers = (series > upper_bound) | (series < lower_bound) outlier_indices = np.where(outliers)[0] return { 'series': series, 'series_name': series.name if series.name else '序列', 'q1': q1, 'q3': q3, 'iqr': iqr, 'upper_bound': upper_bound, 'lower_bound': lower_bound, 'outliers': series[outliers], 'outlier_indices': outlier_indices, 'k': k } def detect_outliers_grubbs(series, alpha=0.05): """Grubbs法检测离群点""" values = series.values n = len(values) outlier_indices = [] while True: if n <= 2: break mean = np.mean(values) std = np.std(values) abs_dev = np.abs(values - mean) max_idx = np.argmax(abs_dev) g = abs_dev[max_idx] / std t = stats.t.ppf(1 - alpha / (2 * n), n - 2) critical = (n - 1) / np.sqrt(n) * np.sqrt(t**2 / (n - 2 + t**2)) if g > critical: outlier_indices.append(series.index.get_loc(series.index[max_idx])) values = np.delete(values, max_idx) n -= 1 else: break upper_bound = mean + critical * std lower_bound = mean - critical * std return { 'series': series, 'series_name': series.name if series.name else '序列', 'mean': mean, 'std': std, 'upper_bound': upper_bound, 'lower_bound': lower_bound, 'outliers': series[outlier_indices], 'outlier_indices': outlier_indices, 'alpha': alpha } def detect_outliers_gesd(series, alpha=0.05, max_outliers=None): """GESD (广义极端学生化偏差) 方法检测离群点""" values = series.copy() n = len(values) if max_outliers is None: max_outliers = n // 10 # 默认最多检测10%的数据点为离群点 outlier_indices = [] r_values = [] lambda_values = [] for i in range(1, max_outliers + 1): mean = np.mean(values) std = np.std(values) abs_dev = np.abs(values - mean) max_idx = np.argmax(abs_dev) r = abs_dev[max_idx] / std r_values.append(r) p = 1 - alpha / (2 * (n - i + 1)) t = stats.t.ppf(p, n - i - 1) lambda_val = (n - i) * t / np.sqrt((n - i - 1 + t**2) * (n - i + 1)) lambda_values.append(lambda_val) if r > lambda_val: original_idx = series.index.get_loc(values.index[max_idx]) outlier_indices.append(original_idx) values = values.drop(values.index[max_idx]) else: break if len(outlier_indices) > 0: upper_bound = series.iloc[outlier_indices].max() + 0.1 * series.std() lower_bound = series.iloc[outlier_indices].min() - 0.1 * series.std() else: upper_bound = series.mean() + 3 * series.std() lower_bound = series.mean() - 3 * series.std() return { 'series': series, 'series_name': series.name if series.name else '序列', 'mean': np.mean(series), 'std': np.std(series), 'upper_bound': upper_bound, 'lower_bound': lower_bound, 'outliers': series[outlier_indices], 'outlier_indices': outlier_indices, 'alpha': alpha, 'max_outliers': max_outliers, 'r_values': r_values, 'lambda_values': lambda_values }