diff --git a/2.py b/2.py deleted file mode 100644 index 5ed5f46..0000000 --- a/2.py +++ /dev/null @@ -1,145 +0,0 @@ -import numpy as np -from scipy import stats -from collections import defaultdict - -def detect_outliers_3sigma(series, threshold=3): - """3倍标准差法检测离群点""" - mean = np.mean(series) - std = np.std(series) - - upper_bound = mean + threshold * std - lower_bound = mean - threshold * std - - outliers = (series > upper_bound) | (series < lower_bound) - outlier_indices = np.where(outliers)[0] - - return { - 'series': series, - 'series_name': series.name if series.name else '序列', - 'mean': mean, - 'std': std, - 'upper_bound': upper_bound, - 'lower_bound': lower_bound, - 'outliers': series[outliers], - 'outlier_indices': outlier_indices, - 'threshold': threshold - } - -def detect_outliers_iqr(series, k=1.5): - """四分位数法检测离群点""" - q1 = series.quantile(0.25) - q3 = series.quantile(0.75) - iqr = q3 - q1 - - upper_bound = q3 + k * iqr - lower_bound = q1 - k * iqr - - outliers = (series > upper_bound) | (series < lower_bound) - outlier_indices = np.where(outliers)[0] - - return { - 'series': series, - 'series_name': series.name if series.name else '序列', - 'q1': q1, - 'q3': q3, - 'iqr': iqr, - 'upper_bound': upper_bound, - 'lower_bound': lower_bound, - 'outliers': series[outliers], - 'outlier_indices': outlier_indices, - 'k': k - } - -def detect_outliers_grubbs(series, alpha=0.05): - """Grubbs法检测离群点""" - values = series.values - n = len(values) - outlier_indices = [] - - while True: - if n <= 2: - break - - mean = np.mean(values) - std = np.std(values) - abs_dev = np.abs(values - mean) - max_idx = np.argmax(abs_dev) - g = abs_dev[max_idx] / std - - t = stats.t.ppf(1 - alpha / (2 * n), n - 2) - critical = (n - 1) / np.sqrt(n) * np.sqrt(t**2 / (n - 2 + t**2)) - - if g > critical: - outlier_indices.append(series.index.get_loc(series.index[max_idx])) - values = np.delete(values, max_idx) - n -= 1 - else: - break - - upper_bound = mean + critical * std - lower_bound = mean - critical * std - - return { - 'series': series, - 'series_name': series.name if series.name else '序列', - 'mean': mean, - 'std': std, - 'upper_bound': upper_bound, - 'lower_bound': lower_bound, - 'outliers': series[outlier_indices], - 'outlier_indices': outlier_indices, - 'alpha': alpha - } - -def detect_outliers_gesd(series, alpha=0.05, max_outliers=None): - """GESD (广义极端学生化偏差) 方法检测离群点""" - values = series.copy() - n = len(values) - if max_outliers is None: - max_outliers = n // 10 # 默认最多检测10%的数据点为离群点 - - outlier_indices = [] - r_values = [] - lambda_values = [] - - for i in range(1, max_outliers + 1): - mean = np.mean(values) - std = np.std(values) - abs_dev = np.abs(values - mean) - max_idx = np.argmax(abs_dev) - r = abs_dev[max_idx] / std - r_values.append(r) - - p = 1 - alpha / (2 * (n - i + 1)) - t = stats.t.ppf(p, n - i - 1) - lambda_val = (n - i) * t / np.sqrt((n - i - 1 + t**2) * (n - i + 1)) - lambda_values.append(lambda_val) - - if r > lambda_val: - original_idx = series.index.get_loc(values.index[max_idx]) - outlier_indices.append(original_idx) - values = values.drop(values.index[max_idx]) - else: - break - - if len(outlier_indices) > 0: - upper_bound = series.iloc[outlier_indices].max() + 0.1 * series.std() - lower_bound = series.iloc[outlier_indices].min() - 0.1 * series.std() - else: - upper_bound = series.mean() + 3 * series.std() - lower_bound = series.mean() - 3 * series.std() - - return { - 'series': series, - 'series_name': series.name if series.name else '序列', - 'mean': np.mean(series), - 'std': np.std(series), - 'upper_bound': upper_bound, - 'lower_bound': lower_bound, - 'outliers': series[outlier_indices], - 'outlier_indices': outlier_indices, - 'alpha': alpha, - 'max_outliers': max_outliers, - 'r_values': r_values, - 'lambda_values': lambda_values - } \ No newline at end of file