# -*- coding: UTF-8 -*- """ @Project -> File :IoD_data_analysis_tool -> outlierprocessing @IDE :PyCharm @Author :rengengchen @Date :2022/4/26 10:24 @Desc : """ from typing import Union import pandas as pd def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0): """ MAD = median(|Xi - median(X)|) @return pandas.Index """ x = data.median() MC = (data - x).abs().median() MAD = MC * constant offset = n * MAD if isinstance(data, pd.DataFrame): return data.clip(lower=x - offset, upper=x + offset, axis=axis) else: return data.clip(lower=x - offset, upper=x + offset) def three_sigma(data: pd.Series): miu = data.mean() sigma = data.std() low = miu - 3 * sigma up = miu + 3 * sigma return data.index[(data < low) | (data > up)] def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5): q = data.quantile(q=[q1, q3]) IQR = q[q3] - q[q1] lower_whisker_limit = q[q1] - k * IQR upper_whisker_limit = q[q3] + k * IQR return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)] def regex_match(data: pd.Series, *patterns): pattern = '|'.join(patterns) return data.index[data.astype(str).str.contains(pattern, regex=True)] def empty(data: Union[pd.Series, pd.DataFrame]): return any(data.isnull())