52 lines
1.3 KiB
Python
52 lines
1.3 KiB
Python
# -*- coding: UTF-8 -*-
|
||
"""
|
||
@Project -> File :IoD_data_analysis_tool -> outlierprocessing
|
||
@IDE :PyCharm
|
||
@Author :rengengchen
|
||
@Date :2022/4/26 10:24
|
||
@Desc :
|
||
"""
|
||
from typing import Union
|
||
|
||
import pandas as pd
|
||
|
||
|
||
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
|
||
"""
|
||
MAD = median(|Xi - median(X)|)
|
||
@return pandas.Index
|
||
"""
|
||
x = data.median()
|
||
MC = (data - x).abs().median()
|
||
MAD = MC * constant
|
||
offset = n * MAD
|
||
if isinstance(data, pd.DataFrame):
|
||
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
|
||
else:
|
||
return data.clip(lower=x - offset, upper=x + offset)
|
||
|
||
|
||
def three_sigma(data: pd.Series):
|
||
miu = data.mean()
|
||
sigma = data.std()
|
||
low = miu - 3 * sigma
|
||
up = miu + 3 * sigma
|
||
return data.index[(data < low) | (data > up)]
|
||
|
||
|
||
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
|
||
q = data.quantile(q=[q1, q3])
|
||
IQR = q[q3] - q[q1]
|
||
lower_whisker_limit = q[q1] - k * IQR
|
||
upper_whisker_limit = q[q3] + k * IQR
|
||
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
|
||
|
||
|
||
def regex_match(data: pd.Series, *patterns):
|
||
pattern = '|'.join(patterns)
|
||
return data.index[data.astype(str).str.contains(pattern, regex=True)]
|
||
|
||
|
||
def empty(data: Union[pd.Series, pd.DataFrame]):
|
||
return any(data.isnull())
|