52 lines
1.3 KiB
Python
52 lines
1.3 KiB
Python
|
# -*- coding: UTF-8 -*-
|
|||
|
"""
|
|||
|
@Project -> File :IoD_data_analysis_tool -> outlierprocessing
|
|||
|
@IDE :PyCharm
|
|||
|
@Author :rengengchen
|
|||
|
@Date :2022/4/26 10:24
|
|||
|
@Desc :
|
|||
|
"""
|
|||
|
from typing import Union
|
|||
|
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
|
|||
|
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
|
|||
|
"""
|
|||
|
MAD = median(|Xi - median(X)|)
|
|||
|
@return pandas.Index
|
|||
|
"""
|
|||
|
x = data.median()
|
|||
|
MC = (data - x).abs().median()
|
|||
|
MAD = MC * constant
|
|||
|
offset = n * MAD
|
|||
|
if isinstance(data, pd.DataFrame):
|
|||
|
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
|
|||
|
else:
|
|||
|
return data.clip(lower=x - offset, upper=x + offset)
|
|||
|
|
|||
|
|
|||
|
def three_sigma(data: pd.Series):
|
|||
|
miu = data.mean()
|
|||
|
sigma = data.std()
|
|||
|
low = miu - 3 * sigma
|
|||
|
up = miu + 3 * sigma
|
|||
|
return data.index[(data < low) | (data > up)]
|
|||
|
|
|||
|
|
|||
|
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
|
|||
|
q = data.quantile(q=[q1, q3])
|
|||
|
IQR = q[q3] - q[q1]
|
|||
|
lower_whisker_limit = q[q1] - k * IQR
|
|||
|
upper_whisker_limit = q[q3] + k * IQR
|
|||
|
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
|
|||
|
|
|||
|
|
|||
|
def regex_match(data: pd.Series, *patterns):
|
|||
|
pattern = '|'.join(patterns)
|
|||
|
return data.index[data.astype(str).str.contains(pattern, regex=True)]
|
|||
|
|
|||
|
|
|||
|
def empty(data: Union[pd.Series, pd.DataFrame]):
|
|||
|
return any(data.isnull())
|