util/lib/analysis_package/preprocess/outlier.py

52 lines
1.3 KiB
Python
Raw Normal View History

2024-05-12 12:18:24 +00:00
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> outlierprocessing
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:24
@Desc
"""
from typing import Union
import pandas as pd
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
"""
MAD = median(|Xi - median(X)|)
@return pandas.Index
"""
x = data.median()
MC = (data - x).abs().median()
MAD = MC * constant
offset = n * MAD
if isinstance(data, pd.DataFrame):
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
else:
return data.clip(lower=x - offset, upper=x + offset)
def three_sigma(data: pd.Series):
miu = data.mean()
sigma = data.std()
low = miu - 3 * sigma
up = miu + 3 * sigma
return data.index[(data < low) | (data > up)]
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
q = data.quantile(q=[q1, q3])
IQR = q[q3] - q[q1]
lower_whisker_limit = q[q1] - k * IQR
upper_whisker_limit = q[q3] + k * IQR
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
def regex_match(data: pd.Series, *patterns):
pattern = '|'.join(patterns)
return data.index[data.astype(str).str.contains(pattern, regex=True)]
def empty(data: Union[pd.Series, pd.DataFrame]):
return any(data.isnull())