util/lib/analysis_package/preprocess/outlier.py

52 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> outlierprocessing
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:24
@Desc
"""
from typing import Union
import pandas as pd
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
"""
MAD = median(|Xi - median(X)|)
@return pandas.Index
"""
x = data.median()
MC = (data - x).abs().median()
MAD = MC * constant
offset = n * MAD
if isinstance(data, pd.DataFrame):
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
else:
return data.clip(lower=x - offset, upper=x + offset)
def three_sigma(data: pd.Series):
miu = data.mean()
sigma = data.std()
low = miu - 3 * sigma
up = miu + 3 * sigma
return data.index[(data < low) | (data > up)]
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
q = data.quantile(q=[q1, q3])
IQR = q[q3] - q[q1]
lower_whisker_limit = q[q1] - k * IQR
upper_whisker_limit = q[q3] + k * IQR
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
def regex_match(data: pd.Series, *patterns):
pattern = '|'.join(patterns)
return data.index[data.astype(str).str.contains(pattern, regex=True)]
def empty(data: Union[pd.Series, pd.DataFrame]):
return any(data.isnull())