util/lib/analysis_package/preprocess/data_insight.py

134 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# file: data_insight
# author: shenwentao, wangkanglong
# description:
# date: 2022-03-30 16:45
# IDE: PyCharm
import pandas as pd
import datetime
from typing import List, Union
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
from iod_data_analysis_tool.utils.assertion import assert_range
class DuplicateInsight:
@staticmethod
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
"""
用户自定义重复数据的计数
:param data: 来源数据
:param subset: 选中列/字段同pd.DataFrame里的dulplicated函数subset参数
:param keep: 确定要标记的重复项如果有。同pd.DataFrame里的dulplicated函数keep参数
:return: 返回计数结果
"""
result = data.duplicated(subset, keep=keep).sum()
return pd.DataFrame([result], columns=['duplicate_num'])
class NullInsight:
@staticmethod
def num_null(data, column: str = None) -> pd.DataFrame:
"""
用户自定义计数数据中的空值
:param data: 来源数据
:param column: 选中列/字段
:return: 返回计数结果
"""
if column is not None:
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
else:
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
class ValidationInsight:
"""
自定义验证数据有效性,比如数据里有坏数,针对不同类型的数据限定范围
"""
@staticmethod
def validation_continuous_range(data: pd.DataFrame, column: str,
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
"""
用户自定义对连续数值型数据进行验证,返回数据在指定范围内外的计数结果
:param data: 来源数据
:param column: 选中列/字段
:param min_val: 范围最小值
:param max_val: 范围最大值
:return: 计数结果
"""
assert_range(min_val, max_val)
nums = dict()
nums['column'] = column
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
return pd.DataFrame([nums], index=['result'])
@staticmethod
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
"""
用户自定义对离散型数据进行验证,返回数据在指定范围内外的计数结果
:param data: 来源数据
:param column: 选中列/字段
:param values: 用户自定义的离散值,也就是数值所在的"范围"
:return: 计数结果
"""
nums = dict()
nums['column'] = column
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
return pd.DataFrame([nums], index=['result'])
@staticmethod
def validation_date_range(data, column: str, start_date: datetime.date,
end_date: datetime.date) -> pd.DataFrame:
"""
用户自定义对日期型数据范围进行验证,返回数据在指定范围内外的计数结果,前提:数据类型是 datetime.date
:param data: 来源数据
:param column: 选中列/字段
:param start_date: 开始日期
:param end_date: 结束日期
:return: 计数结果
"""
assert_range(start_date, end_date)
nums = dict()
nums['column'] = column
nums['date_lt_start'] = sum(data[column] < start_date)
nums['date_gt_end'] = sum(data[column] > end_date)
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
return pd.DataFrame([nums], index=['result'])
class TypeInsight:
"""
使用户能够检测数据的数据类型是否为自己所预期的
"""
# TODO: 还缺一个timestamp checker
_checkers = {
'int': is_integer_dtype,
'float': is_float_dtype,
'string': is_string_dtype,
'bool': is_bool_dtype,
'datetime': is_datetime64_dtype
}
@staticmethod
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
"""
用户检测数据类型是否为自己所需要的类型
:param data: 来源数据
:param column: 选中的列/字段
:param check_type: 选择检测的数据类型,{'int', 'float', 'string', 'bool', 'datetime'}
:return: 检测结果
"""
flag = True
if not TypeInsight._checkers[check_type](data[column]):
flag = False
return pd.DataFrame([flag], columns=['result'], index=[column])