134 lines
4.9 KiB
Python
134 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding:utf-8 -*-
|
||
# file: data_insight
|
||
# author: shenwentao, wangkanglong
|
||
# description:
|
||
# date: 2022-03-30 16:45
|
||
# IDE: PyCharm
|
||
|
||
import pandas as pd
|
||
import datetime
|
||
from typing import List, Union
|
||
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
|
||
|
||
from iod_data_analysis_tool.utils.assertion import assert_range
|
||
|
||
|
||
class DuplicateInsight:
|
||
|
||
@staticmethod
|
||
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
|
||
"""
|
||
用户自定义重复数据的计数
|
||
:param data: 来源数据
|
||
:param subset: 选中列/字段,同pd.DataFrame里的dulplicated函数subset参数
|
||
:param keep: 确定要标记的重复项(如果有)。同pd.DataFrame里的dulplicated函数keep参数
|
||
:return: 返回计数结果
|
||
"""
|
||
result = data.duplicated(subset, keep=keep).sum()
|
||
return pd.DataFrame([result], columns=['duplicate_num'])
|
||
|
||
|
||
class NullInsight:
|
||
|
||
@staticmethod
|
||
def num_null(data, column: str = None) -> pd.DataFrame:
|
||
"""
|
||
用户自定义计数数据中的空值
|
||
:param data: 来源数据
|
||
:param column: 选中列/字段
|
||
:return: 返回计数结果
|
||
"""
|
||
if column is not None:
|
||
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
|
||
else:
|
||
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
|
||
|
||
|
||
class ValidationInsight:
|
||
"""
|
||
自定义验证数据有效性,比如数据里有坏数,针对不同类型的数据限定范围
|
||
"""
|
||
|
||
@staticmethod
|
||
def validation_continuous_range(data: pd.DataFrame, column: str,
|
||
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
|
||
"""
|
||
用户自定义对连续数值型数据进行验证,返回数据在指定范围内外的计数结果
|
||
:param data: 来源数据
|
||
:param column: 选中列/字段
|
||
:param min_val: 范围最小值
|
||
:param max_val: 范围最大值
|
||
:return: 计数结果
|
||
"""
|
||
assert_range(min_val, max_val)
|
||
nums = dict()
|
||
nums['column'] = column
|
||
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
|
||
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
|
||
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
|
||
return pd.DataFrame([nums], index=['result'])
|
||
|
||
@staticmethod
|
||
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
|
||
"""
|
||
用户自定义对离散型数据进行验证,返回数据在指定范围内外的计数结果
|
||
:param data: 来源数据
|
||
:param column: 选中列/字段
|
||
:param values: 用户自定义的离散值,也就是数值所在的"范围"
|
||
:return: 计数结果
|
||
"""
|
||
nums = dict()
|
||
nums['column'] = column
|
||
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
|
||
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
|
||
return pd.DataFrame([nums], index=['result'])
|
||
|
||
@staticmethod
|
||
def validation_date_range(data, column: str, start_date: datetime.date,
|
||
end_date: datetime.date) -> pd.DataFrame:
|
||
"""
|
||
用户自定义对日期型数据范围进行验证,返回数据在指定范围内外的计数结果,前提:数据类型是 datetime.date
|
||
:param data: 来源数据
|
||
:param column: 选中列/字段
|
||
:param start_date: 开始日期
|
||
:param end_date: 结束日期
|
||
:return: 计数结果
|
||
"""
|
||
assert_range(start_date, end_date)
|
||
nums = dict()
|
||
nums['column'] = column
|
||
nums['date_lt_start'] = sum(data[column] < start_date)
|
||
nums['date_gt_end'] = sum(data[column] > end_date)
|
||
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
|
||
return pd.DataFrame([nums], index=['result'])
|
||
|
||
|
||
class TypeInsight:
|
||
"""
|
||
使用户能够检测数据的数据类型是否为自己所预期的
|
||
"""
|
||
|
||
# TODO: 还缺一个timestamp checker
|
||
_checkers = {
|
||
'int': is_integer_dtype,
|
||
'float': is_float_dtype,
|
||
'string': is_string_dtype,
|
||
'bool': is_bool_dtype,
|
||
'datetime': is_datetime64_dtype
|
||
}
|
||
|
||
@staticmethod
|
||
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
|
||
"""
|
||
用户检测数据类型是否为自己所需要的类型
|
||
:param data: 来源数据
|
||
:param column: 选中的列/字段
|
||
:param check_type: 选择检测的数据类型,{'int', 'float', 'string', 'bool', 'datetime'}
|
||
:return: 检测结果
|
||
"""
|
||
flag = True
|
||
if not TypeInsight._checkers[check_type](data[column]):
|
||
flag = False
|
||
return pd.DataFrame([flag], columns=['result'], index=[column])
|