134 lines
4.9 KiB
Python
134 lines
4.9 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
# -*- coding:utf-8 -*-
|
|||
|
# file: data_insight
|
|||
|
# author: shenwentao, wangkanglong
|
|||
|
# description:
|
|||
|
# date: 2022-03-30 16:45
|
|||
|
# IDE: PyCharm
|
|||
|
|
|||
|
import pandas as pd
|
|||
|
import datetime
|
|||
|
from typing import List, Union
|
|||
|
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
|
|||
|
|
|||
|
from iod_data_analysis_tool.utils.assertion import assert_range
|
|||
|
|
|||
|
|
|||
|
class DuplicateInsight:
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户自定义重复数据的计数
|
|||
|
:param data: 来源数据
|
|||
|
:param subset: 选中列/字段,同pd.DataFrame里的dulplicated函数subset参数
|
|||
|
:param keep: 确定要标记的重复项(如果有)。同pd.DataFrame里的dulplicated函数keep参数
|
|||
|
:return: 返回计数结果
|
|||
|
"""
|
|||
|
result = data.duplicated(subset, keep=keep).sum()
|
|||
|
return pd.DataFrame([result], columns=['duplicate_num'])
|
|||
|
|
|||
|
|
|||
|
class NullInsight:
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def num_null(data, column: str = None) -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户自定义计数数据中的空值
|
|||
|
:param data: 来源数据
|
|||
|
:param column: 选中列/字段
|
|||
|
:return: 返回计数结果
|
|||
|
"""
|
|||
|
if column is not None:
|
|||
|
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
|
|||
|
else:
|
|||
|
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
|
|||
|
|
|||
|
|
|||
|
class ValidationInsight:
|
|||
|
"""
|
|||
|
自定义验证数据有效性,比如数据里有坏数,针对不同类型的数据限定范围
|
|||
|
"""
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def validation_continuous_range(data: pd.DataFrame, column: str,
|
|||
|
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户自定义对连续数值型数据进行验证,返回数据在指定范围内外的计数结果
|
|||
|
:param data: 来源数据
|
|||
|
:param column: 选中列/字段
|
|||
|
:param min_val: 范围最小值
|
|||
|
:param max_val: 范围最大值
|
|||
|
:return: 计数结果
|
|||
|
"""
|
|||
|
assert_range(min_val, max_val)
|
|||
|
nums = dict()
|
|||
|
nums['column'] = column
|
|||
|
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
|
|||
|
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
|
|||
|
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
|
|||
|
return pd.DataFrame([nums], index=['result'])
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户自定义对离散型数据进行验证,返回数据在指定范围内外的计数结果
|
|||
|
:param data: 来源数据
|
|||
|
:param column: 选中列/字段
|
|||
|
:param values: 用户自定义的离散值,也就是数值所在的"范围"
|
|||
|
:return: 计数结果
|
|||
|
"""
|
|||
|
nums = dict()
|
|||
|
nums['column'] = column
|
|||
|
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
|
|||
|
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
|
|||
|
return pd.DataFrame([nums], index=['result'])
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def validation_date_range(data, column: str, start_date: datetime.date,
|
|||
|
end_date: datetime.date) -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户自定义对日期型数据范围进行验证,返回数据在指定范围内外的计数结果,前提:数据类型是 datetime.date
|
|||
|
:param data: 来源数据
|
|||
|
:param column: 选中列/字段
|
|||
|
:param start_date: 开始日期
|
|||
|
:param end_date: 结束日期
|
|||
|
:return: 计数结果
|
|||
|
"""
|
|||
|
assert_range(start_date, end_date)
|
|||
|
nums = dict()
|
|||
|
nums['column'] = column
|
|||
|
nums['date_lt_start'] = sum(data[column] < start_date)
|
|||
|
nums['date_gt_end'] = sum(data[column] > end_date)
|
|||
|
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
|
|||
|
return pd.DataFrame([nums], index=['result'])
|
|||
|
|
|||
|
|
|||
|
class TypeInsight:
|
|||
|
"""
|
|||
|
使用户能够检测数据的数据类型是否为自己所预期的
|
|||
|
"""
|
|||
|
|
|||
|
# TODO: 还缺一个timestamp checker
|
|||
|
_checkers = {
|
|||
|
'int': is_integer_dtype,
|
|||
|
'float': is_float_dtype,
|
|||
|
'string': is_string_dtype,
|
|||
|
'bool': is_bool_dtype,
|
|||
|
'datetime': is_datetime64_dtype
|
|||
|
}
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
|
|||
|
"""
|
|||
|
用户检测数据类型是否为自己所需要的类型
|
|||
|
:param data: 来源数据
|
|||
|
:param column: 选中的列/字段
|
|||
|
:param check_type: 选择检测的数据类型,{'int', 'float', 'string', 'bool', 'datetime'}
|
|||
|
:return: 检测结果
|
|||
|
"""
|
|||
|
flag = True
|
|||
|
if not TypeInsight._checkers[check_type](data[column]):
|
|||
|
flag = False
|
|||
|
return pd.DataFrame([flag], columns=['result'], index=[column])
|