init
This commit is contained in:
commit
707997d4e1
|
@ -0,0 +1,8 @@
|
||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
|
@ -0,0 +1,12 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredErrors">
|
||||||
|
<list>
|
||||||
|
<option value="N802" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
</profile>
|
||||||
|
</component>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/utils.iml" filepath="$PROJECT_DIR$/.idea/utils.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
|
@ -0,0 +1,5 @@
|
||||||
|
模块职责:
|
||||||
|
1. continuous:针对数值型数据进行特征分析
|
||||||
|
2. categorical:针对离散型数据进行特征分析
|
||||||
|
3. timeseries:对时序数据的分析方法
|
||||||
|
4. pre-process:解析配置文件,在数据进入下一步前进行一定的预处理(如补充空值、采样等)
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
'''
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :__init__.py.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/8/3 17:07
|
||||||
|
'''
|
|
@ -0,0 +1,30 @@
|
||||||
|
数值模块:
|
||||||
|
针对离散型数据进行特征分析
|
||||||
|
|
||||||
|
分析方法:
|
||||||
|
|
||||||
|
1> 描述性统计:
|
||||||
|
- 记录数据中该列包含的分类
|
||||||
|
|
||||||
|
- 分类个数
|
||||||
|
|
||||||
|
- 频数表
|
||||||
|
- 列联表
|
||||||
|
|
||||||
|
2> 卡方独立性检验
|
||||||
|
|
||||||
|
3> 信息熵
|
||||||
|
|
||||||
|
4> 互信息
|
||||||
|
|
||||||
|
功能:
|
||||||
|
|
||||||
|
多列离散数据循环进行数据分析
|
||||||
|
|
||||||
|
运行环境:
|
||||||
|
python3.7.10以上
|
||||||
|
- numpy
|
||||||
|
- pandas
|
||||||
|
- matplotlib
|
||||||
|
- sklearn
|
||||||
|
- scipy.stats
|
|
@ -0,0 +1,8 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/7/4 16:34
|
||||||
|
@Desc :
|
||||||
|
"""
|
|
@ -0,0 +1,180 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2022/3/17 17:36
|
||||||
|
# @Author : Leng Yang
|
||||||
|
# @FileName: categorical_process.py
|
||||||
|
# @Software: PyCharm
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn import metrics
|
||||||
|
from scipy.stats import chi2_contingency, chi2
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CategorySelfDescribe(object):
|
||||||
|
"""
|
||||||
|
描述性统计量
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def category_describe(data: pd.Series) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
描述该列数据包含的分类名称和分类种类数量
|
||||||
|
:param data: 输入数据,格式为pd.Series
|
||||||
|
:return: pd.DataFrame, 返回dataframe形式包含分类名称列表和分类种类数量
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
|
||||||
|
>>> CategorySelfDescribe().category_describe('天气')
|
||||||
|
categories types
|
||||||
|
0 [晴, 阴, 雨] 3.0
|
||||||
|
"""
|
||||||
|
results = pd.DataFrame()
|
||||||
|
results = results.append({'categories': data.unique(), 'types': len(data.unique())}, ignore_index=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def category_frequency(data: pd.Series) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
频数表
|
||||||
|
:param data: 输入数据,格式为pd.Series
|
||||||
|
:return: pd.DataFrame, 返回频数表
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨','雨','雨','阴','晴','晴','雨','晴','阴','阴','雨'],
|
||||||
|
'温度':['高','高','高','低','低','低','低','低','低','低','低','低','高','低']})
|
||||||
|
>>> CategorySelfDescribe().category_frequency('天气')
|
||||||
|
unique_values count frequency
|
||||||
|
0 晴 5 0.357143
|
||||||
|
1 雨 5 0.357143
|
||||||
|
2 阴 4 0.285714
|
||||||
|
|
||||||
|
"""
|
||||||
|
df_freq = data.value_counts(ascending=False).rename_axis('unique_values').reset_index(name='count')
|
||||||
|
df_freq['frequency'] = df_freq['count'] / len(data)
|
||||||
|
return df_freq
|
||||||
|
|
||||||
|
|
||||||
|
class CategorySelfAnalyse(object):
|
||||||
|
"""
|
||||||
|
对单列分类数据进行统计分析
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def entropy(data: pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
计算信息熵
|
||||||
|
:param data: 输入数据,格式为pd.Series
|
||||||
|
:return: float, 信息熵
|
||||||
|
"""
|
||||||
|
prob = pd.value_counts(data) / len(data)
|
||||||
|
return sum(np.log2(prob) * prob * (-1))
|
||||||
|
|
||||||
|
|
||||||
|
class CategoryMutualDescribe(object):
|
||||||
|
"""
|
||||||
|
对两列不同的分类数据进行描述性统计
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def crosstab(row_data: pd.Series, col_data: pd.Series) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
对两列不同的分类数据进行列联表分析
|
||||||
|
:param row_data: categorical数据1, 数据1分类作为列联表的行
|
||||||
|
:param col_data: categorical数据2, 数据2分类作为列联表的列
|
||||||
|
:return: pd.DataFrame, 列联表
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
|
||||||
|
>>> CategoryMutualDescribe().crosstab('天气','温度')
|
||||||
|
温度 高 低
|
||||||
|
天气
|
||||||
|
晴 2 0
|
||||||
|
阴 1 0
|
||||||
|
雨 0 1
|
||||||
|
"""
|
||||||
|
return pd.crosstab(row_data, col_data)
|
||||||
|
|
||||||
|
|
||||||
|
class MutualCategoricalAnalyse(object):
|
||||||
|
"""
|
||||||
|
对两列分类数据进行统计分析
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def info_gain(df: pd.DataFrame, attr_col: str, data_col: str) -> float:
|
||||||
|
"""
|
||||||
|
计算信息增益: Gain(D,A) = Ent(D) - Ent(D|A)
|
||||||
|
使用某个特征A划分数据集D
|
||||||
|
:param df_data: 输入数据,格式为dataframe
|
||||||
|
:param attr_col: 特征数据列名
|
||||||
|
:param data_col: 数据集列名
|
||||||
|
:return: float, 信息增益
|
||||||
|
"""
|
||||||
|
# e: 条件信息熵
|
||||||
|
e1 = df.groupby(attr_col).apply(lambda x: CategorySelfAnalyse.entropy(x[data_col]))
|
||||||
|
p1 = pd.value_counts(df[attr_col]) / len(df[attr_col]) # p(x)
|
||||||
|
e2 = sum(e1 * p1) # Ent(D|A)
|
||||||
|
return CategorySelfAnalyse.entropy(df[data_col]) - e2
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normalized_mutual_information(data1: pd.Series, data2: pd.Series) -> float:
|
||||||
|
"""
|
||||||
|
Mutual Information between two clusterings. The Mutual Information is a measure of the similarity
|
||||||
|
between two labels of the same data.
|
||||||
|
Normalized Mutual Information (NMI) is a normalization of the Mutual
|
||||||
|
Information (MI) score to scale the results between 0 (no mutual
|
||||||
|
information) and 1 (perfect correlation).
|
||||||
|
:param df_data: 输入数据,格式为dataframe
|
||||||
|
:param data1: 分类数据1
|
||||||
|
:param data2: 分类数据2
|
||||||
|
:return: nmi : float, score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
|
||||||
|
"""
|
||||||
|
return metrics.normalized_mutual_info_score(data1, data2)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def chi2_independence(data1: pd.Series, data2: pd.Series, alpha=0.05) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
卡方独立性检验
|
||||||
|
:param alpha: 置信度,用来确定临界值
|
||||||
|
:param data1: categroical数据1
|
||||||
|
:param data2: categorical数据2
|
||||||
|
:return: pd.DataFrame,内容如下:
|
||||||
|
g: 卡方值,也就是统计量
|
||||||
|
p: P值(统计学名词),与置信度对比,也可进行假设检验,P值小于置信度,即可拒绝原假设
|
||||||
|
dof: 自由度
|
||||||
|
re: 判读变量,1表示拒绝原假设,0表示接受原假设
|
||||||
|
expctd: 原数据数组同维度的对应理论值
|
||||||
|
"""
|
||||||
|
data = CategoryMutualDescribe.crosstab(data1, data2)
|
||||||
|
result = pd.DataFrame(columns=['g', 'p', 'dof', 'expctd'])
|
||||||
|
g, p, dof, expctd = chi2_contingency(data)
|
||||||
|
result = result.append({'g': g, 'p': p, 'dof': dof, 'expctd': expctd}, ignore_index=True)
|
||||||
|
if dof == 0:
|
||||||
|
raise ValueError('自由度应该大于等于1')
|
||||||
|
elif dof == 1:
|
||||||
|
cv = chi2.isf(alpha * 0.5, dof) # critical value
|
||||||
|
else:
|
||||||
|
cv = chi2.isf(alpha * 0.5, dof - 1)
|
||||||
|
|
||||||
|
if g > cv:
|
||||||
|
result.loc[0, 're'] = 1 # 表示拒绝原假设
|
||||||
|
else:
|
||||||
|
result.loc[0, 're'] = 0 # 表示接受原假设
|
||||||
|
return result
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :__init__.py.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/8/5 11:52
|
||||||
|
"""
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :__init__.py.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/8/5 11:52
|
||||||
|
"""
|
|
@ -0,0 +1,127 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :producer_consumer.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/8/5 11:53
|
||||||
|
"""
|
||||||
|
import multiprocessing
|
||||||
|
from typing import Iterable, Callable
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class Stop:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractPCConcurrencySystem:
|
||||||
|
"""
|
||||||
|
@todo 对启动进程的维护
|
||||||
|
@todo 进程数量
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_producer: int = 1, num_consumer: int = 1, num_callback: int = 0,
|
||||||
|
len_task_queue: int = 0, len_result_queue: int = 0, len_callback_queue: int = 0,
|
||||||
|
producer_lock=None, consumer_lock=None, callback_lock=None,
|
||||||
|
meta=None, enable_progressbar=False, num_total_result=None):
|
||||||
|
self.task_queue = multiprocessing.Queue(len_task_queue)
|
||||||
|
|
||||||
|
self.num_producer = num_producer
|
||||||
|
self.num_consumer = num_consumer
|
||||||
|
self.num_callback = num_callback
|
||||||
|
self.producer_lock = producer_lock or multiprocessing.Lock()
|
||||||
|
self.consumer_lock = consumer_lock or multiprocessing.Lock()
|
||||||
|
self.meta = meta
|
||||||
|
self.enable_progressbar = enable_progressbar
|
||||||
|
if enable_progressbar and self.num_callback == 0:
|
||||||
|
self.num_callback = 1
|
||||||
|
self.result_queue = multiprocessing.Queue(len_result_queue)
|
||||||
|
if self.num_callback:
|
||||||
|
self.callback_lock = callback_lock or multiprocessing.Lock()
|
||||||
|
self.num_total_result = num_total_result
|
||||||
|
self.callback_queue = multiprocessing.Queue(len_callback_queue)
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
return self.callback_queue.get()
|
||||||
|
|
||||||
|
def produce(self):
|
||||||
|
"""
|
||||||
|
Must return an iterable object or a Stop object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def consume(self, consumer_params):
|
||||||
|
"""
|
||||||
|
@return: task result or Stop()
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def callback(self, result):
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _produce(self):
|
||||||
|
producer = self.produce()
|
||||||
|
if isinstance(producer, Iterable):
|
||||||
|
for params in producer:
|
||||||
|
self.task_queue.put(params, block=True)
|
||||||
|
stop = Stop()
|
||||||
|
for _ in range(self.num_consumer):
|
||||||
|
self.task_queue.put(stop, block=True)
|
||||||
|
elif isinstance(producer, Callable):
|
||||||
|
while True:
|
||||||
|
task = producer()
|
||||||
|
if isinstance(task, Stop):
|
||||||
|
break
|
||||||
|
self.task_queue.put(task, block=True)
|
||||||
|
|
||||||
|
def _consume(self):
|
||||||
|
consumer_params = self.task_queue.get(block=True)
|
||||||
|
while not isinstance(consumer_params, Stop):
|
||||||
|
info = self.consume(consumer_params)
|
||||||
|
self.result_queue.put(info)
|
||||||
|
consumer_params = self.task_queue.get(block=True)
|
||||||
|
self.result_queue.put(Stop())
|
||||||
|
|
||||||
|
def _callback(self):
|
||||||
|
if self.enable_progressbar:
|
||||||
|
bar = tqdm(total=self.num_total_result)
|
||||||
|
over_flag = 0
|
||||||
|
while over_flag < self.num_consumer:
|
||||||
|
result = self.result_queue.get(block=True)
|
||||||
|
if isinstance(result, Stop):
|
||||||
|
over_flag += 1
|
||||||
|
else:
|
||||||
|
callback = self.callback(result)
|
||||||
|
self.callback_queue.put(callback)
|
||||||
|
if self.enable_progressbar:
|
||||||
|
bar.update(1)
|
||||||
|
else:
|
||||||
|
if self.enable_progressbar:
|
||||||
|
bar.close()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
consumers = []
|
||||||
|
callbackers = []
|
||||||
|
# 创建并启动生产者
|
||||||
|
for i in range(self.num_producer):
|
||||||
|
multiprocessing.Process(target=self._produce, name=f'producer_{i}').start()
|
||||||
|
# 创建并启动消费者
|
||||||
|
for i in range(self.num_consumer):
|
||||||
|
p = multiprocessing.Process(target=self._consume, name=f'consumer_{i}')
|
||||||
|
consumers.append(p)
|
||||||
|
p.start()
|
||||||
|
# 处理结果
|
||||||
|
if self.num_callback:
|
||||||
|
for i in range(self.num_callback):
|
||||||
|
p = multiprocessing.Process(target=self._callback, name=f'callback_{i}')
|
||||||
|
callbackers.append(p)
|
||||||
|
p.start()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.task_queue.close()
|
||||||
|
self.result_queue.close()
|
||||||
|
self.callback_queue.close()
|
|
@ -0,0 +1,28 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :distribute_task.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/8/8 16:55
|
||||||
|
"""
|
||||||
|
import math
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
|
||||||
|
def equally_distributing_task(target, tasks, *args, results=None, num_processors=8):
|
||||||
|
len_tasks = len(tasks)
|
||||||
|
process_offset = math.ceil(len_tasks / num_processors)
|
||||||
|
for i in range(num_processors):
|
||||||
|
sub_tasks = tasks[i * process_offset: (i + 1) * process_offset]
|
||||||
|
if sub_tasks:
|
||||||
|
if results:
|
||||||
|
multiprocessing.Process(target=target,
|
||||||
|
args=(sub_tasks, results, *args)).start()
|
||||||
|
else:
|
||||||
|
multiprocessing.Process(target=target,
|
||||||
|
args=(sub_tasks, *args)).start()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return results
|
|
@ -0,0 +1,48 @@
|
||||||
|
CrimeRate,Youth,Southern,Education,ExpenditureYear0,LabourForce,Males,MoreMales,StateSize,YouthUnemployment,MatureUnemployment,HighYouthUnemploy,Wage,BelowWage,CrimeRate10,Youth10,Education10,ExpenditureYear10,LabourForce10,Males10,MoreMales10,StateSize10,YouthUnemploy10,MatureUnemploy10,HighYouthUnemploy10,Wage10,BelowWage10
|
||||||
|
45.5,135,0,12.4,69,540,965,0,6,80,22,1,564,139,26.5,135,12.5,71,564,974,0,6,82,20,1,632,142
|
||||||
|
52.3,140,0,10.9,55,535,1045,1,6,135,40,1,453,200,35.9,135,10.9,54,540,1039,1,7,138,39,1,521,210
|
||||||
|
56.6,157,1,11.2,47,512,962,0,22,97,34,0,288,276,37.1,153,11,44,529,959,0,24,98,33,0,359,256
|
||||||
|
60.3,139,1,11.9,46,480,968,0,19,135,53,0,457,249,42.7,139,11.8,41,497,983,0,20,131,50,0,510,235
|
||||||
|
64.2,126,0,12.2,106,599,989,0,40,78,25,1,593,171,46.7,125,12.2,97,602,989,0,42,79,24,1,660,162
|
||||||
|
67.6,128,0,13.5,67,624,972,0,28,77,25,1,507,206,47.9,128,13.8,60,621,983,0,28,81,24,1,571,199
|
||||||
|
70.5,130,0,14.1,63,641,984,0,14,70,21,1,486,196,50.6,153,14.1,57,641,993,0,14,71,23,1,556,176
|
||||||
|
73.2,143,0,12.9,66,537,977,0,10,114,35,1,487,166,55.9,143,13,63,549,973,0,11,119,36,1,561,168
|
||||||
|
75,141,0,12.9,56,523,968,0,4,107,37,0,489,170,61.8,153,12.9,54,538,968,0,5,110,36,1,550,126
|
||||||
|
78.1,133,0,11.4,51,599,1024,1,7,99,27,1,425,225,65.4,134,11.2,47,600,1024,1,7,97,28,1,499,215
|
||||||
|
79.8,142,1,12.9,45,533,969,0,18,94,33,0,318,250,71.4,142,13.1,44,552,969,0,19,93,36,0,378,247
|
||||||
|
82.3,123,0,12.5,97,526,948,0,113,124,50,0,572,158,75.4,134,12.4,87,529,949,0,117,125,49,0,639,146
|
||||||
|
83.1,135,0,13.6,62,595,986,0,22,77,27,0,529,190,77.3,137,13.7,61,599,993,0,23,80,28,0,591,189
|
||||||
|
84.9,121,0,13.2,118,547,964,0,25,84,29,0,689,126,78.6,132,13.3,115,538,968,0,25,82,30,0,742,127
|
||||||
|
85.6,166,1,11.4,58,521,973,0,46,72,26,0,396,237,80.6,153,11.2,54,543,983,0,47,76,25,1,568,246
|
||||||
|
88,140,0,12.9,71,632,1029,1,7,100,24,1,526,174,82.2,130,12.9,68,620,1024,1,8,104,25,1,570,182
|
||||||
|
92.3,126,0,12.7,74,602,984,0,34,102,33,1,557,195,87.5,134,12.9,67,599,982,0,33,107,34,1,621,199
|
||||||
|
94.3,130,0,13.3,128,536,934,0,51,78,34,0,627,135,92.9,127,13.3,128,530,949,0,52,79,33,0,692,140
|
||||||
|
95.3,125,0,12,90,586,964,0,97,105,43,0,617,163,94.1,134,11.9,81,571,971,0,99,106,41,0,679,162
|
||||||
|
96.8,151,1,10,58,510,950,0,33,108,41,0,394,261,96.2,161,10.1,56,515,1001,1,32,110,40,0,465,254
|
||||||
|
97.4,152,1,10.8,57,530,986,0,30,92,43,0,405,264,97.8,152,11,53,541,989,0,30,92,41,0,470,243
|
||||||
|
98.7,162,1,12.1,75,522,996,0,40,73,27,0,496,224,99.9,162,12,70,533,992,0,41,80,28,0,562,229
|
||||||
|
99.9,149,1,10.7,61,515,953,0,36,86,35,0,395,251,101.4,150,10.7,54,520,952,0,35,84,32,0,476,249
|
||||||
|
103,177,1,11,58,638,974,0,24,76,28,0,382,254,103.5,164,10.9,56,638,978,0,25,79,28,0,456,257
|
||||||
|
104.3,134,0,12.5,75,595,972,0,47,83,31,0,580,172,104.5,133,12.7,71,599,982,0,50,87,32,0,649,182
|
||||||
|
105.9,130,0,13.4,90,623,1049,1,3,113,40,0,588,160,106.4,153,13.4,91,622,1050,1,3,119,41,0,649,159
|
||||||
|
106.6,157,1,11.1,65,553,955,0,39,81,28,0,421,239,107.8,156,11.2,62,562,956,0,39,85,29,0,499,243
|
||||||
|
107.2,148,0,13.7,72,601,998,0,9,84,20,1,590,144,110.1,134,13.9,66,602,999,0,9,87,15,0,656,151
|
||||||
|
108.3,126,0,13.8,97,542,990,0,18,102,35,0,589,166,110.5,126,13.8,97,549,993,0,19,103,34,1,659,160
|
||||||
|
109.4,135,1,11.4,123,537,978,0,31,89,34,0,631,165,113.5,134,11.3,115,529,978,0,32,93,35,0,703,175
|
||||||
|
112.1,142,1,10.9,81,497,956,0,33,116,47,0,427,247,116.3,147,10.7,77,501,962,0,33,117,44,0,500,256
|
||||||
|
114.3,127,1,12.8,82,519,982,0,4,97,38,0,620,168,119.7,125,12.9,79,510,945,0,4,99,39,0,696,170
|
||||||
|
115.1,131,0,13.7,78,574,1038,1,7,142,42,1,540,176,124.5,134,13.6,73,581,1029,1,7,143,41,1,615,177
|
||||||
|
117.2,136,0,12.9,95,574,1012,1,29,111,37,1,622,162,127.8,140,13,96,581,1011,1,29,115,36,1,691,169
|
||||||
|
119.7,119,0,11.9,166,521,938,0,168,92,36,0,637,154,129.8,120,11.9,157,524,935,0,180,93,27,1,698,169
|
||||||
|
121.6,147,1,13.9,63,560,972,0,23,76,24,1,462,233,130.7,139,14,64,571,970,0,24,78,24,1,511,220
|
||||||
|
123.4,145,1,11.7,82,560,981,0,96,88,31,0,488,228,132.5,154,11.8,74,563,980,0,99,89,29,1,550,230
|
||||||
|
127.2,132,0,10.4,87,564,953,0,43,83,32,0,513,227,134.6,135,10.2,83,560,948,0,44,83,32,0,589,234
|
||||||
|
132.4,152,0,12,82,571,1018,1,10,103,28,1,537,215,137.5,151,12.1,76,567,1079,1,11,105,27,1,617,204
|
||||||
|
135.5,125,0,12.5,113,567,985,0,78,130,58,0,626,166,140.5,140,12.5,105,571,993,0,77,131,59,0,684,174
|
||||||
|
137.8,141,0,14.2,109,591,985,0,18,91,20,1,578,174,145.7,142,14.2,101,590,987,0,19,94,19,1,649,180
|
||||||
|
140.8,150,0,12,109,531,964,0,9,87,38,0,559,153,150.6,153,12,98,539,982,0,10,88,36,0,635,151
|
||||||
|
145.4,131,1,12.2,115,542,969,0,50,79,35,0,472,206,157.3,131,12.1,109,548,976,0,52,82,34,0,539,219
|
||||||
|
149.3,143,0,12.3,103,583,1012,1,13,96,36,0,557,194,162.7,142,12.2,95,612,1003,1,13,97,36,0,625,196
|
||||||
|
154.3,124,0,12.3,121,580,966,0,101,77,35,0,657,170,169.6,134,12.2,116,580,987,0,104,79,36,0,719,172
|
||||||
|
157.7,136,0,15.1,149,577,994,0,157,102,39,0,673,167,177.2,140,15.2,141,578,995,0,160,110,40,0,739,169
|
||||||
|
161.8,131,0,13.2,160,631,1071,1,3,102,41,0,674,152,178.2,132,13.2,143,632,1058,1,4,100,40,0,748,150
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
# **Numerical data analysis and process tools**
|
||||||
|
|
||||||
|
|
||||||
|
### **Project Description**:
|
||||||
|
|
||||||
|
- Numerical data correlation analysis and processing, using image visualization to help understanding.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### Numerical analysis tools part
|
||||||
|
|
||||||
|
- Spearman_correlation is to determine whether there is a Monotonic component between two features,
|
||||||
|
which can be apply only for non_linear relationship and ordinal data.
|
||||||
|
|
||||||
|
#### Numercial process tools part
|
||||||
|
|
||||||
|
- Detecting outlier by using the Interquartile range(IQR).
|
||||||
|
- When highly correlated features will be used to remove.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#### How to use the tools
|
||||||
|
|
||||||
|
Input an only numerical data (data type:DataFrame).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/7/4 16:34
|
||||||
|
@Desc :
|
||||||
|
"""
|
|
@ -0,0 +1,38 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.stats import spearmanr
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
|
||||||
|
"""
|
||||||
|
Spearman_correlation is to determine whether there is a
|
||||||
|
Monotonic component between two features, which can be apply
|
||||||
|
only for non_linear relationship and ordinal data
|
||||||
|
|
||||||
|
@param feature_a: Input first feature for Spearman's rank test
|
||||||
|
@param feature_b: Input second feature for Spearman's rank test
|
||||||
|
@param sample_size: Choose a sample for representing the population
|
||||||
|
@param:save_path: output path
|
||||||
|
@param:file_name: output name
|
||||||
|
|
||||||
|
"""
|
||||||
|
a = data_frame[feature_a].sample(n=sample_size, random_state=1)
|
||||||
|
b = data_frame[feature_b].sample(n=sample_size, random_state=1)
|
||||||
|
coef, p = spearmanr(a, b)
|
||||||
|
logger.info("Spearmans' correlation coefficient is:" + str(coef))
|
||||||
|
alpha = 0.05
|
||||||
|
plt.scatter(a, b)
|
||||||
|
plt.xlabel("Feature A")
|
||||||
|
plt.ylabel("Feature B")
|
||||||
|
plt.title("Spearman's Rank Test")
|
||||||
|
plt.savefig(os.path.join(save_path, file_name))
|
||||||
|
if p > alpha:
|
||||||
|
logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
|
||||||
|
else:
|
||||||
|
logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))
|
|
@ -0,0 +1,155 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> correlation
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/7/4 16:48
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
from scipy.stats import spearmanr
|
||||||
|
|
||||||
|
|
||||||
|
def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
|
||||||
|
alternative='two-sided', sample_size=4000, random_state=None):
|
||||||
|
"""Calculate a Spearman correlation coefficient with associated p-value.
|
||||||
|
|
||||||
|
The Spearman rank-order correlation coefficient is a nonparametric measure
|
||||||
|
of the monotonicity of the relationship between two datasets. Unlike the
|
||||||
|
Pearson correlation, the Spearman correlation does not assume that both
|
||||||
|
datasets are normally distributed. Like other correlation coefficients,
|
||||||
|
this one varies between -1 and +1 with 0 implying no correlation.
|
||||||
|
Correlations of -1 or +1 imply an exact monotonic relationship. Positive
|
||||||
|
correlations imply that as x increases, so does y. Negative correlations
|
||||||
|
imply that as x increases, y decreases.
|
||||||
|
|
||||||
|
The p-value roughly indicates the probability of an uncorrelated system
|
||||||
|
producing datasets that have a Spearman correlation at least as extreme
|
||||||
|
as the one computed from these datasets. The p-values are not entirely
|
||||||
|
reliable but are probably reasonable for datasets larger than 500 or so.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
a, b : 1D or 2D array_like, b is optional
|
||||||
|
One or two 1-D or 2-D arrays containing multiple variables and
|
||||||
|
observations. When these are 1-D, each represents a vector of
|
||||||
|
observations of a single variable. For the behavior in the 2-D case,
|
||||||
|
see under ``axis``, below.
|
||||||
|
Both arrays need to have the same length in the ``axis`` dimension.
|
||||||
|
axis : int or None, optional
|
||||||
|
If axis=0 (default), then each column represents a variable, with
|
||||||
|
observations in the rows. If axis=1, the relationship is transposed:
|
||||||
|
each row represents a variable, while the columns contain observations.
|
||||||
|
If axis=None, then both arrays will be raveled.
|
||||||
|
nan_policy : {'propagate', 'raise', 'omit'}, optional
|
||||||
|
Defines how to handle when input contains nan.
|
||||||
|
The following options are available (default is 'propagate'):
|
||||||
|
|
||||||
|
* 'propagate': returns nan
|
||||||
|
* 'raise': throws an error
|
||||||
|
* 'omit': performs the calculations ignoring nan values
|
||||||
|
|
||||||
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||||
|
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||||
|
The following options are available:
|
||||||
|
|
||||||
|
* 'two-sided': the correlation is nonzero
|
||||||
|
* 'less': the correlation is negative (less than zero)
|
||||||
|
* 'greater': the correlation is positive (greater than zero)
|
||||||
|
|
||||||
|
sample_size : int, optional
|
||||||
|
Number of items from column to return. Default is 4000.
|
||||||
|
|
||||||
|
random_state : int, array-like, BitGenerator, np.random.RandomState, optional
|
||||||
|
If int, array-like, or BitGenerator (NumPy>=1.17), seed for
|
||||||
|
random number generator
|
||||||
|
If np.random.RandomState, use as numpy RandomState object.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
correlation : float or ndarray (2-D square)
|
||||||
|
Spearman correlation matrix or correlation coefficient (if only 2
|
||||||
|
variables are given as parameters. Correlation matrix is square with
|
||||||
|
length equal to total number of variables (columns or rows) in ``a``
|
||||||
|
and ``b`` combined.
|
||||||
|
pvalue : float
|
||||||
|
The p-value for a hypothesis test whose null hypotheisis
|
||||||
|
is that two sets of data are uncorrelated. See `alternative` above
|
||||||
|
for alternative hypotheses. `pvalue` has the same
|
||||||
|
shape as `correlation`.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
|
||||||
|
Probability and Statistics Tables and Formulae. Chapman & Hall: New
|
||||||
|
York. 2000.
|
||||||
|
Section 14.7
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> from scipy import stats
|
||||||
|
>>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
|
||||||
|
SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
|
||||||
|
>>> rng = np.random.default_rng()
|
||||||
|
>>> x2n = rng.standard_normal((100, 2))
|
||||||
|
>>> y2n = rng.standard_normal((100, 2))
|
||||||
|
>>> stats.spearmanr(x2n)
|
||||||
|
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
||||||
|
>>> stats.spearmanr(x2n[:,0], x2n[:,1])
|
||||||
|
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
||||||
|
>>> rho, pval = stats.spearmanr(x2n, y2n)
|
||||||
|
>>> rho
|
||||||
|
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
||||||
|
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
||||||
|
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
||||||
|
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
||||||
|
>>> pval
|
||||||
|
array([[0. , 0.43111687, 0.41084066, 0.33891628],
|
||||||
|
[0.43111687, 0. , 0.15151618, 0.09600687],
|
||||||
|
[0.41084066, 0.15151618, 0. , 0.74938561],
|
||||||
|
[0.33891628, 0.09600687, 0.74938561, 0. ]])
|
||||||
|
>>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
|
||||||
|
>>> rho
|
||||||
|
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
||||||
|
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
||||||
|
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
||||||
|
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
||||||
|
>>> stats.spearmanr(x2n, y2n, axis=None)
|
||||||
|
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
||||||
|
>>> stats.spearmanr(x2n.ravel(), y2n.ravel())
|
||||||
|
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
||||||
|
|
||||||
|
>>> rng = np.random.default_rng()
|
||||||
|
>>> xint = rng.integers(10, size=(100, 2))
|
||||||
|
>>> stats.spearmanr(xint)
|
||||||
|
SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# a = a.sample(n=sample_size, random_state=random_state)
|
||||||
|
# if b:
|
||||||
|
# b = b.sample(n=sample_size, random_state=random_state)
|
||||||
|
return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
|
||||||
|
|
||||||
|
|
||||||
|
def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
|
||||||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||||
|
plt.rcParams['axes.unicode_minus'] = False
|
||||||
|
cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
|
||||||
|
cov = df.corr(method=method)
|
||||||
|
if drop:
|
||||||
|
uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
|
||||||
|
cov = cov[uncorr]
|
||||||
|
cov = cov[cov.index]
|
||||||
|
if plot or filepath:
|
||||||
|
mask = np.triu(np.ones_like(cov, dtype=bool))
|
||||||
|
fig, ax = plt.subplots(figsize=figsize)
|
||||||
|
sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
|
||||||
|
plt.title("相关性矩阵")
|
||||||
|
if filepath:
|
||||||
|
plt.savefig(filepath)
|
||||||
|
if plot:
|
||||||
|
plt.show()
|
||||||
|
return cov
|
|
@ -0,0 +1,48 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2022/3/25 9:09
|
||||||
|
# @Software : PyCharm
|
||||||
|
# @File : process_tool.py
|
||||||
|
# @Author : QT
|
||||||
|
# @Email : taoqimin@sics.ac.cn
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(level=logging.INFO)
|
||||||
|
handler = logging.FileHandler("log.txt")
|
||||||
|
handler.setLevel(logging.INFO)
|
||||||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
console = logging.StreamHandler()
|
||||||
|
console.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
logger.addHandler(handler)
|
||||||
|
logger.addHandler(console)
|
||||||
|
|
||||||
|
|
||||||
|
class NumericProcess:
|
||||||
|
@staticmethod
|
||||||
|
def drop_feature(data_frame, thresh_hold):
|
||||||
|
"""
|
||||||
|
A function for detecting and dropping highly correlated features.
|
||||||
|
when two variables are highly correlated, it usually cause problem
|
||||||
|
such as Multicolinearity. The following function will be used to
|
||||||
|
remove the correlated features.
|
||||||
|
|
||||||
|
@param data_frame: Input dataframe
|
||||||
|
@param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
matrix = data_frame.corr().abs()
|
||||||
|
mask = np.triu(np.ones_like(matrix, dtype=bool))
|
||||||
|
reduced_matrix = matrix.mask(mask)
|
||||||
|
feature_drop = [c for c in tqdm(reduced_matrix) if
|
||||||
|
any(reduced_matrix[c] > thresh_hold)]
|
||||||
|
data_frame.drop(feature_drop, axis=1, inplace=True)
|
||||||
|
logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
|
||||||
|
return data_frame
|
|
@ -0,0 +1,20 @@
|
||||||
|
解析配置文件,在数据进入下一步前进行一定的预处理(如补充空值、采样等)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
目前完成了Pre-process Lib的部分预处理功能,如下:
|
||||||
|
|
||||||
|
- data_insight
|
||||||
|
- DuplicateInsight - 重复数据的检测
|
||||||
|
- NullInsight - 空值数据的检测
|
||||||
|
- ValidationInsight - 数据有效性检测
|
||||||
|
- data_process
|
||||||
|
- FilteringProcessor - 数据过滤
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
另外:
|
||||||
|
|
||||||
|
- TypeInsight - 其中对date日期的检验方法还未完成
|
||||||
|
|
||||||
|
还未完成
|
|
@ -0,0 +1,8 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/4/26 10:40
|
||||||
|
@Desc :
|
||||||
|
"""
|
|
@ -0,0 +1,133 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding:utf-8 -*-
|
||||||
|
# file: data_insight
|
||||||
|
# author: shenwentao, wangkanglong
|
||||||
|
# description:
|
||||||
|
# date: 2022-03-30 16:45
|
||||||
|
# IDE: PyCharm
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import datetime
|
||||||
|
from typing import List, Union
|
||||||
|
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
|
||||||
|
|
||||||
|
from iod_data_analysis_tool.utils.assertion import assert_range
|
||||||
|
|
||||||
|
|
||||||
|
class DuplicateInsight:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户自定义重复数据的计数
|
||||||
|
:param data: 来源数据
|
||||||
|
:param subset: 选中列/字段,同pd.DataFrame里的dulplicated函数subset参数
|
||||||
|
:param keep: 确定要标记的重复项(如果有)。同pd.DataFrame里的dulplicated函数keep参数
|
||||||
|
:return: 返回计数结果
|
||||||
|
"""
|
||||||
|
result = data.duplicated(subset, keep=keep).sum()
|
||||||
|
return pd.DataFrame([result], columns=['duplicate_num'])
|
||||||
|
|
||||||
|
|
||||||
|
class NullInsight:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def num_null(data, column: str = None) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户自定义计数数据中的空值
|
||||||
|
:param data: 来源数据
|
||||||
|
:param column: 选中列/字段
|
||||||
|
:return: 返回计数结果
|
||||||
|
"""
|
||||||
|
if column is not None:
|
||||||
|
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
|
||||||
|
else:
|
||||||
|
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
|
||||||
|
|
||||||
|
|
||||||
|
class ValidationInsight:
|
||||||
|
"""
|
||||||
|
自定义验证数据有效性,比如数据里有坏数,针对不同类型的数据限定范围
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validation_continuous_range(data: pd.DataFrame, column: str,
|
||||||
|
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户自定义对连续数值型数据进行验证,返回数据在指定范围内外的计数结果
|
||||||
|
:param data: 来源数据
|
||||||
|
:param column: 选中列/字段
|
||||||
|
:param min_val: 范围最小值
|
||||||
|
:param max_val: 范围最大值
|
||||||
|
:return: 计数结果
|
||||||
|
"""
|
||||||
|
assert_range(min_val, max_val)
|
||||||
|
nums = dict()
|
||||||
|
nums['column'] = column
|
||||||
|
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
|
||||||
|
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
|
||||||
|
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
|
||||||
|
return pd.DataFrame([nums], index=['result'])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户自定义对离散型数据进行验证,返回数据在指定范围内外的计数结果
|
||||||
|
:param data: 来源数据
|
||||||
|
:param column: 选中列/字段
|
||||||
|
:param values: 用户自定义的离散值,也就是数值所在的"范围"
|
||||||
|
:return: 计数结果
|
||||||
|
"""
|
||||||
|
nums = dict()
|
||||||
|
nums['column'] = column
|
||||||
|
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
|
||||||
|
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
|
||||||
|
return pd.DataFrame([nums], index=['result'])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validation_date_range(data, column: str, start_date: datetime.date,
|
||||||
|
end_date: datetime.date) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户自定义对日期型数据范围进行验证,返回数据在指定范围内外的计数结果,前提:数据类型是 datetime.date
|
||||||
|
:param data: 来源数据
|
||||||
|
:param column: 选中列/字段
|
||||||
|
:param start_date: 开始日期
|
||||||
|
:param end_date: 结束日期
|
||||||
|
:return: 计数结果
|
||||||
|
"""
|
||||||
|
assert_range(start_date, end_date)
|
||||||
|
nums = dict()
|
||||||
|
nums['column'] = column
|
||||||
|
nums['date_lt_start'] = sum(data[column] < start_date)
|
||||||
|
nums['date_gt_end'] = sum(data[column] > end_date)
|
||||||
|
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
|
||||||
|
return pd.DataFrame([nums], index=['result'])
|
||||||
|
|
||||||
|
|
||||||
|
class TypeInsight:
|
||||||
|
"""
|
||||||
|
使用户能够检测数据的数据类型是否为自己所预期的
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO: 还缺一个timestamp checker
|
||||||
|
_checkers = {
|
||||||
|
'int': is_integer_dtype,
|
||||||
|
'float': is_float_dtype,
|
||||||
|
'string': is_string_dtype,
|
||||||
|
'bool': is_bool_dtype,
|
||||||
|
'datetime': is_datetime64_dtype
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
用户检测数据类型是否为自己所需要的类型
|
||||||
|
:param data: 来源数据
|
||||||
|
:param column: 选中的列/字段
|
||||||
|
:param check_type: 选择检测的数据类型,{'int', 'float', 'string', 'bool', 'datetime'}
|
||||||
|
:return: 检测结果
|
||||||
|
"""
|
||||||
|
flag = True
|
||||||
|
if not TypeInsight._checkers[check_type](data[column]):
|
||||||
|
flag = False
|
||||||
|
return pd.DataFrame([flag], columns=['result'], index=[column])
|
|
@ -0,0 +1,17 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> normalizer
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/4/26 10:40
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.stats import zscore as scipy_zscore
|
||||||
|
|
||||||
|
|
||||||
|
def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
|
||||||
|
"""
|
||||||
|
Zi = (Xi - μ) / σ
|
||||||
|
"""
|
||||||
|
return scipy_zscore(a, axis, ddof, nan_policy)
|
|
@ -0,0 +1,51 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> outlierprocessing
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/4/26 10:24
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
|
||||||
|
"""
|
||||||
|
MAD = median(|Xi - median(X)|)
|
||||||
|
@return pandas.Index
|
||||||
|
"""
|
||||||
|
x = data.median()
|
||||||
|
MC = (data - x).abs().median()
|
||||||
|
MAD = MC * constant
|
||||||
|
offset = n * MAD
|
||||||
|
if isinstance(data, pd.DataFrame):
|
||||||
|
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
|
||||||
|
else:
|
||||||
|
return data.clip(lower=x - offset, upper=x + offset)
|
||||||
|
|
||||||
|
|
||||||
|
def three_sigma(data: pd.Series):
|
||||||
|
miu = data.mean()
|
||||||
|
sigma = data.std()
|
||||||
|
low = miu - 3 * sigma
|
||||||
|
up = miu + 3 * sigma
|
||||||
|
return data.index[(data < low) | (data > up)]
|
||||||
|
|
||||||
|
|
||||||
|
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
|
||||||
|
q = data.quantile(q=[q1, q3])
|
||||||
|
IQR = q[q3] - q[q1]
|
||||||
|
lower_whisker_limit = q[q1] - k * IQR
|
||||||
|
upper_whisker_limit = q[q3] + k * IQR
|
||||||
|
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
|
||||||
|
|
||||||
|
|
||||||
|
def regex_match(data: pd.Series, *patterns):
|
||||||
|
pattern = '|'.join(patterns)
|
||||||
|
return data.index[data.astype(str).str.contains(pattern, regex=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def empty(data: Union[pd.Series, pd.DataFrame]):
|
||||||
|
return any(data.isnull())
|
|
@ -0,0 +1,24 @@
|
||||||
|
## 对时序数据的分析方法
|
||||||
|
|
||||||
|
--------
|
||||||
|
|
||||||
|
|模块|涉及方法|
|
||||||
|
| ---- | ---- |
|
||||||
|
|基础模块| |
|
||||||
|
|平稳性| |
|
||||||
|
|异常检测| |
|
||||||
|
|频率检测| |
|
||||||
|
|周期性检测| |
|
||||||
|
|其他| |
|
||||||
|
|
||||||
|
### 基础模块
|
||||||
|
|
||||||
|
### 平稳性
|
||||||
|
|
||||||
|
### 异常检测
|
||||||
|
|
||||||
|
### 频率检测
|
||||||
|
|
||||||
|
### 周期性检测
|
||||||
|
|
||||||
|
### 其他
|
|
@ -0,0 +1,26 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def describe_datetime_info(data: pd.Series, datetime_is_numeric: bool = False) -> pd.Series:
|
||||||
|
"""
|
||||||
|
if the type of data is str and data dont have date, it will be populated by the
|
||||||
|
date of today.
|
||||||
|
@param data: data
|
||||||
|
@param datetime_is_numeric : bool, default False
|
||||||
|
Whether to treat datetime dtypes as numeric. This affects statistics
|
||||||
|
calculated for the column. For DataFrame input, this also
|
||||||
|
controls whether datetime columns are included by default.
|
||||||
|
@return: Summary statistics of the Series.
|
||||||
|
@example: Describing a numeric ``Series``.
|
||||||
|
|
||||||
|
>>> s = pd.read_csv()
|
||||||
|
>>> s.describe()
|
||||||
|
count 1427132
|
||||||
|
unique 25111
|
||||||
|
top 2022-04-26 09:25:00.260000
|
||||||
|
freq 32994
|
||||||
|
first 2022-04-26 09:25:00
|
||||||
|
last 2022-04-26 09:34:46.340000
|
||||||
|
Name: TradTime, dtype: object
|
||||||
|
"""
|
||||||
|
return pd.to_datetime(data).describe(datetime_is_numeric=datetime_is_numeric)
|
|
@ -0,0 +1,62 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from time_base import timeBase
|
||||||
|
|
||||||
|
import statsmodels.api as sm
|
||||||
|
import statsmodels.tsa.api as smt
|
||||||
|
import statsmodels.formula as smf
|
||||||
|
|
||||||
|
import scipy.stats as scs
|
||||||
|
|
||||||
|
|
||||||
|
class stationaryTest(Time_base):
|
||||||
|
"""
|
||||||
|
时间序列稳定性检验
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_stationary(self, x, window_size):
|
||||||
|
"""
|
||||||
|
时间序列稳定性检验
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
x_ma = self.moving_average(x, window_size)
|
||||||
|
x_std = self.moving_std(x, window_size)
|
||||||
|
x_max = self.moving_max(x, window_size)
|
||||||
|
x_min = self.moving_min(x, window_size)
|
||||||
|
x_median = self.moving_median(x, window_size)
|
||||||
|
x_normalized = self.normalize(x)
|
||||||
|
x_ma_normalized = self.normalize(x_ma)
|
||||||
|
x_std_normalized = self.normalize(x_std)
|
||||||
|
x_max_normalized = self.normalize(x_max)
|
||||||
|
x_min_normalized = self.normalize(x_min)
|
||||||
|
x_median_normalized = self.normalize(x_median)
|
||||||
|
x_normalized_ma_normalized = self.normalize(x_normalized - x_ma_normalized)
|
||||||
|
x_normalized_std_normalized = self.normalize(x_normalized - x_std_normalized)
|
||||||
|
x_normalized_max_normalized = self.normalize(x_normalized - x_max_normalized)
|
||||||
|
x_normalized_min_normalized = self.normalize(x_normalized - x_min_normalized)
|
||||||
|
x_normalized_median_normalized = self.normalize(x_normalized - x_median_normalized)
|
||||||
|
x_normalized_ma_normalized_std_normalized = self.normalize(x_normalized_ma_normalized - x_std)
|
||||||
|
|
||||||
|
return x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized
|
||||||
|
|
||||||
|
def adf_test(self, x, window_size):
|
||||||
|
"""
|
||||||
|
时间序列稳定性检验
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized = self.test_stationary(x, window_size)
|
||||||
|
adf_test_normalized = smt.adfuller(x_normalized)
|
||||||
|
adf_test_ma_normalized = smt.adfuller(x_ma_normalized)
|
||||||
|
adf_test_std_normalized = smt.adfuller(x_std_normalized)
|
||||||
|
adf_test_max_normalized = smt.adfuller(x_max_normalized)
|
||||||
|
adf_test_min_normalized = smt.adfuller(x_min_normalized)
|
||||||
|
adf_test_median_normalized = smt.adfuller(x_median_normalized)
|
||||||
|
adf_test_normalized_ma_normalized = smt.adfuller(x_normalized_ma_normalized)
|
||||||
|
adf_test_normalized_std_normalized = smt.adfuller(x_normalized_std_normalized)
|
||||||
|
adf_test_normalized_max_normalized = smt.adfuller(x_normalized_max_normalized)
|
||||||
|
adf_test_normalized_min_normalized = smt.adfuller(x_normalized_min_normalized)
|
||||||
|
return adf_test_normalized, adf_test_ma_normalized, adf_test_std_normalized, adf_test_max_normalized, adf_test_min_normalized, adf_test_median_normalized, adf_test_normalized_ma_normalized, adf_test_normalized_std_normalized, adf_test_normalized_max_normalized, adf_test_normalized_min_normalized
|
|
@ -0,0 +1,133 @@
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Time_base(object):
|
||||||
|
"""
|
||||||
|
时间序列基础模块
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normalize(x):
|
||||||
|
"""
|
||||||
|
将时间序列数据归一化
|
||||||
|
x : 时间序列数据
|
||||||
|
"""
|
||||||
|
x = np.array(x)
|
||||||
|
return np.log2(x / np.sqrt(np.sum(x**2)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def lag(x, lag):
|
||||||
|
"""
|
||||||
|
滞后
|
||||||
|
x : 时间序列数据
|
||||||
|
lag : 滞后时间
|
||||||
|
"""
|
||||||
|
return pd.Series(x).shift(lag)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_average(x, window_size):
|
||||||
|
"""
|
||||||
|
移动平均窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).mean()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_median(x, window_size):
|
||||||
|
"""
|
||||||
|
移动中值窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).median()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_std(x, window_size):
|
||||||
|
"""
|
||||||
|
移动标准差窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).std()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_max(x, window_size):
|
||||||
|
"""
|
||||||
|
移动最大值窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).max()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_min(x, window_size):
|
||||||
|
"""
|
||||||
|
移动最小值窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).min()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_sum(x, window_size):
|
||||||
|
"""
|
||||||
|
移动和窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).sum()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_quantile(x, window_size, quantile):
|
||||||
|
"""
|
||||||
|
移动分位数窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
quantile : 分位数
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).quantile(quantile)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_corr(x, y, window_size):
|
||||||
|
"""
|
||||||
|
移动相关窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
y : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).corr(pd.Series(y))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_cov(x, y, window_size):
|
||||||
|
"""
|
||||||
|
移动协方差窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
y : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).cov(pd.Series(y))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_skew(x, window_size):
|
||||||
|
"""
|
||||||
|
移动偏度窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).skew()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def moving_kurt(x, window_size):
|
||||||
|
"""
|
||||||
|
移动峰度窗口
|
||||||
|
x : 时间序列数据
|
||||||
|
window_size : 窗口大小
|
||||||
|
"""
|
||||||
|
return pd.Series(x).rolling(window_size).kurt()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> ID_code
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/5/17 16:00
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
re_ID = re.compile(r'^\d{6}(?:18|19|20)?\d{2}(?:0[1-9]|1[012])(?:(?:[0-2][1-9])|10|20|30|31)\d{3}[0-9xX]$')
|
||||||
|
|
||||||
|
|
||||||
|
def validate_identity_code(code: str):
|
||||||
|
"""
|
||||||
|
身份证格式校验
|
||||||
|
:param code:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
city = {'11': "北京", '12': "天津", '13': "河北", '14': "山西", '15': "内蒙古", '21': "辽宁", '22': "吉林", '23': "黑龙江 ",
|
||||||
|
'31': "上海", '32': "江苏", '33': "浙江", '34': "安徽", '35': "福建", '36': "江西", '37': "山东", '41': "河南", '42': "湖北 ",
|
||||||
|
'43': "湖南", '44': "广东", '45': "广西", '46': "海南", '50': "重庆", '51': "四川", '52': "贵州", '53': "云南", '54': "西藏 ",
|
||||||
|
'61': "陕西", '62': "甘肃", '63': "青海", '64': "宁夏", '65': "新疆", '71': "台湾", '81': "香港", '82': "澳门", '91': "国外 "}
|
||||||
|
tip = ""
|
||||||
|
p = True
|
||||||
|
|
||||||
|
if re_ID.match(code) is None:
|
||||||
|
tip = "身份证号格式错误"
|
||||||
|
p = False
|
||||||
|
|
||||||
|
|
||||||
|
elif not city[code[:2]]:
|
||||||
|
tip = "地址编码错误"
|
||||||
|
p = False
|
||||||
|
else:
|
||||||
|
# 18位身份证需要验证最后一位校验位
|
||||||
|
if len(code) == 18:
|
||||||
|
code = code.split('')
|
||||||
|
# ∑(ai × Wi)(mod 11)
|
||||||
|
# 加权因子
|
||||||
|
factor = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
|
||||||
|
# 校验位
|
||||||
|
parity = [1, 0, 'X', 9, 8, 7, 6, 5, 4, 3, 2]
|
||||||
|
sum = 0
|
||||||
|
for i in range(17):
|
||||||
|
ai = code[i]
|
||||||
|
wi = factor[i]
|
||||||
|
sum += ai * wi
|
||||||
|
i += 1
|
||||||
|
if parity[sum % 11] != code[17]:
|
||||||
|
tip = "校验位错误"
|
||||||
|
p = False
|
||||||
|
return p, tip
|
|
@ -0,0 +1,8 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/5/17 15:59
|
||||||
|
@Desc :
|
||||||
|
"""
|
|
@ -0,0 +1,97 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> timeutil
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/4/26 10:02
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import datetime
|
||||||
|
import types
|
||||||
|
import typing
|
||||||
|
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
|
|
||||||
|
class cnparserinfo(parser.parserinfo):
|
||||||
|
"""
|
||||||
|
匹配中文日期格式
|
||||||
|
用法:
|
||||||
|
from dateutil import parser
|
||||||
|
parser.parse('1998年12月11日 8点20分30秒', cnparserinfo())
|
||||||
|
"""
|
||||||
|
parser.parserinfo.JUMP.extend('年月日')
|
||||||
|
WEEKDAYS = [list(weekdays) for weekdays in parser.parserinfo.WEEKDAYS]
|
||||||
|
WEEKDAYS[0].extend(('星期一', '周一'))
|
||||||
|
WEEKDAYS[1].extend(('星期二', '周二'))
|
||||||
|
WEEKDAYS[2].extend(('星期三', '周三'))
|
||||||
|
WEEKDAYS[3].extend(('星期四', '周四'))
|
||||||
|
WEEKDAYS[4].extend(('星期五', '周五'))
|
||||||
|
WEEKDAYS[5].extend(('星期六', '周六'))
|
||||||
|
WEEKDAYS[6].extend(('星期天', '周日', '周天', '周末'))
|
||||||
|
WEEKDAYS = [tuple(weekdays) for weekdays in WEEKDAYS]
|
||||||
|
|
||||||
|
# MONTHS = [list(months) for months in parser.parserinfo.MONTHS]
|
||||||
|
# MONTHS[0].extend(('一月', '1月'))
|
||||||
|
# MONTHS[1].extend(('二月', '2月'))
|
||||||
|
# MONTHS[2].extend(('三月', '3月'))
|
||||||
|
# MONTHS[3].extend(('四月', '4月'))
|
||||||
|
# MONTHS[4].extend(('五月', '5月'))
|
||||||
|
# MONTHS[5].extend(('六月', '6月'))
|
||||||
|
# MONTHS[6].extend(('七月', '7月'))
|
||||||
|
# MONTHS[7].extend(('八月', '8月'))
|
||||||
|
# MONTHS[8].extend(('九月', '9月'))
|
||||||
|
# MONTHS[9].extend(('十月', '10月'))
|
||||||
|
# MONTHS[10].extend(('十一月', '11月'))
|
||||||
|
# MONTHS[11].extend(('十二月', '12月'))
|
||||||
|
# MONTHS = [tuple(months) for months in MONTHS]
|
||||||
|
|
||||||
|
HMS = [list(hms) for hms in parser.parserinfo.HMS]
|
||||||
|
HMS[0].extend('时点')
|
||||||
|
HMS[1].append('分')
|
||||||
|
HMS[2].append('秒')
|
||||||
|
HMS = [tuple(hms) for hms in HMS]
|
||||||
|
|
||||||
|
AMPM = [list(ampm) for ampm in parser.parserinfo.AMPM]
|
||||||
|
AMPM[0].append('上午')
|
||||||
|
AMPM[1].append('下午')
|
||||||
|
AMPM = [tuple(ampm) for ampm in AMPM]
|
||||||
|
|
||||||
|
def __init__(self, dayfirst=False, yearfirst=False):
|
||||||
|
super().__init__(dayfirst, yearfirst)
|
||||||
|
|
||||||
|
|
||||||
|
def utctimestamp():
|
||||||
|
"""
|
||||||
|
@return: utc时间戳
|
||||||
|
"""
|
||||||
|
return int(datetime.datetime.utcnow().timestamp())
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp2datetime(ts: float):
|
||||||
|
return datetime.datetime.fromtimestamp(ts)
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp2str(ts: float, fmt: str = '%F %H:%M:%S'):
|
||||||
|
"""
|
||||||
|
@param ts: timestamp
|
||||||
|
@param fmt: format
|
||||||
|
"""
|
||||||
|
return datetime.datetime.strftime(timestamp2datetime(ts), fmt)
|
||||||
|
|
||||||
|
|
||||||
|
cnparser = cnparserinfo()
|
||||||
|
|
||||||
|
|
||||||
|
def str2datetime(datetime_str: str, fmt: str = None):
|
||||||
|
if fmt:
|
||||||
|
return datetime.datetime.strptime(datetime_str, fmt)
|
||||||
|
return parser.parse(datetime_str, cnparser)
|
||||||
|
|
||||||
|
|
||||||
|
def int2date(date_int: int):
|
||||||
|
return str2datetime(str(date_int), '%Y%m%d')
|
||||||
|
|
||||||
|
|
||||||
|
def date2int(a: typing.Union[datetime.datetime, datetime.date]):
|
||||||
|
return int(a.strftime('%Y%m%d'))
|
|
@ -0,0 +1,81 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> file_util
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/5/10 17:21
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import queue
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
import paramiko
|
||||||
|
|
||||||
|
|
||||||
|
def list_files(dir_paths):
|
||||||
|
files = []
|
||||||
|
for root, dir_path, filepath in walk(dir_paths):
|
||||||
|
if filepath:
|
||||||
|
files.append(os.path.join(root, filepath))
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def walk(dir_paths):
|
||||||
|
dir_queue = queue.Queue()
|
||||||
|
if isinstance(dir_paths, str):
|
||||||
|
dir_paths = [dir_paths]
|
||||||
|
for dir_path in dir_paths:
|
||||||
|
dir_queue.put(dir_path)
|
||||||
|
while not dir_queue.empty():
|
||||||
|
dirname = dir_queue.get()
|
||||||
|
for root, dirs, files in os.walk(dirname):
|
||||||
|
for dirname in dirs:
|
||||||
|
dir_queue.put(os.path.join(root, dirname))
|
||||||
|
yield root, dirname, None
|
||||||
|
for filename in files:
|
||||||
|
yield root, None, filename
|
||||||
|
|
||||||
|
|
||||||
|
def copy(s, t):
|
||||||
|
if os.path.isfile(s):
|
||||||
|
shutil.copy(s, t)
|
||||||
|
else:
|
||||||
|
if not os.path.exists(t):
|
||||||
|
os.mkdir(t)
|
||||||
|
s = os.path.abspath(s)
|
||||||
|
t = os.path.abspath(t)
|
||||||
|
for root, dirname, filename in walk(s):
|
||||||
|
if dirname:
|
||||||
|
os.mkdir(os.path.join(t, dirname))
|
||||||
|
else:
|
||||||
|
shutil.copy(os.path.join(root, filename), os.path.join(root.replace(s, t), filename))
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteFileUtil:
|
||||||
|
|
||||||
|
def __init__(self, ip, username, password, port=22, local_dir=None, remote_dir=None):
|
||||||
|
tran = paramiko.Transport((ip, port))
|
||||||
|
tran.connect(username=username, password=password)
|
||||||
|
self.sftp = paramiko.SFTPClient.from_transport(tran).getfo()
|
||||||
|
self.local_dir = local_dir
|
||||||
|
self.remote_dir = remote_dir
|
||||||
|
|
||||||
|
def ls(self, remote_dir=None):
|
||||||
|
if remote_dir is None:
|
||||||
|
remote_dir = self.remote_dir
|
||||||
|
return self.sftp.listdir_attr(remote_dir)
|
||||||
|
|
||||||
|
def upload_file(self, local_filepath=None, remote_filepath=None, filename=None):
|
||||||
|
if local_filepath is None:
|
||||||
|
local_filepath = os.path.join(self.local_dir, filename)
|
||||||
|
if remote_filepath is None:
|
||||||
|
remote_filepath = os.path.join(self.remote_dir, filename)
|
||||||
|
self.sftp.put(local_filepath, remote_filepath)
|
||||||
|
|
||||||
|
def download_file(self, local_filepath=None, remote_filepath=None, filename=None):
|
||||||
|
if local_filepath is None:
|
||||||
|
local_filepath = os.path.join(self.local_dir, filename)
|
||||||
|
if remote_filepath is None:
|
||||||
|
remote_filepath = os.path.join(self.remote_dir, filename)
|
||||||
|
self.sftp.get(remote_filepath, local_filepath)
|
|
@ -0,0 +1,82 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> pd_util
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/7/13 11:00
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from functools import partial
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from typing import Hashable, Callable
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from pandas._typing import CompressionOptions, FilePath, StorageOptions, WriteBuffer
|
||||||
|
from pandas.core.generic import bool_t
|
||||||
|
|
||||||
|
|
||||||
|
class to_same_csv:
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
|
||||||
|
sep: str = ",",
|
||||||
|
na_rep: str = "",
|
||||||
|
float_format: str | None = None,
|
||||||
|
columns: pd.Sequence[Hashable] | None = None,
|
||||||
|
header: bool_t | list[str] = True,
|
||||||
|
index: bool_t = False,
|
||||||
|
index_label: pd.IndexLabel | None = None,
|
||||||
|
mode: str = "w",
|
||||||
|
encoding: str = 'utf8',
|
||||||
|
compression: CompressionOptions = "infer",
|
||||||
|
quoting: int | None = None,
|
||||||
|
quotechar: str = '"',
|
||||||
|
line_terminator: str | None = None,
|
||||||
|
chunksize: int | None = None,
|
||||||
|
date_format: str | None = None,
|
||||||
|
doublequote: bool_t = True,
|
||||||
|
escapechar: str | None = None,
|
||||||
|
decimal: str = ".",
|
||||||
|
errors: str = "strict",
|
||||||
|
storage_options: StorageOptions = None,
|
||||||
|
prepare: Callable = None):
|
||||||
|
self.not_first = False
|
||||||
|
self.mode = mode
|
||||||
|
if self.mode == 'a' and isinstance(path_or_buf, str) and os.path.exists(path_or_buf):
|
||||||
|
header = False
|
||||||
|
self.header = header
|
||||||
|
self.prepare = prepare
|
||||||
|
self.kwargs = {'path_or_buf': path_or_buf,
|
||||||
|
'sep': sep,
|
||||||
|
'na_rep': na_rep,
|
||||||
|
'float_format': float_format,
|
||||||
|
'columns': columns,
|
||||||
|
'index': index,
|
||||||
|
'index_label': index_label,
|
||||||
|
'encoding': encoding,
|
||||||
|
'compression': compression,
|
||||||
|
'quoting': quoting,
|
||||||
|
'quotechar': quotechar,
|
||||||
|
'line_terminator': line_terminator,
|
||||||
|
'chunksize': chunksize,
|
||||||
|
'date_format': date_format,
|
||||||
|
'doublequote': doublequote,
|
||||||
|
'escapechar': escapechar,
|
||||||
|
'decimal': decimal,
|
||||||
|
'errors': errors,
|
||||||
|
'storage_options': storage_options}
|
||||||
|
|
||||||
|
def __call__(self, df_or_series: pd.Series | pd.DataFrame):
|
||||||
|
if self.not_first:
|
||||||
|
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
|
||||||
|
else:
|
||||||
|
if self.prepare:
|
||||||
|
result = self.prepare(df_or_series)
|
||||||
|
if result:
|
||||||
|
df_or_series = result
|
||||||
|
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
|
||||||
|
self.mode = 'a'
|
||||||
|
self.header = False
|
|
@ -0,0 +1,17 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :IoD_data_analysis_tool -> phone_util
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2022/5/17 15:59
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
re_phone = re.compile(r'^(?:(?:13[0-9])'
|
||||||
|
r'|(?:14(?:0|[5-7]|9))'
|
||||||
|
r'|(?:15(?:[0-3]|[5-9]))'
|
||||||
|
r'|(?:16(?:2|[5-7]))'
|
||||||
|
r'|(?:17[0-8])'
|
||||||
|
r'|(?:18[0-9])'
|
||||||
|
r'|(?:19(?:[0-3]|[5-9])))\d{8}$')
|
|
@ -0,0 +1,61 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :IoD_data_analysis_tool
|
||||||
|
@File :project_util.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2022/9/15 9:45
|
||||||
|
"""
|
||||||
|
import compileall
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
from os.path import join
|
||||||
|
|
||||||
|
from lib.analysis_package.utils.file_util import walk
|
||||||
|
|
||||||
|
re_pyc = re.compile(r'cpython-\d+\.')
|
||||||
|
|
||||||
|
|
||||||
|
def compile_project(source, target=None):
|
||||||
|
"""
|
||||||
|
编译项目为pyc文件到指定目录
|
||||||
|
@param source: 项目路径
|
||||||
|
@param target: 编译文件存放路径
|
||||||
|
"""
|
||||||
|
source = os.path.abspath(source)
|
||||||
|
if target is None:
|
||||||
|
target = source
|
||||||
|
else:
|
||||||
|
target = os.path.abspath(target)
|
||||||
|
compileall.compile_dir(source)
|
||||||
|
pycache_paths = set()
|
||||||
|
if target == source:
|
||||||
|
for root, dirname, filename in walk(source):
|
||||||
|
if root[-11:] == '__pycache__':
|
||||||
|
pycache_paths.add(root)
|
||||||
|
shutil.move(join(root, filename), join(root, '../', re_pyc.sub('', filename)))
|
||||||
|
if filename and filename.endswith('py'):
|
||||||
|
os.remove(join(root, filename))
|
||||||
|
else:
|
||||||
|
if target is None:
|
||||||
|
target = join(source, 'dist')
|
||||||
|
len_t = len(target)
|
||||||
|
for root, dirname, filename in walk(source):
|
||||||
|
t_root = root.replace(source, target)
|
||||||
|
if target == root[:len_t]:
|
||||||
|
continue
|
||||||
|
if dirname and dirname != '__pycache__':
|
||||||
|
t_root = join(t_root, dirname)
|
||||||
|
if not os.path.exists(t_root) and join(source, dirname) != target:
|
||||||
|
os.makedirs(t_root)
|
||||||
|
elif filename and not filename.endswith('py'):
|
||||||
|
if root[-11:] == '__pycache__':
|
||||||
|
pycache_paths.add(root)
|
||||||
|
t_root = t_root[:-11]
|
||||||
|
shutil.move(join(root, filename), join(t_root, re_pyc.sub('', filename)))
|
||||||
|
else:
|
||||||
|
shutil.copyfile(join(root, filename), join(t_root, filename))
|
||||||
|
for p in pycache_paths:
|
||||||
|
os.rmdir(p)
|
Binary file not shown.
|
@ -0,0 +1,14 @@
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project -> File :scrapyproject -> package_project
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Date :2021/5/12 10:46
|
||||||
|
@Desc :
|
||||||
|
"""
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
subprocess.call('python setup.py bdist_wheel')
|
||||||
|
shutil.rmtree(r'build')
|
||||||
|
shutil.rmtree(r'analysis_package.egg-info')
|
|
@ -0,0 +1,36 @@
|
||||||
|
# coding:utf-8
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
PACKAGE = "analysis_package"
|
||||||
|
NAME = "analysis_package"
|
||||||
|
DESCRIPTION = "general analysis function"
|
||||||
|
AUTHOR = "iod"
|
||||||
|
AUTHOR_EMAIL = "rengengchen@sics.ac.cn"
|
||||||
|
URL = ""
|
||||||
|
VERSION = '0.1.3'
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name=NAME,
|
||||||
|
version=VERSION,
|
||||||
|
description=DESCRIPTION,
|
||||||
|
author=AUTHOR,
|
||||||
|
author_email=AUTHOR_EMAIL,
|
||||||
|
license="BSD",
|
||||||
|
url=URL,
|
||||||
|
include_package_data=True,
|
||||||
|
packages=find_packages(),
|
||||||
|
classifiers=[
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Operating System :: OS Independent',
|
||||||
|
],
|
||||||
|
install_requires=[
|
||||||
|
'pandas',
|
||||||
|
'scipy',
|
||||||
|
'numpy',
|
||||||
|
'matplotlib',
|
||||||
|
'seaborn',
|
||||||
|
'tqdm',
|
||||||
|
'scikit-learn',
|
||||||
|
],
|
||||||
|
zip_safe=False,
|
||||||
|
)
|
Loading…
Reference in New Issue