This commit is contained in:
wystan_rin 2024-05-12 20:18:24 +08:00
commit 707997d4e1
43 changed files with 1696 additions and 0 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,12 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/utils.iml" filepath="$PROJECT_DIR$/.idea/utils.iml" />
</modules>
</component>
</project>

8
.idea/utils.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

5
lib/README.md Normal file
View File

@ -0,0 +1,5 @@
模块职责:
1. continuous针对数值型数据进行特征分析
2. categorical针对离散型数据进行特征分析
3. timeseries对时序数据的分析方法
4. pre-process解析配置文件在数据进入下一步前进行一定的预处理如补充空值、采样等

0
lib/__init__.py Normal file
View File

View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project IoD_data_analysis_tool
@File __init__.py.py
@IDE PyCharm
@Author rengengchen
@Time 2022/8/3 17:07
'''

View File

@ -0,0 +1,30 @@
数值模块:
针对离散型数据进行特征分析
分析方法:
1> 描述性统计:
- 记录数据中该列包含的分类
- 分类个数
- 频数表
- 列联表
2> 卡方独立性检验
3> 信息熵
4> 互信息
功能:
多列离散数据循环进行数据分析
运行环境:
python3.7.10以上
- numpy
- pandas
- matplotlib
- sklearn
- scipy.stats

View File

@ -0,0 +1,8 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> __init__.py
@IDE PyCharm
@Author rengengchen
@Date 2022/7/4 16:34
@Desc
"""

View File

@ -0,0 +1,180 @@
# -*- coding: utf-8 -*-
# @Time : 2022/3/17 17:36
# @Author : Leng Yang
# @FileName: categorical_process.py
# @Software: PyCharm
import pandas as pd
import numpy as np
from sklearn import metrics
from scipy.stats import chi2_contingency, chi2
def test():
pass
class CategorySelfDescribe(object):
"""
描述性统计量
"""
def __init__(self):
pass
@staticmethod
def category_describe(data: pd.Series) -> pd.DataFrame:
"""
描述该列数据包含的分类名称和分类种类数量
:param data: 输入数据格式为pd.Series
:return: pd.DataFrame, 返回dataframe形式包含分类名称列表和分类种类数量
Examples
--------
>>> data1 = pd.DataFrame({'天气':['','','',''], '温度':['','','','']})
>>> CategorySelfDescribe().category_describe('天气')
categories types
0 [, , ] 3.0
"""
results = pd.DataFrame()
results = results.append({'categories': data.unique(), 'types': len(data.unique())}, ignore_index=True)
return results
@staticmethod
def category_frequency(data: pd.Series) -> pd.DataFrame:
"""
频数表
:param data: 输入数据格式为pd.Series
:return: pd.DataFrame, 返回频数表
Examples
--------
>>> data1 = pd.DataFrame({'天气':['','','','','','','','','','','','','',''],
'温度':['','','','','','','','','','','','','','']})
>>> CategorySelfDescribe().category_frequency('天气')
unique_values count frequency
0 5 0.357143
1 5 0.357143
2 4 0.285714
"""
df_freq = data.value_counts(ascending=False).rename_axis('unique_values').reset_index(name='count')
df_freq['frequency'] = df_freq['count'] / len(data)
return df_freq
class CategorySelfAnalyse(object):
"""
对单列分类数据进行统计分析
"""
def __init__(self):
pass
@staticmethod
def entropy(data: pd.Series) -> float:
"""
计算信息熵
:param data: 输入数据格式为pd.Series
:return: float, 信息熵
"""
prob = pd.value_counts(data) / len(data)
return sum(np.log2(prob) * prob * (-1))
class CategoryMutualDescribe(object):
"""
对两列不同的分类数据进行描述性统计
"""
def __init__(self):
pass
@staticmethod
def crosstab(row_data: pd.Series, col_data: pd.Series) -> pd.DataFrame:
"""
对两列不同的分类数据进行列联表分析
:param row_data: categorical数据1, 数据1分类作为列联表的行
:param col_data: categorical数据2, 数据2分类作为列联表的列
:return: pd.DataFrame, 列联表
Examples
--------
>>> data1 = pd.DataFrame({'天气':['','','',''], '温度':['','','','']})
>>> CategoryMutualDescribe().crosstab('天气','温度')
温度
天气
2 0
1 0
0 1
"""
return pd.crosstab(row_data, col_data)
class MutualCategoricalAnalyse(object):
"""
对两列分类数据进行统计分析
"""
def __init__(self):
pass
@staticmethod
def info_gain(df: pd.DataFrame, attr_col: str, data_col: str) -> float:
"""
计算信息增益: Gain(D,A) = Ent(D) - Ent(D|A)
使用某个特征A划分数据集D
:param df_data: 输入数据格式为dataframe
:param attr_col: 特征数据列名
:param data_col: 数据集列名
:return: float, 信息增益
"""
# e: 条件信息熵
e1 = df.groupby(attr_col).apply(lambda x: CategorySelfAnalyse.entropy(x[data_col]))
p1 = pd.value_counts(df[attr_col]) / len(df[attr_col]) # p(x)
e2 = sum(e1 * p1) # Ent(D|A)
return CategorySelfAnalyse.entropy(df[data_col]) - e2
@staticmethod
def normalized_mutual_information(data1: pd.Series, data2: pd.Series) -> float:
"""
Mutual Information between two clusterings. The Mutual Information is a measure of the similarity
between two labels of the same data.
Normalized Mutual Information (NMI) is a normalization of the Mutual
Information (MI) score to scale the results between 0 (no mutual
information) and 1 (perfect correlation).
:param df_data: 输入数据格式为dataframe
:param data1: 分类数据1
:param data2: 分类数据2
:return: nmi : float, score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
"""
return metrics.normalized_mutual_info_score(data1, data2)
@staticmethod
def chi2_independence(data1: pd.Series, data2: pd.Series, alpha=0.05) -> pd.DataFrame:
"""
卡方独立性检验
:param alpha: 置信度用来确定临界值
:param data1: categroical数据1
:param data2: categorical数据2
:return: pd.DataFrame内容如下
g: 卡方值也就是统计量
p: P值统计学名词与置信度对比也可进行假设检验P值小于置信度即可拒绝原假设
dof: 自由度
re: 判读变量1表示拒绝原假设0表示接受原假设
expctd: 原数据数组同维度的对应理论值
"""
data = CategoryMutualDescribe.crosstab(data1, data2)
result = pd.DataFrame(columns=['g', 'p', 'dof', 'expctd'])
g, p, dof, expctd = chi2_contingency(data)
result = result.append({'g': g, 'p': p, 'dof': dof, 'expctd': expctd}, ignore_index=True)
if dof == 0:
raise ValueError('自由度应该大于等于1')
elif dof == 1:
cv = chi2.isf(alpha * 0.5, dof) # critical value
else:
cv = chi2.isf(alpha * 0.5, dof - 1)
if g > cv:
result.loc[0, 're'] = 1 # 表示拒绝原假设
else:
result.loc[0, 're'] = 0 # 表示接受原假设
return result

View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project IoD_data_analysis_tool
@File __init__.py.py
@IDE PyCharm
@Author rengengchen
@Time 2022/8/5 11:52
"""

View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project IoD_data_analysis_tool
@File __init__.py.py
@IDE PyCharm
@Author rengengchen
@Time 2022/8/5 11:52
"""

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project IoD_data_analysis_tool
@File producer_consumer.py
@IDE PyCharm
@Author rengengchen
@Time 2022/8/5 11:53
"""
import multiprocessing
from typing import Iterable, Callable
from tqdm import tqdm
class Stop:
pass
class AbstractPCConcurrencySystem:
"""
@todo 对启动进程的维护
@todo 进程数量
"""
def __init__(self, num_producer: int = 1, num_consumer: int = 1, num_callback: int = 0,
len_task_queue: int = 0, len_result_queue: int = 0, len_callback_queue: int = 0,
producer_lock=None, consumer_lock=None, callback_lock=None,
meta=None, enable_progressbar=False, num_total_result=None):
self.task_queue = multiprocessing.Queue(len_task_queue)
self.num_producer = num_producer
self.num_consumer = num_consumer
self.num_callback = num_callback
self.producer_lock = producer_lock or multiprocessing.Lock()
self.consumer_lock = consumer_lock or multiprocessing.Lock()
self.meta = meta
self.enable_progressbar = enable_progressbar
if enable_progressbar and self.num_callback == 0:
self.num_callback = 1
self.result_queue = multiprocessing.Queue(len_result_queue)
if self.num_callback:
self.callback_lock = callback_lock or multiprocessing.Lock()
self.num_total_result = num_total_result
self.callback_queue = multiprocessing.Queue(len_callback_queue)
def get_result(self):
return self.callback_queue.get()
def produce(self):
"""
Must return an iterable object or a Stop object.
"""
raise NotImplementedError
def consume(self, consumer_params):
"""
@return: task result or Stop()
"""
raise NotImplementedError
def callback(self, result):
return result
def _produce(self):
producer = self.produce()
if isinstance(producer, Iterable):
for params in producer:
self.task_queue.put(params, block=True)
stop = Stop()
for _ in range(self.num_consumer):
self.task_queue.put(stop, block=True)
elif isinstance(producer, Callable):
while True:
task = producer()
if isinstance(task, Stop):
break
self.task_queue.put(task, block=True)
def _consume(self):
consumer_params = self.task_queue.get(block=True)
while not isinstance(consumer_params, Stop):
info = self.consume(consumer_params)
self.result_queue.put(info)
consumer_params = self.task_queue.get(block=True)
self.result_queue.put(Stop())
def _callback(self):
if self.enable_progressbar:
bar = tqdm(total=self.num_total_result)
over_flag = 0
while over_flag < self.num_consumer:
result = self.result_queue.get(block=True)
if isinstance(result, Stop):
over_flag += 1
else:
callback = self.callback(result)
self.callback_queue.put(callback)
if self.enable_progressbar:
bar.update(1)
else:
if self.enable_progressbar:
bar.close()
def run(self):
consumers = []
callbackers = []
# 创建并启动生产者
for i in range(self.num_producer):
multiprocessing.Process(target=self._produce, name=f'producer_{i}').start()
# 创建并启动消费者
for i in range(self.num_consumer):
p = multiprocessing.Process(target=self._consume, name=f'consumer_{i}')
consumers.append(p)
p.start()
# 处理结果
if self.num_callback:
for i in range(self.num_callback):
p = multiprocessing.Process(target=self._callback, name=f'callback_{i}')
callbackers.append(p)
p.start()
return self
def close(self):
self.task_queue.close()
self.result_queue.close()
self.callback_queue.close()

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project IoD_data_analysis_tool
@File distribute_task.py
@IDE PyCharm
@Author rengengchen
@Time 2022/8/8 16:55
"""
import math
import multiprocessing
def equally_distributing_task(target, tasks, *args, results=None, num_processors=8):
len_tasks = len(tasks)
process_offset = math.ceil(len_tasks / num_processors)
for i in range(num_processors):
sub_tasks = tasks[i * process_offset: (i + 1) * process_offset]
if sub_tasks:
if results:
multiprocessing.Process(target=target,
args=(sub_tasks, results, *args)).start()
else:
multiprocessing.Process(target=target,
args=(sub_tasks, *args)).start()
else:
break
return results

View File

@ -0,0 +1,48 @@
CrimeRate,Youth,Southern,Education,ExpenditureYear0,LabourForce,Males,MoreMales,StateSize,YouthUnemployment,MatureUnemployment,HighYouthUnemploy,Wage,BelowWage,CrimeRate10,Youth10,Education10,ExpenditureYear10,LabourForce10,Males10,MoreMales10,StateSize10,YouthUnemploy10,MatureUnemploy10,HighYouthUnemploy10,Wage10,BelowWage10
45.5,135,0,12.4,69,540,965,0,6,80,22,1,564,139,26.5,135,12.5,71,564,974,0,6,82,20,1,632,142
52.3,140,0,10.9,55,535,1045,1,6,135,40,1,453,200,35.9,135,10.9,54,540,1039,1,7,138,39,1,521,210
56.6,157,1,11.2,47,512,962,0,22,97,34,0,288,276,37.1,153,11,44,529,959,0,24,98,33,0,359,256
60.3,139,1,11.9,46,480,968,0,19,135,53,0,457,249,42.7,139,11.8,41,497,983,0,20,131,50,0,510,235
64.2,126,0,12.2,106,599,989,0,40,78,25,1,593,171,46.7,125,12.2,97,602,989,0,42,79,24,1,660,162
67.6,128,0,13.5,67,624,972,0,28,77,25,1,507,206,47.9,128,13.8,60,621,983,0,28,81,24,1,571,199
70.5,130,0,14.1,63,641,984,0,14,70,21,1,486,196,50.6,153,14.1,57,641,993,0,14,71,23,1,556,176
73.2,143,0,12.9,66,537,977,0,10,114,35,1,487,166,55.9,143,13,63,549,973,0,11,119,36,1,561,168
75,141,0,12.9,56,523,968,0,4,107,37,0,489,170,61.8,153,12.9,54,538,968,0,5,110,36,1,550,126
78.1,133,0,11.4,51,599,1024,1,7,99,27,1,425,225,65.4,134,11.2,47,600,1024,1,7,97,28,1,499,215
79.8,142,1,12.9,45,533,969,0,18,94,33,0,318,250,71.4,142,13.1,44,552,969,0,19,93,36,0,378,247
82.3,123,0,12.5,97,526,948,0,113,124,50,0,572,158,75.4,134,12.4,87,529,949,0,117,125,49,0,639,146
83.1,135,0,13.6,62,595,986,0,22,77,27,0,529,190,77.3,137,13.7,61,599,993,0,23,80,28,0,591,189
84.9,121,0,13.2,118,547,964,0,25,84,29,0,689,126,78.6,132,13.3,115,538,968,0,25,82,30,0,742,127
85.6,166,1,11.4,58,521,973,0,46,72,26,0,396,237,80.6,153,11.2,54,543,983,0,47,76,25,1,568,246
88,140,0,12.9,71,632,1029,1,7,100,24,1,526,174,82.2,130,12.9,68,620,1024,1,8,104,25,1,570,182
92.3,126,0,12.7,74,602,984,0,34,102,33,1,557,195,87.5,134,12.9,67,599,982,0,33,107,34,1,621,199
94.3,130,0,13.3,128,536,934,0,51,78,34,0,627,135,92.9,127,13.3,128,530,949,0,52,79,33,0,692,140
95.3,125,0,12,90,586,964,0,97,105,43,0,617,163,94.1,134,11.9,81,571,971,0,99,106,41,0,679,162
96.8,151,1,10,58,510,950,0,33,108,41,0,394,261,96.2,161,10.1,56,515,1001,1,32,110,40,0,465,254
97.4,152,1,10.8,57,530,986,0,30,92,43,0,405,264,97.8,152,11,53,541,989,0,30,92,41,0,470,243
98.7,162,1,12.1,75,522,996,0,40,73,27,0,496,224,99.9,162,12,70,533,992,0,41,80,28,0,562,229
99.9,149,1,10.7,61,515,953,0,36,86,35,0,395,251,101.4,150,10.7,54,520,952,0,35,84,32,0,476,249
103,177,1,11,58,638,974,0,24,76,28,0,382,254,103.5,164,10.9,56,638,978,0,25,79,28,0,456,257
104.3,134,0,12.5,75,595,972,0,47,83,31,0,580,172,104.5,133,12.7,71,599,982,0,50,87,32,0,649,182
105.9,130,0,13.4,90,623,1049,1,3,113,40,0,588,160,106.4,153,13.4,91,622,1050,1,3,119,41,0,649,159
106.6,157,1,11.1,65,553,955,0,39,81,28,0,421,239,107.8,156,11.2,62,562,956,0,39,85,29,0,499,243
107.2,148,0,13.7,72,601,998,0,9,84,20,1,590,144,110.1,134,13.9,66,602,999,0,9,87,15,0,656,151
108.3,126,0,13.8,97,542,990,0,18,102,35,0,589,166,110.5,126,13.8,97,549,993,0,19,103,34,1,659,160
109.4,135,1,11.4,123,537,978,0,31,89,34,0,631,165,113.5,134,11.3,115,529,978,0,32,93,35,0,703,175
112.1,142,1,10.9,81,497,956,0,33,116,47,0,427,247,116.3,147,10.7,77,501,962,0,33,117,44,0,500,256
114.3,127,1,12.8,82,519,982,0,4,97,38,0,620,168,119.7,125,12.9,79,510,945,0,4,99,39,0,696,170
115.1,131,0,13.7,78,574,1038,1,7,142,42,1,540,176,124.5,134,13.6,73,581,1029,1,7,143,41,1,615,177
117.2,136,0,12.9,95,574,1012,1,29,111,37,1,622,162,127.8,140,13,96,581,1011,1,29,115,36,1,691,169
119.7,119,0,11.9,166,521,938,0,168,92,36,0,637,154,129.8,120,11.9,157,524,935,0,180,93,27,1,698,169
121.6,147,1,13.9,63,560,972,0,23,76,24,1,462,233,130.7,139,14,64,571,970,0,24,78,24,1,511,220
123.4,145,1,11.7,82,560,981,0,96,88,31,0,488,228,132.5,154,11.8,74,563,980,0,99,89,29,1,550,230
127.2,132,0,10.4,87,564,953,0,43,83,32,0,513,227,134.6,135,10.2,83,560,948,0,44,83,32,0,589,234
132.4,152,0,12,82,571,1018,1,10,103,28,1,537,215,137.5,151,12.1,76,567,1079,1,11,105,27,1,617,204
135.5,125,0,12.5,113,567,985,0,78,130,58,0,626,166,140.5,140,12.5,105,571,993,0,77,131,59,0,684,174
137.8,141,0,14.2,109,591,985,0,18,91,20,1,578,174,145.7,142,14.2,101,590,987,0,19,94,19,1,649,180
140.8,150,0,12,109,531,964,0,9,87,38,0,559,153,150.6,153,12,98,539,982,0,10,88,36,0,635,151
145.4,131,1,12.2,115,542,969,0,50,79,35,0,472,206,157.3,131,12.1,109,548,976,0,52,82,34,0,539,219
149.3,143,0,12.3,103,583,1012,1,13,96,36,0,557,194,162.7,142,12.2,95,612,1003,1,13,97,36,0,625,196
154.3,124,0,12.3,121,580,966,0,101,77,35,0,657,170,169.6,134,12.2,116,580,987,0,104,79,36,0,719,172
157.7,136,0,15.1,149,577,994,0,157,102,39,0,673,167,177.2,140,15.2,141,578,995,0,160,110,40,0,739,169
161.8,131,0,13.2,160,631,1071,1,3,102,41,0,674,152,178.2,132,13.2,143,632,1058,1,4,100,40,0,748,150
1 CrimeRate Youth Southern Education ExpenditureYear0 LabourForce Males MoreMales StateSize YouthUnemployment MatureUnemployment HighYouthUnemploy Wage BelowWage CrimeRate10 Youth10 Education10 ExpenditureYear10 LabourForce10 Males10 MoreMales10 StateSize10 YouthUnemploy10 MatureUnemploy10 HighYouthUnemploy10 Wage10 BelowWage10
2 45.5 135 0 12.4 69 540 965 0 6 80 22 1 564 139 26.5 135 12.5 71 564 974 0 6 82 20 1 632 142
3 52.3 140 0 10.9 55 535 1045 1 6 135 40 1 453 200 35.9 135 10.9 54 540 1039 1 7 138 39 1 521 210
4 56.6 157 1 11.2 47 512 962 0 22 97 34 0 288 276 37.1 153 11 44 529 959 0 24 98 33 0 359 256
5 60.3 139 1 11.9 46 480 968 0 19 135 53 0 457 249 42.7 139 11.8 41 497 983 0 20 131 50 0 510 235
6 64.2 126 0 12.2 106 599 989 0 40 78 25 1 593 171 46.7 125 12.2 97 602 989 0 42 79 24 1 660 162
7 67.6 128 0 13.5 67 624 972 0 28 77 25 1 507 206 47.9 128 13.8 60 621 983 0 28 81 24 1 571 199
8 70.5 130 0 14.1 63 641 984 0 14 70 21 1 486 196 50.6 153 14.1 57 641 993 0 14 71 23 1 556 176
9 73.2 143 0 12.9 66 537 977 0 10 114 35 1 487 166 55.9 143 13 63 549 973 0 11 119 36 1 561 168
10 75 141 0 12.9 56 523 968 0 4 107 37 0 489 170 61.8 153 12.9 54 538 968 0 5 110 36 1 550 126
11 78.1 133 0 11.4 51 599 1024 1 7 99 27 1 425 225 65.4 134 11.2 47 600 1024 1 7 97 28 1 499 215
12 79.8 142 1 12.9 45 533 969 0 18 94 33 0 318 250 71.4 142 13.1 44 552 969 0 19 93 36 0 378 247
13 82.3 123 0 12.5 97 526 948 0 113 124 50 0 572 158 75.4 134 12.4 87 529 949 0 117 125 49 0 639 146
14 83.1 135 0 13.6 62 595 986 0 22 77 27 0 529 190 77.3 137 13.7 61 599 993 0 23 80 28 0 591 189
15 84.9 121 0 13.2 118 547 964 0 25 84 29 0 689 126 78.6 132 13.3 115 538 968 0 25 82 30 0 742 127
16 85.6 166 1 11.4 58 521 973 0 46 72 26 0 396 237 80.6 153 11.2 54 543 983 0 47 76 25 1 568 246
17 88 140 0 12.9 71 632 1029 1 7 100 24 1 526 174 82.2 130 12.9 68 620 1024 1 8 104 25 1 570 182
18 92.3 126 0 12.7 74 602 984 0 34 102 33 1 557 195 87.5 134 12.9 67 599 982 0 33 107 34 1 621 199
19 94.3 130 0 13.3 128 536 934 0 51 78 34 0 627 135 92.9 127 13.3 128 530 949 0 52 79 33 0 692 140
20 95.3 125 0 12 90 586 964 0 97 105 43 0 617 163 94.1 134 11.9 81 571 971 0 99 106 41 0 679 162
21 96.8 151 1 10 58 510 950 0 33 108 41 0 394 261 96.2 161 10.1 56 515 1001 1 32 110 40 0 465 254
22 97.4 152 1 10.8 57 530 986 0 30 92 43 0 405 264 97.8 152 11 53 541 989 0 30 92 41 0 470 243
23 98.7 162 1 12.1 75 522 996 0 40 73 27 0 496 224 99.9 162 12 70 533 992 0 41 80 28 0 562 229
24 99.9 149 1 10.7 61 515 953 0 36 86 35 0 395 251 101.4 150 10.7 54 520 952 0 35 84 32 0 476 249
25 103 177 1 11 58 638 974 0 24 76 28 0 382 254 103.5 164 10.9 56 638 978 0 25 79 28 0 456 257
26 104.3 134 0 12.5 75 595 972 0 47 83 31 0 580 172 104.5 133 12.7 71 599 982 0 50 87 32 0 649 182
27 105.9 130 0 13.4 90 623 1049 1 3 113 40 0 588 160 106.4 153 13.4 91 622 1050 1 3 119 41 0 649 159
28 106.6 157 1 11.1 65 553 955 0 39 81 28 0 421 239 107.8 156 11.2 62 562 956 0 39 85 29 0 499 243
29 107.2 148 0 13.7 72 601 998 0 9 84 20 1 590 144 110.1 134 13.9 66 602 999 0 9 87 15 0 656 151
30 108.3 126 0 13.8 97 542 990 0 18 102 35 0 589 166 110.5 126 13.8 97 549 993 0 19 103 34 1 659 160
31 109.4 135 1 11.4 123 537 978 0 31 89 34 0 631 165 113.5 134 11.3 115 529 978 0 32 93 35 0 703 175
32 112.1 142 1 10.9 81 497 956 0 33 116 47 0 427 247 116.3 147 10.7 77 501 962 0 33 117 44 0 500 256
33 114.3 127 1 12.8 82 519 982 0 4 97 38 0 620 168 119.7 125 12.9 79 510 945 0 4 99 39 0 696 170
34 115.1 131 0 13.7 78 574 1038 1 7 142 42 1 540 176 124.5 134 13.6 73 581 1029 1 7 143 41 1 615 177
35 117.2 136 0 12.9 95 574 1012 1 29 111 37 1 622 162 127.8 140 13 96 581 1011 1 29 115 36 1 691 169
36 119.7 119 0 11.9 166 521 938 0 168 92 36 0 637 154 129.8 120 11.9 157 524 935 0 180 93 27 1 698 169
37 121.6 147 1 13.9 63 560 972 0 23 76 24 1 462 233 130.7 139 14 64 571 970 0 24 78 24 1 511 220
38 123.4 145 1 11.7 82 560 981 0 96 88 31 0 488 228 132.5 154 11.8 74 563 980 0 99 89 29 1 550 230
39 127.2 132 0 10.4 87 564 953 0 43 83 32 0 513 227 134.6 135 10.2 83 560 948 0 44 83 32 0 589 234
40 132.4 152 0 12 82 571 1018 1 10 103 28 1 537 215 137.5 151 12.1 76 567 1079 1 11 105 27 1 617 204
41 135.5 125 0 12.5 113 567 985 0 78 130 58 0 626 166 140.5 140 12.5 105 571 993 0 77 131 59 0 684 174
42 137.8 141 0 14.2 109 591 985 0 18 91 20 1 578 174 145.7 142 14.2 101 590 987 0 19 94 19 1 649 180
43 140.8 150 0 12 109 531 964 0 9 87 38 0 559 153 150.6 153 12 98 539 982 0 10 88 36 0 635 151
44 145.4 131 1 12.2 115 542 969 0 50 79 35 0 472 206 157.3 131 12.1 109 548 976 0 52 82 34 0 539 219
45 149.3 143 0 12.3 103 583 1012 1 13 96 36 0 557 194 162.7 142 12.2 95 612 1003 1 13 97 36 0 625 196
46 154.3 124 0 12.3 121 580 966 0 101 77 35 0 657 170 169.6 134 12.2 116 580 987 0 104 79 36 0 719 172
47 157.7 136 0 15.1 149 577 994 0 157 102 39 0 673 167 177.2 140 15.2 141 578 995 0 160 110 40 0 739 169
48 161.8 131 0 13.2 160 631 1071 1 3 102 41 0 674 152 178.2 132 13.2 143 632 1058 1 4 100 40 0 748 150

View File

@ -0,0 +1,29 @@
# **Numerical data analysis and process tools**
### **Project Description**:
- Numerical data correlation analysis and processing, using image visualization to help understanding.
#### Numerical analysis tools part
- Spearman_correlation is to determine whether there is a Monotonic component between two features,
which can be apply only for non_linear relationship and ordinal data.
#### Numercial process tools part
- Detecting outlier by using the Interquartile range(IQR).
- When highly correlated features will be used to remove.
#### How to use the tools
Input an only numerical data (data type:DataFrame).

View File

@ -0,0 +1,8 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> __init__.py
@IDE PyCharm
@Author rengengchen
@Date 2022/7/4 16:34
@Desc
"""

View File

@ -0,0 +1,38 @@
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import logging
logger = logging.getLogger(__name__)
def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
"""
Spearman_correlation is to determine whether there is a
Monotonic component between two features, which can be apply
only for non_linear relationship and ordinal data
@param feature_a: Input first feature for Spearman's rank test
@param feature_b: Input second feature for Spearman's rank test
@param sample_size: Choose a sample for representing the population
@param:save_path: output path
@param:file_name: output name
"""
a = data_frame[feature_a].sample(n=sample_size, random_state=1)
b = data_frame[feature_b].sample(n=sample_size, random_state=1)
coef, p = spearmanr(a, b)
logger.info("Spearmans' correlation coefficient is:" + str(coef))
alpha = 0.05
plt.scatter(a, b)
plt.xlabel("Feature A")
plt.ylabel("Feature B")
plt.title("Spearman's Rank Test")
plt.savefig(os.path.join(save_path, file_name))
if p > alpha:
logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
else:
logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))

View File

@ -0,0 +1,155 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> correlation
@IDE PyCharm
@Author rengengchen
@Date 2022/7/4 16:48
@Desc
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import spearmanr
def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
alternative='two-sided', sample_size=4000, random_state=None):
"""Calculate a Spearman correlation coefficient with associated p-value.
The Spearman rank-order correlation coefficient is a nonparametric measure
of the monotonicity of the relationship between two datasets. Unlike the
Pearson correlation, the Spearman correlation does not assume that both
datasets are normally distributed. Like other correlation coefficients,
this one varies between -1 and +1 with 0 implying no correlation.
Correlations of -1 or +1 imply an exact monotonic relationship. Positive
correlations imply that as x increases, so does y. Negative correlations
imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system
producing datasets that have a Spearman correlation at least as extreme
as the one computed from these datasets. The p-values are not entirely
reliable but are probably reasonable for datasets larger than 500 or so.
Parameters
----------
a, b : 1D or 2D array_like, b is optional
One or two 1-D or 2-D arrays containing multiple variables and
observations. When these are 1-D, each represents a vector of
observations of a single variable. For the behavior in the 2-D case,
see under ``axis``, below.
Both arrays need to have the same length in the ``axis`` dimension.
axis : int or None, optional
If axis=0 (default), then each column represents a variable, with
observations in the rows. If axis=1, the relationship is transposed:
each row represents a variable, while the columns contain observations.
If axis=None, then both arrays will be raveled.
nan_policy : {'propagate', 'raise', 'omit'}, optional
Defines how to handle when input contains nan.
The following options are available (default is 'propagate'):
* 'propagate': returns nan
* 'raise': throws an error
* 'omit': performs the calculations ignoring nan values
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis. Default is 'two-sided'.
The following options are available:
* 'two-sided': the correlation is nonzero
* 'less': the correlation is negative (less than zero)
* 'greater': the correlation is positive (greater than zero)
sample_size : int, optional
Number of items from column to return. Default is 4000.
random_state : int, array-like, BitGenerator, np.random.RandomState, optional
If int, array-like, or BitGenerator (NumPy>=1.17), seed for
random number generator
If np.random.RandomState, use as numpy RandomState object.
Returns
-------
correlation : float or ndarray (2-D square)
Spearman correlation matrix or correlation coefficient (if only 2
variables are given as parameters. Correlation matrix is square with
length equal to total number of variables (columns or rows) in ``a``
and ``b`` combined.
pvalue : float
The p-value for a hypothesis test whose null hypotheisis
is that two sets of data are uncorrelated. See `alternative` above
for alternative hypotheses. `pvalue` has the same
shape as `correlation`.
References
----------
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
Probability and Statistics Tables and Formulae. Chapman & Hall: New
York. 2000.
Section 14.7
Examples
--------
>>> from scipy import stats
>>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
>>> rng = np.random.default_rng()
>>> x2n = rng.standard_normal((100, 2))
>>> y2n = rng.standard_normal((100, 2))
>>> stats.spearmanr(x2n)
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
>>> stats.spearmanr(x2n[:,0], x2n[:,1])
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
>>> rho, pval = stats.spearmanr(x2n, y2n)
>>> rho
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
[-0.07960396, 1. , -0.14448245, 0.16738074],
[-0.08314431, -0.14448245, 1. , 0.03234323],
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
>>> pval
array([[0. , 0.43111687, 0.41084066, 0.33891628],
[0.43111687, 0. , 0.15151618, 0.09600687],
[0.41084066, 0.15151618, 0. , 0.74938561],
[0.33891628, 0.09600687, 0.74938561, 0. ]])
>>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
>>> rho
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
[-0.07960396, 1. , -0.14448245, 0.16738074],
[-0.08314431, -0.14448245, 1. , 0.03234323],
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
>>> stats.spearmanr(x2n, y2n, axis=None)
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
>>> stats.spearmanr(x2n.ravel(), y2n.ravel())
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
>>> rng = np.random.default_rng()
>>> xint = rng.integers(10, size=(100, 2))
>>> stats.spearmanr(xint)
SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
"""
# a = a.sample(n=sample_size, random_state=random_state)
# if b:
# b = b.sample(n=sample_size, random_state=random_state)
return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
cov = df.corr(method=method)
if drop:
uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
cov = cov[uncorr]
cov = cov[cov.index]
if plot or filepath:
mask = np.triu(np.ones_like(cov, dtype=bool))
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
plt.title("相关性矩阵")
if filepath:
plt.savefig(filepath)
if plot:
plt.show()
return cov

View File

@ -0,0 +1,48 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/3/25 9:09
# @Software : PyCharm
# @File : process_tool.py
# @Author : QT
# @Email : taoqimin@sics.ac.cn
import numpy as np
from tqdm import tqdm
import logging
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(handler)
logger.addHandler(console)
class NumericProcess:
@staticmethod
def drop_feature(data_frame, thresh_hold):
"""
A function for detecting and dropping highly correlated features.
when two variables are highly correlated, it usually cause problem
such as Multicolinearity. The following function will be used to
remove the correlated features.
@param data_frame: Input dataframe
@param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
"""
matrix = data_frame.corr().abs()
mask = np.triu(np.ones_like(matrix, dtype=bool))
reduced_matrix = matrix.mask(mask)
feature_drop = [c for c in tqdm(reduced_matrix) if
any(reduced_matrix[c] > thresh_hold)]
data_frame.drop(feature_drop, axis=1, inplace=True)
logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
return data_frame

View File

@ -0,0 +1,20 @@
解析配置文件,在数据进入下一步前进行一定的预处理(如补充空值、采样等)
目前完成了Pre-process Lib的部分预处理功能如下
- data_insight
- DuplicateInsight - 重复数据的检测
- NullInsight - 空值数据的检测
- ValidationInsight - 数据有效性检测
- data_process
- FilteringProcessor - 数据过滤
另外:
- TypeInsight - 其中对date日期的检验方法还未完成
还未完成

View File

@ -0,0 +1,8 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> __init__.py
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:40
@Desc
"""

View File

@ -0,0 +1,133 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# file: data_insight
# author: shenwentao, wangkanglong
# description:
# date: 2022-03-30 16:45
# IDE: PyCharm
import pandas as pd
import datetime
from typing import List, Union
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
from iod_data_analysis_tool.utils.assertion import assert_range
class DuplicateInsight:
@staticmethod
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
"""
用户自定义重复数据的计数
:param data: 来源数据
:param subset: 选中列/字段同pd.DataFrame里的dulplicated函数subset参数
:param keep: 确定要标记的重复项如果有同pd.DataFrame里的dulplicated函数keep参数
:return: 返回计数结果
"""
result = data.duplicated(subset, keep=keep).sum()
return pd.DataFrame([result], columns=['duplicate_num'])
class NullInsight:
@staticmethod
def num_null(data, column: str = None) -> pd.DataFrame:
"""
用户自定义计数数据中的空值
:param data: 来源数据
:param column: 选中列/字段
:return: 返回计数结果
"""
if column is not None:
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
else:
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
class ValidationInsight:
"""
自定义验证数据有效性比如数据里有坏数针对不同类型的数据限定范围
"""
@staticmethod
def validation_continuous_range(data: pd.DataFrame, column: str,
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
"""
用户自定义对连续数值型数据进行验证返回数据在指定范围内外的计数结果
:param data: 来源数据
:param column: 选中列/字段
:param min_val: 范围最小值
:param max_val: 范围最大值
:return: 计数结果
"""
assert_range(min_val, max_val)
nums = dict()
nums['column'] = column
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
return pd.DataFrame([nums], index=['result'])
@staticmethod
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
"""
用户自定义对离散型数据进行验证返回数据在指定范围内外的计数结果
:param data: 来源数据
:param column: 选中列/字段
:param values: 用户自定义的离散值也就是数值所在的"范围"
:return: 计数结果
"""
nums = dict()
nums['column'] = column
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
return pd.DataFrame([nums], index=['result'])
@staticmethod
def validation_date_range(data, column: str, start_date: datetime.date,
end_date: datetime.date) -> pd.DataFrame:
"""
用户自定义对日期型数据范围进行验证返回数据在指定范围内外的计数结果前提数据类型是 datetime.date
:param data: 来源数据
:param column: 选中列/字段
:param start_date: 开始日期
:param end_date: 结束日期
:return: 计数结果
"""
assert_range(start_date, end_date)
nums = dict()
nums['column'] = column
nums['date_lt_start'] = sum(data[column] < start_date)
nums['date_gt_end'] = sum(data[column] > end_date)
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
return pd.DataFrame([nums], index=['result'])
class TypeInsight:
"""
使用户能够检测数据的数据类型是否为自己所预期的
"""
# TODO: 还缺一个timestamp checker
_checkers = {
'int': is_integer_dtype,
'float': is_float_dtype,
'string': is_string_dtype,
'bool': is_bool_dtype,
'datetime': is_datetime64_dtype
}
@staticmethod
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
"""
用户检测数据类型是否为自己所需要的类型
:param data: 来源数据
:param column: 选中的列/字段
:param check_type: 选择检测的数据类型{'int', 'float', 'string', 'bool', 'datetime'}
:return: 检测结果
"""
flag = True
if not TypeInsight._checkers[check_type](data[column]):
flag = False
return pd.DataFrame([flag], columns=['result'], index=[column])

View File

@ -0,0 +1,17 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> normalizer
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:40
@Desc
"""
import pandas as pd
from scipy.stats import zscore as scipy_zscore
def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
"""
Zi = (Xi - μ) / σ
"""
return scipy_zscore(a, axis, ddof, nan_policy)

View File

@ -0,0 +1,51 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> outlierprocessing
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:24
@Desc
"""
from typing import Union
import pandas as pd
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
"""
MAD = median(|Xi - median(X)|)
@return pandas.Index
"""
x = data.median()
MC = (data - x).abs().median()
MAD = MC * constant
offset = n * MAD
if isinstance(data, pd.DataFrame):
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
else:
return data.clip(lower=x - offset, upper=x + offset)
def three_sigma(data: pd.Series):
miu = data.mean()
sigma = data.std()
low = miu - 3 * sigma
up = miu + 3 * sigma
return data.index[(data < low) | (data > up)]
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
q = data.quantile(q=[q1, q3])
IQR = q[q3] - q[q1]
lower_whisker_limit = q[q1] - k * IQR
upper_whisker_limit = q[q3] + k * IQR
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
def regex_match(data: pd.Series, *patterns):
pattern = '|'.join(patterns)
return data.index[data.astype(str).str.contains(pattern, regex=True)]
def empty(data: Union[pd.Series, pd.DataFrame]):
return any(data.isnull())

View File

@ -0,0 +1,24 @@
## 对时序数据的分析方法
--------
|模块|涉及方法|
| ---- | ---- |
|基础模块| |
|平稳性| |
|异常检测| |
|频率检测| |
|周期性检测| |
|其他| |
### 基础模块
### 平稳性
### 异常检测
### 频率检测
### 周期性检测
### 其他

View File

@ -0,0 +1,26 @@
import pandas as pd
def describe_datetime_info(data: pd.Series, datetime_is_numeric: bool = False) -> pd.Series:
"""
if the type of data is str and data dont have date, it will be populated by the
date of today.
@param data: data
@param datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric. This affects statistics
calculated for the column. For DataFrame input, this also
controls whether datetime columns are included by default.
@return: Summary statistics of the Series.
@example: Describing a numeric ``Series``.
>>> s = pd.read_csv()
>>> s.describe()
count 1427132
unique 25111
top 2022-04-26 09:25:00.260000
freq 32994
first 2022-04-26 09:25:00
last 2022-04-26 09:34:46.340000
Name: TradTime, dtype: object
"""
return pd.to_datetime(data).describe(datetime_is_numeric=datetime_is_numeric)

View File

@ -0,0 +1,62 @@
import pandas as pd
import numpy as np
from time_base import timeBase
import statsmodels.api as sm
import statsmodels.tsa.api as smt
import statsmodels.formula as smf
import scipy.stats as scs
class stationaryTest(Time_base):
"""
时间序列稳定性检验
"""
def __init__(self):
pass
def test_stationary(self, x, window_size):
"""
时间序列稳定性检验
x : 时间序列数据
window_size : 窗口大小
"""
x_ma = self.moving_average(x, window_size)
x_std = self.moving_std(x, window_size)
x_max = self.moving_max(x, window_size)
x_min = self.moving_min(x, window_size)
x_median = self.moving_median(x, window_size)
x_normalized = self.normalize(x)
x_ma_normalized = self.normalize(x_ma)
x_std_normalized = self.normalize(x_std)
x_max_normalized = self.normalize(x_max)
x_min_normalized = self.normalize(x_min)
x_median_normalized = self.normalize(x_median)
x_normalized_ma_normalized = self.normalize(x_normalized - x_ma_normalized)
x_normalized_std_normalized = self.normalize(x_normalized - x_std_normalized)
x_normalized_max_normalized = self.normalize(x_normalized - x_max_normalized)
x_normalized_min_normalized = self.normalize(x_normalized - x_min_normalized)
x_normalized_median_normalized = self.normalize(x_normalized - x_median_normalized)
x_normalized_ma_normalized_std_normalized = self.normalize(x_normalized_ma_normalized - x_std)
return x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized
def adf_test(self, x, window_size):
"""
时间序列稳定性检验
x : 时间序列数据
window_size : 窗口大小
"""
x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized = self.test_stationary(x, window_size)
adf_test_normalized = smt.adfuller(x_normalized)
adf_test_ma_normalized = smt.adfuller(x_ma_normalized)
adf_test_std_normalized = smt.adfuller(x_std_normalized)
adf_test_max_normalized = smt.adfuller(x_max_normalized)
adf_test_min_normalized = smt.adfuller(x_min_normalized)
adf_test_median_normalized = smt.adfuller(x_median_normalized)
adf_test_normalized_ma_normalized = smt.adfuller(x_normalized_ma_normalized)
adf_test_normalized_std_normalized = smt.adfuller(x_normalized_std_normalized)
adf_test_normalized_max_normalized = smt.adfuller(x_normalized_max_normalized)
adf_test_normalized_min_normalized = smt.adfuller(x_normalized_min_normalized)
return adf_test_normalized, adf_test_ma_normalized, adf_test_std_normalized, adf_test_max_normalized, adf_test_min_normalized, adf_test_median_normalized, adf_test_normalized_ma_normalized, adf_test_normalized_std_normalized, adf_test_normalized_max_normalized, adf_test_normalized_min_normalized

View File

@ -0,0 +1,133 @@
import pandas as pd
import numpy as np
class Time_base(object):
"""
时间序列基础模块
"""
def __init__(self):
pass
@staticmethod
def normalize(x):
"""
将时间序列数据归一化
x : 时间序列数据
"""
x = np.array(x)
return np.log2(x / np.sqrt(np.sum(x**2)))
@staticmethod
def lag(x, lag):
"""
滞后
x : 时间序列数据
lag : 滞后时间
"""
return pd.Series(x).shift(lag)
@staticmethod
def moving_average(x, window_size):
"""
移动平均窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).mean()
@staticmethod
def moving_median(x, window_size):
"""
移动中值窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).median()
@staticmethod
def moving_std(x, window_size):
"""
移动标准差窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).std()
@staticmethod
def moving_max(x, window_size):
"""
移动最大值窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).max()
@staticmethod
def moving_min(x, window_size):
"""
移动最小值窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).min()
@staticmethod
def moving_sum(x, window_size):
"""
移动和窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).sum()
@staticmethod
def moving_quantile(x, window_size, quantile):
"""
移动分位数窗口
x : 时间序列数据
window_size : 窗口大小
quantile : 分位数
"""
return pd.Series(x).rolling(window_size).quantile(quantile)
@staticmethod
def moving_corr(x, y, window_size):
"""
移动相关窗口
x : 时间序列数据
y : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).corr(pd.Series(y))
@staticmethod
def moving_cov(x, y, window_size):
"""
移动协方差窗口
x : 时间序列数据
y : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).cov(pd.Series(y))
@staticmethod
def moving_skew(x, window_size):
"""
移动偏度窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).skew()
@staticmethod
def moving_kurt(x, window_size):
"""
移动峰度窗口
x : 时间序列数据
window_size : 窗口大小
"""
return pd.Series(x).rolling(window_size).kurt()

View File

@ -0,0 +1,53 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> ID_code
@IDE PyCharm
@Author rengengchen
@Date 2022/5/17 16:00
@Desc
"""
import re
re_ID = re.compile(r'^\d{6}(?:18|19|20)?\d{2}(?:0[1-9]|1[012])(?:(?:[0-2][1-9])|10|20|30|31)\d{3}[0-9xX]$')
def validate_identity_code(code: str):
"""
身份证格式校验
:param code:
:return:
"""
city = {'11': "北京", '12': "天津", '13': "河北", '14': "山西", '15': "内蒙古", '21': "辽宁", '22': "吉林", '23': "黑龙江 ",
'31': "上海", '32': "江苏", '33': "浙江", '34': "安徽", '35': "福建", '36': "江西", '37': "山东", '41': "河南", '42': "湖北 ",
'43': "湖南", '44': "广东", '45': "广西", '46': "海南", '50': "重庆", '51': "四川", '52': "贵州", '53': "云南", '54': "西藏 ",
'61': "陕西", '62': "甘肃", '63': "青海", '64': "宁夏", '65': "新疆", '71': "台湾", '81': "香港", '82': "澳门", '91': "国外 "}
tip = ""
p = True
if re_ID.match(code) is None:
tip = "身份证号格式错误"
p = False
elif not city[code[:2]]:
tip = "地址编码错误"
p = False
else:
# 18位身份证需要验证最后一位校验位
if len(code) == 18:
code = code.split('')
# ∑(ai × Wi)(mod 11)
# 加权因子
factor = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
# 校验位
parity = [1, 0, 'X', 9, 8, 7, 6, 5, 4, 3, 2]
sum = 0
for i in range(17):
ai = code[i]
wi = factor[i]
sum += ai * wi
i += 1
if parity[sum % 11] != code[17]:
tip = "校验位错误"
p = False
return p, tip

View File

@ -0,0 +1,8 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> __init__.py
@IDE PyCharm
@Author rengengchen
@Date 2022/5/17 15:59
@Desc
"""

View File

@ -0,0 +1,97 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> timeutil
@IDE PyCharm
@Author rengengchen
@Date 2022/4/26 10:02
@Desc
"""
import datetime
import types
import typing
from dateutil import parser
class cnparserinfo(parser.parserinfo):
"""
匹配中文日期格式
用法:
from dateutil import parser
parser.parse('1998年12月11日 8点20分30秒', cnparserinfo())
"""
parser.parserinfo.JUMP.extend('年月日')
WEEKDAYS = [list(weekdays) for weekdays in parser.parserinfo.WEEKDAYS]
WEEKDAYS[0].extend(('星期一', '周一'))
WEEKDAYS[1].extend(('星期二', '周二'))
WEEKDAYS[2].extend(('星期三', '周三'))
WEEKDAYS[3].extend(('星期四', '周四'))
WEEKDAYS[4].extend(('星期五', '周五'))
WEEKDAYS[5].extend(('星期六', '周六'))
WEEKDAYS[6].extend(('星期天', '周日', '周天', '周末'))
WEEKDAYS = [tuple(weekdays) for weekdays in WEEKDAYS]
# MONTHS = [list(months) for months in parser.parserinfo.MONTHS]
# MONTHS[0].extend(('一月', '1月'))
# MONTHS[1].extend(('二月', '2月'))
# MONTHS[2].extend(('三月', '3月'))
# MONTHS[3].extend(('四月', '4月'))
# MONTHS[4].extend(('五月', '5月'))
# MONTHS[5].extend(('六月', '6月'))
# MONTHS[6].extend(('七月', '7月'))
# MONTHS[7].extend(('八月', '8月'))
# MONTHS[8].extend(('九月', '9月'))
# MONTHS[9].extend(('十月', '10月'))
# MONTHS[10].extend(('十一月', '11月'))
# MONTHS[11].extend(('十二月', '12月'))
# MONTHS = [tuple(months) for months in MONTHS]
HMS = [list(hms) for hms in parser.parserinfo.HMS]
HMS[0].extend('时点')
HMS[1].append('')
HMS[2].append('')
HMS = [tuple(hms) for hms in HMS]
AMPM = [list(ampm) for ampm in parser.parserinfo.AMPM]
AMPM[0].append('上午')
AMPM[1].append('下午')
AMPM = [tuple(ampm) for ampm in AMPM]
def __init__(self, dayfirst=False, yearfirst=False):
super().__init__(dayfirst, yearfirst)
def utctimestamp():
"""
@return: utc时间戳
"""
return int(datetime.datetime.utcnow().timestamp())
def timestamp2datetime(ts: float):
return datetime.datetime.fromtimestamp(ts)
def timestamp2str(ts: float, fmt: str = '%F %H:%M:%S'):
"""
@param ts: timestamp
@param fmt: format
"""
return datetime.datetime.strftime(timestamp2datetime(ts), fmt)
cnparser = cnparserinfo()
def str2datetime(datetime_str: str, fmt: str = None):
if fmt:
return datetime.datetime.strptime(datetime_str, fmt)
return parser.parse(datetime_str, cnparser)
def int2date(date_int: int):
return str2datetime(str(date_int), '%Y%m%d')
def date2int(a: typing.Union[datetime.datetime, datetime.date]):
return int(a.strftime('%Y%m%d'))

View File

@ -0,0 +1,81 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> file_util
@IDE PyCharm
@Author rengengchen
@Date 2022/5/10 17:21
@Desc
"""
import os
import queue
import shutil
import paramiko
def list_files(dir_paths):
files = []
for root, dir_path, filepath in walk(dir_paths):
if filepath:
files.append(os.path.join(root, filepath))
return files
def walk(dir_paths):
dir_queue = queue.Queue()
if isinstance(dir_paths, str):
dir_paths = [dir_paths]
for dir_path in dir_paths:
dir_queue.put(dir_path)
while not dir_queue.empty():
dirname = dir_queue.get()
for root, dirs, files in os.walk(dirname):
for dirname in dirs:
dir_queue.put(os.path.join(root, dirname))
yield root, dirname, None
for filename in files:
yield root, None, filename
def copy(s, t):
if os.path.isfile(s):
shutil.copy(s, t)
else:
if not os.path.exists(t):
os.mkdir(t)
s = os.path.abspath(s)
t = os.path.abspath(t)
for root, dirname, filename in walk(s):
if dirname:
os.mkdir(os.path.join(t, dirname))
else:
shutil.copy(os.path.join(root, filename), os.path.join(root.replace(s, t), filename))
class RemoteFileUtil:
def __init__(self, ip, username, password, port=22, local_dir=None, remote_dir=None):
tran = paramiko.Transport((ip, port))
tran.connect(username=username, password=password)
self.sftp = paramiko.SFTPClient.from_transport(tran).getfo()
self.local_dir = local_dir
self.remote_dir = remote_dir
def ls(self, remote_dir=None):
if remote_dir is None:
remote_dir = self.remote_dir
return self.sftp.listdir_attr(remote_dir)
def upload_file(self, local_filepath=None, remote_filepath=None, filename=None):
if local_filepath is None:
local_filepath = os.path.join(self.local_dir, filename)
if remote_filepath is None:
remote_filepath = os.path.join(self.remote_dir, filename)
self.sftp.put(local_filepath, remote_filepath)
def download_file(self, local_filepath=None, remote_filepath=None, filename=None):
if local_filepath is None:
local_filepath = os.path.join(self.local_dir, filename)
if remote_filepath is None:
remote_filepath = os.path.join(self.remote_dir, filename)
self.sftp.get(remote_filepath, local_filepath)

View File

@ -0,0 +1,82 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> pd_util
@IDE PyCharm
@Author rengengchen
@Date 2022/7/13 11:00
@Desc
"""
from __future__ import annotations
import os
from functools import partial
from multiprocessing import Pool
from typing import Hashable, Callable
import pandas as pd
from pandas._typing import CompressionOptions, FilePath, StorageOptions, WriteBuffer
from pandas.core.generic import bool_t
class to_same_csv:
def __init__(self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | None = None,
columns: pd.Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = False,
index_label: pd.IndexLabel | None = None,
mode: str = "w",
encoding: str = 'utf8',
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
line_terminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: str = "strict",
storage_options: StorageOptions = None,
prepare: Callable = None):
self.not_first = False
self.mode = mode
if self.mode == 'a' and isinstance(path_or_buf, str) and os.path.exists(path_or_buf):
header = False
self.header = header
self.prepare = prepare
self.kwargs = {'path_or_buf': path_or_buf,
'sep': sep,
'na_rep': na_rep,
'float_format': float_format,
'columns': columns,
'index': index,
'index_label': index_label,
'encoding': encoding,
'compression': compression,
'quoting': quoting,
'quotechar': quotechar,
'line_terminator': line_terminator,
'chunksize': chunksize,
'date_format': date_format,
'doublequote': doublequote,
'escapechar': escapechar,
'decimal': decimal,
'errors': errors,
'storage_options': storage_options}
def __call__(self, df_or_series: pd.Series | pd.DataFrame):
if self.not_first:
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
else:
if self.prepare:
result = self.prepare(df_or_series)
if result:
df_or_series = result
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
self.mode = 'a'
self.header = False

View File

@ -0,0 +1,17 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File IoD_data_analysis_tool -> phone_util
@IDE PyCharm
@Author rengengchen
@Date 2022/5/17 15:59
@Desc
"""
import re
re_phone = re.compile(r'^(?:(?:13[0-9])'
r'|(?:14(?:0|[5-7]|9))'
r'|(?:15(?:[0-3]|[5-9]))'
r'|(?:16(?:2|[5-7]))'
r'|(?:17[0-8])'
r'|(?:18[0-9])'
r'|(?:19(?:[0-3]|[5-9])))\d{8}$')

View File

@ -0,0 +1,61 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project IoD_data_analysis_tool
@File project_util.py
@IDE PyCharm
@Author rengengchen
@Time 2022/9/15 9:45
"""
import compileall
import os
import re
import shutil
from os.path import join
from lib.analysis_package.utils.file_util import walk
re_pyc = re.compile(r'cpython-\d+\.')
def compile_project(source, target=None):
"""
编译项目为pyc文件到指定目录
@param source: 项目路径
@param target: 编译文件存放路径
"""
source = os.path.abspath(source)
if target is None:
target = source
else:
target = os.path.abspath(target)
compileall.compile_dir(source)
pycache_paths = set()
if target == source:
for root, dirname, filename in walk(source):
if root[-11:] == '__pycache__':
pycache_paths.add(root)
shutil.move(join(root, filename), join(root, '../', re_pyc.sub('', filename)))
if filename and filename.endswith('py'):
os.remove(join(root, filename))
else:
if target is None:
target = join(source, 'dist')
len_t = len(target)
for root, dirname, filename in walk(source):
t_root = root.replace(source, target)
if target == root[:len_t]:
continue
if dirname and dirname != '__pycache__':
t_root = join(t_root, dirname)
if not os.path.exists(t_root) and join(source, dirname) != target:
os.makedirs(t_root)
elif filename and not filename.endswith('py'):
if root[-11:] == '__pycache__':
pycache_paths.add(root)
t_root = t_root[:-11]
shutil.move(join(root, filename), join(t_root, re_pyc.sub('', filename)))
else:
shutil.copyfile(join(root, filename), join(t_root, filename))
for p in pycache_paths:
os.rmdir(p)

Binary file not shown.

14
lib/package_project.py Normal file
View File

@ -0,0 +1,14 @@
# -*- coding: UTF-8 -*-
"""
@Project -> File scrapyproject -> package_project
@IDE PyCharm
@Author rengengchen
@Date 2021/5/12 10:46
@Desc
"""
import shutil
import subprocess
subprocess.call('python setup.py bdist_wheel')
shutil.rmtree(r'build')
shutil.rmtree(r'analysis_package.egg-info')

36
lib/setup.py Normal file
View File

@ -0,0 +1,36 @@
# coding:utf-8
from setuptools import setup, find_packages
PACKAGE = "analysis_package"
NAME = "analysis_package"
DESCRIPTION = "general analysis function"
AUTHOR = "iod"
AUTHOR_EMAIL = "rengengchen@sics.ac.cn"
URL = ""
VERSION = '0.1.3'
setup(
name=NAME,
version=VERSION,
description=DESCRIPTION,
author=AUTHOR,
author_email=AUTHOR_EMAIL,
license="BSD",
url=URL,
include_package_data=True,
packages=find_packages(),
classifiers=[
'Programming Language :: Python',
'Operating System :: OS Independent',
],
install_requires=[
'pandas',
'scipy',
'numpy',
'matplotlib',
'seaborn',
'tqdm',
'scikit-learn',
],
zip_safe=False,
)