init
This commit is contained in:
commit
707997d4e1
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,12 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N802" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/utils.iml" filepath="$PROJECT_DIR$/.idea/utils.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,5 @@
|
|||
模块职责:
|
||||
1. continuous:针对数值型数据进行特征分析
|
||||
2. categorical:针对离散型数据进行特征分析
|
||||
3. timeseries:对时序数据的分析方法
|
||||
4. pre-process:解析配置文件,在数据进入下一步前进行一定的预处理(如补充空值、采样等)
|
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
'''
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :__init__.py.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/8/3 17:07
|
||||
'''
|
|
@ -0,0 +1,30 @@
|
|||
数值模块:
|
||||
针对离散型数据进行特征分析
|
||||
|
||||
分析方法:
|
||||
|
||||
1> 描述性统计:
|
||||
- 记录数据中该列包含的分类
|
||||
|
||||
- 分类个数
|
||||
|
||||
- 频数表
|
||||
- 列联表
|
||||
|
||||
2> 卡方独立性检验
|
||||
|
||||
3> 信息熵
|
||||
|
||||
4> 互信息
|
||||
|
||||
功能:
|
||||
|
||||
多列离散数据循环进行数据分析
|
||||
|
||||
运行环境:
|
||||
python3.7.10以上
|
||||
- numpy
|
||||
- pandas
|
||||
- matplotlib
|
||||
- sklearn
|
||||
- scipy.stats
|
|
@ -0,0 +1,8 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/7/4 16:34
|
||||
@Desc :
|
||||
"""
|
|
@ -0,0 +1,180 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2022/3/17 17:36
|
||||
# @Author : Leng Yang
|
||||
# @FileName: categorical_process.py
|
||||
# @Software: PyCharm
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn import metrics
|
||||
from scipy.stats import chi2_contingency, chi2
|
||||
|
||||
|
||||
def test():
|
||||
pass
|
||||
|
||||
|
||||
class CategorySelfDescribe(object):
|
||||
"""
|
||||
描述性统计量
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def category_describe(data: pd.Series) -> pd.DataFrame:
|
||||
"""
|
||||
描述该列数据包含的分类名称和分类种类数量
|
||||
:param data: 输入数据,格式为pd.Series
|
||||
:return: pd.DataFrame, 返回dataframe形式包含分类名称列表和分类种类数量
|
||||
Examples
|
||||
--------
|
||||
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
|
||||
>>> CategorySelfDescribe().category_describe('天气')
|
||||
categories types
|
||||
0 [晴, 阴, 雨] 3.0
|
||||
"""
|
||||
results = pd.DataFrame()
|
||||
results = results.append({'categories': data.unique(), 'types': len(data.unique())}, ignore_index=True)
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def category_frequency(data: pd.Series) -> pd.DataFrame:
|
||||
"""
|
||||
频数表
|
||||
:param data: 输入数据,格式为pd.Series
|
||||
:return: pd.DataFrame, 返回频数表
|
||||
Examples
|
||||
--------
|
||||
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨','雨','雨','阴','晴','晴','雨','晴','阴','阴','雨'],
|
||||
'温度':['高','高','高','低','低','低','低','低','低','低','低','低','高','低']})
|
||||
>>> CategorySelfDescribe().category_frequency('天气')
|
||||
unique_values count frequency
|
||||
0 晴 5 0.357143
|
||||
1 雨 5 0.357143
|
||||
2 阴 4 0.285714
|
||||
|
||||
"""
|
||||
df_freq = data.value_counts(ascending=False).rename_axis('unique_values').reset_index(name='count')
|
||||
df_freq['frequency'] = df_freq['count'] / len(data)
|
||||
return df_freq
|
||||
|
||||
|
||||
class CategorySelfAnalyse(object):
|
||||
"""
|
||||
对单列分类数据进行统计分析
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def entropy(data: pd.Series) -> float:
|
||||
"""
|
||||
计算信息熵
|
||||
:param data: 输入数据,格式为pd.Series
|
||||
:return: float, 信息熵
|
||||
"""
|
||||
prob = pd.value_counts(data) / len(data)
|
||||
return sum(np.log2(prob) * prob * (-1))
|
||||
|
||||
|
||||
class CategoryMutualDescribe(object):
|
||||
"""
|
||||
对两列不同的分类数据进行描述性统计
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def crosstab(row_data: pd.Series, col_data: pd.Series) -> pd.DataFrame:
|
||||
"""
|
||||
对两列不同的分类数据进行列联表分析
|
||||
:param row_data: categorical数据1, 数据1分类作为列联表的行
|
||||
:param col_data: categorical数据2, 数据2分类作为列联表的列
|
||||
:return: pd.DataFrame, 列联表
|
||||
Examples
|
||||
--------
|
||||
>>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
|
||||
>>> CategoryMutualDescribe().crosstab('天气','温度')
|
||||
温度 高 低
|
||||
天气
|
||||
晴 2 0
|
||||
阴 1 0
|
||||
雨 0 1
|
||||
"""
|
||||
return pd.crosstab(row_data, col_data)
|
||||
|
||||
|
||||
class MutualCategoricalAnalyse(object):
|
||||
"""
|
||||
对两列分类数据进行统计分析
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def info_gain(df: pd.DataFrame, attr_col: str, data_col: str) -> float:
|
||||
"""
|
||||
计算信息增益: Gain(D,A) = Ent(D) - Ent(D|A)
|
||||
使用某个特征A划分数据集D
|
||||
:param df_data: 输入数据,格式为dataframe
|
||||
:param attr_col: 特征数据列名
|
||||
:param data_col: 数据集列名
|
||||
:return: float, 信息增益
|
||||
"""
|
||||
# e: 条件信息熵
|
||||
e1 = df.groupby(attr_col).apply(lambda x: CategorySelfAnalyse.entropy(x[data_col]))
|
||||
p1 = pd.value_counts(df[attr_col]) / len(df[attr_col]) # p(x)
|
||||
e2 = sum(e1 * p1) # Ent(D|A)
|
||||
return CategorySelfAnalyse.entropy(df[data_col]) - e2
|
||||
|
||||
@staticmethod
|
||||
def normalized_mutual_information(data1: pd.Series, data2: pd.Series) -> float:
|
||||
"""
|
||||
Mutual Information between two clusterings. The Mutual Information is a measure of the similarity
|
||||
between two labels of the same data.
|
||||
Normalized Mutual Information (NMI) is a normalization of the Mutual
|
||||
Information (MI) score to scale the results between 0 (no mutual
|
||||
information) and 1 (perfect correlation).
|
||||
:param df_data: 输入数据,格式为dataframe
|
||||
:param data1: 分类数据1
|
||||
:param data2: 分类数据2
|
||||
:return: nmi : float, score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
|
||||
"""
|
||||
return metrics.normalized_mutual_info_score(data1, data2)
|
||||
|
||||
@staticmethod
|
||||
def chi2_independence(data1: pd.Series, data2: pd.Series, alpha=0.05) -> pd.DataFrame:
|
||||
"""
|
||||
卡方独立性检验
|
||||
:param alpha: 置信度,用来确定临界值
|
||||
:param data1: categroical数据1
|
||||
:param data2: categorical数据2
|
||||
:return: pd.DataFrame,内容如下:
|
||||
g: 卡方值,也就是统计量
|
||||
p: P值(统计学名词),与置信度对比,也可进行假设检验,P值小于置信度,即可拒绝原假设
|
||||
dof: 自由度
|
||||
re: 判读变量,1表示拒绝原假设,0表示接受原假设
|
||||
expctd: 原数据数组同维度的对应理论值
|
||||
"""
|
||||
data = CategoryMutualDescribe.crosstab(data1, data2)
|
||||
result = pd.DataFrame(columns=['g', 'p', 'dof', 'expctd'])
|
||||
g, p, dof, expctd = chi2_contingency(data)
|
||||
result = result.append({'g': g, 'p': p, 'dof': dof, 'expctd': expctd}, ignore_index=True)
|
||||
if dof == 0:
|
||||
raise ValueError('自由度应该大于等于1')
|
||||
elif dof == 1:
|
||||
cv = chi2.isf(alpha * 0.5, dof) # critical value
|
||||
else:
|
||||
cv = chi2.isf(alpha * 0.5, dof - 1)
|
||||
|
||||
if g > cv:
|
||||
result.loc[0, 're'] = 1 # 表示拒绝原假设
|
||||
else:
|
||||
result.loc[0, 're'] = 0 # 表示接受原假设
|
||||
return result
|
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :__init__.py.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/8/5 11:52
|
||||
"""
|
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :__init__.py.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/8/5 11:52
|
||||
"""
|
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :producer_consumer.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/8/5 11:53
|
||||
"""
|
||||
import multiprocessing
|
||||
from typing import Iterable, Callable
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class Stop:
|
||||
pass
|
||||
|
||||
|
||||
class AbstractPCConcurrencySystem:
|
||||
"""
|
||||
@todo 对启动进程的维护
|
||||
@todo 进程数量
|
||||
"""
|
||||
|
||||
def __init__(self, num_producer: int = 1, num_consumer: int = 1, num_callback: int = 0,
|
||||
len_task_queue: int = 0, len_result_queue: int = 0, len_callback_queue: int = 0,
|
||||
producer_lock=None, consumer_lock=None, callback_lock=None,
|
||||
meta=None, enable_progressbar=False, num_total_result=None):
|
||||
self.task_queue = multiprocessing.Queue(len_task_queue)
|
||||
|
||||
self.num_producer = num_producer
|
||||
self.num_consumer = num_consumer
|
||||
self.num_callback = num_callback
|
||||
self.producer_lock = producer_lock or multiprocessing.Lock()
|
||||
self.consumer_lock = consumer_lock or multiprocessing.Lock()
|
||||
self.meta = meta
|
||||
self.enable_progressbar = enable_progressbar
|
||||
if enable_progressbar and self.num_callback == 0:
|
||||
self.num_callback = 1
|
||||
self.result_queue = multiprocessing.Queue(len_result_queue)
|
||||
if self.num_callback:
|
||||
self.callback_lock = callback_lock or multiprocessing.Lock()
|
||||
self.num_total_result = num_total_result
|
||||
self.callback_queue = multiprocessing.Queue(len_callback_queue)
|
||||
|
||||
def get_result(self):
|
||||
return self.callback_queue.get()
|
||||
|
||||
def produce(self):
|
||||
"""
|
||||
Must return an iterable object or a Stop object.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def consume(self, consumer_params):
|
||||
"""
|
||||
@return: task result or Stop()
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def callback(self, result):
|
||||
return result
|
||||
|
||||
def _produce(self):
|
||||
producer = self.produce()
|
||||
if isinstance(producer, Iterable):
|
||||
for params in producer:
|
||||
self.task_queue.put(params, block=True)
|
||||
stop = Stop()
|
||||
for _ in range(self.num_consumer):
|
||||
self.task_queue.put(stop, block=True)
|
||||
elif isinstance(producer, Callable):
|
||||
while True:
|
||||
task = producer()
|
||||
if isinstance(task, Stop):
|
||||
break
|
||||
self.task_queue.put(task, block=True)
|
||||
|
||||
def _consume(self):
|
||||
consumer_params = self.task_queue.get(block=True)
|
||||
while not isinstance(consumer_params, Stop):
|
||||
info = self.consume(consumer_params)
|
||||
self.result_queue.put(info)
|
||||
consumer_params = self.task_queue.get(block=True)
|
||||
self.result_queue.put(Stop())
|
||||
|
||||
def _callback(self):
|
||||
if self.enable_progressbar:
|
||||
bar = tqdm(total=self.num_total_result)
|
||||
over_flag = 0
|
||||
while over_flag < self.num_consumer:
|
||||
result = self.result_queue.get(block=True)
|
||||
if isinstance(result, Stop):
|
||||
over_flag += 1
|
||||
else:
|
||||
callback = self.callback(result)
|
||||
self.callback_queue.put(callback)
|
||||
if self.enable_progressbar:
|
||||
bar.update(1)
|
||||
else:
|
||||
if self.enable_progressbar:
|
||||
bar.close()
|
||||
|
||||
def run(self):
|
||||
consumers = []
|
||||
callbackers = []
|
||||
# 创建并启动生产者
|
||||
for i in range(self.num_producer):
|
||||
multiprocessing.Process(target=self._produce, name=f'producer_{i}').start()
|
||||
# 创建并启动消费者
|
||||
for i in range(self.num_consumer):
|
||||
p = multiprocessing.Process(target=self._consume, name=f'consumer_{i}')
|
||||
consumers.append(p)
|
||||
p.start()
|
||||
# 处理结果
|
||||
if self.num_callback:
|
||||
for i in range(self.num_callback):
|
||||
p = multiprocessing.Process(target=self._callback, name=f'callback_{i}')
|
||||
callbackers.append(p)
|
||||
p.start()
|
||||
return self
|
||||
|
||||
def close(self):
|
||||
self.task_queue.close()
|
||||
self.result_queue.close()
|
||||
self.callback_queue.close()
|
|
@ -0,0 +1,28 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :distribute_task.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/8/8 16:55
|
||||
"""
|
||||
import math
|
||||
import multiprocessing
|
||||
|
||||
|
||||
def equally_distributing_task(target, tasks, *args, results=None, num_processors=8):
|
||||
len_tasks = len(tasks)
|
||||
process_offset = math.ceil(len_tasks / num_processors)
|
||||
for i in range(num_processors):
|
||||
sub_tasks = tasks[i * process_offset: (i + 1) * process_offset]
|
||||
if sub_tasks:
|
||||
if results:
|
||||
multiprocessing.Process(target=target,
|
||||
args=(sub_tasks, results, *args)).start()
|
||||
else:
|
||||
multiprocessing.Process(target=target,
|
||||
args=(sub_tasks, *args)).start()
|
||||
else:
|
||||
break
|
||||
return results
|
|
@ -0,0 +1,48 @@
|
|||
CrimeRate,Youth,Southern,Education,ExpenditureYear0,LabourForce,Males,MoreMales,StateSize,YouthUnemployment,MatureUnemployment,HighYouthUnemploy,Wage,BelowWage,CrimeRate10,Youth10,Education10,ExpenditureYear10,LabourForce10,Males10,MoreMales10,StateSize10,YouthUnemploy10,MatureUnemploy10,HighYouthUnemploy10,Wage10,BelowWage10
|
||||
45.5,135,0,12.4,69,540,965,0,6,80,22,1,564,139,26.5,135,12.5,71,564,974,0,6,82,20,1,632,142
|
||||
52.3,140,0,10.9,55,535,1045,1,6,135,40,1,453,200,35.9,135,10.9,54,540,1039,1,7,138,39,1,521,210
|
||||
56.6,157,1,11.2,47,512,962,0,22,97,34,0,288,276,37.1,153,11,44,529,959,0,24,98,33,0,359,256
|
||||
60.3,139,1,11.9,46,480,968,0,19,135,53,0,457,249,42.7,139,11.8,41,497,983,0,20,131,50,0,510,235
|
||||
64.2,126,0,12.2,106,599,989,0,40,78,25,1,593,171,46.7,125,12.2,97,602,989,0,42,79,24,1,660,162
|
||||
67.6,128,0,13.5,67,624,972,0,28,77,25,1,507,206,47.9,128,13.8,60,621,983,0,28,81,24,1,571,199
|
||||
70.5,130,0,14.1,63,641,984,0,14,70,21,1,486,196,50.6,153,14.1,57,641,993,0,14,71,23,1,556,176
|
||||
73.2,143,0,12.9,66,537,977,0,10,114,35,1,487,166,55.9,143,13,63,549,973,0,11,119,36,1,561,168
|
||||
75,141,0,12.9,56,523,968,0,4,107,37,0,489,170,61.8,153,12.9,54,538,968,0,5,110,36,1,550,126
|
||||
78.1,133,0,11.4,51,599,1024,1,7,99,27,1,425,225,65.4,134,11.2,47,600,1024,1,7,97,28,1,499,215
|
||||
79.8,142,1,12.9,45,533,969,0,18,94,33,0,318,250,71.4,142,13.1,44,552,969,0,19,93,36,0,378,247
|
||||
82.3,123,0,12.5,97,526,948,0,113,124,50,0,572,158,75.4,134,12.4,87,529,949,0,117,125,49,0,639,146
|
||||
83.1,135,0,13.6,62,595,986,0,22,77,27,0,529,190,77.3,137,13.7,61,599,993,0,23,80,28,0,591,189
|
||||
84.9,121,0,13.2,118,547,964,0,25,84,29,0,689,126,78.6,132,13.3,115,538,968,0,25,82,30,0,742,127
|
||||
85.6,166,1,11.4,58,521,973,0,46,72,26,0,396,237,80.6,153,11.2,54,543,983,0,47,76,25,1,568,246
|
||||
88,140,0,12.9,71,632,1029,1,7,100,24,1,526,174,82.2,130,12.9,68,620,1024,1,8,104,25,1,570,182
|
||||
92.3,126,0,12.7,74,602,984,0,34,102,33,1,557,195,87.5,134,12.9,67,599,982,0,33,107,34,1,621,199
|
||||
94.3,130,0,13.3,128,536,934,0,51,78,34,0,627,135,92.9,127,13.3,128,530,949,0,52,79,33,0,692,140
|
||||
95.3,125,0,12,90,586,964,0,97,105,43,0,617,163,94.1,134,11.9,81,571,971,0,99,106,41,0,679,162
|
||||
96.8,151,1,10,58,510,950,0,33,108,41,0,394,261,96.2,161,10.1,56,515,1001,1,32,110,40,0,465,254
|
||||
97.4,152,1,10.8,57,530,986,0,30,92,43,0,405,264,97.8,152,11,53,541,989,0,30,92,41,0,470,243
|
||||
98.7,162,1,12.1,75,522,996,0,40,73,27,0,496,224,99.9,162,12,70,533,992,0,41,80,28,0,562,229
|
||||
99.9,149,1,10.7,61,515,953,0,36,86,35,0,395,251,101.4,150,10.7,54,520,952,0,35,84,32,0,476,249
|
||||
103,177,1,11,58,638,974,0,24,76,28,0,382,254,103.5,164,10.9,56,638,978,0,25,79,28,0,456,257
|
||||
104.3,134,0,12.5,75,595,972,0,47,83,31,0,580,172,104.5,133,12.7,71,599,982,0,50,87,32,0,649,182
|
||||
105.9,130,0,13.4,90,623,1049,1,3,113,40,0,588,160,106.4,153,13.4,91,622,1050,1,3,119,41,0,649,159
|
||||
106.6,157,1,11.1,65,553,955,0,39,81,28,0,421,239,107.8,156,11.2,62,562,956,0,39,85,29,0,499,243
|
||||
107.2,148,0,13.7,72,601,998,0,9,84,20,1,590,144,110.1,134,13.9,66,602,999,0,9,87,15,0,656,151
|
||||
108.3,126,0,13.8,97,542,990,0,18,102,35,0,589,166,110.5,126,13.8,97,549,993,0,19,103,34,1,659,160
|
||||
109.4,135,1,11.4,123,537,978,0,31,89,34,0,631,165,113.5,134,11.3,115,529,978,0,32,93,35,0,703,175
|
||||
112.1,142,1,10.9,81,497,956,0,33,116,47,0,427,247,116.3,147,10.7,77,501,962,0,33,117,44,0,500,256
|
||||
114.3,127,1,12.8,82,519,982,0,4,97,38,0,620,168,119.7,125,12.9,79,510,945,0,4,99,39,0,696,170
|
||||
115.1,131,0,13.7,78,574,1038,1,7,142,42,1,540,176,124.5,134,13.6,73,581,1029,1,7,143,41,1,615,177
|
||||
117.2,136,0,12.9,95,574,1012,1,29,111,37,1,622,162,127.8,140,13,96,581,1011,1,29,115,36,1,691,169
|
||||
119.7,119,0,11.9,166,521,938,0,168,92,36,0,637,154,129.8,120,11.9,157,524,935,0,180,93,27,1,698,169
|
||||
121.6,147,1,13.9,63,560,972,0,23,76,24,1,462,233,130.7,139,14,64,571,970,0,24,78,24,1,511,220
|
||||
123.4,145,1,11.7,82,560,981,0,96,88,31,0,488,228,132.5,154,11.8,74,563,980,0,99,89,29,1,550,230
|
||||
127.2,132,0,10.4,87,564,953,0,43,83,32,0,513,227,134.6,135,10.2,83,560,948,0,44,83,32,0,589,234
|
||||
132.4,152,0,12,82,571,1018,1,10,103,28,1,537,215,137.5,151,12.1,76,567,1079,1,11,105,27,1,617,204
|
||||
135.5,125,0,12.5,113,567,985,0,78,130,58,0,626,166,140.5,140,12.5,105,571,993,0,77,131,59,0,684,174
|
||||
137.8,141,0,14.2,109,591,985,0,18,91,20,1,578,174,145.7,142,14.2,101,590,987,0,19,94,19,1,649,180
|
||||
140.8,150,0,12,109,531,964,0,9,87,38,0,559,153,150.6,153,12,98,539,982,0,10,88,36,0,635,151
|
||||
145.4,131,1,12.2,115,542,969,0,50,79,35,0,472,206,157.3,131,12.1,109,548,976,0,52,82,34,0,539,219
|
||||
149.3,143,0,12.3,103,583,1012,1,13,96,36,0,557,194,162.7,142,12.2,95,612,1003,1,13,97,36,0,625,196
|
||||
154.3,124,0,12.3,121,580,966,0,101,77,35,0,657,170,169.6,134,12.2,116,580,987,0,104,79,36,0,719,172
|
||||
157.7,136,0,15.1,149,577,994,0,157,102,39,0,673,167,177.2,140,15.2,141,578,995,0,160,110,40,0,739,169
|
||||
161.8,131,0,13.2,160,631,1071,1,3,102,41,0,674,152,178.2,132,13.2,143,632,1058,1,4,100,40,0,748,150
|
|
|
@ -0,0 +1,29 @@
|
|||
# **Numerical data analysis and process tools**
|
||||
|
||||
|
||||
### **Project Description**:
|
||||
|
||||
- Numerical data correlation analysis and processing, using image visualization to help understanding.
|
||||
|
||||
|
||||
|
||||
#### Numerical analysis tools part
|
||||
|
||||
- Spearman_correlation is to determine whether there is a Monotonic component between two features,
|
||||
which can be apply only for non_linear relationship and ordinal data.
|
||||
|
||||
#### Numercial process tools part
|
||||
|
||||
- Detecting outlier by using the Interquartile range(IQR).
|
||||
- When highly correlated features will be used to remove.
|
||||
|
||||
|
||||
|
||||
#### How to use the tools
|
||||
|
||||
Input an only numerical data (data type:DataFrame).
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/7/4 16:34
|
||||
@Desc :
|
||||
"""
|
|
@ -0,0 +1,38 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import spearmanr
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
|
||||
"""
|
||||
Spearman_correlation is to determine whether there is a
|
||||
Monotonic component between two features, which can be apply
|
||||
only for non_linear relationship and ordinal data
|
||||
|
||||
@param feature_a: Input first feature for Spearman's rank test
|
||||
@param feature_b: Input second feature for Spearman's rank test
|
||||
@param sample_size: Choose a sample for representing the population
|
||||
@param:save_path: output path
|
||||
@param:file_name: output name
|
||||
|
||||
"""
|
||||
a = data_frame[feature_a].sample(n=sample_size, random_state=1)
|
||||
b = data_frame[feature_b].sample(n=sample_size, random_state=1)
|
||||
coef, p = spearmanr(a, b)
|
||||
logger.info("Spearmans' correlation coefficient is:" + str(coef))
|
||||
alpha = 0.05
|
||||
plt.scatter(a, b)
|
||||
plt.xlabel("Feature A")
|
||||
plt.ylabel("Feature B")
|
||||
plt.title("Spearman's Rank Test")
|
||||
plt.savefig(os.path.join(save_path, file_name))
|
||||
if p > alpha:
|
||||
logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
|
||||
else:
|
||||
logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))
|
|
@ -0,0 +1,155 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> correlation
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/7/4 16:48
|
||||
@Desc :
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from scipy.stats import spearmanr
|
||||
|
||||
|
||||
def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
|
||||
alternative='two-sided', sample_size=4000, random_state=None):
|
||||
"""Calculate a Spearman correlation coefficient with associated p-value.
|
||||
|
||||
The Spearman rank-order correlation coefficient is a nonparametric measure
|
||||
of the monotonicity of the relationship between two datasets. Unlike the
|
||||
Pearson correlation, the Spearman correlation does not assume that both
|
||||
datasets are normally distributed. Like other correlation coefficients,
|
||||
this one varies between -1 and +1 with 0 implying no correlation.
|
||||
Correlations of -1 or +1 imply an exact monotonic relationship. Positive
|
||||
correlations imply that as x increases, so does y. Negative correlations
|
||||
imply that as x increases, y decreases.
|
||||
|
||||
The p-value roughly indicates the probability of an uncorrelated system
|
||||
producing datasets that have a Spearman correlation at least as extreme
|
||||
as the one computed from these datasets. The p-values are not entirely
|
||||
reliable but are probably reasonable for datasets larger than 500 or so.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a, b : 1D or 2D array_like, b is optional
|
||||
One or two 1-D or 2-D arrays containing multiple variables and
|
||||
observations. When these are 1-D, each represents a vector of
|
||||
observations of a single variable. For the behavior in the 2-D case,
|
||||
see under ``axis``, below.
|
||||
Both arrays need to have the same length in the ``axis`` dimension.
|
||||
axis : int or None, optional
|
||||
If axis=0 (default), then each column represents a variable, with
|
||||
observations in the rows. If axis=1, the relationship is transposed:
|
||||
each row represents a variable, while the columns contain observations.
|
||||
If axis=None, then both arrays will be raveled.
|
||||
nan_policy : {'propagate', 'raise', 'omit'}, optional
|
||||
Defines how to handle when input contains nan.
|
||||
The following options are available (default is 'propagate'):
|
||||
|
||||
* 'propagate': returns nan
|
||||
* 'raise': throws an error
|
||||
* 'omit': performs the calculations ignoring nan values
|
||||
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
The following options are available:
|
||||
|
||||
* 'two-sided': the correlation is nonzero
|
||||
* 'less': the correlation is negative (less than zero)
|
||||
* 'greater': the correlation is positive (greater than zero)
|
||||
|
||||
sample_size : int, optional
|
||||
Number of items from column to return. Default is 4000.
|
||||
|
||||
random_state : int, array-like, BitGenerator, np.random.RandomState, optional
|
||||
If int, array-like, or BitGenerator (NumPy>=1.17), seed for
|
||||
random number generator
|
||||
If np.random.RandomState, use as numpy RandomState object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
correlation : float or ndarray (2-D square)
|
||||
Spearman correlation matrix or correlation coefficient (if only 2
|
||||
variables are given as parameters. Correlation matrix is square with
|
||||
length equal to total number of variables (columns or rows) in ``a``
|
||||
and ``b`` combined.
|
||||
pvalue : float
|
||||
The p-value for a hypothesis test whose null hypotheisis
|
||||
is that two sets of data are uncorrelated. See `alternative` above
|
||||
for alternative hypotheses. `pvalue` has the same
|
||||
shape as `correlation`.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
|
||||
Probability and Statistics Tables and Formulae. Chapman & Hall: New
|
||||
York. 2000.
|
||||
Section 14.7
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
>>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
|
||||
SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> x2n = rng.standard_normal((100, 2))
|
||||
>>> y2n = rng.standard_normal((100, 2))
|
||||
>>> stats.spearmanr(x2n)
|
||||
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
||||
>>> stats.spearmanr(x2n[:,0], x2n[:,1])
|
||||
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
||||
>>> rho, pval = stats.spearmanr(x2n, y2n)
|
||||
>>> rho
|
||||
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
||||
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
||||
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
||||
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
||||
>>> pval
|
||||
array([[0. , 0.43111687, 0.41084066, 0.33891628],
|
||||
[0.43111687, 0. , 0.15151618, 0.09600687],
|
||||
[0.41084066, 0.15151618, 0. , 0.74938561],
|
||||
[0.33891628, 0.09600687, 0.74938561, 0. ]])
|
||||
>>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
|
||||
>>> rho
|
||||
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
||||
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
||||
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
||||
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
||||
>>> stats.spearmanr(x2n, y2n, axis=None)
|
||||
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
||||
>>> stats.spearmanr(x2n.ravel(), y2n.ravel())
|
||||
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> xint = rng.integers(10, size=(100, 2))
|
||||
>>> stats.spearmanr(xint)
|
||||
SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
|
||||
|
||||
"""
|
||||
# a = a.sample(n=sample_size, random_state=random_state)
|
||||
# if b:
|
||||
# b = b.sample(n=sample_size, random_state=random_state)
|
||||
return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
|
||||
|
||||
|
||||
def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
|
||||
cov = df.corr(method=method)
|
||||
if drop:
|
||||
uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
|
||||
cov = cov[uncorr]
|
||||
cov = cov[cov.index]
|
||||
if plot or filepath:
|
||||
mask = np.triu(np.ones_like(cov, dtype=bool))
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
|
||||
plt.title("相关性矩阵")
|
||||
if filepath:
|
||||
plt.savefig(filepath)
|
||||
if plot:
|
||||
plt.show()
|
||||
return cov
|
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2022/3/25 9:09
|
||||
# @Software : PyCharm
|
||||
# @File : process_tool.py
|
||||
# @Author : QT
|
||||
# @Email : taoqimin@sics.ac.cn
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(level=logging.INFO)
|
||||
handler = logging.FileHandler("log.txt")
|
||||
handler.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
console = logging.StreamHandler()
|
||||
console.setLevel(logging.INFO)
|
||||
|
||||
logger.addHandler(handler)
|
||||
logger.addHandler(console)
|
||||
|
||||
|
||||
class NumericProcess:
|
||||
@staticmethod
|
||||
def drop_feature(data_frame, thresh_hold):
|
||||
"""
|
||||
A function for detecting and dropping highly correlated features.
|
||||
when two variables are highly correlated, it usually cause problem
|
||||
such as Multicolinearity. The following function will be used to
|
||||
remove the correlated features.
|
||||
|
||||
@param data_frame: Input dataframe
|
||||
@param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
|
||||
|
||||
"""
|
||||
|
||||
matrix = data_frame.corr().abs()
|
||||
mask = np.triu(np.ones_like(matrix, dtype=bool))
|
||||
reduced_matrix = matrix.mask(mask)
|
||||
feature_drop = [c for c in tqdm(reduced_matrix) if
|
||||
any(reduced_matrix[c] > thresh_hold)]
|
||||
data_frame.drop(feature_drop, axis=1, inplace=True)
|
||||
logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
|
||||
return data_frame
|
|
@ -0,0 +1,20 @@
|
|||
解析配置文件,在数据进入下一步前进行一定的预处理(如补充空值、采样等)
|
||||
|
||||
|
||||
|
||||
目前完成了Pre-process Lib的部分预处理功能,如下:
|
||||
|
||||
- data_insight
|
||||
- DuplicateInsight - 重复数据的检测
|
||||
- NullInsight - 空值数据的检测
|
||||
- ValidationInsight - 数据有效性检测
|
||||
- data_process
|
||||
- FilteringProcessor - 数据过滤
|
||||
|
||||
|
||||
|
||||
另外:
|
||||
|
||||
- TypeInsight - 其中对date日期的检验方法还未完成
|
||||
|
||||
还未完成
|
|
@ -0,0 +1,8 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/4/26 10:40
|
||||
@Desc :
|
||||
"""
|
|
@ -0,0 +1,133 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding:utf-8 -*-
|
||||
# file: data_insight
|
||||
# author: shenwentao, wangkanglong
|
||||
# description:
|
||||
# date: 2022-03-30 16:45
|
||||
# IDE: PyCharm
|
||||
|
||||
import pandas as pd
|
||||
import datetime
|
||||
from typing import List, Union
|
||||
from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
|
||||
|
||||
from iod_data_analysis_tool.utils.assertion import assert_range
|
||||
|
||||
|
||||
class DuplicateInsight:
|
||||
|
||||
@staticmethod
|
||||
def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
|
||||
"""
|
||||
用户自定义重复数据的计数
|
||||
:param data: 来源数据
|
||||
:param subset: 选中列/字段,同pd.DataFrame里的dulplicated函数subset参数
|
||||
:param keep: 确定要标记的重复项(如果有)。同pd.DataFrame里的dulplicated函数keep参数
|
||||
:return: 返回计数结果
|
||||
"""
|
||||
result = data.duplicated(subset, keep=keep).sum()
|
||||
return pd.DataFrame([result], columns=['duplicate_num'])
|
||||
|
||||
|
||||
class NullInsight:
|
||||
|
||||
@staticmethod
|
||||
def num_null(data, column: str = None) -> pd.DataFrame:
|
||||
"""
|
||||
用户自定义计数数据中的空值
|
||||
:param data: 来源数据
|
||||
:param column: 选中列/字段
|
||||
:return: 返回计数结果
|
||||
"""
|
||||
if column is not None:
|
||||
return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
|
||||
else:
|
||||
return pd.DataFrame(data.isna().sum(), columns=['null_num'])
|
||||
|
||||
|
||||
class ValidationInsight:
|
||||
"""
|
||||
自定义验证数据有效性,比如数据里有坏数,针对不同类型的数据限定范围
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def validation_continuous_range(data: pd.DataFrame, column: str,
|
||||
min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
|
||||
"""
|
||||
用户自定义对连续数值型数据进行验证,返回数据在指定范围内外的计数结果
|
||||
:param data: 来源数据
|
||||
:param column: 选中列/字段
|
||||
:param min_val: 范围最小值
|
||||
:param max_val: 范围最大值
|
||||
:return: 计数结果
|
||||
"""
|
||||
assert_range(min_val, max_val)
|
||||
nums = dict()
|
||||
nums['column'] = column
|
||||
nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
|
||||
nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
|
||||
nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
|
||||
return pd.DataFrame([nums], index=['result'])
|
||||
|
||||
@staticmethod
|
||||
def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
|
||||
"""
|
||||
用户自定义对离散型数据进行验证,返回数据在指定范围内外的计数结果
|
||||
:param data: 来源数据
|
||||
:param column: 选中列/字段
|
||||
:param values: 用户自定义的离散值,也就是数值所在的"范围"
|
||||
:return: 计数结果
|
||||
"""
|
||||
nums = dict()
|
||||
nums['column'] = column
|
||||
nums['num_within_range'] = data[data[column].isin(values)].shape[0]
|
||||
nums['num_out_range'] = len(data[column]) - nums['num_within_range']
|
||||
return pd.DataFrame([nums], index=['result'])
|
||||
|
||||
@staticmethod
|
||||
def validation_date_range(data, column: str, start_date: datetime.date,
|
||||
end_date: datetime.date) -> pd.DataFrame:
|
||||
"""
|
||||
用户自定义对日期型数据范围进行验证,返回数据在指定范围内外的计数结果,前提:数据类型是 datetime.date
|
||||
:param data: 来源数据
|
||||
:param column: 选中列/字段
|
||||
:param start_date: 开始日期
|
||||
:param end_date: 结束日期
|
||||
:return: 计数结果
|
||||
"""
|
||||
assert_range(start_date, end_date)
|
||||
nums = dict()
|
||||
nums['column'] = column
|
||||
nums['date_lt_start'] = sum(data[column] < start_date)
|
||||
nums['date_gt_end'] = sum(data[column] > end_date)
|
||||
nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
|
||||
return pd.DataFrame([nums], index=['result'])
|
||||
|
||||
|
||||
class TypeInsight:
|
||||
"""
|
||||
使用户能够检测数据的数据类型是否为自己所预期的
|
||||
"""
|
||||
|
||||
# TODO: 还缺一个timestamp checker
|
||||
_checkers = {
|
||||
'int': is_integer_dtype,
|
||||
'float': is_float_dtype,
|
||||
'string': is_string_dtype,
|
||||
'bool': is_bool_dtype,
|
||||
'datetime': is_datetime64_dtype
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def type_check(data, column: str, check_type: str) -> pd.DataFrame:
|
||||
"""
|
||||
用户检测数据类型是否为自己所需要的类型
|
||||
:param data: 来源数据
|
||||
:param column: 选中的列/字段
|
||||
:param check_type: 选择检测的数据类型,{'int', 'float', 'string', 'bool', 'datetime'}
|
||||
:return: 检测结果
|
||||
"""
|
||||
flag = True
|
||||
if not TypeInsight._checkers[check_type](data[column]):
|
||||
flag = False
|
||||
return pd.DataFrame([flag], columns=['result'], index=[column])
|
|
@ -0,0 +1,17 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> normalizer
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/4/26 10:40
|
||||
@Desc :
|
||||
"""
|
||||
import pandas as pd
|
||||
from scipy.stats import zscore as scipy_zscore
|
||||
|
||||
|
||||
def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
|
||||
"""
|
||||
Zi = (Xi - μ) / σ
|
||||
"""
|
||||
return scipy_zscore(a, axis, ddof, nan_policy)
|
|
@ -0,0 +1,51 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> outlierprocessing
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/4/26 10:24
|
||||
@Desc :
|
||||
"""
|
||||
from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
|
||||
"""
|
||||
MAD = median(|Xi - median(X)|)
|
||||
@return pandas.Index
|
||||
"""
|
||||
x = data.median()
|
||||
MC = (data - x).abs().median()
|
||||
MAD = MC * constant
|
||||
offset = n * MAD
|
||||
if isinstance(data, pd.DataFrame):
|
||||
return data.clip(lower=x - offset, upper=x + offset, axis=axis)
|
||||
else:
|
||||
return data.clip(lower=x - offset, upper=x + offset)
|
||||
|
||||
|
||||
def three_sigma(data: pd.Series):
|
||||
miu = data.mean()
|
||||
sigma = data.std()
|
||||
low = miu - 3 * sigma
|
||||
up = miu + 3 * sigma
|
||||
return data.index[(data < low) | (data > up)]
|
||||
|
||||
|
||||
def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
|
||||
q = data.quantile(q=[q1, q3])
|
||||
IQR = q[q3] - q[q1]
|
||||
lower_whisker_limit = q[q1] - k * IQR
|
||||
upper_whisker_limit = q[q3] + k * IQR
|
||||
return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
|
||||
|
||||
|
||||
def regex_match(data: pd.Series, *patterns):
|
||||
pattern = '|'.join(patterns)
|
||||
return data.index[data.astype(str).str.contains(pattern, regex=True)]
|
||||
|
||||
|
||||
def empty(data: Union[pd.Series, pd.DataFrame]):
|
||||
return any(data.isnull())
|
|
@ -0,0 +1,24 @@
|
|||
## 对时序数据的分析方法
|
||||
|
||||
--------
|
||||
|
||||
|模块|涉及方法|
|
||||
| ---- | ---- |
|
||||
|基础模块| |
|
||||
|平稳性| |
|
||||
|异常检测| |
|
||||
|频率检测| |
|
||||
|周期性检测| |
|
||||
|其他| |
|
||||
|
||||
### 基础模块
|
||||
|
||||
### 平稳性
|
||||
|
||||
### 异常检测
|
||||
|
||||
### 频率检测
|
||||
|
||||
### 周期性检测
|
||||
|
||||
### 其他
|
|
@ -0,0 +1,26 @@
|
|||
import pandas as pd
|
||||
|
||||
|
||||
def describe_datetime_info(data: pd.Series, datetime_is_numeric: bool = False) -> pd.Series:
|
||||
"""
|
||||
if the type of data is str and data dont have date, it will be populated by the
|
||||
date of today.
|
||||
@param data: data
|
||||
@param datetime_is_numeric : bool, default False
|
||||
Whether to treat datetime dtypes as numeric. This affects statistics
|
||||
calculated for the column. For DataFrame input, this also
|
||||
controls whether datetime columns are included by default.
|
||||
@return: Summary statistics of the Series.
|
||||
@example: Describing a numeric ``Series``.
|
||||
|
||||
>>> s = pd.read_csv()
|
||||
>>> s.describe()
|
||||
count 1427132
|
||||
unique 25111
|
||||
top 2022-04-26 09:25:00.260000
|
||||
freq 32994
|
||||
first 2022-04-26 09:25:00
|
||||
last 2022-04-26 09:34:46.340000
|
||||
Name: TradTime, dtype: object
|
||||
"""
|
||||
return pd.to_datetime(data).describe(datetime_is_numeric=datetime_is_numeric)
|
|
@ -0,0 +1,62 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from time_base import timeBase
|
||||
|
||||
import statsmodels.api as sm
|
||||
import statsmodels.tsa.api as smt
|
||||
import statsmodels.formula as smf
|
||||
|
||||
import scipy.stats as scs
|
||||
|
||||
|
||||
class stationaryTest(Time_base):
|
||||
"""
|
||||
时间序列稳定性检验
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def test_stationary(self, x, window_size):
|
||||
"""
|
||||
时间序列稳定性检验
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
x_ma = self.moving_average(x, window_size)
|
||||
x_std = self.moving_std(x, window_size)
|
||||
x_max = self.moving_max(x, window_size)
|
||||
x_min = self.moving_min(x, window_size)
|
||||
x_median = self.moving_median(x, window_size)
|
||||
x_normalized = self.normalize(x)
|
||||
x_ma_normalized = self.normalize(x_ma)
|
||||
x_std_normalized = self.normalize(x_std)
|
||||
x_max_normalized = self.normalize(x_max)
|
||||
x_min_normalized = self.normalize(x_min)
|
||||
x_median_normalized = self.normalize(x_median)
|
||||
x_normalized_ma_normalized = self.normalize(x_normalized - x_ma_normalized)
|
||||
x_normalized_std_normalized = self.normalize(x_normalized - x_std_normalized)
|
||||
x_normalized_max_normalized = self.normalize(x_normalized - x_max_normalized)
|
||||
x_normalized_min_normalized = self.normalize(x_normalized - x_min_normalized)
|
||||
x_normalized_median_normalized = self.normalize(x_normalized - x_median_normalized)
|
||||
x_normalized_ma_normalized_std_normalized = self.normalize(x_normalized_ma_normalized - x_std)
|
||||
|
||||
return x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized
|
||||
|
||||
def adf_test(self, x, window_size):
|
||||
"""
|
||||
时间序列稳定性检验
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized = self.test_stationary(x, window_size)
|
||||
adf_test_normalized = smt.adfuller(x_normalized)
|
||||
adf_test_ma_normalized = smt.adfuller(x_ma_normalized)
|
||||
adf_test_std_normalized = smt.adfuller(x_std_normalized)
|
||||
adf_test_max_normalized = smt.adfuller(x_max_normalized)
|
||||
adf_test_min_normalized = smt.adfuller(x_min_normalized)
|
||||
adf_test_median_normalized = smt.adfuller(x_median_normalized)
|
||||
adf_test_normalized_ma_normalized = smt.adfuller(x_normalized_ma_normalized)
|
||||
adf_test_normalized_std_normalized = smt.adfuller(x_normalized_std_normalized)
|
||||
adf_test_normalized_max_normalized = smt.adfuller(x_normalized_max_normalized)
|
||||
adf_test_normalized_min_normalized = smt.adfuller(x_normalized_min_normalized)
|
||||
return adf_test_normalized, adf_test_ma_normalized, adf_test_std_normalized, adf_test_max_normalized, adf_test_min_normalized, adf_test_median_normalized, adf_test_normalized_ma_normalized, adf_test_normalized_std_normalized, adf_test_normalized_max_normalized, adf_test_normalized_min_normalized
|
|
@ -0,0 +1,133 @@
|
|||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Time_base(object):
|
||||
"""
|
||||
时间序列基础模块
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def normalize(x):
|
||||
"""
|
||||
将时间序列数据归一化
|
||||
x : 时间序列数据
|
||||
"""
|
||||
x = np.array(x)
|
||||
return np.log2(x / np.sqrt(np.sum(x**2)))
|
||||
|
||||
@staticmethod
|
||||
def lag(x, lag):
|
||||
"""
|
||||
滞后
|
||||
x : 时间序列数据
|
||||
lag : 滞后时间
|
||||
"""
|
||||
return pd.Series(x).shift(lag)
|
||||
|
||||
@staticmethod
|
||||
def moving_average(x, window_size):
|
||||
"""
|
||||
移动平均窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).mean()
|
||||
|
||||
@staticmethod
|
||||
def moving_median(x, window_size):
|
||||
"""
|
||||
移动中值窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).median()
|
||||
|
||||
@staticmethod
|
||||
def moving_std(x, window_size):
|
||||
"""
|
||||
移动标准差窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).std()
|
||||
|
||||
@staticmethod
|
||||
def moving_max(x, window_size):
|
||||
"""
|
||||
移动最大值窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).max()
|
||||
|
||||
@staticmethod
|
||||
def moving_min(x, window_size):
|
||||
"""
|
||||
移动最小值窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).min()
|
||||
|
||||
@staticmethod
|
||||
def moving_sum(x, window_size):
|
||||
"""
|
||||
移动和窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).sum()
|
||||
|
||||
@staticmethod
|
||||
def moving_quantile(x, window_size, quantile):
|
||||
"""
|
||||
移动分位数窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
quantile : 分位数
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).quantile(quantile)
|
||||
|
||||
@staticmethod
|
||||
def moving_corr(x, y, window_size):
|
||||
"""
|
||||
移动相关窗口
|
||||
x : 时间序列数据
|
||||
y : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).corr(pd.Series(y))
|
||||
|
||||
@staticmethod
|
||||
def moving_cov(x, y, window_size):
|
||||
"""
|
||||
移动协方差窗口
|
||||
x : 时间序列数据
|
||||
y : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).cov(pd.Series(y))
|
||||
|
||||
@staticmethod
|
||||
def moving_skew(x, window_size):
|
||||
"""
|
||||
移动偏度窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).skew()
|
||||
|
||||
@staticmethod
|
||||
def moving_kurt(x, window_size):
|
||||
"""
|
||||
移动峰度窗口
|
||||
x : 时间序列数据
|
||||
window_size : 窗口大小
|
||||
"""
|
||||
return pd.Series(x).rolling(window_size).kurt()
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> ID_code
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/5/17 16:00
|
||||
@Desc :
|
||||
"""
|
||||
import re
|
||||
|
||||
re_ID = re.compile(r'^\d{6}(?:18|19|20)?\d{2}(?:0[1-9]|1[012])(?:(?:[0-2][1-9])|10|20|30|31)\d{3}[0-9xX]$')
|
||||
|
||||
|
||||
def validate_identity_code(code: str):
|
||||
"""
|
||||
身份证格式校验
|
||||
:param code:
|
||||
:return:
|
||||
"""
|
||||
city = {'11': "北京", '12': "天津", '13': "河北", '14': "山西", '15': "内蒙古", '21': "辽宁", '22': "吉林", '23': "黑龙江 ",
|
||||
'31': "上海", '32': "江苏", '33': "浙江", '34': "安徽", '35': "福建", '36': "江西", '37': "山东", '41': "河南", '42': "湖北 ",
|
||||
'43': "湖南", '44': "广东", '45': "广西", '46': "海南", '50': "重庆", '51': "四川", '52': "贵州", '53': "云南", '54': "西藏 ",
|
||||
'61': "陕西", '62': "甘肃", '63': "青海", '64': "宁夏", '65': "新疆", '71': "台湾", '81': "香港", '82': "澳门", '91': "国外 "}
|
||||
tip = ""
|
||||
p = True
|
||||
|
||||
if re_ID.match(code) is None:
|
||||
tip = "身份证号格式错误"
|
||||
p = False
|
||||
|
||||
|
||||
elif not city[code[:2]]:
|
||||
tip = "地址编码错误"
|
||||
p = False
|
||||
else:
|
||||
# 18位身份证需要验证最后一位校验位
|
||||
if len(code) == 18:
|
||||
code = code.split('')
|
||||
# ∑(ai × Wi)(mod 11)
|
||||
# 加权因子
|
||||
factor = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
|
||||
# 校验位
|
||||
parity = [1, 0, 'X', 9, 8, 7, 6, 5, 4, 3, 2]
|
||||
sum = 0
|
||||
for i in range(17):
|
||||
ai = code[i]
|
||||
wi = factor[i]
|
||||
sum += ai * wi
|
||||
i += 1
|
||||
if parity[sum % 11] != code[17]:
|
||||
tip = "校验位错误"
|
||||
p = False
|
||||
return p, tip
|
|
@ -0,0 +1,8 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> __init__.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/5/17 15:59
|
||||
@Desc :
|
||||
"""
|
|
@ -0,0 +1,97 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> timeutil
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/4/26 10:02
|
||||
@Desc :
|
||||
"""
|
||||
import datetime
|
||||
import types
|
||||
import typing
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
|
||||
class cnparserinfo(parser.parserinfo):
|
||||
"""
|
||||
匹配中文日期格式
|
||||
用法:
|
||||
from dateutil import parser
|
||||
parser.parse('1998年12月11日 8点20分30秒', cnparserinfo())
|
||||
"""
|
||||
parser.parserinfo.JUMP.extend('年月日')
|
||||
WEEKDAYS = [list(weekdays) for weekdays in parser.parserinfo.WEEKDAYS]
|
||||
WEEKDAYS[0].extend(('星期一', '周一'))
|
||||
WEEKDAYS[1].extend(('星期二', '周二'))
|
||||
WEEKDAYS[2].extend(('星期三', '周三'))
|
||||
WEEKDAYS[3].extend(('星期四', '周四'))
|
||||
WEEKDAYS[4].extend(('星期五', '周五'))
|
||||
WEEKDAYS[5].extend(('星期六', '周六'))
|
||||
WEEKDAYS[6].extend(('星期天', '周日', '周天', '周末'))
|
||||
WEEKDAYS = [tuple(weekdays) for weekdays in WEEKDAYS]
|
||||
|
||||
# MONTHS = [list(months) for months in parser.parserinfo.MONTHS]
|
||||
# MONTHS[0].extend(('一月', '1月'))
|
||||
# MONTHS[1].extend(('二月', '2月'))
|
||||
# MONTHS[2].extend(('三月', '3月'))
|
||||
# MONTHS[3].extend(('四月', '4月'))
|
||||
# MONTHS[4].extend(('五月', '5月'))
|
||||
# MONTHS[5].extend(('六月', '6月'))
|
||||
# MONTHS[6].extend(('七月', '7月'))
|
||||
# MONTHS[7].extend(('八月', '8月'))
|
||||
# MONTHS[8].extend(('九月', '9月'))
|
||||
# MONTHS[9].extend(('十月', '10月'))
|
||||
# MONTHS[10].extend(('十一月', '11月'))
|
||||
# MONTHS[11].extend(('十二月', '12月'))
|
||||
# MONTHS = [tuple(months) for months in MONTHS]
|
||||
|
||||
HMS = [list(hms) for hms in parser.parserinfo.HMS]
|
||||
HMS[0].extend('时点')
|
||||
HMS[1].append('分')
|
||||
HMS[2].append('秒')
|
||||
HMS = [tuple(hms) for hms in HMS]
|
||||
|
||||
AMPM = [list(ampm) for ampm in parser.parserinfo.AMPM]
|
||||
AMPM[0].append('上午')
|
||||
AMPM[1].append('下午')
|
||||
AMPM = [tuple(ampm) for ampm in AMPM]
|
||||
|
||||
def __init__(self, dayfirst=False, yearfirst=False):
|
||||
super().__init__(dayfirst, yearfirst)
|
||||
|
||||
|
||||
def utctimestamp():
|
||||
"""
|
||||
@return: utc时间戳
|
||||
"""
|
||||
return int(datetime.datetime.utcnow().timestamp())
|
||||
|
||||
|
||||
def timestamp2datetime(ts: float):
|
||||
return datetime.datetime.fromtimestamp(ts)
|
||||
|
||||
|
||||
def timestamp2str(ts: float, fmt: str = '%F %H:%M:%S'):
|
||||
"""
|
||||
@param ts: timestamp
|
||||
@param fmt: format
|
||||
"""
|
||||
return datetime.datetime.strftime(timestamp2datetime(ts), fmt)
|
||||
|
||||
|
||||
cnparser = cnparserinfo()
|
||||
|
||||
|
||||
def str2datetime(datetime_str: str, fmt: str = None):
|
||||
if fmt:
|
||||
return datetime.datetime.strptime(datetime_str, fmt)
|
||||
return parser.parse(datetime_str, cnparser)
|
||||
|
||||
|
||||
def int2date(date_int: int):
|
||||
return str2datetime(str(date_int), '%Y%m%d')
|
||||
|
||||
|
||||
def date2int(a: typing.Union[datetime.datetime, datetime.date]):
|
||||
return int(a.strftime('%Y%m%d'))
|
|
@ -0,0 +1,81 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> file_util
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/5/10 17:21
|
||||
@Desc :
|
||||
"""
|
||||
import os
|
||||
import queue
|
||||
import shutil
|
||||
|
||||
import paramiko
|
||||
|
||||
|
||||
def list_files(dir_paths):
|
||||
files = []
|
||||
for root, dir_path, filepath in walk(dir_paths):
|
||||
if filepath:
|
||||
files.append(os.path.join(root, filepath))
|
||||
return files
|
||||
|
||||
|
||||
def walk(dir_paths):
|
||||
dir_queue = queue.Queue()
|
||||
if isinstance(dir_paths, str):
|
||||
dir_paths = [dir_paths]
|
||||
for dir_path in dir_paths:
|
||||
dir_queue.put(dir_path)
|
||||
while not dir_queue.empty():
|
||||
dirname = dir_queue.get()
|
||||
for root, dirs, files in os.walk(dirname):
|
||||
for dirname in dirs:
|
||||
dir_queue.put(os.path.join(root, dirname))
|
||||
yield root, dirname, None
|
||||
for filename in files:
|
||||
yield root, None, filename
|
||||
|
||||
|
||||
def copy(s, t):
|
||||
if os.path.isfile(s):
|
||||
shutil.copy(s, t)
|
||||
else:
|
||||
if not os.path.exists(t):
|
||||
os.mkdir(t)
|
||||
s = os.path.abspath(s)
|
||||
t = os.path.abspath(t)
|
||||
for root, dirname, filename in walk(s):
|
||||
if dirname:
|
||||
os.mkdir(os.path.join(t, dirname))
|
||||
else:
|
||||
shutil.copy(os.path.join(root, filename), os.path.join(root.replace(s, t), filename))
|
||||
|
||||
|
||||
class RemoteFileUtil:
|
||||
|
||||
def __init__(self, ip, username, password, port=22, local_dir=None, remote_dir=None):
|
||||
tran = paramiko.Transport((ip, port))
|
||||
tran.connect(username=username, password=password)
|
||||
self.sftp = paramiko.SFTPClient.from_transport(tran).getfo()
|
||||
self.local_dir = local_dir
|
||||
self.remote_dir = remote_dir
|
||||
|
||||
def ls(self, remote_dir=None):
|
||||
if remote_dir is None:
|
||||
remote_dir = self.remote_dir
|
||||
return self.sftp.listdir_attr(remote_dir)
|
||||
|
||||
def upload_file(self, local_filepath=None, remote_filepath=None, filename=None):
|
||||
if local_filepath is None:
|
||||
local_filepath = os.path.join(self.local_dir, filename)
|
||||
if remote_filepath is None:
|
||||
remote_filepath = os.path.join(self.remote_dir, filename)
|
||||
self.sftp.put(local_filepath, remote_filepath)
|
||||
|
||||
def download_file(self, local_filepath=None, remote_filepath=None, filename=None):
|
||||
if local_filepath is None:
|
||||
local_filepath = os.path.join(self.local_dir, filename)
|
||||
if remote_filepath is None:
|
||||
remote_filepath = os.path.join(self.remote_dir, filename)
|
||||
self.sftp.get(remote_filepath, local_filepath)
|
|
@ -0,0 +1,82 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> pd_util
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/7/13 11:00
|
||||
@Desc :
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
from typing import Hashable, Callable
|
||||
|
||||
import pandas as pd
|
||||
from pandas._typing import CompressionOptions, FilePath, StorageOptions, WriteBuffer
|
||||
from pandas.core.generic import bool_t
|
||||
|
||||
|
||||
class to_same_csv:
|
||||
|
||||
def __init__(self,
|
||||
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
|
||||
sep: str = ",",
|
||||
na_rep: str = "",
|
||||
float_format: str | None = None,
|
||||
columns: pd.Sequence[Hashable] | None = None,
|
||||
header: bool_t | list[str] = True,
|
||||
index: bool_t = False,
|
||||
index_label: pd.IndexLabel | None = None,
|
||||
mode: str = "w",
|
||||
encoding: str = 'utf8',
|
||||
compression: CompressionOptions = "infer",
|
||||
quoting: int | None = None,
|
||||
quotechar: str = '"',
|
||||
line_terminator: str | None = None,
|
||||
chunksize: int | None = None,
|
||||
date_format: str | None = None,
|
||||
doublequote: bool_t = True,
|
||||
escapechar: str | None = None,
|
||||
decimal: str = ".",
|
||||
errors: str = "strict",
|
||||
storage_options: StorageOptions = None,
|
||||
prepare: Callable = None):
|
||||
self.not_first = False
|
||||
self.mode = mode
|
||||
if self.mode == 'a' and isinstance(path_or_buf, str) and os.path.exists(path_or_buf):
|
||||
header = False
|
||||
self.header = header
|
||||
self.prepare = prepare
|
||||
self.kwargs = {'path_or_buf': path_or_buf,
|
||||
'sep': sep,
|
||||
'na_rep': na_rep,
|
||||
'float_format': float_format,
|
||||
'columns': columns,
|
||||
'index': index,
|
||||
'index_label': index_label,
|
||||
'encoding': encoding,
|
||||
'compression': compression,
|
||||
'quoting': quoting,
|
||||
'quotechar': quotechar,
|
||||
'line_terminator': line_terminator,
|
||||
'chunksize': chunksize,
|
||||
'date_format': date_format,
|
||||
'doublequote': doublequote,
|
||||
'escapechar': escapechar,
|
||||
'decimal': decimal,
|
||||
'errors': errors,
|
||||
'storage_options': storage_options}
|
||||
|
||||
def __call__(self, df_or_series: pd.Series | pd.DataFrame):
|
||||
if self.not_first:
|
||||
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
|
||||
else:
|
||||
if self.prepare:
|
||||
result = self.prepare(df_or_series)
|
||||
if result:
|
||||
df_or_series = result
|
||||
df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
|
||||
self.mode = 'a'
|
||||
self.header = False
|
|
@ -0,0 +1,17 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :IoD_data_analysis_tool -> phone_util
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2022/5/17 15:59
|
||||
@Desc :
|
||||
"""
|
||||
import re
|
||||
|
||||
re_phone = re.compile(r'^(?:(?:13[0-9])'
|
||||
r'|(?:14(?:0|[5-7]|9))'
|
||||
r'|(?:15(?:[0-3]|[5-9]))'
|
||||
r'|(?:16(?:2|[5-7]))'
|
||||
r'|(?:17[0-8])'
|
||||
r'|(?:18[0-9])'
|
||||
r'|(?:19(?:[0-3]|[5-9])))\d{8}$')
|
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :IoD_data_analysis_tool
|
||||
@File :project_util.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2022/9/15 9:45
|
||||
"""
|
||||
import compileall
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from os.path import join
|
||||
|
||||
from lib.analysis_package.utils.file_util import walk
|
||||
|
||||
re_pyc = re.compile(r'cpython-\d+\.')
|
||||
|
||||
|
||||
def compile_project(source, target=None):
|
||||
"""
|
||||
编译项目为pyc文件到指定目录
|
||||
@param source: 项目路径
|
||||
@param target: 编译文件存放路径
|
||||
"""
|
||||
source = os.path.abspath(source)
|
||||
if target is None:
|
||||
target = source
|
||||
else:
|
||||
target = os.path.abspath(target)
|
||||
compileall.compile_dir(source)
|
||||
pycache_paths = set()
|
||||
if target == source:
|
||||
for root, dirname, filename in walk(source):
|
||||
if root[-11:] == '__pycache__':
|
||||
pycache_paths.add(root)
|
||||
shutil.move(join(root, filename), join(root, '../', re_pyc.sub('', filename)))
|
||||
if filename and filename.endswith('py'):
|
||||
os.remove(join(root, filename))
|
||||
else:
|
||||
if target is None:
|
||||
target = join(source, 'dist')
|
||||
len_t = len(target)
|
||||
for root, dirname, filename in walk(source):
|
||||
t_root = root.replace(source, target)
|
||||
if target == root[:len_t]:
|
||||
continue
|
||||
if dirname and dirname != '__pycache__':
|
||||
t_root = join(t_root, dirname)
|
||||
if not os.path.exists(t_root) and join(source, dirname) != target:
|
||||
os.makedirs(t_root)
|
||||
elif filename and not filename.endswith('py'):
|
||||
if root[-11:] == '__pycache__':
|
||||
pycache_paths.add(root)
|
||||
t_root = t_root[:-11]
|
||||
shutil.move(join(root, filename), join(t_root, re_pyc.sub('', filename)))
|
||||
else:
|
||||
shutil.copyfile(join(root, filename), join(t_root, filename))
|
||||
for p in pycache_paths:
|
||||
os.rmdir(p)
|
Binary file not shown.
|
@ -0,0 +1,14 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project -> File :scrapyproject -> package_project
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Date :2021/5/12 10:46
|
||||
@Desc :
|
||||
"""
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
subprocess.call('python setup.py bdist_wheel')
|
||||
shutil.rmtree(r'build')
|
||||
shutil.rmtree(r'analysis_package.egg-info')
|
|
@ -0,0 +1,36 @@
|
|||
# coding:utf-8
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
PACKAGE = "analysis_package"
|
||||
NAME = "analysis_package"
|
||||
DESCRIPTION = "general analysis function"
|
||||
AUTHOR = "iod"
|
||||
AUTHOR_EMAIL = "rengengchen@sics.ac.cn"
|
||||
URL = ""
|
||||
VERSION = '0.1.3'
|
||||
|
||||
setup(
|
||||
name=NAME,
|
||||
version=VERSION,
|
||||
description=DESCRIPTION,
|
||||
author=AUTHOR,
|
||||
author_email=AUTHOR_EMAIL,
|
||||
license="BSD",
|
||||
url=URL,
|
||||
include_package_data=True,
|
||||
packages=find_packages(),
|
||||
classifiers=[
|
||||
'Programming Language :: Python',
|
||||
'Operating System :: OS Independent',
|
||||
],
|
||||
install_requires=[
|
||||
'pandas',
|
||||
'scipy',
|
||||
'numpy',
|
||||
'matplotlib',
|
||||
'seaborn',
|
||||
'tqdm',
|
||||
'scikit-learn',
|
||||
],
|
||||
zip_safe=False,
|
||||
)
|
Loading…
Reference in New Issue