init

2024-05-12 20:18:24 +08:00 · 2024-05-12 20:18:24 +08:00 · 707997d4e1
commit 707997d4e1
43 changed files with 1696 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,12 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/utils.iml" filepath="$PROJECT_DIR$/.idea/utils.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/utils.iml
+++ b/.idea/utils.iml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/lib/README.md
+++ b/lib/README.md
@ -0,0 +1,5 @@
+模块职责：
+1. continuous：针对数值型数据进行特征分析
+2. categorical：针对离散型数据进行特征分析
+3. timeseries：对时序数据的分析方法
+4. pre-process：解析配置文件，在数据进入下一步前进行一定的预处理（如补充空值、采样等）
--- a/lib/init.py
+++ b/lib/init.py
--- a/lib/analysis_package/init.py
+++ b/lib/analysis_package/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+'''
+@Project ：IoD_data_analysis_tool 
+@File    ：__init__.py.py
+@IDE     ：PyCharm 
+@Author  ：rengengchen
+@Time    ：2022/8/3 17:07 
+'''
--- a/lib/analysis_package/categorical/README.md
+++ b/lib/analysis_package/categorical/README.md
@ -0,0 +1,30 @@
+数值模块：
+针对离散型数据进行特征分析
+
+分析方法：
+
+1> 描述性统计：
+  - 记录数据中该列包含的分类
+
+  - 分类个数
+
+  - 频数表
+  - 列联表
+
+2> 卡方独立性检验
+
+3> 信息熵
+
+4> 互信息
+
+功能：
+
+多列离散数据循环进行数据分析
+
+运行环境：
+python3.7.10以上
+- numpy
+- pandas
+- matplotlib
+- sklearn
+- scipy.stats
--- a/lib/analysis_package/categorical/init.py
+++ b/lib/analysis_package/categorical/init.py
@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> __init__.py
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/7/4 16:34
+@Desc   ：
+"""
--- a/lib/analysis_package/categorical/categorical_process.py
+++ b/lib/analysis_package/categorical/categorical_process.py
@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+# @Time : 2022/3/17 17:36
+# @Author : Leng Yang
+# @FileName: categorical_process.py
+# @Software: PyCharm
+
+
+import pandas as pd
+import numpy as np
+from sklearn import metrics
+from scipy.stats import chi2_contingency, chi2
+
+
+def test():
+    pass
+
+
+class CategorySelfDescribe(object):
+    """
+    描述性统计量
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def category_describe(data: pd.Series) -> pd.DataFrame:
+        """
+        描述该列数据包含的分类名称和分类种类数量
+        :param data: 输入数据，格式为pd.Series
+        :return: pd.DataFrame, 返回dataframe形式包含分类名称列表和分类种类数量
+        Examples
+        --------
+        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
+        >>> CategorySelfDescribe().category_describe('天气')
+          categories  types
+        0  [晴, 阴, 雨]    3.0
+        """
+        results = pd.DataFrame()
+        results = results.append({'categories': data.unique(), 'types': len(data.unique())}, ignore_index=True)
+        return results
+
+    @staticmethod
+    def category_frequency(data: pd.Series) -> pd.DataFrame:
+        """
+        频数表
+        :param data: 输入数据，格式为pd.Series
+        :return: pd.DataFrame, 返回频数表
+        Examples
+        --------
+        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨','雨','雨','阴','晴','晴','雨','晴','阴','阴','雨'],
+                     '温度':['高','高','高','低','低','低','低','低','低','低','低','低','高','低']})
+        >>> CategorySelfDescribe().category_frequency('天气')
+                unique_values  count  frequency
+            0             晴      5    0.357143
+            1             雨      5    0.357143
+            2             阴      4    0.285714
+
+        """
+        df_freq = data.value_counts(ascending=False).rename_axis('unique_values').reset_index(name='count')
+        df_freq['frequency'] = df_freq['count'] / len(data)
+        return df_freq
+
+
+class CategorySelfAnalyse(object):
+    """
+    对单列分类数据进行统计分析
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def entropy(data: pd.Series) -> float:
+        """
+        计算信息熵
+        :param data: 输入数据，格式为pd.Series
+        :return: float, 信息熵
+        """
+        prob = pd.value_counts(data) / len(data)
+        return sum(np.log2(prob) * prob * (-1))
+
+
+class CategoryMutualDescribe(object):
+    """
+    对两列不同的分类数据进行描述性统计
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def crosstab(row_data: pd.Series, col_data: pd.Series) -> pd.DataFrame:
+        """
+        对两列不同的分类数据进行列联表分析
+        :param row_data: categorical数据1, 数据1分类作为列联表的行
+        :param col_data: categorical数据2, 数据2分类作为列联表的列
+        :return: pd.DataFrame, 列联表
+        Examples
+        --------
+        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
+        >>> CategoryMutualDescribe().crosstab('天气','温度')
+        温度   高 低
+        天气
+        晴     2  0
+        阴     1  0
+        雨     0  1
+        """
+        return pd.crosstab(row_data, col_data)
+
+
+class MutualCategoricalAnalyse(object):
+    """
+    对两列分类数据进行统计分析
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def info_gain(df: pd.DataFrame, attr_col: str, data_col: str) -> float:
+        """
+        计算信息增益: Gain(D,A) = Ent(D) - Ent(D|A)
+        使用某个特征A划分数据集D
+        :param df_data: 输入数据，格式为dataframe
+        :param attr_col: 特征数据列名
+        :param data_col: 数据集列名
+        :return: float, 信息增益
+        """
+        # e: 条件信息熵
+        e1 = df.groupby(attr_col).apply(lambda x: CategorySelfAnalyse.entropy(x[data_col]))
+        p1 = pd.value_counts(df[attr_col]) / len(df[attr_col])  # p(x)
+        e2 = sum(e1 * p1)  # Ent(D|A)
+        return CategorySelfAnalyse.entropy(df[data_col]) - e2
+
+    @staticmethod
+    def normalized_mutual_information(data1: pd.Series, data2: pd.Series) -> float:
+        """
+        Mutual Information between two clusterings. The Mutual Information is a measure of the similarity
+        between two labels of the same data.
+        Normalized Mutual Information (NMI) is a normalization of the Mutual
+        Information (MI) score to scale the results between 0 (no mutual
+        information) and 1 (perfect correlation).
+        :param df_data: 输入数据，格式为dataframe
+        :param data1: 分类数据1
+        :param data2: 分类数据2
+        :return: nmi : float, score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
+        """
+        return metrics.normalized_mutual_info_score(data1, data2)
+
+    @staticmethod
+    def chi2_independence(data1: pd.Series, data2: pd.Series, alpha=0.05) -> pd.DataFrame:
+        """
+        卡方独立性检验
+        :param alpha: 置信度，用来确定临界值
+        :param data1: categroical数据1
+        :param data2: categorical数据2
+        :return: pd.DataFrame，内容如下：
+            g: 卡方值，也就是统计量
+            p: P值（统计学名词），与置信度对比，也可进行假设检验，P值小于置信度，即可拒绝原假设
+            dof: 自由度
+            re: 判读变量，1表示拒绝原假设，0表示接受原假设
+            expctd: 原数据数组同维度的对应理论值
+        """
+        data = CategoryMutualDescribe.crosstab(data1, data2)
+        result = pd.DataFrame(columns=['g', 'p', 'dof', 'expctd'])
+        g, p, dof, expctd = chi2_contingency(data)
+        result = result.append({'g': g, 'p': p, 'dof': dof, 'expctd': expctd}, ignore_index=True)
+        if dof == 0:
+            raise ValueError('自由度应该大于等于1')
+        elif dof == 1:
+            cv = chi2.isf(alpha * 0.5, dof)  # critical value
+        else:
+            cv = chi2.isf(alpha * 0.5, dof - 1)
+
+        if g > cv:
+            result.loc[0, 're'] = 1  # 表示拒绝原假设
+        else:
+            result.loc[0, 're'] = 0  # 表示接受原假设
+        return result
--- a/lib/analysis_package/code_template/init.py
+++ b/lib/analysis_package/code_template/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+@Project ：IoD_data_analysis_tool
+@File    ：__init__.py.py
+@IDE     ：PyCharm
+@Author  ：rengengchen
+@Time    ：2022/8/5 11:52
+"""
--- a/lib/analysis_package/code_template/concurrency/init.py
+++ b/lib/analysis_package/code_template/concurrency/init.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+@Project ：IoD_data_analysis_tool
+@File    ：__init__.py.py
+@IDE     ：PyCharm
+@Author  ：rengengchen
+@Time    ：2022/8/5 11:52
+"""
--- a/lib/analysis_package/code_template/concurrency/producer_consumer.py
+++ b/lib/analysis_package/code_template/concurrency/producer_consumer.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+@Project ：IoD_data_analysis_tool
+@File    ：producer_consumer.py
+@IDE     ：PyCharm
+@Author  ：rengengchen
+@Time    ：2022/8/5 11:53
+"""
+import multiprocessing
+from typing import Iterable, Callable
+
+from tqdm import tqdm
+
+
+class Stop:
+    pass
+
+
+class AbstractPCConcurrencySystem:
+    """
+    @todo 对启动进程的维护
+    @todo 进程数量
+    """
+
+    def __init__(self, num_producer: int = 1, num_consumer: int = 1, num_callback: int = 0,
+                 len_task_queue: int = 0, len_result_queue: int = 0, len_callback_queue: int = 0,
+                 producer_lock=None, consumer_lock=None, callback_lock=None,
+                 meta=None, enable_progressbar=False, num_total_result=None):
+        self.task_queue = multiprocessing.Queue(len_task_queue)
+
+        self.num_producer = num_producer
+        self.num_consumer = num_consumer
+        self.num_callback = num_callback
+        self.producer_lock = producer_lock or multiprocessing.Lock()
+        self.consumer_lock = consumer_lock or multiprocessing.Lock()
+        self.meta = meta
+        self.enable_progressbar = enable_progressbar
+        if enable_progressbar and self.num_callback == 0:
+            self.num_callback = 1
+        self.result_queue = multiprocessing.Queue(len_result_queue)
+        if self.num_callback:
+            self.callback_lock = callback_lock or multiprocessing.Lock()
+        self.num_total_result = num_total_result
+        self.callback_queue = multiprocessing.Queue(len_callback_queue)
+
+    def get_result(self):
+        return self.callback_queue.get()
+
+    def produce(self):
+        """
+        Must return an iterable object or a Stop object.
+        """
+        raise NotImplementedError
+
+    def consume(self, consumer_params):
+        """
+        @return: task result or Stop()
+        """
+        raise NotImplementedError
+
+    def callback(self, result):
+        return result
+
+    def _produce(self):
+        producer = self.produce()
+        if isinstance(producer, Iterable):
+            for params in producer:
+                self.task_queue.put(params, block=True)
+            stop = Stop()
+            for _ in range(self.num_consumer):
+                self.task_queue.put(stop, block=True)
+        elif isinstance(producer, Callable):
+            while True:
+                task = producer()
+                if isinstance(task, Stop):
+                    break
+                self.task_queue.put(task, block=True)
+
+    def _consume(self):
+        consumer_params = self.task_queue.get(block=True)
+        while not isinstance(consumer_params, Stop):
+            info = self.consume(consumer_params)
+            self.result_queue.put(info)
+            consumer_params = self.task_queue.get(block=True)
+        self.result_queue.put(Stop())
+
+    def _callback(self):
+        if self.enable_progressbar:
+            bar = tqdm(total=self.num_total_result)
+        over_flag = 0
+        while over_flag < self.num_consumer:
+            result = self.result_queue.get(block=True)
+            if isinstance(result, Stop):
+                over_flag += 1
+            else:
+                callback = self.callback(result)
+                self.callback_queue.put(callback)
+                if self.enable_progressbar:
+                    bar.update(1)
+        else:
+            if self.enable_progressbar:
+                bar.close()
+
+    def run(self):
+        consumers = []
+        callbackers = []
+        # 创建并启动生产者
+        for i in range(self.num_producer):
+            multiprocessing.Process(target=self._produce, name=f'producer_{i}').start()
+        # 创建并启动消费者
+        for i in range(self.num_consumer):
+            p = multiprocessing.Process(target=self._consume, name=f'consumer_{i}')
+            consumers.append(p)
+            p.start()
+        # 处理结果
+        if self.num_callback:
+            for i in range(self.num_callback):
+                p = multiprocessing.Process(target=self._callback, name=f'callback_{i}')
+                callbackers.append(p)
+                p.start()
+        return self
+
+    def close(self):
+        self.task_queue.close()
+        self.result_queue.close()
+        self.callback_queue.close()
--- a/lib/analysis_package/code_template/concurrency/task_distribution.py
+++ b/lib/analysis_package/code_template/concurrency/task_distribution.py
@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+@Project ：IoD_data_analysis_tool 
+@File    ：distribute_task.py
+@IDE     ：PyCharm 
+@Author  ：rengengchen
+@Time    ：2022/8/8 16:55 
+"""
+import math
+import multiprocessing
+
+
+def equally_distributing_task(target, tasks, *args, results=None, num_processors=8):
+    len_tasks = len(tasks)
+    process_offset = math.ceil(len_tasks / num_processors)
+    for i in range(num_processors):
+        sub_tasks = tasks[i * process_offset: (i + 1) * process_offset]
+        if sub_tasks:
+            if results:
+                multiprocessing.Process(target=target,
+                                        args=(sub_tasks, results, *args)).start()
+            else:
+                multiprocessing.Process(target=target,
+                                        args=(sub_tasks, *args)).start()
+        else:
+            break
+    return results
--- a/lib/analysis_package/continuous/Crime_R.csv
+++ b/lib/analysis_package/continuous/Crime_R.csv
@ -0,0 +1,48 @@
+CrimeRate,Youth,Southern,Education,ExpenditureYear0,LabourForce,Males,MoreMales,StateSize,YouthUnemployment,MatureUnemployment,HighYouthUnemploy,Wage,BelowWage,CrimeRate10,Youth10,Education10,ExpenditureYear10,LabourForce10,Males10,MoreMales10,StateSize10,YouthUnemploy10,MatureUnemploy10,HighYouthUnemploy10,Wage10,BelowWage10
+45.5,135,0,12.4,69,540,965,0,6,80,22,1,564,139,26.5,135,12.5,71,564,974,0,6,82,20,1,632,142
+52.3,140,0,10.9,55,535,1045,1,6,135,40,1,453,200,35.9,135,10.9,54,540,1039,1,7,138,39,1,521,210
+56.6,157,1,11.2,47,512,962,0,22,97,34,0,288,276,37.1,153,11,44,529,959,0,24,98,33,0,359,256
+60.3,139,1,11.9,46,480,968,0,19,135,53,0,457,249,42.7,139,11.8,41,497,983,0,20,131,50,0,510,235
+64.2,126,0,12.2,106,599,989,0,40,78,25,1,593,171,46.7,125,12.2,97,602,989,0,42,79,24,1,660,162
+67.6,128,0,13.5,67,624,972,0,28,77,25,1,507,206,47.9,128,13.8,60,621,983,0,28,81,24,1,571,199
+70.5,130,0,14.1,63,641,984,0,14,70,21,1,486,196,50.6,153,14.1,57,641,993,0,14,71,23,1,556,176
+73.2,143,0,12.9,66,537,977,0,10,114,35,1,487,166,55.9,143,13,63,549,973,0,11,119,36,1,561,168
+75,141,0,12.9,56,523,968,0,4,107,37,0,489,170,61.8,153,12.9,54,538,968,0,5,110,36,1,550,126
+78.1,133,0,11.4,51,599,1024,1,7,99,27,1,425,225,65.4,134,11.2,47,600,1024,1,7,97,28,1,499,215
+79.8,142,1,12.9,45,533,969,0,18,94,33,0,318,250,71.4,142,13.1,44,552,969,0,19,93,36,0,378,247
+82.3,123,0,12.5,97,526,948,0,113,124,50,0,572,158,75.4,134,12.4,87,529,949,0,117,125,49,0,639,146
+83.1,135,0,13.6,62,595,986,0,22,77,27,0,529,190,77.3,137,13.7,61,599,993,0,23,80,28,0,591,189
+84.9,121,0,13.2,118,547,964,0,25,84,29,0,689,126,78.6,132,13.3,115,538,968,0,25,82,30,0,742,127
+85.6,166,1,11.4,58,521,973,0,46,72,26,0,396,237,80.6,153,11.2,54,543,983,0,47,76,25,1,568,246
+88,140,0,12.9,71,632,1029,1,7,100,24,1,526,174,82.2,130,12.9,68,620,1024,1,8,104,25,1,570,182
+92.3,126,0,12.7,74,602,984,0,34,102,33,1,557,195,87.5,134,12.9,67,599,982,0,33,107,34,1,621,199
+94.3,130,0,13.3,128,536,934,0,51,78,34,0,627,135,92.9,127,13.3,128,530,949,0,52,79,33,0,692,140
+95.3,125,0,12,90,586,964,0,97,105,43,0,617,163,94.1,134,11.9,81,571,971,0,99,106,41,0,679,162
+96.8,151,1,10,58,510,950,0,33,108,41,0,394,261,96.2,161,10.1,56,515,1001,1,32,110,40,0,465,254
+97.4,152,1,10.8,57,530,986,0,30,92,43,0,405,264,97.8,152,11,53,541,989,0,30,92,41,0,470,243
+98.7,162,1,12.1,75,522,996,0,40,73,27,0,496,224,99.9,162,12,70,533,992,0,41,80,28,0,562,229
+99.9,149,1,10.7,61,515,953,0,36,86,35,0,395,251,101.4,150,10.7,54,520,952,0,35,84,32,0,476,249
+103,177,1,11,58,638,974,0,24,76,28,0,382,254,103.5,164,10.9,56,638,978,0,25,79,28,0,456,257
+104.3,134,0,12.5,75,595,972,0,47,83,31,0,580,172,104.5,133,12.7,71,599,982,0,50,87,32,0,649,182
+105.9,130,0,13.4,90,623,1049,1,3,113,40,0,588,160,106.4,153,13.4,91,622,1050,1,3,119,41,0,649,159
+106.6,157,1,11.1,65,553,955,0,39,81,28,0,421,239,107.8,156,11.2,62,562,956,0,39,85,29,0,499,243
+107.2,148,0,13.7,72,601,998,0,9,84,20,1,590,144,110.1,134,13.9,66,602,999,0,9,87,15,0,656,151
+108.3,126,0,13.8,97,542,990,0,18,102,35,0,589,166,110.5,126,13.8,97,549,993,0,19,103,34,1,659,160
+109.4,135,1,11.4,123,537,978,0,31,89,34,0,631,165,113.5,134,11.3,115,529,978,0,32,93,35,0,703,175
+112.1,142,1,10.9,81,497,956,0,33,116,47,0,427,247,116.3,147,10.7,77,501,962,0,33,117,44,0,500,256
+114.3,127,1,12.8,82,519,982,0,4,97,38,0,620,168,119.7,125,12.9,79,510,945,0,4,99,39,0,696,170
+115.1,131,0,13.7,78,574,1038,1,7,142,42,1,540,176,124.5,134,13.6,73,581,1029,1,7,143,41,1,615,177
+117.2,136,0,12.9,95,574,1012,1,29,111,37,1,622,162,127.8,140,13,96,581,1011,1,29,115,36,1,691,169
+119.7,119,0,11.9,166,521,938,0,168,92,36,0,637,154,129.8,120,11.9,157,524,935,0,180,93,27,1,698,169
+121.6,147,1,13.9,63,560,972,0,23,76,24,1,462,233,130.7,139,14,64,571,970,0,24,78,24,1,511,220
+123.4,145,1,11.7,82,560,981,0,96,88,31,0,488,228,132.5,154,11.8,74,563,980,0,99,89,29,1,550,230
+127.2,132,0,10.4,87,564,953,0,43,83,32,0,513,227,134.6,135,10.2,83,560,948,0,44,83,32,0,589,234
+132.4,152,0,12,82,571,1018,1,10,103,28,1,537,215,137.5,151,12.1,76,567,1079,1,11,105,27,1,617,204
+135.5,125,0,12.5,113,567,985,0,78,130,58,0,626,166,140.5,140,12.5,105,571,993,0,77,131,59,0,684,174
+137.8,141,0,14.2,109,591,985,0,18,91,20,1,578,174,145.7,142,14.2,101,590,987,0,19,94,19,1,649,180
+140.8,150,0,12,109,531,964,0,9,87,38,0,559,153,150.6,153,12,98,539,982,0,10,88,36,0,635,151
+145.4,131,1,12.2,115,542,969,0,50,79,35,0,472,206,157.3,131,12.1,109,548,976,0,52,82,34,0,539,219
+149.3,143,0,12.3,103,583,1012,1,13,96,36,0,557,194,162.7,142,12.2,95,612,1003,1,13,97,36,0,625,196
+154.3,124,0,12.3,121,580,966,0,101,77,35,0,657,170,169.6,134,12.2,116,580,987,0,104,79,36,0,719,172
+157.7,136,0,15.1,149,577,994,0,157,102,39,0,673,167,177.2,140,15.2,141,578,995,0,160,110,40,0,739,169
+161.8,131,0,13.2,160,631,1071,1,3,102,41,0,674,152,178.2,132,13.2,143,632,1058,1,4,100,40,0,748,150
--- a/lib/analysis_package/continuous/README.md
+++ b/lib/analysis_package/continuous/README.md
@ -0,0 +1,29 @@
+# **Numerical data analysis and process tools**
+
+
+###  **Project Description**: 
+ 
+- Numerical data correlation analysis and processing, using image visualization to help understanding.
+
+ 
+
+####  Numerical analysis tools part 
+
+- Spearman_correlation is to determine whether there is a Monotonic component between two features,
+which can be apply only for non_linear relationship and ordinal data.
+
+#### Numercial process tools part 
+
+- Detecting outlier by using the Interquartile range(IQR).
+- When highly correlated features will be used to remove.
+
+
+
+#### How to use the tools 
+
+Input an only numerical data (data type:DataFrame).
+
+
+
+
+
--- a/lib/analysis_package/continuous/init.py
+++ b/lib/analysis_package/continuous/init.py
@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> __init__.py
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/7/4 16:34
+@Desc   ：
+"""
--- a/lib/analysis_package/continuous/analyzer.py
+++ b/lib/analysis_package/continuous/analyzer.py
@ -0,0 +1,38 @@
+import os
+
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy.stats import spearmanr
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
+    """
+    Spearman_correlation is to determine whether there is a
+    Monotonic component between two features, which can be apply
+    only for non_linear relationship and ordinal data
+
+    @param feature_a: Input first feature for Spearman's rank test
+    @param feature_b: Input second feature for Spearman's rank test
+    @param sample_size: Choose a sample for representing the population
+    @param:save_path: output path
+    @param:file_name: output name
+
+    """
+    a = data_frame[feature_a].sample(n=sample_size, random_state=1)
+    b = data_frame[feature_b].sample(n=sample_size, random_state=1)
+    coef, p = spearmanr(a, b)
+    logger.info("Spearmans' correlation coefficient is:" + str(coef))
+    alpha = 0.05
+    plt.scatter(a, b)
+    plt.xlabel("Feature A")
+    plt.ylabel("Feature B")
+    plt.title("Spearman's Rank Test")
+    plt.savefig(os.path.join(save_path, file_name))
+    if p > alpha:
+        logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
+    else:
+        logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))
--- a/lib/analysis_package/continuous/correlation.py
+++ b/lib/analysis_package/continuous/correlation.py
@ -0,0 +1,155 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> correlation
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/7/4 16:48
+@Desc   ：
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from scipy.stats import spearmanr
+
+
+def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
+              alternative='two-sided', sample_size=4000, random_state=None):
+    """Calculate a Spearman correlation coefficient with associated p-value.
+
+        The Spearman rank-order correlation coefficient is a nonparametric measure
+        of the monotonicity of the relationship between two datasets. Unlike the
+        Pearson correlation, the Spearman correlation does not assume that both
+        datasets are normally distributed. Like other correlation coefficients,
+        this one varies between -1 and +1 with 0 implying no correlation.
+        Correlations of -1 or +1 imply an exact monotonic relationship. Positive
+        correlations imply that as x increases, so does y. Negative correlations
+        imply that as x increases, y decreases.
+
+        The p-value roughly indicates the probability of an uncorrelated system
+        producing datasets that have a Spearman correlation at least as extreme
+        as the one computed from these datasets. The p-values are not entirely
+        reliable but are probably reasonable for datasets larger than 500 or so.
+
+        Parameters
+        ----------
+        a, b : 1D or 2D array_like, b is optional
+            One or two 1-D or 2-D arrays containing multiple variables and
+            observations. When these are 1-D, each represents a vector of
+            observations of a single variable. For the behavior in the 2-D case,
+            see under ``axis``, below.
+            Both arrays need to have the same length in the ``axis`` dimension.
+        axis : int or None, optional
+            If axis=0 (default), then each column represents a variable, with
+            observations in the rows. If axis=1, the relationship is transposed:
+            each row represents a variable, while the columns contain observations.
+            If axis=None, then both arrays will be raveled.
+        nan_policy : {'propagate', 'raise', 'omit'}, optional
+            Defines how to handle when input contains nan.
+            The following options are available (default is 'propagate'):
+
+            * 'propagate': returns nan
+            * 'raise': throws an error
+            * 'omit': performs the calculations ignoring nan values
+
+        alternative : {'two-sided', 'less', 'greater'}, optional
+            Defines the alternative hypothesis. Default is 'two-sided'.
+            The following options are available:
+
+            * 'two-sided': the correlation is nonzero
+            * 'less': the correlation is negative (less than zero)
+            * 'greater':  the correlation is positive (greater than zero)
+
+        sample_size : int, optional
+            Number of items from column to return. Default is 4000.
+
+        random_state : int, array-like, BitGenerator, np.random.RandomState, optional
+            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
+            random number generator
+            If np.random.RandomState, use as numpy RandomState object.
+
+        Returns
+        -------
+        correlation : float or ndarray (2-D square)
+            Spearman correlation matrix or correlation coefficient (if only 2
+            variables are given as parameters. Correlation matrix is square with
+            length equal to total number of variables (columns or rows) in ``a``
+            and ``b`` combined.
+        pvalue : float
+            The p-value for a hypothesis test whose null hypotheisis
+            is that two sets of data are uncorrelated. See `alternative` above
+            for alternative hypotheses. `pvalue` has the same
+            shape as `correlation`.
+
+        References
+        ----------
+        .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
+           Probability and Statistics Tables and Formulae. Chapman & Hall: New
+           York. 2000.
+           Section  14.7
+
+        Examples
+        --------
+        >>> from scipy import stats
+        >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
+        SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
+        >>> rng = np.random.default_rng()
+        >>> x2n = rng.standard_normal((100, 2))
+        >>> y2n = rng.standard_normal((100, 2))
+        >>> stats.spearmanr(x2n)
+        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
+        >>> stats.spearmanr(x2n[:,0], x2n[:,1])
+        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
+        >>> rho, pval = stats.spearmanr(x2n, y2n)
+        >>> rho
+        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
+               [-0.07960396,  1.        , -0.14448245,  0.16738074],
+               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
+               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
+        >>> pval
+        array([[0.        , 0.43111687, 0.41084066, 0.33891628],
+               [0.43111687, 0.        , 0.15151618, 0.09600687],
+               [0.41084066, 0.15151618, 0.        , 0.74938561],
+               [0.33891628, 0.09600687, 0.74938561, 0.        ]])
+        >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
+        >>> rho
+        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
+               [-0.07960396,  1.        , -0.14448245,  0.16738074],
+               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
+               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
+        >>> stats.spearmanr(x2n, y2n, axis=None)
+        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
+        >>> stats.spearmanr(x2n.ravel(), y2n.ravel())
+        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
+
+        >>> rng = np.random.default_rng()
+        >>> xint = rng.integers(10, size=(100, 2))
+        >>> stats.spearmanr(xint)
+        SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
+
+        """
+    # a = a.sample(n=sample_size, random_state=random_state)
+    # if b:
+    #     b = b.sample(n=sample_size, random_state=random_state)
+    return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
+
+
+def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
+    plt.rcParams['font.sans-serif'] = ['SimHei']
+    plt.rcParams['axes.unicode_minus'] = False
+    cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
+    cov = df.corr(method=method)
+    if drop:
+        uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
+        cov = cov[uncorr]
+        cov = cov[cov.index]
+    if plot or filepath:
+        mask = np.triu(np.ones_like(cov, dtype=bool))
+        fig, ax = plt.subplots(figsize=figsize)
+        sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
+        plt.title("相关性矩阵")
+    if filepath:
+        plt.savefig(filepath)
+    if plot:
+        plt.show()
+    return cov
--- a/lib/analysis_package/continuous/process_tool.py
+++ b/lib/analysis_package/continuous/process_tool.py
@ -0,0 +1,48 @@
+#!/usr/bin/python3  
+# -*- coding: utf-8 -*-
+# @Time      : 2022/3/25 9:09
+# @Software  : PyCharm
+# @File      : process_tool.py
+# @Author    : QT
+# @Email     : taoqimin@sics.ac.cn
+import numpy as np
+from tqdm import tqdm
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(level=logging.INFO)
+handler = logging.FileHandler("log.txt")
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+
+logger.addHandler(handler)
+logger.addHandler(console)
+
+
+class NumericProcess:
+    @staticmethod
+    def drop_feature(data_frame, thresh_hold):
+        """
+        A function for detecting and dropping highly correlated features.
+        when two variables are highly correlated, it usually cause problem
+        such as Multicolinearity. The following function will be used to
+        remove the correlated features.
+
+        @param data_frame: Input dataframe
+        @param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
+
+        """
+
+        matrix = data_frame.corr().abs()
+        mask = np.triu(np.ones_like(matrix, dtype=bool))
+        reduced_matrix = matrix.mask(mask)
+        feature_drop = [c for c in tqdm(reduced_matrix) if
+                        any(reduced_matrix[c] > thresh_hold)]
+        data_frame.drop(feature_drop, axis=1, inplace=True)
+        logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
+        return data_frame
--- a/lib/analysis_package/preprocess/README.md
+++ b/lib/analysis_package/preprocess/README.md
@ -0,0 +1,20 @@
+解析配置文件，在数据进入下一步前进行一定的预处理（如补充空值、采样等）
+
+
+
+目前完成了Pre-process Lib的部分预处理功能，如下：
+
+- data_insight
+  - DuplicateInsight - 重复数据的检测
+  - NullInsight - 空值数据的检测
+  - ValidationInsight - 数据有效性检测
+- data_process
+  - FilteringProcessor - 数据过滤
+
+
+
+另外：
+
+- TypeInsight - 其中对date日期的检验方法还未完成
+
+还未完成
--- a/lib/analysis_package/preprocess/init.py
+++ b/lib/analysis_package/preprocess/init.py
@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> __init__.py
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/4/26 10:40
+@Desc   ：
+"""
--- a/lib/analysis_package/preprocess/data_insight.py
+++ b/lib/analysis_package/preprocess/data_insight.py
@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# file: data_insight
+# author: shenwentao, wangkanglong
+# description:
+# date: 2022-03-30 16:45
+# IDE: PyCharm
+
+import pandas as pd
+import datetime
+from typing import List, Union
+from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
+
+from iod_data_analysis_tool.utils.assertion import assert_range
+
+
+class DuplicateInsight:
+
+    @staticmethod
+    def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
+        """
+        用户自定义重复数据的计数
+        :param data: 来源数据
+        :param subset: 选中列/字段，同pd.DataFrame里的dulplicated函数subset参数
+        :param keep: 确定要标记的重复项（如果有）。同pd.DataFrame里的dulplicated函数keep参数
+        :return: 返回计数结果
+        """
+        result = data.duplicated(subset, keep=keep).sum()
+        return pd.DataFrame([result], columns=['duplicate_num'])
+
+
+class NullInsight:
+
+    @staticmethod
+    def num_null(data, column: str = None) -> pd.DataFrame:
+        """
+        用户自定义计数数据中的空值
+        :param data: 来源数据
+        :param column: 选中列/字段
+        :return: 返回计数结果
+        """
+        if column is not None:
+            return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
+        else:
+            return pd.DataFrame(data.isna().sum(), columns=['null_num'])
+
+
+class ValidationInsight:
+    """
+    自定义验证数据有效性，比如数据里有坏数，针对不同类型的数据限定范围
+    """
+
+    @staticmethod
+    def validation_continuous_range(data: pd.DataFrame, column: str,
+                                    min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
+        """
+        用户自定义对连续数值型数据进行验证，返回数据在指定范围内外的计数结果
+        :param data: 来源数据
+        :param column: 选中列/字段
+        :param min_val: 范围最小值
+        :param max_val: 范围最大值
+        :return: 计数结果
+        """
+        assert_range(min_val, max_val)
+        nums = dict()
+        nums['column'] = column
+        nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
+        nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
+        nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
+        return pd.DataFrame([nums], index=['result'])
+
+    @staticmethod
+    def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
+        """
+        用户自定义对离散型数据进行验证，返回数据在指定范围内外的计数结果
+        :param data: 来源数据
+        :param column: 选中列/字段
+        :param values: 用户自定义的离散值，也就是数值所在的"范围"
+        :return: 计数结果
+        """
+        nums = dict()
+        nums['column'] = column
+        nums['num_within_range'] = data[data[column].isin(values)].shape[0]
+        nums['num_out_range'] = len(data[column]) - nums['num_within_range']
+        return pd.DataFrame([nums], index=['result'])
+
+    @staticmethod
+    def validation_date_range(data, column: str, start_date: datetime.date,
+                              end_date: datetime.date) -> pd.DataFrame:
+        """
+        用户自定义对日期型数据范围进行验证，返回数据在指定范围内外的计数结果，前提：数据类型是 datetime.date
+        :param data: 来源数据
+        :param column: 选中列/字段
+        :param start_date: 开始日期
+        :param end_date: 结束日期
+        :return: 计数结果
+        """
+        assert_range(start_date, end_date)
+        nums = dict()
+        nums['column'] = column
+        nums['date_lt_start'] = sum(data[column] < start_date)
+        nums['date_gt_end'] = sum(data[column] > end_date)
+        nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
+        return pd.DataFrame([nums], index=['result'])
+
+
+class TypeInsight:
+    """
+    使用户能够检测数据的数据类型是否为自己所预期的
+    """
+
+    # TODO: 还缺一个timestamp checker
+    _checkers = {
+        'int': is_integer_dtype,
+        'float': is_float_dtype,
+        'string': is_string_dtype,
+        'bool': is_bool_dtype,
+        'datetime': is_datetime64_dtype
+    }
+
+    @staticmethod
+    def type_check(data, column: str, check_type: str) -> pd.DataFrame:
+        """
+        用户检测数据类型是否为自己所需要的类型
+        :param data: 来源数据
+        :param column: 选中的列/字段
+        :param check_type: 选择检测的数据类型，{'int', 'float', 'string', 'bool', 'datetime'}
+        :return: 检测结果
+        """
+        flag = True
+        if not TypeInsight._checkers[check_type](data[column]):
+            flag = False
+        return pd.DataFrame([flag], columns=['result'], index=[column])
--- a/lib/analysis_package/preprocess/normalizer.py
+++ b/lib/analysis_package/preprocess/normalizer.py
@ -0,0 +1,17 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> normalizer
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/4/26 10:40
+@Desc   ：
+"""
+import pandas as pd
+from scipy.stats import zscore as scipy_zscore
+
+
+def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
+    """
+    Zi = (Xi - μ) / σ
+    """
+    return scipy_zscore(a, axis, ddof, nan_policy)
--- a/lib/analysis_package/preprocess/outlier.py
+++ b/lib/analysis_package/preprocess/outlier.py
@ -0,0 +1,51 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> outlierprocessing
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/4/26 10:24
+@Desc   ：
+"""
+from typing import Union
+
+import pandas as pd
+
+
+def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
+    """
+    MAD = median(|Xi - median(X)|)
+    @return pandas.Index
+    """
+    x = data.median()
+    MC = (data - x).abs().median()
+    MAD = MC * constant
+    offset = n * MAD
+    if isinstance(data, pd.DataFrame):
+        return data.clip(lower=x - offset, upper=x + offset, axis=axis)
+    else:
+        return data.clip(lower=x - offset, upper=x + offset)
+
+
+def three_sigma(data: pd.Series):
+    miu = data.mean()
+    sigma = data.std()
+    low = miu - 3 * sigma
+    up = miu + 3 * sigma
+    return data.index[(data < low) | (data > up)]
+
+
+def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
+    q = data.quantile(q=[q1, q3])
+    IQR = q[q3] - q[q1]
+    lower_whisker_limit = q[q1] - k * IQR
+    upper_whisker_limit = q[q3] + k * IQR
+    return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
+
+
+def regex_match(data: pd.Series, *patterns):
+    pattern = '|'.join(patterns)
+    return data.index[data.astype(str).str.contains(pattern, regex=True)]
+
+
+def empty(data: Union[pd.Series, pd.DataFrame]):
+    return any(data.isnull())
--- a/lib/analysis_package/timeseries/README.md
+++ b/lib/analysis_package/timeseries/README.md
@ -0,0 +1,24 @@
+## 对时序数据的分析方法
+
+--------
+
+|模块|涉及方法|
+|  ----  | ----  |
+|基础模块| |
+|平稳性| |
+|异常检测| |
+|频率检测| |
+|周期性检测| |
+|其他| |
+
+### 基础模块
+
+### 平稳性
+
+### 异常检测
+
+### 频率检测
+
+### 周期性检测
+
+### 其他
--- a/lib/analysis_package/timeseries/init.py
+++ b/lib/analysis_package/timeseries/init.py
@ -0,0 +1,26 @@
+import pandas as pd
+
+
+def describe_datetime_info(data: pd.Series, datetime_is_numeric: bool = False) -> pd.Series:
+    """
+    if the type of data is str and data dont have date, it will be populated by the
+    date of today.
+    @param data: data
+    @param datetime_is_numeric : bool, default False
+            Whether to treat datetime dtypes as numeric. This affects statistics
+            calculated for the column. For DataFrame input, this also
+            controls whether datetime columns are included by default.
+    @return: Summary statistics of the Series.
+    @example: Describing a numeric ``Series``.
+
+        >>> s = pd.read_csv()
+        >>> s.describe()
+        count                        1427132
+        unique                         25111
+        top       2022-04-26 09:25:00.260000
+        freq                           32994
+        first            2022-04-26 09:25:00
+        last      2022-04-26 09:34:46.340000
+        Name: TradTime, dtype: object
+    """
+    return pd.to_datetime(data).describe(datetime_is_numeric=datetime_is_numeric)
--- a/lib/analysis_package/timeseries/anomaly_detection.py
+++ b/lib/analysis_package/timeseries/anomaly_detection.py
--- a/lib/analysis_package/timeseries/frequent_analysis.py
+++ b/lib/analysis_package/timeseries/frequent_analysis.py
--- a/lib/analysis_package/timeseries/seasonal_detection.py
+++ b/lib/analysis_package/timeseries/seasonal_detection.py
--- a/lib/analysis_package/timeseries/stationary_test.py
+++ b/lib/analysis_package/timeseries/stationary_test.py
@ -0,0 +1,62 @@
+import pandas as pd 
+import numpy as np 
+from time_base import timeBase
+
+import statsmodels.api as sm
+import statsmodels.tsa.api as smt
+import statsmodels.formula as smf 
+
+import scipy.stats as scs
+
+
+class stationaryTest(Time_base):
+    """
+    时间序列稳定性检验
+    """
+    def __init__(self):
+        pass
+
+    def test_stationary(self, x, window_size):
+        """
+        时间序列稳定性检验
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        x_ma = self.moving_average(x, window_size)
+        x_std = self.moving_std(x, window_size)
+        x_max = self.moving_max(x, window_size)
+        x_min = self.moving_min(x, window_size)
+        x_median = self.moving_median(x, window_size)
+        x_normalized = self.normalize(x)
+        x_ma_normalized = self.normalize(x_ma)
+        x_std_normalized = self.normalize(x_std)
+        x_max_normalized = self.normalize(x_max)
+        x_min_normalized = self.normalize(x_min)
+        x_median_normalized = self.normalize(x_median)
+        x_normalized_ma_normalized = self.normalize(x_normalized - x_ma_normalized)
+        x_normalized_std_normalized = self.normalize(x_normalized - x_std_normalized)
+        x_normalized_max_normalized = self.normalize(x_normalized - x_max_normalized)
+        x_normalized_min_normalized = self.normalize(x_normalized - x_min_normalized)
+        x_normalized_median_normalized = self.normalize(x_normalized - x_median_normalized)
+        x_normalized_ma_normalized_std_normalized = self.normalize(x_normalized_ma_normalized - x_std)
+
+        return x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized
+
+    def adf_test(self, x, window_size):
+        """
+        时间序列稳定性检验
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized = self.test_stationary(x, window_size)
+        adf_test_normalized = smt.adfuller(x_normalized)
+        adf_test_ma_normalized = smt.adfuller(x_ma_normalized)
+        adf_test_std_normalized = smt.adfuller(x_std_normalized)
+        adf_test_max_normalized = smt.adfuller(x_max_normalized)
+        adf_test_min_normalized = smt.adfuller(x_min_normalized)
+        adf_test_median_normalized = smt.adfuller(x_median_normalized)
+        adf_test_normalized_ma_normalized = smt.adfuller(x_normalized_ma_normalized)
+        adf_test_normalized_std_normalized = smt.adfuller(x_normalized_std_normalized)
+        adf_test_normalized_max_normalized = smt.adfuller(x_normalized_max_normalized)
+        adf_test_normalized_min_normalized = smt.adfuller(x_normalized_min_normalized)
+        return adf_test_normalized, adf_test_ma_normalized, adf_test_std_normalized, adf_test_max_normalized, adf_test_min_normalized, adf_test_median_normalized, adf_test_normalized_ma_normalized, adf_test_normalized_std_normalized, adf_test_normalized_max_normalized, adf_test_normalized_min_normalized
--- a/lib/analysis_package/timeseries/time_base.py
+++ b/lib/analysis_package/timeseries/time_base.py
@ -0,0 +1,133 @@
+
+import pandas as pd 
+import numpy as np
+
+
+class Time_base(object):
+    """
+    时间序列基础模块
+    """
+    def __init__(self):
+        pass
+    
+    @staticmethod
+    def normalize(x):
+        """
+        将时间序列数据归一化
+        x : 时间序列数据
+        """
+        x = np.array(x)
+        return np.log2(x / np.sqrt(np.sum(x**2)))
+
+    @staticmethod
+    def lag(x, lag):
+        """
+        滞后
+        x : 时间序列数据
+        lag : 滞后时间
+        """
+        return pd.Series(x).shift(lag)
+
+    @staticmethod
+    def moving_average(x, window_size):
+        """
+        移动平均窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).mean()
+    
+    @staticmethod
+    def moving_median(x, window_size):
+        """
+        移动中值窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).median()
+
+    @staticmethod
+    def moving_std(x, window_size):
+        """
+        移动标准差窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).std()
+
+    @staticmethod
+    def moving_max(x, window_size):
+        """
+        移动最大值窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).max()
+
+    @staticmethod
+    def moving_min(x, window_size):
+        """
+        移动最小值窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).min()
+    
+    @staticmethod
+    def moving_sum(x, window_size):
+        """
+        移动和窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).sum()
+
+    @staticmethod
+    def moving_quantile(x, window_size, quantile):
+        """
+        移动分位数窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        quantile : 分位数
+        """
+        return pd.Series(x).rolling(window_size).quantile(quantile)
+
+    @staticmethod
+    def moving_corr(x, y, window_size):
+        """
+        移动相关窗口
+        x : 时间序列数据
+        y : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).corr(pd.Series(y))
+
+    @staticmethod
+    def moving_cov(x, y, window_size):
+        """
+        移动协方差窗口
+        x : 时间序列数据
+        y : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).cov(pd.Series(y))
+
+    @staticmethod
+    def moving_skew(x, window_size):
+        """
+        移动偏度窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).skew()
+
+    @staticmethod
+    def moving_kurt(x, window_size):
+        """
+        移动峰度窗口
+        x : 时间序列数据
+        window_size : 窗口大小
+        """
+        return pd.Series(x).rolling(window_size).kurt()
+
+    
--- a/lib/analysis_package/utils/IDcode_util.py
+++ b/lib/analysis_package/utils/IDcode_util.py
@ -0,0 +1,53 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> ID_code
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/5/17 16:00
+@Desc   ：
+"""
+import re
+
+re_ID = re.compile(r'^\d{6}(?:18|19|20)?\d{2}(?:0[1-9]|1[012])(?:(?:[0-2][1-9])|10|20|30|31)\d{3}[0-9xX]$')
+
+
+def validate_identity_code(code: str):
+    """
+    身份证格式校验
+    :param code:
+    :return:
+    """
+    city = {'11': "北京", '12': "天津", '13': "河北", '14': "山西", '15': "内蒙古", '21': "辽宁", '22': "吉林", '23': "黑龙江 ",
+            '31': "上海", '32': "江苏", '33': "浙江", '34': "安徽", '35': "福建", '36': "江西", '37': "山东", '41': "河南", '42': "湖北 ",
+            '43': "湖南", '44': "广东", '45': "广西", '46': "海南", '50': "重庆", '51': "四川", '52': "贵州", '53': "云南", '54': "西藏 ",
+            '61': "陕西", '62': "甘肃", '63': "青海", '64': "宁夏", '65': "新疆", '71': "台湾", '81': "香港", '82': "澳门", '91': "国外 "}
+    tip = ""
+    p = True
+
+    if re_ID.match(code) is None:
+        tip = "身份证号格式错误"
+        p = False
+
+
+    elif not city[code[:2]]:
+        tip = "地址编码错误"
+        p = False
+    else:
+        # 18位身份证需要验证最后一位校验位
+        if len(code) == 18:
+            code = code.split('')
+            # ∑(ai × Wi)(mod 11)
+            # 加权因子
+            factor = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
+            # 校验位
+            parity = [1, 0, 'X', 9, 8, 7, 6, 5, 4, 3, 2]
+            sum = 0
+            for i in range(17):
+                ai = code[i]
+                wi = factor[i]
+                sum += ai * wi
+                i += 1
+            if parity[sum % 11] != code[17]:
+                tip = "校验位错误"
+                p = False
+    return p, tip
--- a/lib/analysis_package/utils/init.py
+++ b/lib/analysis_package/utils/init.py
@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> __init__.py
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/5/17 15:59
+@Desc   ：
+"""
--- a/lib/analysis_package/utils/datetime_util.py
+++ b/lib/analysis_package/utils/datetime_util.py
@ -0,0 +1,97 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> timeutil
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/4/26 10:02
+@Desc   ：
+"""
+import datetime
+import types
+import typing
+
+from dateutil import parser
+
+
+class cnparserinfo(parser.parserinfo):
+    """
+    匹配中文日期格式
+    用法:
+        from dateutil import parser
+        parser.parse('1998年12月11日 8点20分30秒', cnparserinfo())
+    """
+    parser.parserinfo.JUMP.extend('年月日')
+    WEEKDAYS = [list(weekdays) for weekdays in parser.parserinfo.WEEKDAYS]
+    WEEKDAYS[0].extend(('星期一', '周一'))
+    WEEKDAYS[1].extend(('星期二', '周二'))
+    WEEKDAYS[2].extend(('星期三', '周三'))
+    WEEKDAYS[3].extend(('星期四', '周四'))
+    WEEKDAYS[4].extend(('星期五', '周五'))
+    WEEKDAYS[5].extend(('星期六', '周六'))
+    WEEKDAYS[6].extend(('星期天', '周日', '周天', '周末'))
+    WEEKDAYS = [tuple(weekdays) for weekdays in WEEKDAYS]
+
+    # MONTHS = [list(months) for months in parser.parserinfo.MONTHS]
+    # MONTHS[0].extend(('一月', '1月'))
+    # MONTHS[1].extend(('二月', '2月'))
+    # MONTHS[2].extend(('三月', '3月'))
+    # MONTHS[3].extend(('四月', '4月'))
+    # MONTHS[4].extend(('五月', '5月'))
+    # MONTHS[5].extend(('六月', '6月'))
+    # MONTHS[6].extend(('七月', '7月'))
+    # MONTHS[7].extend(('八月', '8月'))
+    # MONTHS[8].extend(('九月', '9月'))
+    # MONTHS[9].extend(('十月', '10月'))
+    # MONTHS[10].extend(('十一月', '11月'))
+    # MONTHS[11].extend(('十二月', '12月'))
+    # MONTHS = [tuple(months) for months in MONTHS]
+
+    HMS = [list(hms) for hms in parser.parserinfo.HMS]
+    HMS[0].extend('时点')
+    HMS[1].append('分')
+    HMS[2].append('秒')
+    HMS = [tuple(hms) for hms in HMS]
+
+    AMPM = [list(ampm) for ampm in parser.parserinfo.AMPM]
+    AMPM[0].append('上午')
+    AMPM[1].append('下午')
+    AMPM = [tuple(ampm) for ampm in AMPM]
+
+    def __init__(self, dayfirst=False, yearfirst=False):
+        super().__init__(dayfirst, yearfirst)
+
+
+def utctimestamp():
+    """
+    @return: utc时间戳
+    """
+    return int(datetime.datetime.utcnow().timestamp())
+
+
+def timestamp2datetime(ts: float):
+    return datetime.datetime.fromtimestamp(ts)
+
+
+def timestamp2str(ts: float, fmt: str = '%F %H:%M:%S'):
+    """
+    @param ts: timestamp
+    @param fmt: format
+    """
+    return datetime.datetime.strftime(timestamp2datetime(ts), fmt)
+
+
+cnparser = cnparserinfo()
+
+
+def str2datetime(datetime_str: str, fmt: str = None):
+    if fmt:
+        return datetime.datetime.strptime(datetime_str, fmt)
+    return parser.parse(datetime_str, cnparser)
+
+
+def int2date(date_int: int):
+    return str2datetime(str(date_int), '%Y%m%d')
+
+
+def date2int(a: typing.Union[datetime.datetime, datetime.date]):
+    return int(a.strftime('%Y%m%d'))
--- a/lib/analysis_package/utils/file_util.py
+++ b/lib/analysis_package/utils/file_util.py
@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> file_util
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/5/10 17:21
+@Desc   ：
+"""
+import os
+import queue
+import shutil
+
+import paramiko
+
+
+def list_files(dir_paths):
+    files = []
+    for root, dir_path, filepath in walk(dir_paths):
+        if filepath:
+            files.append(os.path.join(root, filepath))
+    return files
+
+
+def walk(dir_paths):
+    dir_queue = queue.Queue()
+    if isinstance(dir_paths, str):
+        dir_paths = [dir_paths]
+    for dir_path in dir_paths:
+        dir_queue.put(dir_path)
+    while not dir_queue.empty():
+        dirname = dir_queue.get()
+        for root, dirs, files in os.walk(dirname):
+            for dirname in dirs:
+                dir_queue.put(os.path.join(root, dirname))
+                yield root, dirname, None
+            for filename in files:
+                yield root, None, filename
+
+
+def copy(s, t):
+    if os.path.isfile(s):
+        shutil.copy(s, t)
+    else:
+        if not os.path.exists(t):
+            os.mkdir(t)
+        s = os.path.abspath(s)
+        t = os.path.abspath(t)
+        for root, dirname, filename in walk(s):
+            if dirname:
+                os.mkdir(os.path.join(t, dirname))
+            else:
+                shutil.copy(os.path.join(root, filename), os.path.join(root.replace(s, t), filename))
+
+
+class RemoteFileUtil:
+
+    def __init__(self, ip, username, password, port=22, local_dir=None, remote_dir=None):
+        tran = paramiko.Transport((ip, port))
+        tran.connect(username=username, password=password)
+        self.sftp = paramiko.SFTPClient.from_transport(tran).getfo()
+        self.local_dir = local_dir
+        self.remote_dir = remote_dir
+
+    def ls(self, remote_dir=None):
+        if remote_dir is None:
+            remote_dir = self.remote_dir
+        return self.sftp.listdir_attr(remote_dir)
+
+    def upload_file(self, local_filepath=None, remote_filepath=None, filename=None):
+        if local_filepath is None:
+            local_filepath = os.path.join(self.local_dir, filename)
+        if remote_filepath is None:
+            remote_filepath = os.path.join(self.remote_dir, filename)
+        self.sftp.put(local_filepath, remote_filepath)
+
+    def download_file(self, local_filepath=None, remote_filepath=None, filename=None):
+        if local_filepath is None:
+            local_filepath = os.path.join(self.local_dir, filename)
+        if remote_filepath is None:
+            remote_filepath = os.path.join(self.remote_dir, filename)
+        self.sftp.get(remote_filepath, local_filepath)
--- a/lib/analysis_package/utils/pd_util.py
+++ b/lib/analysis_package/utils/pd_util.py
@ -0,0 +1,82 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> pd_util
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/7/13 11:00
+@Desc   ：
+"""
+from __future__ import annotations
+
+import os
+from functools import partial
+from multiprocessing import Pool
+from typing import Hashable, Callable
+
+import pandas as pd
+from pandas._typing import CompressionOptions, FilePath, StorageOptions, WriteBuffer
+from pandas.core.generic import bool_t
+
+
+class to_same_csv:
+
+    def __init__(self,
+                 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+                 sep: str = ",",
+                 na_rep: str = "",
+                 float_format: str | None = None,
+                 columns: pd.Sequence[Hashable] | None = None,
+                 header: bool_t | list[str] = True,
+                 index: bool_t = False,
+                 index_label: pd.IndexLabel | None = None,
+                 mode: str = "w",
+                 encoding: str = 'utf8',
+                 compression: CompressionOptions = "infer",
+                 quoting: int | None = None,
+                 quotechar: str = '"',
+                 line_terminator: str | None = None,
+                 chunksize: int | None = None,
+                 date_format: str | None = None,
+                 doublequote: bool_t = True,
+                 escapechar: str | None = None,
+                 decimal: str = ".",
+                 errors: str = "strict",
+                 storage_options: StorageOptions = None,
+                 prepare: Callable = None):
+        self.not_first = False
+        self.mode = mode
+        if self.mode == 'a' and isinstance(path_or_buf, str) and os.path.exists(path_or_buf):
+            header = False
+        self.header = header
+        self.prepare = prepare
+        self.kwargs = {'path_or_buf': path_or_buf,
+                       'sep': sep,
+                       'na_rep': na_rep,
+                       'float_format': float_format,
+                       'columns': columns,
+                       'index': index,
+                       'index_label': index_label,
+                       'encoding': encoding,
+                       'compression': compression,
+                       'quoting': quoting,
+                       'quotechar': quotechar,
+                       'line_terminator': line_terminator,
+                       'chunksize': chunksize,
+                       'date_format': date_format,
+                       'doublequote': doublequote,
+                       'escapechar': escapechar,
+                       'decimal': decimal,
+                       'errors': errors,
+                       'storage_options': storage_options}
+
+    def __call__(self, df_or_series: pd.Series | pd.DataFrame):
+        if self.not_first:
+            df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
+        else:
+            if self.prepare:
+                result = self.prepare(df_or_series)
+                if result:
+                    df_or_series = result
+            df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
+            self.mode = 'a'
+            self.header = False
--- a/lib/analysis_package/utils/phone_util.py
+++ b/lib/analysis_package/utils/phone_util.py
@ -0,0 +1,17 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：IoD_data_analysis_tool -> phone_util
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2022/5/17 15:59
+@Desc   ：
+"""
+import re
+
+re_phone = re.compile(r'^(?:(?:13[0-9])'
+                      r'|(?:14(?:0|[5-7]|9))'
+                      r'|(?:15(?:[0-3]|[5-9]))'
+                      r'|(?:16(?:2|[5-7]))'
+                      r'|(?:17[0-8])'
+                      r'|(?:18[0-9])'
+                      r'|(?:19(?:[0-3]|[5-9])))\d{8}$')
--- a/lib/analysis_package/utils/project_util.py
+++ b/lib/analysis_package/utils/project_util.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""
+@Project ：IoD_data_analysis_tool 
+@File    ：project_util.py
+@IDE     ：PyCharm 
+@Author  ：rengengchen
+@Time    ：2022/9/15 9:45 
+"""
+import compileall
+import os
+import re
+import shutil
+from os.path import join
+
+from lib.analysis_package.utils.file_util import walk
+
+re_pyc = re.compile(r'cpython-\d+\.')
+
+
+def compile_project(source, target=None):
+    """
+    编译项目为pyc文件到指定目录
+    @param source: 项目路径
+    @param target: 编译文件存放路径
+    """
+    source = os.path.abspath(source)
+    if target is None:
+        target = source
+    else:
+        target = os.path.abspath(target)
+    compileall.compile_dir(source)
+    pycache_paths = set()
+    if target == source:
+        for root, dirname, filename in walk(source):
+            if root[-11:] == '__pycache__':
+                pycache_paths.add(root)
+                shutil.move(join(root, filename), join(root, '../', re_pyc.sub('', filename)))
+            if filename and filename.endswith('py'):
+                os.remove(join(root, filename))
+    else:
+        if target is None:
+            target = join(source, 'dist')
+        len_t = len(target)
+        for root, dirname, filename in walk(source):
+            t_root = root.replace(source, target)
+            if target == root[:len_t]:
+                continue
+            if dirname and dirname != '__pycache__':
+                t_root = join(t_root, dirname)
+                if not os.path.exists(t_root) and join(source, dirname) != target:
+                    os.makedirs(t_root)
+            elif filename and not filename.endswith('py'):
+                if root[-11:] == '__pycache__':
+                    pycache_paths.add(root)
+                    t_root = t_root[:-11]
+                    shutil.move(join(root, filename), join(t_root, re_pyc.sub('', filename)))
+                else:
+                    shutil.copyfile(join(root, filename), join(t_root, filename))
+    for p in pycache_paths:
+        os.rmdir(p)
--- a/lib/dist/analysis_package-0.1.3-py3-none-any.whl
+++ b/lib/dist/analysis_package-0.1.3-py3-none-any.whl
--- a/lib/package_project.py
+++ b/lib/package_project.py
@ -0,0 +1,14 @@
+# -*- coding: UTF-8 -*-
+"""
+@Project -> File   ：scrapyproject -> package_project
+@IDE    ：PyCharm
+@Author ：rengengchen
+@Date   ：2021/5/12 10:46
+@Desc   ：
+"""
+import shutil
+import subprocess
+
+subprocess.call('python setup.py bdist_wheel')
+shutil.rmtree(r'build')
+shutil.rmtree(r'analysis_package.egg-info')
--- a/lib/setup.py
+++ b/lib/setup.py
@ -0,0 +1,36 @@
+# coding:utf-8
+from setuptools import setup, find_packages
+
+PACKAGE = "analysis_package"
+NAME = "analysis_package"
+DESCRIPTION = "general analysis function"
+AUTHOR = "iod"
+AUTHOR_EMAIL = "rengengchen@sics.ac.cn"
+URL = ""
+VERSION = '0.1.3'
+
+setup(
+    name=NAME,
+    version=VERSION,
+    description=DESCRIPTION,
+    author=AUTHOR,
+    author_email=AUTHOR_EMAIL,
+    license="BSD",
+    url=URL,
+    include_package_data=True,
+    packages=find_packages(),
+    classifiers=[
+        'Programming Language :: Python',
+        'Operating System :: OS Independent',
+    ],
+    install_requires=[
+        'pandas',
+        'scipy',
+        'numpy',
+        'matplotlib',
+        'seaborn',
+        'tqdm',
+        'scikit-learn',
+    ],
+    zip_safe=False,
+)