init

2024-05-12 20:18:24 +08:00 · 2024-05-12 20:18:24 +08:00 · 707997d4e1
commit 707997d4e1
43 changed files with 1696 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,12 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="N802" />
        </list>
      </option>
    </inspection_tool>
  </profile>
 </component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/utils.iml" filepath="$PROJECT_DIR$/.idea/utils.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/utils.iml
+++ b/.idea/utils.iml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/lib/README.md
+++ b/lib/README.md
@ -0,0 +1,5 @@
 模块职责：
 1. continuous：针对数值型数据进行特征分析
 2. categorical：针对离散型数据进行特征分析
 3. timeseries：对时序数据的分析方法
 4. pre-process：解析配置文件，在数据进入下一步前进行一定的预处理（如补充空值、采样等）
--- a/lib/init.py
+++ b/lib/init.py
--- a/lib/analysis_package/init.py
+++ b/lib/analysis_package/init.py
@ -0,0 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 '''
@Project ：IoD_data_analysis_tool 
@File    ：__init__.py.py
@IDE     ：PyCharm 
@Author  ：rengengchen
@Time    ：2022/8/3 17:07 
 '''
--- a/lib/analysis_package/categorical/README.md
+++ b/lib/analysis_package/categorical/README.md
@ -0,0 +1,30 @@
 数值模块：
 针对离散型数据进行特征分析
 分析方法：
 1> 描述性统计：
  - 记录数据中该列包含的分类
  - 分类个数
  - 频数表
  - 列联表
 2> 卡方独立性检验
 3> 信息熵
 4> 互信息
 功能：
 多列离散数据循环进行数据分析
 运行环境：
 python3.7.10以上
 - numpy
 - pandas
 - matplotlib
 - sklearn
 - scipy.stats
--- a/lib/analysis_package/categorical/init.py
+++ b/lib/analysis_package/categorical/init.py
@ -0,0 +1,8 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> __init__.py
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/7/4 16:34
@Desc   ：
 """
--- a/lib/analysis_package/categorical/categorical_process.py
+++ b/lib/analysis_package/categorical/categorical_process.py
@ -0,0 +1,180 @@
 # -*- coding: utf-8 -*-
 # @Time : 2022/3/17 17:36
 # @Author : Leng Yang
 # @FileName: categorical_process.py
 # @Software: PyCharm
 import pandas as pd
 import numpy as np
 from sklearn import metrics
 from scipy.stats import chi2_contingency, chi2
 def test():
    pass
 class CategorySelfDescribe(object):
    """
    描述性统计量
    """
    def __init__(self):
        pass
    @staticmethod
    def category_describe(data: pd.Series) -> pd.DataFrame:
        """
        描述该列数据包含的分类名称和分类种类数量
        :param data: 输入数据，格式为pd.Series
        :return: pd.DataFrame, 返回dataframe形式包含分类名称列表和分类种类数量
        Examples
        --------
        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
        >>> CategorySelfDescribe().category_describe('天气')
          categories  types
        0  [晴, 阴, 雨]    3.0
        """
        results = pd.DataFrame()
        results = results.append({'categories': data.unique(), 'types': len(data.unique())}, ignore_index=True)
        return results
    @staticmethod
    def category_frequency(data: pd.Series) -> pd.DataFrame:
        """
        频数表
        :param data: 输入数据，格式为pd.Series
        :return: pd.DataFrame, 返回频数表
        Examples
        --------
        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨','雨','雨','阴','晴','晴','雨','晴','阴','阴','雨'],
                     '温度':['高','高','高','低','低','低','低','低','低','低','低','低','高','低']})
        >>> CategorySelfDescribe().category_frequency('天气')
                unique_values  count  frequency
            0             晴      5    0.357143
            1             雨      5    0.357143
            2             阴      4    0.285714
        """
        df_freq = data.value_counts(ascending=False).rename_axis('unique_values').reset_index(name='count')
        df_freq['frequency'] = df_freq['count'] / len(data)
        return df_freq
 class CategorySelfAnalyse(object):
    """
    对单列分类数据进行统计分析
    """
    def __init__(self):
        pass
    @staticmethod
    def entropy(data: pd.Series) -> float:
        """
        计算信息熵
        :param data: 输入数据，格式为pd.Series
        :return: float, 信息熵
        """
        prob = pd.value_counts(data) / len(data)
        return sum(np.log2(prob) * prob * (-1))
 class CategoryMutualDescribe(object):
    """
    对两列不同的分类数据进行描述性统计
    """
    def __init__(self):
        pass
    @staticmethod
    def crosstab(row_data: pd.Series, col_data: pd.Series) -> pd.DataFrame:
        """
        对两列不同的分类数据进行列联表分析
        :param row_data: categorical数据1, 数据1分类作为列联表的行
        :param col_data: categorical数据2, 数据2分类作为列联表的列
        :return: pd.DataFrame, 列联表
        Examples
        --------
        >>> data1 = pd.DataFrame({'天气':['晴','晴','阴','雨'], '温度':['高','高','高','低']})
        >>> CategoryMutualDescribe().crosstab('天气','温度')
        温度   高 低
        天气
        晴     2  0
        阴     1  0
        雨     0  1
        """
        return pd.crosstab(row_data, col_data)
 class MutualCategoricalAnalyse(object):
    """
    对两列分类数据进行统计分析
    """
    def __init__(self):
        pass
    @staticmethod
    def info_gain(df: pd.DataFrame, attr_col: str, data_col: str) -> float:
        """
        计算信息增益: Gain(D,A) = Ent(D) - Ent(D|A)
        使用某个特征A划分数据集D
        :param df_data: 输入数据，格式为dataframe
        :param attr_col: 特征数据列名
        :param data_col: 数据集列名
        :return: float, 信息增益
        """
        # e: 条件信息熵
        e1 = df.groupby(attr_col).apply(lambda x: CategorySelfAnalyse.entropy(x[data_col]))
        p1 = pd.value_counts(df[attr_col]) / len(df[attr_col])  # p(x)
        e2 = sum(e1 * p1)  # Ent(D|A)
        return CategorySelfAnalyse.entropy(df[data_col]) - e2
    @staticmethod
    def normalized_mutual_information(data1: pd.Series, data2: pd.Series) -> float:
        """
        Mutual Information between two clusterings. The Mutual Information is a measure of the similarity
        between two labels of the same data.
        Normalized Mutual Information (NMI) is a normalization of the Mutual
        Information (MI) score to scale the results between 0 (no mutual
        information) and 1 (perfect correlation).
        :param df_data: 输入数据，格式为dataframe
        :param data1: 分类数据1
        :param data2: 分类数据2
        :return: nmi : float, score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
        """
        return metrics.normalized_mutual_info_score(data1, data2)
    @staticmethod
    def chi2_independence(data1: pd.Series, data2: pd.Series, alpha=0.05) -> pd.DataFrame:
        """
        卡方独立性检验
        :param alpha: 置信度，用来确定临界值
        :param data1: categroical数据1
        :param data2: categorical数据2
        :return: pd.DataFrame，内容如下：
            g: 卡方值，也就是统计量
            p: P值（统计学名词），与置信度对比，也可进行假设检验，P值小于置信度，即可拒绝原假设
            dof: 自由度
            re: 判读变量，1表示拒绝原假设，0表示接受原假设
            expctd: 原数据数组同维度的对应理论值
        """
        data = CategoryMutualDescribe.crosstab(data1, data2)
        result = pd.DataFrame(columns=['g', 'p', 'dof', 'expctd'])
        g, p, dof, expctd = chi2_contingency(data)
        result = result.append({'g': g, 'p': p, 'dof': dof, 'expctd': expctd}, ignore_index=True)
        if dof == 0:
            raise ValueError('自由度应该大于等于1')
        elif dof == 1:
            cv = chi2.isf(alpha * 0.5, dof)  # critical value
        else:
            cv = chi2.isf(alpha * 0.5, dof - 1)
        if g > cv:
            result.loc[0, 're'] = 1  # 表示拒绝原假设
        else:
            result.loc[0, 're'] = 0  # 表示接受原假设
        return result
--- a/lib/analysis_package/code_template/init.py
+++ b/lib/analysis_package/code_template/init.py
@ -0,0 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
@Project ：IoD_data_analysis_tool
@File    ：__init__.py.py
@IDE     ：PyCharm
@Author  ：rengengchen
@Time    ：2022/8/5 11:52
 """
--- a/lib/analysis_package/code_template/concurrency/init.py
+++ b/lib/analysis_package/code_template/concurrency/init.py
@ -0,0 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
@Project ：IoD_data_analysis_tool
@File    ：__init__.py.py
@IDE     ：PyCharm
@Author  ：rengengchen
@Time    ：2022/8/5 11:52
 """
--- a/lib/analysis_package/code_template/concurrency/producer_consumer.py
+++ b/lib/analysis_package/code_template/concurrency/producer_consumer.py
@ -0,0 +1,127 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
@Project ：IoD_data_analysis_tool
@File    ：producer_consumer.py
@IDE     ：PyCharm
@Author  ：rengengchen
@Time    ：2022/8/5 11:53
 """
 import multiprocessing
 from typing import Iterable, Callable
 from tqdm import tqdm
 class Stop:
    pass
 class AbstractPCConcurrencySystem:
    """
    @todo 对启动进程的维护
    @todo 进程数量
    """
    def __init__(self, num_producer: int = 1, num_consumer: int = 1, num_callback: int = 0,
                 len_task_queue: int = 0, len_result_queue: int = 0, len_callback_queue: int = 0,
                 producer_lock=None, consumer_lock=None, callback_lock=None,
                 meta=None, enable_progressbar=False, num_total_result=None):
        self.task_queue = multiprocessing.Queue(len_task_queue)
        self.num_producer = num_producer
        self.num_consumer = num_consumer
        self.num_callback = num_callback
        self.producer_lock = producer_lock or multiprocessing.Lock()
        self.consumer_lock = consumer_lock or multiprocessing.Lock()
        self.meta = meta
        self.enable_progressbar = enable_progressbar
        if enable_progressbar and self.num_callback == 0:
            self.num_callback = 1
        self.result_queue = multiprocessing.Queue(len_result_queue)
        if self.num_callback:
            self.callback_lock = callback_lock or multiprocessing.Lock()
        self.num_total_result = num_total_result
        self.callback_queue = multiprocessing.Queue(len_callback_queue)
    def get_result(self):
        return self.callback_queue.get()
    def produce(self):
        """
        Must return an iterable object or a Stop object.
        """
        raise NotImplementedError
    def consume(self, consumer_params):
        """
        @return: task result or Stop()
        """
        raise NotImplementedError
    def callback(self, result):
        return result
    def _produce(self):
        producer = self.produce()
        if isinstance(producer, Iterable):
            for params in producer:
                self.task_queue.put(params, block=True)
            stop = Stop()
            for _ in range(self.num_consumer):
                self.task_queue.put(stop, block=True)
        elif isinstance(producer, Callable):
            while True:
                task = producer()
                if isinstance(task, Stop):
                    break
                self.task_queue.put(task, block=True)
    def _consume(self):
        consumer_params = self.task_queue.get(block=True)
        while not isinstance(consumer_params, Stop):
            info = self.consume(consumer_params)
            self.result_queue.put(info)
            consumer_params = self.task_queue.get(block=True)
        self.result_queue.put(Stop())
    def _callback(self):
        if self.enable_progressbar:
            bar = tqdm(total=self.num_total_result)
        over_flag = 0
        while over_flag < self.num_consumer:
            result = self.result_queue.get(block=True)
            if isinstance(result, Stop):
                over_flag += 1
            else:
                callback = self.callback(result)
                self.callback_queue.put(callback)
                if self.enable_progressbar:
                    bar.update(1)
        else:
            if self.enable_progressbar:
                bar.close()
    def run(self):
        consumers = []
        callbackers = []
        # 创建并启动生产者
        for i in range(self.num_producer):
            multiprocessing.Process(target=self._produce, name=f'producer_{i}').start()
        # 创建并启动消费者
        for i in range(self.num_consumer):
            p = multiprocessing.Process(target=self._consume, name=f'consumer_{i}')
            consumers.append(p)
            p.start()
        # 处理结果
        if self.num_callback:
            for i in range(self.num_callback):
                p = multiprocessing.Process(target=self._callback, name=f'callback_{i}')
                callbackers.append(p)
                p.start()
        return self
    def close(self):
        self.task_queue.close()
        self.result_queue.close()
        self.callback_queue.close()
--- a/lib/analysis_package/code_template/concurrency/task_distribution.py
+++ b/lib/analysis_package/code_template/concurrency/task_distribution.py
@ -0,0 +1,28 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
@Project ：IoD_data_analysis_tool 
@File    ：distribute_task.py
@IDE     ：PyCharm 
@Author  ：rengengchen
@Time    ：2022/8/8 16:55 
 """
 import math
 import multiprocessing
 def equally_distributing_task(target, tasks, *args, results=None, num_processors=8):
    len_tasks = len(tasks)
    process_offset = math.ceil(len_tasks / num_processors)
    for i in range(num_processors):
        sub_tasks = tasks[i * process_offset: (i + 1) * process_offset]
        if sub_tasks:
            if results:
                multiprocessing.Process(target=target,
                                        args=(sub_tasks, results, *args)).start()
            else:
                multiprocessing.Process(target=target,
                                        args=(sub_tasks, *args)).start()
        else:
            break
    return results
--- a/lib/analysis_package/continuous/Crime_R.csv
+++ b/lib/analysis_package/continuous/Crime_R.csv
@ -0,0 +1,48 @@
 CrimeRate,Youth,Southern,Education,ExpenditureYear0,LabourForce,Males,MoreMales,StateSize,YouthUnemployment,MatureUnemployment,HighYouthUnemploy,Wage,BelowWage,CrimeRate10,Youth10,Education10,ExpenditureYear10,LabourForce10,Males10,MoreMales10,StateSize10,YouthUnemploy10,MatureUnemploy10,HighYouthUnemploy10,Wage10,BelowWage10
 45.5,135,0,12.4,69,540,965,0,6,80,22,1,564,139,26.5,135,12.5,71,564,974,0,6,82,20,1,632,142
 52.3,140,0,10.9,55,535,1045,1,6,135,40,1,453,200,35.9,135,10.9,54,540,1039,1,7,138,39,1,521,210
 56.6,157,1,11.2,47,512,962,0,22,97,34,0,288,276,37.1,153,11,44,529,959,0,24,98,33,0,359,256
 60.3,139,1,11.9,46,480,968,0,19,135,53,0,457,249,42.7,139,11.8,41,497,983,0,20,131,50,0,510,235
 64.2,126,0,12.2,106,599,989,0,40,78,25,1,593,171,46.7,125,12.2,97,602,989,0,42,79,24,1,660,162
 67.6,128,0,13.5,67,624,972,0,28,77,25,1,507,206,47.9,128,13.8,60,621,983,0,28,81,24,1,571,199
 70.5,130,0,14.1,63,641,984,0,14,70,21,1,486,196,50.6,153,14.1,57,641,993,0,14,71,23,1,556,176
 73.2,143,0,12.9,66,537,977,0,10,114,35,1,487,166,55.9,143,13,63,549,973,0,11,119,36,1,561,168
 75,141,0,12.9,56,523,968,0,4,107,37,0,489,170,61.8,153,12.9,54,538,968,0,5,110,36,1,550,126
 78.1,133,0,11.4,51,599,1024,1,7,99,27,1,425,225,65.4,134,11.2,47,600,1024,1,7,97,28,1,499,215
 79.8,142,1,12.9,45,533,969,0,18,94,33,0,318,250,71.4,142,13.1,44,552,969,0,19,93,36,0,378,247
 82.3,123,0,12.5,97,526,948,0,113,124,50,0,572,158,75.4,134,12.4,87,529,949,0,117,125,49,0,639,146
 83.1,135,0,13.6,62,595,986,0,22,77,27,0,529,190,77.3,137,13.7,61,599,993,0,23,80,28,0,591,189
 84.9,121,0,13.2,118,547,964,0,25,84,29,0,689,126,78.6,132,13.3,115,538,968,0,25,82,30,0,742,127
 85.6,166,1,11.4,58,521,973,0,46,72,26,0,396,237,80.6,153,11.2,54,543,983,0,47,76,25,1,568,246
 88,140,0,12.9,71,632,1029,1,7,100,24,1,526,174,82.2,130,12.9,68,620,1024,1,8,104,25,1,570,182
 92.3,126,0,12.7,74,602,984,0,34,102,33,1,557,195,87.5,134,12.9,67,599,982,0,33,107,34,1,621,199
 94.3,130,0,13.3,128,536,934,0,51,78,34,0,627,135,92.9,127,13.3,128,530,949,0,52,79,33,0,692,140
 95.3,125,0,12,90,586,964,0,97,105,43,0,617,163,94.1,134,11.9,81,571,971,0,99,106,41,0,679,162
 96.8,151,1,10,58,510,950,0,33,108,41,0,394,261,96.2,161,10.1,56,515,1001,1,32,110,40,0,465,254
 97.4,152,1,10.8,57,530,986,0,30,92,43,0,405,264,97.8,152,11,53,541,989,0,30,92,41,0,470,243
 98.7,162,1,12.1,75,522,996,0,40,73,27,0,496,224,99.9,162,12,70,533,992,0,41,80,28,0,562,229
 99.9,149,1,10.7,61,515,953,0,36,86,35,0,395,251,101.4,150,10.7,54,520,952,0,35,84,32,0,476,249
 103,177,1,11,58,638,974,0,24,76,28,0,382,254,103.5,164,10.9,56,638,978,0,25,79,28,0,456,257
 104.3,134,0,12.5,75,595,972,0,47,83,31,0,580,172,104.5,133,12.7,71,599,982,0,50,87,32,0,649,182
 105.9,130,0,13.4,90,623,1049,1,3,113,40,0,588,160,106.4,153,13.4,91,622,1050,1,3,119,41,0,649,159
 106.6,157,1,11.1,65,553,955,0,39,81,28,0,421,239,107.8,156,11.2,62,562,956,0,39,85,29,0,499,243
 107.2,148,0,13.7,72,601,998,0,9,84,20,1,590,144,110.1,134,13.9,66,602,999,0,9,87,15,0,656,151
 108.3,126,0,13.8,97,542,990,0,18,102,35,0,589,166,110.5,126,13.8,97,549,993,0,19,103,34,1,659,160
 109.4,135,1,11.4,123,537,978,0,31,89,34,0,631,165,113.5,134,11.3,115,529,978,0,32,93,35,0,703,175
 112.1,142,1,10.9,81,497,956,0,33,116,47,0,427,247,116.3,147,10.7,77,501,962,0,33,117,44,0,500,256
 114.3,127,1,12.8,82,519,982,0,4,97,38,0,620,168,119.7,125,12.9,79,510,945,0,4,99,39,0,696,170
 115.1,131,0,13.7,78,574,1038,1,7,142,42,1,540,176,124.5,134,13.6,73,581,1029,1,7,143,41,1,615,177
 117.2,136,0,12.9,95,574,1012,1,29,111,37,1,622,162,127.8,140,13,96,581,1011,1,29,115,36,1,691,169
 119.7,119,0,11.9,166,521,938,0,168,92,36,0,637,154,129.8,120,11.9,157,524,935,0,180,93,27,1,698,169
 121.6,147,1,13.9,63,560,972,0,23,76,24,1,462,233,130.7,139,14,64,571,970,0,24,78,24,1,511,220
 123.4,145,1,11.7,82,560,981,0,96,88,31,0,488,228,132.5,154,11.8,74,563,980,0,99,89,29,1,550,230
 127.2,132,0,10.4,87,564,953,0,43,83,32,0,513,227,134.6,135,10.2,83,560,948,0,44,83,32,0,589,234
 132.4,152,0,12,82,571,1018,1,10,103,28,1,537,215,137.5,151,12.1,76,567,1079,1,11,105,27,1,617,204
 135.5,125,0,12.5,113,567,985,0,78,130,58,0,626,166,140.5,140,12.5,105,571,993,0,77,131,59,0,684,174
 137.8,141,0,14.2,109,591,985,0,18,91,20,1,578,174,145.7,142,14.2,101,590,987,0,19,94,19,1,649,180
 140.8,150,0,12,109,531,964,0,9,87,38,0,559,153,150.6,153,12,98,539,982,0,10,88,36,0,635,151
 145.4,131,1,12.2,115,542,969,0,50,79,35,0,472,206,157.3,131,12.1,109,548,976,0,52,82,34,0,539,219
 149.3,143,0,12.3,103,583,1012,1,13,96,36,0,557,194,162.7,142,12.2,95,612,1003,1,13,97,36,0,625,196
 154.3,124,0,12.3,121,580,966,0,101,77,35,0,657,170,169.6,134,12.2,116,580,987,0,104,79,36,0,719,172
 157.7,136,0,15.1,149,577,994,0,157,102,39,0,673,167,177.2,140,15.2,141,578,995,0,160,110,40,0,739,169
 161.8,131,0,13.2,160,631,1071,1,3,102,41,0,674,152,178.2,132,13.2,143,632,1058,1,4,100,40,0,748,150
--- a/lib/analysis_package/continuous/README.md
+++ b/lib/analysis_package/continuous/README.md
@ -0,0 +1,29 @@
 # **Numerical data analysis and process tools**
 ###  **Project Description**: 
 - Numerical data correlation analysis and processing, using image visualization to help understanding.
 ####  Numerical analysis tools part 
 - Spearman_correlation is to determine whether there is a Monotonic component between two features,
 which can be apply only for non_linear relationship and ordinal data.
 #### Numercial process tools part 
 - Detecting outlier by using the Interquartile range(IQR).
 - When highly correlated features will be used to remove.
 #### How to use the tools 
 Input an only numerical data (data type:DataFrame).
--- a/lib/analysis_package/continuous/init.py
+++ b/lib/analysis_package/continuous/init.py
@ -0,0 +1,8 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> __init__.py
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/7/4 16:34
@Desc   ：
 """
--- a/lib/analysis_package/continuous/analyzer.py
+++ b/lib/analysis_package/continuous/analyzer.py
@ -0,0 +1,38 @@
 import os
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy.stats import spearmanr
 import logging
 logger = logging.getLogger(__name__)
 def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
    """
    Spearman_correlation is to determine whether there is a
    Monotonic component between two features, which can be apply
    only for non_linear relationship and ordinal data
    @param feature_a: Input first feature for Spearman's rank test
    @param feature_b: Input second feature for Spearman's rank test
    @param sample_size: Choose a sample for representing the population
    @param:save_path: output path
    @param:file_name: output name
    """
    a = data_frame[feature_a].sample(n=sample_size, random_state=1)
    b = data_frame[feature_b].sample(n=sample_size, random_state=1)
    coef, p = spearmanr(a, b)
    logger.info("Spearmans' correlation coefficient is:" + str(coef))
    alpha = 0.05
    plt.scatter(a, b)
    plt.xlabel("Feature A")
    plt.ylabel("Feature B")
    plt.title("Spearman's Rank Test")
    plt.savefig(os.path.join(save_path, file_name))
    if p > alpha:
        logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
    else:
        logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))
--- a/lib/analysis_package/continuous/correlation.py
+++ b/lib/analysis_package/continuous/correlation.py
@ -0,0 +1,155 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> correlation
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/7/4 16:48
@Desc   ：
 """
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from scipy.stats import spearmanr
 def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
              alternative='two-sided', sample_size=4000, random_state=None):
    """Calculate a Spearman correlation coefficient with associated p-value.
        The Spearman rank-order correlation coefficient is a nonparametric measure
        of the monotonicity of the relationship between two datasets. Unlike the
        Pearson correlation, the Spearman correlation does not assume that both
        datasets are normally distributed. Like other correlation coefficients,
        this one varies between -1 and +1 with 0 implying no correlation.
        Correlations of -1 or +1 imply an exact monotonic relationship. Positive
        correlations imply that as x increases, so does y. Negative correlations
        imply that as x increases, y decreases.
        The p-value roughly indicates the probability of an uncorrelated system
        producing datasets that have a Spearman correlation at least as extreme
        as the one computed from these datasets. The p-values are not entirely
        reliable but are probably reasonable for datasets larger than 500 or so.
        Parameters
        ----------
        a, b : 1D or 2D array_like, b is optional
            One or two 1-D or 2-D arrays containing multiple variables and
            observations. When these are 1-D, each represents a vector of
            observations of a single variable. For the behavior in the 2-D case,
            see under ``axis``, below.
            Both arrays need to have the same length in the ``axis`` dimension.
        axis : int or None, optional
            If axis=0 (default), then each column represents a variable, with
            observations in the rows. If axis=1, the relationship is transposed:
            each row represents a variable, while the columns contain observations.
            If axis=None, then both arrays will be raveled.
        nan_policy : {'propagate', 'raise', 'omit'}, optional
            Defines how to handle when input contains nan.
            The following options are available (default is 'propagate'):
            * 'propagate': returns nan
            * 'raise': throws an error
            * 'omit': performs the calculations ignoring nan values
        alternative : {'two-sided', 'less', 'greater'}, optional
            Defines the alternative hypothesis. Default is 'two-sided'.
            The following options are available:
            * 'two-sided': the correlation is nonzero
            * 'less': the correlation is negative (less than zero)
            * 'greater':  the correlation is positive (greater than zero)
        sample_size : int, optional
            Number of items from column to return. Default is 4000.
        random_state : int, array-like, BitGenerator, np.random.RandomState, optional
            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
            random number generator
            If np.random.RandomState, use as numpy RandomState object.
        Returns
        -------
        correlation : float or ndarray (2-D square)
            Spearman correlation matrix or correlation coefficient (if only 2
            variables are given as parameters. Correlation matrix is square with
            length equal to total number of variables (columns or rows) in ``a``
            and ``b`` combined.
        pvalue : float
            The p-value for a hypothesis test whose null hypotheisis
            is that two sets of data are uncorrelated. See `alternative` above
            for alternative hypotheses. `pvalue` has the same
            shape as `correlation`.
        References
        ----------
        .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
           Probability and Statistics Tables and Formulae. Chapman & Hall: New
           York. 2000.
           Section  14.7
        Examples
        --------
        >>> from scipy import stats
        >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
        SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
        >>> rng = np.random.default_rng()
        >>> x2n = rng.standard_normal((100, 2))
        >>> y2n = rng.standard_normal((100, 2))
        >>> stats.spearmanr(x2n)
        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
        >>> stats.spearmanr(x2n[:,0], x2n[:,1])
        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
        >>> rho, pval = stats.spearmanr(x2n, y2n)
        >>> rho
        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
               [-0.07960396,  1.        , -0.14448245,  0.16738074],
               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
        >>> pval
        array([[0.        , 0.43111687, 0.41084066, 0.33891628],
               [0.43111687, 0.        , 0.15151618, 0.09600687],
               [0.41084066, 0.15151618, 0.        , 0.74938561],
               [0.33891628, 0.09600687, 0.74938561, 0.        ]])
        >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
        >>> rho
        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
               [-0.07960396,  1.        , -0.14448245,  0.16738074],
               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
        >>> stats.spearmanr(x2n, y2n, axis=None)
        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
        >>> stats.spearmanr(x2n.ravel(), y2n.ravel())
        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
        >>> rng = np.random.default_rng()
        >>> xint = rng.integers(10, size=(100, 2))
        >>> stats.spearmanr(xint)
        SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
        """
    # a = a.sample(n=sample_size, random_state=random_state)
    # if b:
    #     b = b.sample(n=sample_size, random_state=random_state)
    return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
 def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
    cov = df.corr(method=method)
    if drop:
        uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
        cov = cov[uncorr]
        cov = cov[cov.index]
    if plot or filepath:
        mask = np.triu(np.ones_like(cov, dtype=bool))
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
        plt.title("相关性矩阵")
    if filepath:
        plt.savefig(filepath)
    if plot:
        plt.show()
    return cov
--- a/lib/analysis_package/continuous/process_tool.py
+++ b/lib/analysis_package/continuous/process_tool.py
@ -0,0 +1,48 @@
 #!/usr/bin/python3  
 # -*- coding: utf-8 -*-
 # @Time      : 2022/3/25 9:09
 # @Software  : PyCharm
 # @File      : process_tool.py
 # @Author    : QT
 # @Email     : taoqimin@sics.ac.cn
 import numpy as np
 from tqdm import tqdm
 import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(level=logging.INFO)
 handler = logging.FileHandler("log.txt")
 handler.setLevel(logging.INFO)
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 handler.setFormatter(formatter)
 logger.addHandler(handler)
 console = logging.StreamHandler()
 console.setLevel(logging.INFO)
 logger.addHandler(handler)
 logger.addHandler(console)
 class NumericProcess:
    @staticmethod
    def drop_feature(data_frame, thresh_hold):
        """
        A function for detecting and dropping highly correlated features.
        when two variables are highly correlated, it usually cause problem
        such as Multicolinearity. The following function will be used to
        remove the correlated features.
        @param data_frame: Input dataframe
        @param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
        """
        matrix = data_frame.corr().abs()
        mask = np.triu(np.ones_like(matrix, dtype=bool))
        reduced_matrix = matrix.mask(mask)
        feature_drop = [c for c in tqdm(reduced_matrix) if
                        any(reduced_matrix[c] > thresh_hold)]
        data_frame.drop(feature_drop, axis=1, inplace=True)
        logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
        return data_frame
--- a/lib/analysis_package/preprocess/README.md
+++ b/lib/analysis_package/preprocess/README.md
@ -0,0 +1,20 @@
 解析配置文件，在数据进入下一步前进行一定的预处理（如补充空值、采样等）
 目前完成了Pre-process Lib的部分预处理功能，如下：
 - data_insight
  - DuplicateInsight - 重复数据的检测
  - NullInsight - 空值数据的检测
  - ValidationInsight - 数据有效性检测
 - data_process
  - FilteringProcessor - 数据过滤
 另外：
 - TypeInsight - 其中对date日期的检验方法还未完成
 还未完成
--- a/lib/analysis_package/preprocess/init.py
+++ b/lib/analysis_package/preprocess/init.py
@ -0,0 +1,8 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> __init__.py
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/4/26 10:40
@Desc   ：
 """
--- a/lib/analysis_package/preprocess/data_insight.py
+++ b/lib/analysis_package/preprocess/data_insight.py
@ -0,0 +1,133 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 # file: data_insight
 # author: shenwentao, wangkanglong
 # description:
 # date: 2022-03-30 16:45
 # IDE: PyCharm
 import pandas as pd
 import datetime
 from typing import List, Union
 from pandas.core.dtypes.api import is_bool_dtype, is_float_dtype, is_integer_dtype, is_string_dtype, is_datetime64_dtype
 from iod_data_analysis_tool.utils.assertion import assert_range
 class DuplicateInsight:
    @staticmethod
    def num_duplicate(data, subset=None, keep='first') -> pd.DataFrame:
        """
        用户自定义重复数据的计数
        :param data: 来源数据
        :param subset: 选中列/字段，同pd.DataFrame里的dulplicated函数subset参数
        :param keep: 确定要标记的重复项（如果有）。同pd.DataFrame里的dulplicated函数keep参数
        :return: 返回计数结果
        """
        result = data.duplicated(subset, keep=keep).sum()
        return pd.DataFrame([result], columns=['duplicate_num'])
 class NullInsight:
    @staticmethod
    def num_null(data, column: str = None) -> pd.DataFrame:
        """
        用户自定义计数数据中的空值
        :param data: 来源数据
        :param column: 选中列/字段
        :return: 返回计数结果
        """
        if column is not None:
            return pd.DataFrame([data[column].isna().sum()], columns=['null_num'], index=[column])
        else:
            return pd.DataFrame(data.isna().sum(), columns=['null_num'])
 class ValidationInsight:
    """
    自定义验证数据有效性，比如数据里有坏数，针对不同类型的数据限定范围
    """
    @staticmethod
    def validation_continuous_range(data: pd.DataFrame, column: str,
                                    min_val: Union[int, float], max_val: Union[int, float]) -> pd.DataFrame:
        """
        用户自定义对连续数值型数据进行验证，返回数据在指定范围内外的计数结果
        :param data: 来源数据
        :param column: 选中列/字段
        :param min_val: 范围最小值
        :param max_val: 范围最大值
        :return: 计数结果
        """
        assert_range(min_val, max_val)
        nums = dict()
        nums['column'] = column
        nums['num_lt_min'] = data.query(f'{column} < {min_val}').shape[0]
        nums['num_gt_max'] = data.query(f'{column} > {max_val}').shape[0]
        nums['num_within_range'] = data.shape[0] - nums['num_lt_min'] - nums['num_gt_max']
        return pd.DataFrame([nums], index=['result'])
    @staticmethod
    def validation_categorical_range(data, column: str, values: List) -> pd.DataFrame:
        """
        用户自定义对离散型数据进行验证，返回数据在指定范围内外的计数结果
        :param data: 来源数据
        :param column: 选中列/字段
        :param values: 用户自定义的离散值，也就是数值所在的"范围"
        :return: 计数结果
        """
        nums = dict()
        nums['column'] = column
        nums['num_within_range'] = data[data[column].isin(values)].shape[0]
        nums['num_out_range'] = len(data[column]) - nums['num_within_range']
        return pd.DataFrame([nums], index=['result'])
    @staticmethod
    def validation_date_range(data, column: str, start_date: datetime.date,
                              end_date: datetime.date) -> pd.DataFrame:
        """
        用户自定义对日期型数据范围进行验证，返回数据在指定范围内外的计数结果，前提：数据类型是 datetime.date
        :param data: 来源数据
        :param column: 选中列/字段
        :param start_date: 开始日期
        :param end_date: 结束日期
        :return: 计数结果
        """
        assert_range(start_date, end_date)
        nums = dict()
        nums['column'] = column
        nums['date_lt_start'] = sum(data[column] < start_date)
        nums['date_gt_end'] = sum(data[column] > end_date)
        nums['date_within_range'] = data.shape[0] - nums['date_lt_start'] - nums['date_lt_start']
        return pd.DataFrame([nums], index=['result'])
 class TypeInsight:
    """
    使用户能够检测数据的数据类型是否为自己所预期的
    """
    # TODO: 还缺一个timestamp checker
    _checkers = {
        'int': is_integer_dtype,
        'float': is_float_dtype,
        'string': is_string_dtype,
        'bool': is_bool_dtype,
        'datetime': is_datetime64_dtype
    }
    @staticmethod
    def type_check(data, column: str, check_type: str) -> pd.DataFrame:
        """
        用户检测数据类型是否为自己所需要的类型
        :param data: 来源数据
        :param column: 选中的列/字段
        :param check_type: 选择检测的数据类型，{'int', 'float', 'string', 'bool', 'datetime'}
        :return: 检测结果
        """
        flag = True
        if not TypeInsight._checkers[check_type](data[column]):
            flag = False
        return pd.DataFrame([flag], columns=['result'], index=[column])
--- a/lib/analysis_package/preprocess/normalizer.py
+++ b/lib/analysis_package/preprocess/normalizer.py
@ -0,0 +1,17 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> normalizer
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/4/26 10:40
@Desc   ：
 """
 import pandas as pd
 from scipy.stats import zscore as scipy_zscore
 def zscore(a, axis=0, ddof=0, nan_policy='propagate'):
    """
    Zi = (Xi - μ) / σ
    """
    return scipy_zscore(a, axis, ddof, nan_policy)
--- a/lib/analysis_package/preprocess/outlier.py
+++ b/lib/analysis_package/preprocess/outlier.py
@ -0,0 +1,51 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> outlierprocessing
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/4/26 10:24
@Desc   ：
 """
 from typing import Union
 import pandas as pd
 def MAD(data: pd.Series, n: float = 2.5, constant=1.4826, axis=0):
    """
    MAD = median(|Xi - median(X)|)
    @return pandas.Index
    """
    x = data.median()
    MC = (data - x).abs().median()
    MAD = MC * constant
    offset = n * MAD
    if isinstance(data, pd.DataFrame):
        return data.clip(lower=x - offset, upper=x + offset, axis=axis)
    else:
        return data.clip(lower=x - offset, upper=x + offset)
 def three_sigma(data: pd.Series):
    miu = data.mean()
    sigma = data.std()
    low = miu - 3 * sigma
    up = miu + 3 * sigma
    return data.index[(data < low) | (data > up)]
 def box_plot(data: pd.Series, q1: float = 0.25, q3: float = 0.75, k: float = 1.5):
    q = data.quantile(q=[q1, q3])
    IQR = q[q3] - q[q1]
    lower_whisker_limit = q[q1] - k * IQR
    upper_whisker_limit = q[q3] + k * IQR
    return data.index[(data < lower_whisker_limit) | (data > upper_whisker_limit)]
 def regex_match(data: pd.Series, *patterns):
    pattern = '|'.join(patterns)
    return data.index[data.astype(str).str.contains(pattern, regex=True)]
 def empty(data: Union[pd.Series, pd.DataFrame]):
    return any(data.isnull())
--- a/lib/analysis_package/timeseries/README.md
+++ b/lib/analysis_package/timeseries/README.md
@ -0,0 +1,24 @@
 ## 对时序数据的分析方法
 --------
 |模块|涉及方法|
 |  ----  | ----  |
 |基础模块| |
 |平稳性| |
 |异常检测| |
 |频率检测| |
 |周期性检测| |
 |其他| |
 ### 基础模块
 ### 平稳性
 ### 异常检测
 ### 频率检测
 ### 周期性检测
 ### 其他
--- a/lib/analysis_package/timeseries/init.py
+++ b/lib/analysis_package/timeseries/init.py
@ -0,0 +1,26 @@
 import pandas as pd
 def describe_datetime_info(data: pd.Series, datetime_is_numeric: bool = False) -> pd.Series:
    """
    if the type of data is str and data dont have date, it will be populated by the
    date of today.
    @param data: data
    @param datetime_is_numeric : bool, default False
            Whether to treat datetime dtypes as numeric. This affects statistics
            calculated for the column. For DataFrame input, this also
            controls whether datetime columns are included by default.
    @return: Summary statistics of the Series.
    @example: Describing a numeric ``Series``.
        >>> s = pd.read_csv()
        >>> s.describe()
        count                        1427132
        unique                         25111
        top       2022-04-26 09:25:00.260000
        freq                           32994
        first            2022-04-26 09:25:00
        last      2022-04-26 09:34:46.340000
        Name: TradTime, dtype: object
    """
    return pd.to_datetime(data).describe(datetime_is_numeric=datetime_is_numeric)
--- a/lib/analysis_package/timeseries/anomaly_detection.py
+++ b/lib/analysis_package/timeseries/anomaly_detection.py
--- a/lib/analysis_package/timeseries/frequent_analysis.py
+++ b/lib/analysis_package/timeseries/frequent_analysis.py
--- a/lib/analysis_package/timeseries/seasonal_detection.py
+++ b/lib/analysis_package/timeseries/seasonal_detection.py
--- a/lib/analysis_package/timeseries/stationary_test.py
+++ b/lib/analysis_package/timeseries/stationary_test.py
@ -0,0 +1,62 @@
 import pandas as pd 
 import numpy as np 
 from time_base import timeBase
 import statsmodels.api as sm
 import statsmodels.tsa.api as smt
 import statsmodels.formula as smf 
 import scipy.stats as scs
 class stationaryTest(Time_base):
    """
    时间序列稳定性检验
    """
    def __init__(self):
        pass
    def test_stationary(self, x, window_size):
        """
        时间序列稳定性检验
        x : 时间序列数据
        window_size : 窗口大小
        """
        x_ma = self.moving_average(x, window_size)
        x_std = self.moving_std(x, window_size)
        x_max = self.moving_max(x, window_size)
        x_min = self.moving_min(x, window_size)
        x_median = self.moving_median(x, window_size)
        x_normalized = self.normalize(x)
        x_ma_normalized = self.normalize(x_ma)
        x_std_normalized = self.normalize(x_std)
        x_max_normalized = self.normalize(x_max)
        x_min_normalized = self.normalize(x_min)
        x_median_normalized = self.normalize(x_median)
        x_normalized_ma_normalized = self.normalize(x_normalized - x_ma_normalized)
        x_normalized_std_normalized = self.normalize(x_normalized - x_std_normalized)
        x_normalized_max_normalized = self.normalize(x_normalized - x_max_normalized)
        x_normalized_min_normalized = self.normalize(x_normalized - x_min_normalized)
        x_normalized_median_normalized = self.normalize(x_normalized - x_median_normalized)
        x_normalized_ma_normalized_std_normalized = self.normalize(x_normalized_ma_normalized - x_std)
        return x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized
    def adf_test(self, x, window_size):
        """
        时间序列稳定性检验
        x : 时间序列数据
        window_size : 窗口大小
        """
        x_normalized, x_ma_normalized, x_std_normalized, x_max_normalized, x_min_normalized, x_median_normalized, x_normalized_ma_normalized, x_normalized_std_normalized, x_normalized_max_normalized, x_normalized_min_normalized, x_normalized_median_normalized, x_normalized_ma_normalized_std_normalized = self.test_stationary(x, window_size)
        adf_test_normalized = smt.adfuller(x_normalized)
        adf_test_ma_normalized = smt.adfuller(x_ma_normalized)
        adf_test_std_normalized = smt.adfuller(x_std_normalized)
        adf_test_max_normalized = smt.adfuller(x_max_normalized)
        adf_test_min_normalized = smt.adfuller(x_min_normalized)
        adf_test_median_normalized = smt.adfuller(x_median_normalized)
        adf_test_normalized_ma_normalized = smt.adfuller(x_normalized_ma_normalized)
        adf_test_normalized_std_normalized = smt.adfuller(x_normalized_std_normalized)
        adf_test_normalized_max_normalized = smt.adfuller(x_normalized_max_normalized)
        adf_test_normalized_min_normalized = smt.adfuller(x_normalized_min_normalized)
        return adf_test_normalized, adf_test_ma_normalized, adf_test_std_normalized, adf_test_max_normalized, adf_test_min_normalized, adf_test_median_normalized, adf_test_normalized_ma_normalized, adf_test_normalized_std_normalized, adf_test_normalized_max_normalized, adf_test_normalized_min_normalized
--- a/lib/analysis_package/timeseries/time_base.py
+++ b/lib/analysis_package/timeseries/time_base.py
@ -0,0 +1,133 @@
 import pandas as pd 
 import numpy as np
 class Time_base(object):
    """
    时间序列基础模块
    """
    def __init__(self):
        pass
    @staticmethod
    def normalize(x):
        """
        将时间序列数据归一化
        x : 时间序列数据
        """
        x = np.array(x)
        return np.log2(x / np.sqrt(np.sum(x**2)))
    @staticmethod
    def lag(x, lag):
        """
        滞后
        x : 时间序列数据
        lag : 滞后时间
        """
        return pd.Series(x).shift(lag)
    @staticmethod
    def moving_average(x, window_size):
        """
        移动平均窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).mean()
    @staticmethod
    def moving_median(x, window_size):
        """
        移动中值窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).median()
    @staticmethod
    def moving_std(x, window_size):
        """
        移动标准差窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).std()
    @staticmethod
    def moving_max(x, window_size):
        """
        移动最大值窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).max()
    @staticmethod
    def moving_min(x, window_size):
        """
        移动最小值窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).min()
    @staticmethod
    def moving_sum(x, window_size):
        """
        移动和窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).sum()
    @staticmethod
    def moving_quantile(x, window_size, quantile):
        """
        移动分位数窗口
        x : 时间序列数据
        window_size : 窗口大小
        quantile : 分位数
        """
        return pd.Series(x).rolling(window_size).quantile(quantile)
    @staticmethod
    def moving_corr(x, y, window_size):
        """
        移动相关窗口
        x : 时间序列数据
        y : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).corr(pd.Series(y))
    @staticmethod
    def moving_cov(x, y, window_size):
        """
        移动协方差窗口
        x : 时间序列数据
        y : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).cov(pd.Series(y))
    @staticmethod
    def moving_skew(x, window_size):
        """
        移动偏度窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).skew()
    @staticmethod
    def moving_kurt(x, window_size):
        """
        移动峰度窗口
        x : 时间序列数据
        window_size : 窗口大小
        """
        return pd.Series(x).rolling(window_size).kurt()
--- a/lib/analysis_package/utils/IDcode_util.py
+++ b/lib/analysis_package/utils/IDcode_util.py
@ -0,0 +1,53 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> ID_code
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/5/17 16:00
@Desc   ：
 """
 import re
 re_ID = re.compile(r'^\d{6}(?:18|19|20)?\d{2}(?:0[1-9]|1[012])(?:(?:[0-2][1-9])|10|20|30|31)\d{3}[0-9xX]$')
 def validate_identity_code(code: str):
    """
    身份证格式校验
    :param code:
    :return:
    """
    city = {'11': "北京", '12': "天津", '13': "河北", '14': "山西", '15': "内蒙古", '21': "辽宁", '22': "吉林", '23': "黑龙江 ",
            '31': "上海", '32': "江苏", '33': "浙江", '34': "安徽", '35': "福建", '36': "江西", '37': "山东", '41': "河南", '42': "湖北 ",
            '43': "湖南", '44': "广东", '45': "广西", '46': "海南", '50': "重庆", '51': "四川", '52': "贵州", '53': "云南", '54': "西藏 ",
            '61': "陕西", '62': "甘肃", '63': "青海", '64': "宁夏", '65': "新疆", '71': "台湾", '81': "香港", '82': "澳门", '91': "国外 "}
    tip = ""
    p = True
    if re_ID.match(code) is None:
        tip = "身份证号格式错误"
        p = False
    elif not city[code[:2]]:
        tip = "地址编码错误"
        p = False
    else:
        # 18位身份证需要验证最后一位校验位
        if len(code) == 18:
            code = code.split('')
            # ∑(ai × Wi)(mod 11)
            # 加权因子
            factor = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
            # 校验位
            parity = [1, 0, 'X', 9, 8, 7, 6, 5, 4, 3, 2]
            sum = 0
            for i in range(17):
                ai = code[i]
                wi = factor[i]
                sum += ai * wi
                i += 1
            if parity[sum % 11] != code[17]:
                tip = "校验位错误"
                p = False
    return p, tip
--- a/lib/analysis_package/utils/init.py
+++ b/lib/analysis_package/utils/init.py
@ -0,0 +1,8 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> __init__.py
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/5/17 15:59
@Desc   ：
 """
--- a/lib/analysis_package/utils/datetime_util.py
+++ b/lib/analysis_package/utils/datetime_util.py
@ -0,0 +1,97 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> timeutil
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/4/26 10:02
@Desc   ：
 """
 import datetime
 import types
 import typing
 from dateutil import parser
 class cnparserinfo(parser.parserinfo):
    """
    匹配中文日期格式
    用法:
        from dateutil import parser
        parser.parse('1998年12月11日 8点20分30秒', cnparserinfo())
    """
    parser.parserinfo.JUMP.extend('年月日')
    WEEKDAYS = [list(weekdays) for weekdays in parser.parserinfo.WEEKDAYS]
    WEEKDAYS[0].extend(('星期一', '周一'))
    WEEKDAYS[1].extend(('星期二', '周二'))
    WEEKDAYS[2].extend(('星期三', '周三'))
    WEEKDAYS[3].extend(('星期四', '周四'))
    WEEKDAYS[4].extend(('星期五', '周五'))
    WEEKDAYS[5].extend(('星期六', '周六'))
    WEEKDAYS[6].extend(('星期天', '周日', '周天', '周末'))
    WEEKDAYS = [tuple(weekdays) for weekdays in WEEKDAYS]
    # MONTHS = [list(months) for months in parser.parserinfo.MONTHS]
    # MONTHS[0].extend(('一月', '1月'))
    # MONTHS[1].extend(('二月', '2月'))
    # MONTHS[2].extend(('三月', '3月'))
    # MONTHS[3].extend(('四月', '4月'))
    # MONTHS[4].extend(('五月', '5月'))
    # MONTHS[5].extend(('六月', '6月'))
    # MONTHS[6].extend(('七月', '7月'))
    # MONTHS[7].extend(('八月', '8月'))
    # MONTHS[8].extend(('九月', '9月'))
    # MONTHS[9].extend(('十月', '10月'))
    # MONTHS[10].extend(('十一月', '11月'))
    # MONTHS[11].extend(('十二月', '12月'))
    # MONTHS = [tuple(months) for months in MONTHS]
    HMS = [list(hms) for hms in parser.parserinfo.HMS]
    HMS[0].extend('时点')
    HMS[1].append('分')
    HMS[2].append('秒')
    HMS = [tuple(hms) for hms in HMS]
    AMPM = [list(ampm) for ampm in parser.parserinfo.AMPM]
    AMPM[0].append('上午')
    AMPM[1].append('下午')
    AMPM = [tuple(ampm) for ampm in AMPM]
    def __init__(self, dayfirst=False, yearfirst=False):
        super().__init__(dayfirst, yearfirst)
 def utctimestamp():
    """
    @return: utc时间戳
    """
    return int(datetime.datetime.utcnow().timestamp())
 def timestamp2datetime(ts: float):
    return datetime.datetime.fromtimestamp(ts)
 def timestamp2str(ts: float, fmt: str = '%F %H:%M:%S'):
    """
    @param ts: timestamp
    @param fmt: format
    """
    return datetime.datetime.strftime(timestamp2datetime(ts), fmt)
 cnparser = cnparserinfo()
 def str2datetime(datetime_str: str, fmt: str = None):
    if fmt:
        return datetime.datetime.strptime(datetime_str, fmt)
    return parser.parse(datetime_str, cnparser)
 def int2date(date_int: int):
    return str2datetime(str(date_int), '%Y%m%d')
 def date2int(a: typing.Union[datetime.datetime, datetime.date]):
    return int(a.strftime('%Y%m%d'))
--- a/lib/analysis_package/utils/file_util.py
+++ b/lib/analysis_package/utils/file_util.py
@ -0,0 +1,81 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> file_util
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/5/10 17:21
@Desc   ：
 """
 import os
 import queue
 import shutil
 import paramiko
 def list_files(dir_paths):
    files = []
    for root, dir_path, filepath in walk(dir_paths):
        if filepath:
            files.append(os.path.join(root, filepath))
    return files
 def walk(dir_paths):
    dir_queue = queue.Queue()
    if isinstance(dir_paths, str):
        dir_paths = [dir_paths]
    for dir_path in dir_paths:
        dir_queue.put(dir_path)
    while not dir_queue.empty():
        dirname = dir_queue.get()
        for root, dirs, files in os.walk(dirname):
            for dirname in dirs:
                dir_queue.put(os.path.join(root, dirname))
                yield root, dirname, None
            for filename in files:
                yield root, None, filename
 def copy(s, t):
    if os.path.isfile(s):
        shutil.copy(s, t)
    else:
        if not os.path.exists(t):
            os.mkdir(t)
        s = os.path.abspath(s)
        t = os.path.abspath(t)
        for root, dirname, filename in walk(s):
            if dirname:
                os.mkdir(os.path.join(t, dirname))
            else:
                shutil.copy(os.path.join(root, filename), os.path.join(root.replace(s, t), filename))
 class RemoteFileUtil:
    def __init__(self, ip, username, password, port=22, local_dir=None, remote_dir=None):
        tran = paramiko.Transport((ip, port))
        tran.connect(username=username, password=password)
        self.sftp = paramiko.SFTPClient.from_transport(tran).getfo()
        self.local_dir = local_dir
        self.remote_dir = remote_dir
    def ls(self, remote_dir=None):
        if remote_dir is None:
            remote_dir = self.remote_dir
        return self.sftp.listdir_attr(remote_dir)
    def upload_file(self, local_filepath=None, remote_filepath=None, filename=None):
        if local_filepath is None:
            local_filepath = os.path.join(self.local_dir, filename)
        if remote_filepath is None:
            remote_filepath = os.path.join(self.remote_dir, filename)
        self.sftp.put(local_filepath, remote_filepath)
    def download_file(self, local_filepath=None, remote_filepath=None, filename=None):
        if local_filepath is None:
            local_filepath = os.path.join(self.local_dir, filename)
        if remote_filepath is None:
            remote_filepath = os.path.join(self.remote_dir, filename)
        self.sftp.get(remote_filepath, local_filepath)
--- a/lib/analysis_package/utils/pd_util.py
+++ b/lib/analysis_package/utils/pd_util.py
@ -0,0 +1,82 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> pd_util
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/7/13 11:00
@Desc   ：
 """
 from __future__ import annotations
 import os
 from functools import partial
 from multiprocessing import Pool
 from typing import Hashable, Callable
 import pandas as pd
 from pandas._typing import CompressionOptions, FilePath, StorageOptions, WriteBuffer
 from pandas.core.generic import bool_t
 class to_same_csv:
    def __init__(self,
                 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
                 sep: str = ",",
                 na_rep: str = "",
                 float_format: str | None = None,
                 columns: pd.Sequence[Hashable] | None = None,
                 header: bool_t | list[str] = True,
                 index: bool_t = False,
                 index_label: pd.IndexLabel | None = None,
                 mode: str = "w",
                 encoding: str = 'utf8',
                 compression: CompressionOptions = "infer",
                 quoting: int | None = None,
                 quotechar: str = '"',
                 line_terminator: str | None = None,
                 chunksize: int | None = None,
                 date_format: str | None = None,
                 doublequote: bool_t = True,
                 escapechar: str | None = None,
                 decimal: str = ".",
                 errors: str = "strict",
                 storage_options: StorageOptions = None,
                 prepare: Callable = None):
        self.not_first = False
        self.mode = mode
        if self.mode == 'a' and isinstance(path_or_buf, str) and os.path.exists(path_or_buf):
            header = False
        self.header = header
        self.prepare = prepare
        self.kwargs = {'path_or_buf': path_or_buf,
                       'sep': sep,
                       'na_rep': na_rep,
                       'float_format': float_format,
                       'columns': columns,
                       'index': index,
                       'index_label': index_label,
                       'encoding': encoding,
                       'compression': compression,
                       'quoting': quoting,
                       'quotechar': quotechar,
                       'line_terminator': line_terminator,
                       'chunksize': chunksize,
                       'date_format': date_format,
                       'doublequote': doublequote,
                       'escapechar': escapechar,
                       'decimal': decimal,
                       'errors': errors,
                       'storage_options': storage_options}
    def __call__(self, df_or_series: pd.Series | pd.DataFrame):
        if self.not_first:
            df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
        else:
            if self.prepare:
                result = self.prepare(df_or_series)
                if result:
                    df_or_series = result
            df_or_series.to_csv(mode=self.mode, header=self.header, **self.kwargs)
            self.mode = 'a'
            self.header = False
--- a/lib/analysis_package/utils/phone_util.py
+++ b/lib/analysis_package/utils/phone_util.py
@ -0,0 +1,17 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：IoD_data_analysis_tool -> phone_util
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/5/17 15:59
@Desc   ：
 """
 import re
 re_phone = re.compile(r'^(?:(?:13[0-9])'
                      r'|(?:14(?:0|[5-7]|9))'
                      r'|(?:15(?:[0-3]|[5-9]))'
                      r'|(?:16(?:2|[5-7]))'
                      r'|(?:17[0-8])'
                      r'|(?:18[0-9])'
                      r'|(?:19(?:[0-3]|[5-9])))\d{8}$')
--- a/lib/analysis_package/utils/project_util.py
+++ b/lib/analysis_package/utils/project_util.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """
@Project ：IoD_data_analysis_tool 
@File    ：project_util.py
@IDE     ：PyCharm 
@Author  ：rengengchen
@Time    ：2022/9/15 9:45 
 """
 import compileall
 import os
 import re
 import shutil
 from os.path import join
 from lib.analysis_package.utils.file_util import walk
 re_pyc = re.compile(r'cpython-\d+\.')
 def compile_project(source, target=None):
    """
    编译项目为pyc文件到指定目录
    @param source: 项目路径
    @param target: 编译文件存放路径
    """
    source = os.path.abspath(source)
    if target is None:
        target = source
    else:
        target = os.path.abspath(target)
    compileall.compile_dir(source)
    pycache_paths = set()
    if target == source:
        for root, dirname, filename in walk(source):
            if root[-11:] == '__pycache__':
                pycache_paths.add(root)
                shutil.move(join(root, filename), join(root, '../', re_pyc.sub('', filename)))
            if filename and filename.endswith('py'):
                os.remove(join(root, filename))
    else:
        if target is None:
            target = join(source, 'dist')
        len_t = len(target)
        for root, dirname, filename in walk(source):
            t_root = root.replace(source, target)
            if target == root[:len_t]:
                continue
            if dirname and dirname != '__pycache__':
                t_root = join(t_root, dirname)
                if not os.path.exists(t_root) and join(source, dirname) != target:
                    os.makedirs(t_root)
            elif filename and not filename.endswith('py'):
                if root[-11:] == '__pycache__':
                    pycache_paths.add(root)
                    t_root = t_root[:-11]
                    shutil.move(join(root, filename), join(t_root, re_pyc.sub('', filename)))
                else:
                    shutil.copyfile(join(root, filename), join(t_root, filename))
    for p in pycache_paths:
        os.rmdir(p)
--- a/lib/dist/analysis_package-0.1.3-py3-none-any.whl
+++ b/lib/dist/analysis_package-0.1.3-py3-none-any.whl
--- a/lib/package_project.py
+++ b/lib/package_project.py
@ -0,0 +1,14 @@
 # -*- coding: UTF-8 -*-
 """
@Project -> File   ：scrapyproject -> package_project
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2021/5/12 10:46
@Desc   ：
 """
 import shutil
 import subprocess
 subprocess.call('python setup.py bdist_wheel')
 shutil.rmtree(r'build')
 shutil.rmtree(r'analysis_package.egg-info')
--- a/lib/setup.py
+++ b/lib/setup.py
@ -0,0 +1,36 @@
 # coding:utf-8
 from setuptools import setup, find_packages
 PACKAGE = "analysis_package"
 NAME = "analysis_package"
 DESCRIPTION = "general analysis function"
 AUTHOR = "iod"
 AUTHOR_EMAIL = "rengengchen@sics.ac.cn"
 URL = ""
 VERSION = '0.1.3'
 setup(
    name=NAME,
    version=VERSION,
    description=DESCRIPTION,
    author=AUTHOR,
    author_email=AUTHOR_EMAIL,
    license="BSD",
    url=URL,
    include_package_data=True,
    packages=find_packages(),
    classifiers=[
        'Programming Language :: Python',
        'Operating System :: OS Independent',
    ],
    install_requires=[
        'pandas',
        'scipy',
        'numpy',
        'matplotlib',
        'seaborn',
        'tqdm',
        'scikit-learn',
    ],
    zip_safe=False,
 )