util/lib/analysis_package/continuous/process_tool.py

49 lines
1.6 KiB
Python

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/3/25 9:09
# @Software : PyCharm
# @File : process_tool.py
# @Author : QT
# @Email : taoqimin@sics.ac.cn
import numpy as np
from tqdm import tqdm
import logging
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(handler)
logger.addHandler(console)
class NumericProcess:
@staticmethod
def drop_feature(data_frame, thresh_hold):
"""
A function for detecting and dropping highly correlated features.
when two variables are highly correlated, it usually cause problem
such as Multicolinearity. The following function will be used to
remove the correlated features.
@param data_frame: Input dataframe
@param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
"""
matrix = data_frame.corr().abs()
mask = np.triu(np.ones_like(matrix, dtype=bool))
reduced_matrix = matrix.mask(mask)
feature_drop = [c for c in tqdm(reduced_matrix) if
any(reduced_matrix[c] > thresh_hold)]
data_frame.drop(feature_drop, axis=1, inplace=True)
logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
return data_frame