49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
# @Time : 2022/3/25 9:09
|
|
# @Software : PyCharm
|
|
# @File : process_tool.py
|
|
# @Author : QT
|
|
# @Email : taoqimin@sics.ac.cn
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.setLevel(level=logging.INFO)
|
|
handler = logging.FileHandler("log.txt")
|
|
handler.setLevel(logging.INFO)
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
handler.setFormatter(formatter)
|
|
logger.addHandler(handler)
|
|
|
|
console = logging.StreamHandler()
|
|
console.setLevel(logging.INFO)
|
|
|
|
logger.addHandler(handler)
|
|
logger.addHandler(console)
|
|
|
|
|
|
class NumericProcess:
|
|
@staticmethod
|
|
def drop_feature(data_frame, thresh_hold):
|
|
"""
|
|
A function for detecting and dropping highly correlated features.
|
|
when two variables are highly correlated, it usually cause problem
|
|
such as Multicolinearity. The following function will be used to
|
|
remove the correlated features.
|
|
|
|
@param data_frame: Input dataframe
|
|
@param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level
|
|
|
|
"""
|
|
|
|
matrix = data_frame.corr().abs()
|
|
mask = np.triu(np.ones_like(matrix, dtype=bool))
|
|
reduced_matrix = matrix.mask(mask)
|
|
feature_drop = [c for c in tqdm(reduced_matrix) if
|
|
any(reduced_matrix[c] > thresh_hold)]
|
|
data_frame.drop(feature_drop, axis=1, inplace=True)
|
|
logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop))
|
|
return data_frame
|