#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Time : 2022/3/25 9:09 # @Software : PyCharm # @File : process_tool.py # @Author : QT # @Email : taoqimin@sics.ac.cn import numpy as np from tqdm import tqdm import logging logger = logging.getLogger(__name__) logger.setLevel(level=logging.INFO) handler = logging.FileHandler("log.txt") handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) console = logging.StreamHandler() console.setLevel(logging.INFO) logger.addHandler(handler) logger.addHandler(console) class NumericProcess: @staticmethod def drop_feature(data_frame, thresh_hold): """ A function for detecting and dropping highly correlated features. when two variables are highly correlated, it usually cause problem such as Multicolinearity. The following function will be used to remove the correlated features. @param data_frame: Input dataframe @param thresh_hold: A Number from -1 to 1 to determine whether features will be dropped at certain correlation level """ matrix = data_frame.corr().abs() mask = np.triu(np.ones_like(matrix, dtype=bool)) reduced_matrix = matrix.mask(mask) feature_drop = [c for c in tqdm(reduced_matrix) if any(reduced_matrix[c] > thresh_hold)] data_frame.drop(feature_drop, axis=1, inplace=True) logger.info("The following features are dropped due to Multicollinearity:" + str(feature_drop)) return data_frame