import os import numpy as np import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import spearmanr import logging logger = logging.getLogger(__name__) def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000): """ Spearman_correlation is to determine whether there is a Monotonic component between two features, which can be apply only for non_linear relationship and ordinal data @param feature_a: Input first feature for Spearman's rank test @param feature_b: Input second feature for Spearman's rank test @param sample_size: Choose a sample for representing the population @param:save_path: output path @param:file_name: output name """ a = data_frame[feature_a].sample(n=sample_size, random_state=1) b = data_frame[feature_b].sample(n=sample_size, random_state=1) coef, p = spearmanr(a, b) logger.info("Spearmans' correlation coefficient is:" + str(coef)) alpha = 0.05 plt.scatter(a, b) plt.xlabel("Feature A") plt.ylabel("Feature B") plt.title("Spearman's Rank Test") plt.savefig(os.path.join(save_path, file_name)) if p > alpha: logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p)) else: logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))