util/lib/analysis_package/continuous/analyzer.py

39 lines
1.3 KiB
Python

import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import logging
logger = logging.getLogger(__name__)
def Spearman_rank_test(data_frame, feature_a, feature_b, save_path, file_name, sample_size=4000):
"""
Spearman_correlation is to determine whether there is a
Monotonic component between two features, which can be apply
only for non_linear relationship and ordinal data
@param feature_a: Input first feature for Spearman's rank test
@param feature_b: Input second feature for Spearman's rank test
@param sample_size: Choose a sample for representing the population
@param:save_path: output path
@param:file_name: output name
"""
a = data_frame[feature_a].sample(n=sample_size, random_state=1)
b = data_frame[feature_b].sample(n=sample_size, random_state=1)
coef, p = spearmanr(a, b)
logger.info("Spearmans' correlation coefficient is:" + str(coef))
alpha = 0.05
plt.scatter(a, b)
plt.xlabel("Feature A")
plt.ylabel("Feature B")
plt.title("Spearman's Rank Test")
plt.savefig(os.path.join(save_path, file_name))
if p > alpha:
logger.info("Feature are uncorrelated(failed to reject H0) p=" + str(p))
else:
logger.info("Features have a monotonic relationship(reject H0) p=" + str(p))