156 lines
7.1 KiB
Python
156 lines
7.1 KiB
Python
|
# -*- coding: UTF-8 -*-
|
|||
|
"""
|
|||
|
@Project -> File :IoD_data_analysis_tool -> correlation
|
|||
|
@IDE :PyCharm
|
|||
|
@Author :rengengchen
|
|||
|
@Date :2022/7/4 16:48
|
|||
|
@Desc :
|
|||
|
"""
|
|||
|
import matplotlib.pyplot as plt
|
|||
|
import numpy as np
|
|||
|
import pandas as pd
|
|||
|
import seaborn as sns
|
|||
|
from scipy.stats import spearmanr
|
|||
|
|
|||
|
|
|||
|
def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
|
|||
|
alternative='two-sided', sample_size=4000, random_state=None):
|
|||
|
"""Calculate a Spearman correlation coefficient with associated p-value.
|
|||
|
|
|||
|
The Spearman rank-order correlation coefficient is a nonparametric measure
|
|||
|
of the monotonicity of the relationship between two datasets. Unlike the
|
|||
|
Pearson correlation, the Spearman correlation does not assume that both
|
|||
|
datasets are normally distributed. Like other correlation coefficients,
|
|||
|
this one varies between -1 and +1 with 0 implying no correlation.
|
|||
|
Correlations of -1 or +1 imply an exact monotonic relationship. Positive
|
|||
|
correlations imply that as x increases, so does y. Negative correlations
|
|||
|
imply that as x increases, y decreases.
|
|||
|
|
|||
|
The p-value roughly indicates the probability of an uncorrelated system
|
|||
|
producing datasets that have a Spearman correlation at least as extreme
|
|||
|
as the one computed from these datasets. The p-values are not entirely
|
|||
|
reliable but are probably reasonable for datasets larger than 500 or so.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
a, b : 1D or 2D array_like, b is optional
|
|||
|
One or two 1-D or 2-D arrays containing multiple variables and
|
|||
|
observations. When these are 1-D, each represents a vector of
|
|||
|
observations of a single variable. For the behavior in the 2-D case,
|
|||
|
see under ``axis``, below.
|
|||
|
Both arrays need to have the same length in the ``axis`` dimension.
|
|||
|
axis : int or None, optional
|
|||
|
If axis=0 (default), then each column represents a variable, with
|
|||
|
observations in the rows. If axis=1, the relationship is transposed:
|
|||
|
each row represents a variable, while the columns contain observations.
|
|||
|
If axis=None, then both arrays will be raveled.
|
|||
|
nan_policy : {'propagate', 'raise', 'omit'}, optional
|
|||
|
Defines how to handle when input contains nan.
|
|||
|
The following options are available (default is 'propagate'):
|
|||
|
|
|||
|
* 'propagate': returns nan
|
|||
|
* 'raise': throws an error
|
|||
|
* 'omit': performs the calculations ignoring nan values
|
|||
|
|
|||
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|||
|
Defines the alternative hypothesis. Default is 'two-sided'.
|
|||
|
The following options are available:
|
|||
|
|
|||
|
* 'two-sided': the correlation is nonzero
|
|||
|
* 'less': the correlation is negative (less than zero)
|
|||
|
* 'greater': the correlation is positive (greater than zero)
|
|||
|
|
|||
|
sample_size : int, optional
|
|||
|
Number of items from column to return. Default is 4000.
|
|||
|
|
|||
|
random_state : int, array-like, BitGenerator, np.random.RandomState, optional
|
|||
|
If int, array-like, or BitGenerator (NumPy>=1.17), seed for
|
|||
|
random number generator
|
|||
|
If np.random.RandomState, use as numpy RandomState object.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
correlation : float or ndarray (2-D square)
|
|||
|
Spearman correlation matrix or correlation coefficient (if only 2
|
|||
|
variables are given as parameters. Correlation matrix is square with
|
|||
|
length equal to total number of variables (columns or rows) in ``a``
|
|||
|
and ``b`` combined.
|
|||
|
pvalue : float
|
|||
|
The p-value for a hypothesis test whose null hypotheisis
|
|||
|
is that two sets of data are uncorrelated. See `alternative` above
|
|||
|
for alternative hypotheses. `pvalue` has the same
|
|||
|
shape as `correlation`.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
|
|||
|
Probability and Statistics Tables and Formulae. Chapman & Hall: New
|
|||
|
York. 2000.
|
|||
|
Section 14.7
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from scipy import stats
|
|||
|
>>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
|
|||
|
SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
|
|||
|
>>> rng = np.random.default_rng()
|
|||
|
>>> x2n = rng.standard_normal((100, 2))
|
|||
|
>>> y2n = rng.standard_normal((100, 2))
|
|||
|
>>> stats.spearmanr(x2n)
|
|||
|
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
|||
|
>>> stats.spearmanr(x2n[:,0], x2n[:,1])
|
|||
|
SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
|
|||
|
>>> rho, pval = stats.spearmanr(x2n, y2n)
|
|||
|
>>> rho
|
|||
|
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
|||
|
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
|||
|
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
|||
|
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
|||
|
>>> pval
|
|||
|
array([[0. , 0.43111687, 0.41084066, 0.33891628],
|
|||
|
[0.43111687, 0. , 0.15151618, 0.09600687],
|
|||
|
[0.41084066, 0.15151618, 0. , 0.74938561],
|
|||
|
[0.33891628, 0.09600687, 0.74938561, 0. ]])
|
|||
|
>>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
|
|||
|
>>> rho
|
|||
|
array([[ 1. , -0.07960396, -0.08314431, 0.09662166],
|
|||
|
[-0.07960396, 1. , -0.14448245, 0.16738074],
|
|||
|
[-0.08314431, -0.14448245, 1. , 0.03234323],
|
|||
|
[ 0.09662166, 0.16738074, 0.03234323, 1. ]])
|
|||
|
>>> stats.spearmanr(x2n, y2n, axis=None)
|
|||
|
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
|||
|
>>> stats.spearmanr(x2n.ravel(), y2n.ravel())
|
|||
|
SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
|
|||
|
|
|||
|
>>> rng = np.random.default_rng()
|
|||
|
>>> xint = rng.integers(10, size=(100, 2))
|
|||
|
>>> stats.spearmanr(xint)
|
|||
|
SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)
|
|||
|
|
|||
|
"""
|
|||
|
# a = a.sample(n=sample_size, random_state=random_state)
|
|||
|
# if b:
|
|||
|
# b = b.sample(n=sample_size, random_state=random_state)
|
|||
|
return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)
|
|||
|
|
|||
|
|
|||
|
def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
|
|||
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|||
|
plt.rcParams['axes.unicode_minus'] = False
|
|||
|
cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
|
|||
|
cov = df.corr(method=method)
|
|||
|
if drop:
|
|||
|
uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
|
|||
|
cov = cov[uncorr]
|
|||
|
cov = cov[cov.index]
|
|||
|
if plot or filepath:
|
|||
|
mask = np.triu(np.ones_like(cov, dtype=bool))
|
|||
|
fig, ax = plt.subplots(figsize=figsize)
|
|||
|
sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
|
|||
|
plt.title("相关性矩阵")
|
|||
|
if filepath:
|
|||
|
plt.savefig(filepath)
|
|||
|
if plot:
|
|||
|
plt.show()
|
|||
|
return cov
|