util/lib/analysis_package/continuous/correlation.py

# -*- coding: UTF-8 -*-
"""
@Project -> File   ：IoD_data_analysis_tool -> correlation
@IDE    ：PyCharm
@Author ：rengengchen
@Date   ：2022/7/4 16:48
@Desc   ：
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import spearmanr


def spearmanr(a: pd.Series, b: pd.Series = None, axis=0, nan_policy='propagate',
              alternative='two-sided', sample_size=4000, random_state=None):
    """Calculate a Spearman correlation coefficient with associated p-value.

        The Spearman rank-order correlation coefficient is a nonparametric measure
        of the monotonicity of the relationship between two datasets. Unlike the
        Pearson correlation, the Spearman correlation does not assume that both
        datasets are normally distributed. Like other correlation coefficients,
        this one varies between -1 and +1 with 0 implying no correlation.
        Correlations of -1 or +1 imply an exact monotonic relationship. Positive
        correlations imply that as x increases, so does y. Negative correlations
        imply that as x increases, y decreases.

        The p-value roughly indicates the probability of an uncorrelated system
        producing datasets that have a Spearman correlation at least as extreme
        as the one computed from these datasets. The p-values are not entirely
        reliable but are probably reasonable for datasets larger than 500 or so.

        Parameters
        ----------
        a, b : 1D or 2D array_like, b is optional
            One or two 1-D or 2-D arrays containing multiple variables and
            observations. When these are 1-D, each represents a vector of
            observations of a single variable. For the behavior in the 2-D case,
            see under ``axis``, below.
            Both arrays need to have the same length in the ``axis`` dimension.
        axis : int or None, optional
            If axis=0 (default), then each column represents a variable, with
            observations in the rows. If axis=1, the relationship is transposed:
            each row represents a variable, while the columns contain observations.
            If axis=None, then both arrays will be raveled.
        nan_policy : {'propagate', 'raise', 'omit'}, optional
            Defines how to handle when input contains nan.
            The following options are available (default is 'propagate'):

            * 'propagate': returns nan
            * 'raise': throws an error
            * 'omit': performs the calculations ignoring nan values

        alternative : {'two-sided', 'less', 'greater'}, optional
            Defines the alternative hypothesis. Default is 'two-sided'.
            The following options are available:

            * 'two-sided': the correlation is nonzero
            * 'less': the correlation is negative (less than zero)
            * 'greater':  the correlation is positive (greater than zero)

        sample_size : int, optional
            Number of items from column to return. Default is 4000.

        random_state : int, array-like, BitGenerator, np.random.RandomState, optional
            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
            random number generator
            If np.random.RandomState, use as numpy RandomState object.

        Returns
        -------
        correlation : float or ndarray (2-D square)
            Spearman correlation matrix or correlation coefficient (if only 2
            variables are given as parameters. Correlation matrix is square with
            length equal to total number of variables (columns or rows) in ``a``
            and ``b`` combined.
        pvalue : float
            The p-value for a hypothesis test whose null hypotheisis
            is that two sets of data are uncorrelated. See `alternative` above
            for alternative hypotheses. `pvalue` has the same
            shape as `correlation`.

        References
        ----------
        .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
           Probability and Statistics Tables and Formulae. Chapman & Hall: New
           York. 2000.
           Section  14.7

        Examples
        --------
        >>> from scipy import stats
        >>> stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])
        SpearmanrResult(correlation=0.82078..., pvalue=0.08858...)
        >>> rng = np.random.default_rng()
        >>> x2n = rng.standard_normal((100, 2))
        >>> y2n = rng.standard_normal((100, 2))
        >>> stats.spearmanr(x2n)
        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
        >>> stats.spearmanr(x2n[:,0], x2n[:,1])
        SpearmanrResult(correlation=-0.07960396039603959, pvalue=0.4311168705769747)
        >>> rho, pval = stats.spearmanr(x2n, y2n)
        >>> rho
        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
               [-0.07960396,  1.        , -0.14448245,  0.16738074],
               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
        >>> pval
        array([[0.        , 0.43111687, 0.41084066, 0.33891628],
               [0.43111687, 0.        , 0.15151618, 0.09600687],
               [0.41084066, 0.15151618, 0.        , 0.74938561],
               [0.33891628, 0.09600687, 0.74938561, 0.        ]])
        >>> rho, pval = stats.spearmanr(x2n.T, y2n.T, axis=1)
        >>> rho
        array([[ 1.        , -0.07960396, -0.08314431,  0.09662166],
               [-0.07960396,  1.        , -0.14448245,  0.16738074],
               [-0.08314431, -0.14448245,  1.        ,  0.03234323],
               [ 0.09662166,  0.16738074,  0.03234323,  1.        ]])
        >>> stats.spearmanr(x2n, y2n, axis=None)
        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)
        >>> stats.spearmanr(x2n.ravel(), y2n.ravel())
        SpearmanrResult(correlation=0.044981624540613524, pvalue=0.5270803651336189)

        >>> rng = np.random.default_rng()
        >>> xint = rng.integers(10, size=(100, 2))
        >>> stats.spearmanr(xint)
        SpearmanrResult(correlation=0.09800224850707953, pvalue=0.3320271757932076)

        """
    # a = a.sample(n=sample_size, random_state=random_state)
    # if b:
    #     b = b.sample(n=sample_size, random_state=random_state)
    return spearmanr(a, b, axis=axis, nan_policy=nan_policy, alternative=alternative)


def corr(df, method='pearson', drop=False, threshold=0, plot=True, filepath=None, figsize=None):
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    cmap = sns.diverging_palette(250, 15, s=95, l=40, n=9, center="light", as_cmap=True)
    cov = df.corr(method=method)
    if drop:
        uncorr = ~np.any(np.abs(np.tril(cov, k=-1)) > threshold, axis=1)
        cov = cov[uncorr]
        cov = cov[cov.index]
    if plot or filepath:
        mask = np.triu(np.ones_like(cov, dtype=bool))
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(cov, mask=mask, center=0, annot=True, fmt='.2f', cmap=cmap, square=True, ax=ax)
        plt.title("相关性矩阵")
    if filepath:
        plt.savefig(filepath)
    if plot:
        plt.show()
    return cov