Source code for stats

'''
This module contains functions for statistical analysis.
'''
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, rankdata, chi2, norm
from scikit_posthocs import posthoc_nemenyi_friedman as posthoc_nemenyi
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
import pyreadr
import subprocess
import os
from math import factorial
from itertools import combinations

# Create a custom function to convert the DataFrame to good data for a heatmap
# the values < 0.05 and "good" for the algorithm in the column are going to be 2 - value
# the values < 0.05 and "bad" for the algorithm in the row are going to be value
# the rest of values are going to be 1
# this function returns a DataFrame with the same shape as the original one
# always keep track of the name of the row and column of each cell,
# because the heatmap will be created using the row and column names
# a value is good if means[row_name] > means[col_name]
# a value is bad if means[row_name] < means[col_name]
def to_heatmap_custom(df, means):
    heatmap_data = pd.DataFrame(1, index=df.index, columns=df.columns)
    for i, row_name in enumerate(df.index):
        for j, col_name in enumerate(df.columns):
            value = df.loc[row_name, col_name]
            if value < 0.05:
                heatmap_data.loc[row_name, col_name] = 2 - value if means[i] > means[j] else value 
            if i == j:
                heatmap_data.loc[row_name, col_name] = 3      
    return heatmap_data

def to_image_custom(df_original, df_heatmap_data, filename, df_p_values = None):
    print("Entering to_image_custom")
    # Create a colormap from the list of colors
    cmap = ListedColormap(['red', 'gray', 'green', 'white'])
    sns.set(font_scale=1.2)
    plt.figure(figsize=(12, 10))
    col_labels = df_original.columns
    row_labels = df_original.index
    plt.tick_params(axis='both', which='major', labelsize=10, labelbottom=False, bottom=False, top=False, labeltop=True)
    # Create a copy of the p-values DataFrame to format the non-empty values
    p_values_formatted = df_p_values.copy()
    # Iterate over rows and columns to format non-empty values
    for i, row_name in enumerate(df_p_values.index):
        for j, col_name in enumerate(df_p_values.columns):
            val = df_p_values.loc[row_name, col_name]
            if val != "":
                p_values_formatted.loc[row_name, col_name] = f"{float(val):.2f}"

    if df_p_values is not None:
        sns.heatmap(df_heatmap_data, fmt="", linewidths=.5, cbar=False, cmap=cmap, xticklabels=col_labels, yticklabels=row_labels, annot=p_values_formatted.values)  # Use df_p_values as annotations
    else:
        sns.heatmap(df_heatmap_data, fmt="d", linewidths=.5, cbar=False, cmap=cmap, xticklabels=col_labels, yticklabels=row_labels)
    # save the figure
    plt.savefig(filename, bbox_inches='tight')
    plt.show()

def rankMatrix(data, decreasing=True):
    def rank_rows(row):
        if decreasing:
            return rankdata(-row, method='average')
        else:
            return rankdata(row, method='average')
    
    ranked_data = data.apply(rank_rows, axis=1)
    return ranked_data

def friedmanTest(data):
    N, k = data.shape
    mr = rankMatrix(data).mean(axis=0)
    
    friedman_stat = 12 * N / (k * (k + 1)) * (np.sum(mr**2) - k * (k + 1)**2 / 4)
    p_value = 1 - chi2.cdf(friedman_stat, df=k - 1)

    result = {
        "Friedman's chi-squared": friedman_stat,
        "df": k - 1,
        "p-value": p_value,
        "method": "Friedman's rank sum test",
        "data_name": str(data.columns.values)
    }
    return result

def processControlColumn(data, control):
    if control is not None:
        if isinstance(control, str):
            if control in data.columns:
                control = data.columns.get_loc(control)
            else:
                raise ValueError("The name of the column to be used as control does not exist in the data matrix")
        elif control > data.shape[1] - 1 or control < 0:
            raise ValueError(f"Non-valid value for the control parameter. It has to be either the name of a column or a number between 0 and {data.shape[1] - 1}")
    return control

def generatePairs(k, control):
    if control is None:
        pairs = [(i, j) for i in range(k) for j in range(i + 1, k)]
    else:
        pairs = [(control, i) for i in range(k) if i != control]
    return pairs

def buildPvalMatrix(pvalues, k, pairs, cnames, control):
    if control is None:
        matrix_raw = pd.DataFrame(np.nan, index=cnames, columns=cnames)
        for (i, j), pval in zip(pairs, pvalues):
            matrix_raw.iloc[i, j] = pval
            matrix_raw.iloc[j, i] = pval
    else:
        matrix_raw = pd.DataFrame(np.nan, columns=cnames, index=[0])
        for _, j in pairs:
            matrix_raw.iloc[0, j] = pvalues[j]
    return matrix_raw

def correctForMonotocity(pvalues):
    for i in range(1, len(pvalues)):
        pvalues[i] = max(pvalues[i], pvalues[i - 1])
    return pvalues

def friedmanPost(data, control=None):
    k = data.shape[1]
    N = data.shape[0]
    control = processControlColumn(data, control)
    pairs = generatePairs(k, control)
    
    mean_rank = rankdata(-data.values, axis=1, method='average').mean(axis=0)
    sd = np.sqrt(k * (k + 1) / (6 * N))
    pvalues = [2 * (1 - norm.cdf(abs(mean_rank[i] - mean_rank[j]) / sd)) for i, j in pairs]

    matrix_raw = buildPvalMatrix(pvalues, k, pairs, data.columns, control)
    return matrix_raw

def setUpperBound(vector, bound):
    return np.minimum(vector, bound)

def countRecursively(k):
    res = [0]
    if k > 1:
        res += countRecursively(k - 1)
        for j in range(2, k + 1):
            additional_values = [int((x + factorial(j) / (2 * factorial(j - 2)))) for x in countRecursively(k - j)]
            res += additional_values
    return sorted(set(res))

def adjustShaffer(raw_matrix):
    if not isinstance(raw_matrix, (pd.DataFrame, np.ndarray)):
        raise ValueError("This correction method requires a DataFrame or ndarray with the p-values of all the pairwise comparisons.")
    
    if raw_matrix.shape[0] != raw_matrix.shape[1]:
        raise ValueError("This correction method requires a square matrix or DataFrame with the p-values of all the pairwise comparisons.")
    
    k = raw_matrix.shape[0]
    pairs = generatePairs(k, None)
    raw_pvalues = [raw_matrix.iloc[i, j] for i, j in pairs]

    sk = countRecursively(k)[1:]  # Exclude 0 from the count

    # Replicate elements of sk (excluding the last one) according to the differences between elements
    replicated = np.repeat(sk[:-1], np.diff(sk))

    # Get the last element of sk
    last_element = sk[-1]

    # Combine the replicated part with the last element
    t_i = np.concatenate([replicated, [last_element]])
    t_i = list(t_i)
    t_i.reverse()

    o = np.argsort(raw_pvalues)
    adj_pvalues = np.array(raw_pvalues)[o] * t_i
    adj_pvalues = setUpperBound(adj_pvalues, 1) 
    adj_pvalues = correctForMonotocity(adj_pvalues)

    # Restoring original order
    adj_pvalues = adj_pvalues[o.argsort()]

    # Regenerating the matrix
    adj_matrix = raw_matrix.copy()
    for (i, j), pval in zip(pairs, adj_pvalues):
        adj_matrix.iloc[i, j] = adj_matrix.iloc[j, i] = pval

    return adj_matrix

[docs]def friedman_shaffer_scmamp(df):
    '''
    This function performs the Friedman test and the Shaffer post hoc test
    using the scmamp package.

    :param df: a pandas DataFrame with the results of the experiments
    :type df: pandas.DataFrame
    :return: a dictionary with the results of the Friedman test and the Shaffer post hoc test
    :rtype: dict
    '''
    # Create a R script to perform the Friedman test and the Shaffer post hoc test
    r_script = """
    library(scmamp)

    data <- read.csv("temp_data.csv", header = TRUE, row.names = NULL)
    htest <- friedmanTest(data)
    saveRDS(htest, file = "temp_htest.rds")
    raw.pvalues <- friedmanPost(data)
    saveRDS(raw.pvalues, file = "temp_raw_pvalues.rds")
    adjusted.pvalues <- adjustShaffer(raw.pvalues)
    saveRDS(adjusted.pvalues, file = "temp_adjusted_pvalues.rds")
    """
    # save the script to a temporary file
    with open('temp_script.R', 'w') as f:
        f.write(r_script)
    # we do not need the DataFrame index
    df = df.reset_index(drop=True)
    df.to_csv('temp_data.csv', index=False)
    subprocess.run(["Rscript", "temp_script.R"])

    # Read the RDS file temp_htest.rds into a Python dictionary
    htest_R = pyreadr.read_r("temp_htest.rds")
    Friedman_test = dict()
    Friedman_test["statistic"] = htest_R[None]['statistic'][0]
    Friedman_test["parameter"] = htest_R[None]['parameter'][0]
    Friedman_test["pvalue"] = htest_R[None]['p.value'][0]
    Friedman_test["method"] = htest_R[None]['method'][0]
    Friedman_test["data_name"] = htest_R[None]['data.name'][0]

    # Read the RDS file temp_raw_pvalues.rds into a Python dataframe
    raw_pvalues = pyreadr.read_r("temp_raw_pvalues.rds")
    raw_pvalues = raw_pvalues[None]
    raw_pvalues.columns = df.columns
    raw_pvalues.index = df.columns

    # Read the RDS file temp_adjusted_pvalues.rds into a Python dataframe
    adjusted_pvalues = pyreadr.read_r("temp_adjusted_pvalues.rds")
    adjusted_pvalues = adjusted_pvalues[None]
    adjusted_pvalues.columns = df.columns
    adjusted_pvalues.index = df.columns

    # Clean up
    os.remove('temp_data.csv')
    os.remove('temp_script.R')
    os.remove('temp_htest.rds')
    os.remove('temp_raw_pvalues.rds')
    os.remove('temp_adjusted_pvalues.rds')

    results = {
        'Friedman_test': Friedman_test,
        'raw_pvalues': raw_pvalues,
        'adjusted_pvalues': adjusted_pvalues
    }
    return results



def determine_color(val, row_name, col_name, means):
    if val < 0.05:
        if means[row_name] > means[col_name]:
            return 'green'
        else:
            return 'red'
    else:
        return 'gray'
    
def format_text(val):
    return f"{val:.2f}"

def friedman_dunn_test(dataframe):
    # Make a copy of the dataframe to avoid modifying the original one
    df = dataframe.copy()
    # Transpose the DataFrame to get the algorithms as rows and datasets as columns
    df = df.T
    # Perform the Friedman test
    friedman_result = pg.friedman(df)

    # Display the Friedman test results
    print(friedman_result)
    # Get the p-value from the Friedman test result
    friedman_pvalue = friedman_result['p-unc'][0]

    # Perform the Bonferroni-Dunn post hoc test
    posthoc_dunn = pg.pairwise_ttests(data=df, parametric=False, padjust='bonferroni')

    # Display the post hoc test results
    print(posthoc_dunn)


def friedman_nemenyi_test(dataframe, already_transposed=False, filename="nemenyi.jpg"):
    # Make a copy of the dataframe to avoid modifying the original one
    df = dataframe.copy()
    
    # Replace '_' (not preceded by a backslash) with '\_' in column names for LaTeX compatibility
    df.columns = df.columns.str.replace(r'(?<!\\)_', r'\_')
    
    # Compute mean values for each algorithm
    means = df.mean()

    means_2 = means.copy()
    means_2.reset_index(drop=True, inplace=True)
    means_2.index = means_2.index.astype(str)

    # Perform Friedman test
    ranks = df.rank(axis=1, method='average', ascending=False)
    friedman_result = friedmanchisquare(*ranks.values.T)

    # Compute mean ranks
    mean_ranks = ranks.mean()

    # Transpose the DataFrame to get the algorithms as rows and datasets as columns
    if not already_transposed:
        df = df.T

    print(df)

    # Perform Nemenyi posthoc test
    nemenyi_result = posthoc_nemenyi(df.values)
    print("nemenyi_result")
    print(nemenyi_result)
    print(df.index)
    nemenyi_result.columns = df.columns
    nemenyi_result.index = df.columns

    # Create a DataFrame of the Nemenyi results
    nemenyi_df = pd.DataFrame(nemenyi_result)
    print("nemenyi_df")
    print(nemenyi_df)

    # Create a custom function to convert the DataFrame to good data for a heatmap
    # the values < 0.05 and "good" for the algorithm in the column are going to be 2 - value
    # the values < 0.05 and "bad" for the algorithm in the row are going to be value
    # the rest of values are going to be 1
    # this function returns a DataFrame with the same shape as the original one
    # always keep track of the name of the row and column of each cell,
    # because the heatmap will be created using the row and column names
    # a value is good if means[row_name] > means[col_name]
    # a value is bad if means[row_name] < means[col_name]
    def to_heatmap_custom(df, means):
        heatmap_data = pd.DataFrame(1, index=df.index, columns=df.columns)
        for i, row_name in enumerate(df.index):
            for j, col_name in enumerate(df.columns):
                value = df.loc[row_name, col_name]
                if value < 0.05:
                    heatmap_data.loc[row_name, col_name] = 2 - value if means[i] > means[j] else value 
                if i == j:
                    heatmap_data.loc[row_name, col_name] = 3      
        return heatmap_data

    # Create a custom function to convert the DataFrame to a LaTeX table
    def to_latex_custom(df, means):
        header = ' & ' + ' & '.join(df.columns) + ' \\\\ \\hline'
        rows = []
        for row_name, row in df.iterrows():
            formatted_values = []
            for col_name, val in row.iteritems():
                if val < 0.05:
                    direction = '+' if means[row_name] > means[col_name] else '-'
                    formatted_values.append(f"\\bf{{{val:.3f} {direction}}}")
                else:
                    formatted_values.append(f"{val:.3f}")
            rows.append(f"{row_name} & " + ' & '.join(formatted_values) + ' \\\\ \\hline')
        return '\\begin{tabular}{|' + 'c|' * (df.shape[1] + 1) + '}\n' + header + '\n' + '\n'.join(rows) + '\n\\end{tabular}'

    def to_image_custom(df_original, df_heatmap_data, filename):
        # Create a colormap from the list of colors
        cmap = ListedColormap(['red', 'gray', 'green', 'white'])
        sns.set(font_scale=1.2)
        plt.figure(figsize=(12, 10))
        col_labels = df_original.columns
        row_labels = df_original.index
        plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
        sns.heatmap(df_heatmap_data, fmt="d", linewidths=.5, cbar=False, cmap=cmap, xticklabels=col_labels, yticklabels=row_labels)
        # save the figure
        plt.savefig(filename, bbox_inches='tight')
        plt.show()

    # Convert the DataFrame to a LaTeX table
    nemenyi_table = to_latex_custom(nemenyi_df, means)

    nemenyi_df_2 = nemenyi_df.copy()
    nemenyi_df_2.reset_index(drop=True, inplace=True)
    nemenyi_df_2.columns = nemenyi_df_2.index
    # change the column names to strings
    nemenyi_df_2.columns = nemenyi_df_2.columns.astype(str)

    nemenyi_table_2 = to_latex_custom(nemenyi_df_2, means_2)
    nemenyi_df.columns = df.columns
    nemenyi_df.index = df.columns
    print("nemenyi_df_before_ordering")
    print(nemenyi_df)
    ordered_columns = ["RANSAC-LP-957", "RANSAC-LP-747", "RANSAC-LP-558", "RANSAC-LP-388", "RANSAC-LP-239", "RANSAC-LP-109", 
                    "RANSAC-957", "RANSAC-747", "RANSAC-558", "RANSAC-388", "RANSAC-239", "RANSAC-109"]
    nemenyi_df = nemenyi_df.loc[ordered_columns,ordered_columns]
    nemenyi_df_heatmap_data = to_heatmap_custom(nemenyi_df, means)
    print("nemenyi_df_heatmap_data")
    print(nemenyi_df_heatmap_data)
    to_image_custom(nemenyi_df, nemenyi_df_heatmap_data, filename)

    results = {
        'friedman_result': friedman_result,
        'mean_ranks': mean_ranks,
        'nemenyi_result': nemenyi_result,
        'nemenyi_df': nemenyi_df,
        'nemenyi_table': nemenyi_table,
        'nemenyi_df_2': nemenyi_df_2,
        'nemenyi_table_2': nemenyi_table_2
    }

    return results