Source code for sihnpy.imbalance_mapping

""" Imbalance mapping module
"""

import pandas as pd
import numpy as np

[docs]def odregression_single(index, x, y):
    """Function computing orthogonal regression. Was developped as an adaptation of the Pracma
    library in R. Results were tested against Pracma and scipy's ODR implementation.

    The ODR implementation uses Singular Value Decomposition (SVD) to find the model values. Note
    that `sihnpy`'s imbalance mapping really only requires the orthogonal distances from the models.
    As such, there isn't a lot of focus on the other model measures in this implementation.

    Also note that the function, as it is a direct translation from Pracma, can accept
    multiple independant variables. However, this wasn't tested as the primary goal of the
    implementation was to use ODR for covariance between 2 brain regions. Use at your risks!

    For more customization options, I recommend using SciPy's version which includes a lot more
    options: https://docs.scipy.org/doc/scipy/reference/odr.html#module-scipy.odr

    I'm far from a mathematician; I have adapted the code exactly, but I don't always fully
    understand what Pracma's developer did in some instances. As such, some steps are not always
    clear in terms of what they do. For more info, refer to Pracma's docs:
    https://rdrr.io/cran/pracma/man/odregress.html

    Parameters
    ----------
    index : pandas.DataFrame.index
        Expects the index of the dataframe containing the IDs to output with the data. A 
        `numpy.ndarray` or `list` of index values can also be accepted, but the final
        column will not have a title.
    x : numpy.ndarray
        Expects a numpy array containing the values of the predictor. I recommend feeding
        the `pandas.Series` with the added `.values` method to this argument.
    y : numpy.ndarray
        Expects a numpy array containing the values of the outcome. I recommend feeding
        the `pandas.Series` with the added `.values` method to this argument.

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with individual-level model-computed measures and a Dataframe
        with model variables (slope, intercept, total sum of squares)
    """

    #Prep objects we need
    variables_mat = np.column_stack((x, y)) #Concat independant and dependant variables
    n_indvar = np.shape(variables_mat)[1] - 1 #Grab number of independant variables

    #Create matrix of same shape as variables_mat, but each row is the average value of each column
    mean_mat = np.full((len(variables_mat), 2), np.mean(variables_mat, axis=0))

    #Execute singular value decomposition on the variables minus the mean
    # (i.e., right singular values when substracting the mean from the values).
    v_mat = np.linalg.svd(variables_mat - mean_mat)[2]

    #Compute intercept and slope of the regression. This is based on the right singular values from
    # the SVD.
    intercept = -v_mat[0:n_indvar, n_indvar] / v_mat[n_indvar, n_indvar]
    slope = np.mean(np.matmul(variables_mat, v_mat[n_indvar,:])) / v_mat[n_indvar, n_indvar]

    #Compute model measures (fit, residuals, orthogonal distances, sum of squares)
    fitted_values = np.matmul(
    ## Numpy will whine if not setting type to object.
        np.array([variables_mat[:,0], 1], dtype=object), 
        np.array([intercept, slope], dtype=object)
    )
    residuals = variables_mat[:,1] - fitted_values 

    normal = v_mat[:,n_indvar] #Not 100% sure what this does, but necessary to get the distances
    abs_orthogonal_distances = np.abs(np.matmul(variables_mat - mean_mat, normal)) #Orthogonal distances
    sig_orthogonal_distances = abs_orthogonal_distances*np.sign(residuals) #Same as above, but with the sign
    sum_square = sum(abs_orthogonal_distances**2) 

    #Store the values for export
    individual_values = pd.DataFrame(
        data={"participant_id":index,
            "y_values":variables_mat[:,n_indvar],
            "fitted_values":fitted_values,
            "residuals":residuals,
            "absolute_orthogonal_distances":abs_orthogonal_distances,
            "signed_orthogonal_distances":sig_orthogonal_distances}
    ).set_index('participant_id')

    model_fit = pd.DataFrame(
        data={"slope":slope, "intercept":intercept,"total_sum_of_squares":sum_square},
        index=['values']
    ).T #Flip the table so the metrics are the index instead of columns

    return individual_values, model_fit

[docs]def _pre_mapping(data):
    """Create the 3D array necessary to store the orthogonal distances.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe containing the data to input to imbalance mapping.

    Returns
    -------
    numpy.ndarray
        A 3D numpy.ndarray of size AxAxP, where A is the number of regions
        and P is the number of participants.
    """

    num_regions = len(data.columns.values) #Compute number of brain regions
    num_participants = len(data) #Number of participants includes

    #Final array to hold the data per participant
    residual_array = np.empty((num_regions, num_regions, num_participants))

    return residual_array

[docs]def imbalance_mapping(data, type='sign'):
    """Imbalance mapping function. For each column in the original data, compute covariance
    (i.e., regression) using orthogonal distance regression. Orthogonal distance is computed
    for each participant individually for each regression. We store the orthogonal distance
    for each participant, for each pair of brain region in a 3D symmetric matrix.

    The script gives the option of using either "absolute" or the "signed" distances. The sign
    add the distinction of showing whether a participant is below or above the regression line.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe where the index is the participant IDs and the columns are the brain data.
    type : str, optional
        Type of orthogonal to compute (signed or absolute), by default 'sign'.

    Returns
    -------
    numpy.ndarray
        Returns a numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.
    """

    #Compute variables needed to run
    residual_array = _pre_mapping(data) #3D matrix. Axis 0 and 1 are the brain regions, 
    # axis 2 are participants

    #Loop over the data
    for i, region_i in enumerate(data):
        print(f'Computing region {i+1}/{len(data.columns)}')
        data_i = data[region_i]

        for j, region_j in enumerate(data):
            data_j = data[region_j]

            if type == 'abs':
                #Extract orthogonal distances from output of ODR
                odr_dist = odregression_single(index=data.index,
                        x=data_i.values, y=data_j.values)[0]['absolute_orthogonal_distances'].values
            else:
                odr_dist = odregression_single(index=data.index,
                        x=data_i.values, y=data_j.values)[0]['signed_orthogonal_distances'].values

            #Store the residuals in the 3D array.
            residual_array[i,j,:] = odr_dist #Order of the 3th dimension will match the input order

    return residual_array

[docs]def _by_person(residual_array):
    """Hidden function computing the mean orthogonal distance by participant. The 3rd dimension
    of the residual_array numpy.ndarray is the matrices of each individual participants. We extract
    the upper triangle of each matrix and compute the mean. This is equivalent to the Figure 1D in
    Nadig et al. (2021).

    Parameters
    ----------
    residual_array : numpy.ndarray
        numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.

    Returns
    -------
    list
        Returns a list of size P, where P is the number of participants. Each list element is
        the mean imbalance in 1 individual.
    """

    list_means = []

    for person in range(0, residual_array.shape[2]):
        #Extract person-wise array
        person_array = residual_array[:,:,person]

        #Extract upper triangle
        person_array_mod = person_array[np.triu_indices_from(person_array, k=1)]

        #Compute individuals' mean imbalance
        list_means.append(np.mean(person_array_mod))

    return list_means

[docs]def _by_region(residual_array):
    """Hidden function computing the mean orthogonal distance by region across all participants. 
    In numpy terms, this is equivalent to averaging over the first and third dimension of the 
    matrix. The diagonal of the matrices on the third dimension are perfect correlations (i.e.,
    correlation of region R to region R) so we remove them first. The result is equivalent to
    Figure 1E in Nadig et al. (2021).

    Parameters
    ----------
    residual_array : numpy.ndarray
        numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.

    Returns
    -------
    np.ndarray
        Returns a numpy array of size Ax1, where A is the number of regions.
    """

    resid_copy = residual_array.copy()

    #Fill the diagonals of each person with missing values (as we don't want the correlation if i==j)
    for person in range(0, residual_array.shape[2]):
        np.fill_diagonal(resid_copy[:,:,person], np.NaN)

    return np.nanmean(resid_copy, axis=(0,2))

[docs]def _by_person_by_region(residual_array):
    """Hidden function computing the mean orthogonal distance by region for each participant. 
    In numpy terms, this is equivalent to averaging over the first dimension of the 
    matrix. The diagonal of the matrices on the third dimension are perfect correlations (i.e.,
    correlation of region R to region R) so we remove them first. 

    Parameters
    ----------
    residual_array : numpy.ndarray
        numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.

    Returns
    -------
    np.ndarray
        Returns a numpy array of size PxA, where P is the number of participants and A
        is the number of regions.
    """

    resid_copy = residual_array.copy()

    #Fill the diagonals of each person with missing values (as we don't want the correlation if i==j)
    for person in range(0, residual_array.shape[2]):
        np.fill_diagonal(resid_copy[:,:,person], np.NaN)  

    return np.nanmean(resid_copy, axis=0).T

[docs]def imbalance_stats(data, residual_array):
    """ Function computing the imbalance measures described in Nadig et al. (2021). We additionally
    compute an average imbalance by region by person.

    The function outputs three dataframes.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe containing the data used for computing the imbalance. We use it to extract
        the index and use that for the new dataframes
    residual_array : np.ndarray
        numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.

    Returns
    -------
    pandas.DataFrame
        Returns three dataframes containing the measures computed for the imbalance analysis.
    """

    #Compute imbalance metrics
    list_imbalance_person = _by_person(residual_array=residual_array)
    avg_by_region = _by_region(residual_array=residual_array)
    avg_imb_by_pers_by_region = _by_person_by_region(residual_array=residual_array)

    #Store the metrics in dataframes
    avg_imb_by_region = pd.DataFrame(index=data.columns.values, 
                                data=avg_by_region, columns=['avg_imbalance_by_region'])
    avg_imb_by_person = pd.DataFrame(index=data.index, 
                                data=list_imbalance_person, columns=['avg_imbalance_by_person'])
    avg_imb_by_pers_by_region = pd.DataFrame(index=data.index,
                                data=avg_imb_by_pers_by_region,
                                columns=data.columns.values)

    return avg_imb_by_region, avg_imb_by_person, avg_imb_by_pers_by_region

[docs]def export(data, residual_array, output_path, avg_imb_by_region, avg_imb_by_person, avg_imb_by_pers_by_region, name, all=False):
    """Function to export the results of the imbalance mapping to files. If requested by the user
    `sihnpy` will also output the individual orthogonal distances matrices.

    Parameters
    ----------
    data : pandas.DataFrame
        Original dataframe containing the data to use with the imbalance mapping.
    residual_array : numpy.ndarray
        numpy.ndarray of size AxAxP, where A is the number of region and P is the number
        of participants. The array is filled with the orthogonal distances resulting from the
        orthogonal regression.
    output_path : str
        Local path where the data should be output.
    avg_imb_by_region : pandas.DataFrame
        Dataframe containing the average imbalance by region
    avg_imb_by_person : pandas.DataFrame
        Dataframe containing the average imbalance by person
    avg_imb_by_pers_by_region : pandas.DataFrame
        Dataframe containing the average imbalance by region, by person
    name : str
        String to add as a suffix for all the output (user's choice)
    all : bool, optional
        Whether the user wants to output all individual participants' matrices, by default False
    """

    avg_imb_by_region.to_csv(f"{output_path}/avg_imbalance_by_region_{name}.csv")
    avg_imb_by_person.to_csv(f"{output_path}/avg_imbalance_by_person_{name}.csv")
    avg_imb_by_pers_by_region.to_csv(f"{output_path}/avg_imbalance_by_person_by_region_{name}.csv")

    if all is True: #Output all the individual imbalance mapping if user requests
        for i, person in enumerate(data.index.values):
            resid_i = residual_array[:,:,i]  #Extract individual matrix for each participant
            np.fill_diagonal(resid_i, np.NaN) #Fill the diagonal with missing values
            #Save the matrices
            np.savetxt(f"{output_path}/imbalance_{person}_{name}.txt", resid_i, fmt='%1.3f')