Source code for sihnpy.imbalance_mapping

""" Imbalance mapping module
"""

import pandas as pd
import numpy as np

[docs]def odregression_single(index, x, y): """Function computing orthogonal regression. Was developped as an adaptation of the Pracma library in R. Results were tested against Pracma and scipy's ODR implementation. The ODR implementation uses Singular Value Decomposition (SVD) to find the model values. Note that `sihnpy`'s imbalance mapping really only requires the orthogonal distances from the models. As such, there isn't a lot of focus on the other model measures in this implementation. Also note that the function, as it is a direct translation from Pracma, can accept multiple independant variables. However, this wasn't tested as the primary goal of the implementation was to use ODR for covariance between 2 brain regions. Use at your risks! For more customization options, I recommend using SciPy's version which includes a lot more options: https://docs.scipy.org/doc/scipy/reference/odr.html#module-scipy.odr I'm far from a mathematician; I have adapted the code exactly, but I don't always fully understand what Pracma's developer did in some instances. As such, some steps are not always clear in terms of what they do. For more info, refer to Pracma's docs: https://rdrr.io/cran/pracma/man/odregress.html Parameters ---------- index : pandas.DataFrame.index Expects the index of the dataframe containing the IDs to output with the data. A `numpy.ndarray` or `list` of index values can also be accepted, but the final column will not have a title. x : numpy.ndarray Expects a numpy array containing the values of the predictor. I recommend feeding the `pandas.Series` with the added `.values` method to this argument. y : numpy.ndarray Expects a numpy array containing the values of the outcome. I recommend feeding the `pandas.Series` with the added `.values` method to this argument. Returns ------- pandas.DataFrame Returns a dataframe with individual-level model-computed measures and a Dataframe with model variables (slope, intercept, total sum of squares) """ #Prep objects we need variables_mat = np.column_stack((x, y)) #Concat independant and dependant variables n_indvar = np.shape(variables_mat)[1] - 1 #Grab number of independant variables #Create matrix of same shape as variables_mat, but each row is the average value of each column mean_mat = np.full((len(variables_mat), 2), np.mean(variables_mat, axis=0)) #Execute singular value decomposition on the variables minus the mean # (i.e., right singular values when substracting the mean from the values). v_mat = np.linalg.svd(variables_mat - mean_mat)[2] #Compute intercept and slope of the regression. This is based on the right singular values from # the SVD. intercept = -v_mat[0:n_indvar, n_indvar] / v_mat[n_indvar, n_indvar] slope = np.mean(np.matmul(variables_mat, v_mat[n_indvar,:])) / v_mat[n_indvar, n_indvar] #Compute model measures (fit, residuals, orthogonal distances, sum of squares) fitted_values = np.matmul( ## Numpy will whine if not setting type to object. np.array([variables_mat[:,0], 1], dtype=object), np.array([intercept, slope], dtype=object) ) residuals = variables_mat[:,1] - fitted_values normal = v_mat[:,n_indvar] #Not 100% sure what this does, but necessary to get the distances abs_orthogonal_distances = np.abs(np.matmul(variables_mat - mean_mat, normal)) #Orthogonal distances sig_orthogonal_distances = abs_orthogonal_distances*np.sign(residuals) #Same as above, but with the sign sum_square = sum(abs_orthogonal_distances**2) #Store the values for export individual_values = pd.DataFrame( data={"participant_id":index, "y_values":variables_mat[:,n_indvar], "fitted_values":fitted_values, "residuals":residuals, "absolute_orthogonal_distances":abs_orthogonal_distances, "signed_orthogonal_distances":sig_orthogonal_distances} ).set_index('participant_id') model_fit = pd.DataFrame( data={"slope":slope, "intercept":intercept,"total_sum_of_squares":sum_square}, index=['values'] ).T #Flip the table so the metrics are the index instead of columns return individual_values, model_fit
[docs]def _pre_mapping(data): """Create the 3D array necessary to store the orthogonal distances. Parameters ---------- data : pandas.DataFrame Dataframe containing the data to input to imbalance mapping. Returns ------- numpy.ndarray A 3D numpy.ndarray of size AxAxP, where A is the number of regions and P is the number of participants. """ num_regions = len(data.columns.values) #Compute number of brain regions num_participants = len(data) #Number of participants includes #Final array to hold the data per participant residual_array = np.empty((num_regions, num_regions, num_participants)) return residual_array
[docs]def imbalance_mapping(data, type='sign'): """Imbalance mapping function. For each column in the original data, compute covariance (i.e., regression) using orthogonal distance regression. Orthogonal distance is computed for each participant individually for each regression. We store the orthogonal distance for each participant, for each pair of brain region in a 3D symmetric matrix. The script gives the option of using either "absolute" or the "signed" distances. The sign add the distinction of showing whether a participant is below or above the regression line. Parameters ---------- data : pandas.DataFrame Dataframe where the index is the participant IDs and the columns are the brain data. type : str, optional Type of orthogonal to compute (signed or absolute), by default 'sign'. Returns ------- numpy.ndarray Returns a numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. """ #Compute variables needed to run residual_array = _pre_mapping(data) #3D matrix. Axis 0 and 1 are the brain regions, # axis 2 are participants #Loop over the data for i, region_i in enumerate(data): print(f'Computing region {i+1}/{len(data.columns)}') data_i = data[region_i] for j, region_j in enumerate(data): data_j = data[region_j] if type == 'abs': #Extract orthogonal distances from output of ODR odr_dist = odregression_single(index=data.index, x=data_i.values, y=data_j.values)[0]['absolute_orthogonal_distances'].values else: odr_dist = odregression_single(index=data.index, x=data_i.values, y=data_j.values)[0]['signed_orthogonal_distances'].values #Store the residuals in the 3D array. residual_array[i,j,:] = odr_dist #Order of the 3th dimension will match the input order return residual_array
[docs]def _by_person(residual_array): """Hidden function computing the mean orthogonal distance by participant. The 3rd dimension of the residual_array numpy.ndarray is the matrices of each individual participants. We extract the upper triangle of each matrix and compute the mean. This is equivalent to the Figure 1D in Nadig et al. (2021). Parameters ---------- residual_array : numpy.ndarray numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. Returns ------- list Returns a list of size P, where P is the number of participants. Each list element is the mean imbalance in 1 individual. """ list_means = [] for person in range(0, residual_array.shape[2]): #Extract person-wise array person_array = residual_array[:,:,person] #Extract upper triangle person_array_mod = person_array[np.triu_indices_from(person_array, k=1)] #Compute individuals' mean imbalance list_means.append(np.mean(person_array_mod)) return list_means
[docs]def _by_region(residual_array): """Hidden function computing the mean orthogonal distance by region across all participants. In numpy terms, this is equivalent to averaging over the first and third dimension of the matrix. The diagonal of the matrices on the third dimension are perfect correlations (i.e., correlation of region R to region R) so we remove them first. The result is equivalent to Figure 1E in Nadig et al. (2021). Parameters ---------- residual_array : numpy.ndarray numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. Returns ------- np.ndarray Returns a numpy array of size Ax1, where A is the number of regions. """ resid_copy = residual_array.copy() #Fill the diagonals of each person with missing values (as we don't want the correlation if i==j) for person in range(0, residual_array.shape[2]): np.fill_diagonal(resid_copy[:,:,person], np.NaN) return np.nanmean(resid_copy, axis=(0,2))
[docs]def _by_person_by_region(residual_array): """Hidden function computing the mean orthogonal distance by region for each participant. In numpy terms, this is equivalent to averaging over the first dimension of the matrix. The diagonal of the matrices on the third dimension are perfect correlations (i.e., correlation of region R to region R) so we remove them first. Parameters ---------- residual_array : numpy.ndarray numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. Returns ------- np.ndarray Returns a numpy array of size PxA, where P is the number of participants and A is the number of regions. """ resid_copy = residual_array.copy() #Fill the diagonals of each person with missing values (as we don't want the correlation if i==j) for person in range(0, residual_array.shape[2]): np.fill_diagonal(resid_copy[:,:,person], np.NaN) return np.nanmean(resid_copy, axis=0).T
[docs]def imbalance_stats(data, residual_array): """ Function computing the imbalance measures described in Nadig et al. (2021). We additionally compute an average imbalance by region by person. The function outputs three dataframes. Parameters ---------- data : pandas.DataFrame Dataframe containing the data used for computing the imbalance. We use it to extract the index and use that for the new dataframes residual_array : np.ndarray numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. Returns ------- pandas.DataFrame Returns three dataframes containing the measures computed for the imbalance analysis. """ #Compute imbalance metrics list_imbalance_person = _by_person(residual_array=residual_array) avg_by_region = _by_region(residual_array=residual_array) avg_imb_by_pers_by_region = _by_person_by_region(residual_array=residual_array) #Store the metrics in dataframes avg_imb_by_region = pd.DataFrame(index=data.columns.values, data=avg_by_region, columns=['avg_imbalance_by_region']) avg_imb_by_person = pd.DataFrame(index=data.index, data=list_imbalance_person, columns=['avg_imbalance_by_person']) avg_imb_by_pers_by_region = pd.DataFrame(index=data.index, data=avg_imb_by_pers_by_region, columns=data.columns.values) return avg_imb_by_region, avg_imb_by_person, avg_imb_by_pers_by_region
[docs]def export(data, residual_array, output_path, avg_imb_by_region, avg_imb_by_person, avg_imb_by_pers_by_region, name, all=False): """Function to export the results of the imbalance mapping to files. If requested by the user `sihnpy` will also output the individual orthogonal distances matrices. Parameters ---------- data : pandas.DataFrame Original dataframe containing the data to use with the imbalance mapping. residual_array : numpy.ndarray numpy.ndarray of size AxAxP, where A is the number of region and P is the number of participants. The array is filled with the orthogonal distances resulting from the orthogonal regression. output_path : str Local path where the data should be output. avg_imb_by_region : pandas.DataFrame Dataframe containing the average imbalance by region avg_imb_by_person : pandas.DataFrame Dataframe containing the average imbalance by person avg_imb_by_pers_by_region : pandas.DataFrame Dataframe containing the average imbalance by region, by person name : str String to add as a suffix for all the output (user's choice) all : bool, optional Whether the user wants to output all individual participants' matrices, by default False """ avg_imb_by_region.to_csv(f"{output_path}/avg_imbalance_by_region_{name}.csv") avg_imb_by_person.to_csv(f"{output_path}/avg_imbalance_by_person_{name}.csv") avg_imb_by_pers_by_region.to_csv(f"{output_path}/avg_imbalance_by_person_by_region_{name}.csv") if all is True: #Output all the individual imbalance mapping if user requests for i, person in enumerate(data.index.values): resid_i = residual_array[:,:,i] #Extract individual matrix for each participant np.fill_diagonal(resid_i, np.NaN) #Fill the diagonal with missing values #Save the matrices np.savetxt(f"{output_path}/imbalance_{person}_{name}.txt", resid_i, fmt='%1.3f')