Source code for sihnpy.spatial_extent

import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from scipy import stats
from matplotlib import pyplot as plt

# Spatial extent - Threshold derivation

[docs]def gmm_estimation(data_to_estimate, fix=False): """Function estimating a 1- and a 2-cluster solution Gaussian Mixture Model. The Bayesian Information Criteria is output and compared between the two models. Parameters ---------- data_to_estimate : pandas.DataFrame Data where each column needs to be fed to the GMM. Any column where a GMM should NOT be estimated should have been removed fix : bool, optional Whether `sihnpy` should remove regions where 1-component fits better the data than a 2-component model (using smallest Bayesian Information Criteria), by default False Returns ------- dict, pandas.DataFrame Returns a dictionary with the GMM objects from `scikit-learn` and a `pandas.DataFrame` where columns were removed if fix is applied. """ data = data_to_estimate.copy() #Copy the data to avoid modifying in place gm_estimations = {} #Dict to store GMM models col_rem_id = [] #List of columns to remove, if needed for col in data_to_estimate: print(f'GMM estimation for {col}') #For each column, sort the dataframe from lowest to highest and convert to 1D np.ndarray sorted_df = data.sort_values(by=col) roi_suvr = sorted_df[col].to_numpy().reshape(-1,1) #Need to reshape -1,1 since we feed #only 1 feature to the GMM #Estimate the GMM models (1 and 2 components) gm1 = GaussianMixture(n_components=1, random_state=667).fit(roi_suvr) gm2 = GaussianMixture(n_components=2, random_state=667).fit(roi_suvr) print(f"1-component: BIC = {gm1.bic(roi_suvr)} | 2-components: BIC = {gm2.bic(roi_suvr)} ") #Store GMM estimation object gm_estimations[col] = gm2 #In the case that 1 distribution works better, here are the options if gm1.bic(roi_suvr) <= gm2.bic(roi_suvr): print("---GMM estimation suggests that 1 component is a better fit to the data") #If we want to remove the column with 1 component, save the ID here. if fix is True: print(f"----Fix is True: Region {col} will be removed from further calculation") col_rem_id.append(col) del gm_estimations[col] else: print(f"----Fix is False: Region {col} will be kept in the data") #Remove columns, if errors in estimation AND fix is true clean_data = data.drop(col_rem_id, axis=1) return gm_estimations, clean_data
[docs]def _gmm_avg_sd(gm_obj): """Quick function extracting and returning the average and SD values of the two components from the GMM estimation Parameters ---------- gm_obj : sklearn.mixture.GaussianMixture Takes a GMM object as input Returns ------- dict Returns a dictionary for each GMM object, with the mean and SDs of each component. """ dict_gmm_measures = {} dict_gmm_measures[f'mean_comp1'] = gm_obj.means_[0][0] dict_gmm_measures[f'mean_comp2'] = gm_obj.means_[1][0] dict_gmm_measures[f'sd_comp1'] = np.sqrt(gm_obj.covariances_[0][0])[0] dict_gmm_measures[f'sd_comp2'] = np.sqrt(gm_obj.covariances_[1][0])[0] return dict_gmm_measures
[docs]def gmm_measures(cleaned_data, gm_objects, fix=False): """For all data kept after GMM estimation, this function computes the averages and SDs for both components.We then check that the order of the clusters is right and the measures are also used for the histograms in the `spex.gmm_histograms` function. Parameters ---------- cleaned_data : pandas.DataFrame Dataframe output from `spex.gmm_estimation`. gm_objects : dict Dictionary of the `sklearn.mixture.GaussianMixture` objects to extract measures from. fix : bool, optional If the mean of component 2 is lower than the mean of component 1, it suggests that the components are inverted. If fix is True, we remove the region from further calculations, by default False Returns ------- pandas.DataFrame, dict, dict Returns a Dataframe with clean data (if columns were removed by the fix), one dictionary with `sklearn.mixture.GaussianMixture` objects cleaned (if some estimations were removed) by fix and one dictionary with the averages/SDs of the two components, for regions kept. """ tmp_df = cleaned_data.copy() #To avoid modifying the input. final_gm_estimations = gm_objects.copy() #To avoid modifying the input rem_cols = [] #For fixing if needed gmm_measures = {} #To store the GMM averages and SDs for histograms for col, gm_obj in final_gm_estimations.items(): #Return averages and SDs of the components gmm_measures[col] = _gmm_avg_sd(gm_obj) #Check that average of second component is higher than the first component if gmm_measures[col][f'mean_comp1'] > gmm_measures[col][f'mean_comp2']: print(f"Average of first component of {col} is higher than second component.") if fix is True: print(f'- Fix is true, removing {col}') rem_cols.append(col) #Final fixes, if the user decides to remove a column, remove from everything final_data = tmp_df.drop(labels=rem_cols, axis=1) for key in rem_cols: #For each region to remove del final_gm_estimations[key] #Remove from GMM estimation del gmm_measures[key] #Remove from GMM measures return final_data, final_gm_estimations, gmm_measures
[docs]def gmm_probs(final_data, final_gm_estimations, fix=False): """Function extracting the probability to be in the "second" component (high abnormal values). Parameters ---------- final_data : pandas.DataFrame Cleaned dataframe output by `spex.gmm_measures` final_gm_estimations : dict Cleaned dictionary of `sklearn.mixture.GaussianMixture` objects output by `spex.gmm_measures` fix : bool, optional If inverted distributions are not removed in `spex.gmm_measures`, they can be manually inverted here by setting to True, by default False Returns ------- pandas.DataFrame Dataframe of the shape, index and columns from `final_data`. Contains probabilities of belonging to the "abnormal" distribution for each participant, for each region. """ probs_df = pd.DataFrame() #To store final data for col in final_data: sorted_df = final_data.sort_values(by=col) #Sort input index first so output matches probs = final_gm_estimations[col]\ .predict_proba(sorted_df[col].to_numpy().reshape(-1,1)) #Find probability of each cluster from the GMM #Check whether components' means are inverted if final_gm_estimations[col].means_[1][0] < final_gm_estimations[col].means_[0][0]: print(f'-Means for components of {col} are inverted.') if fix is True: print(f'----Fix is True: Inverting the components...') tmp_df = pd.DataFrame(data=probs[:,0], index=sorted_df.index, columns=[col]) #Extract and store the probability of cluster 1 when inversion issue. else: print(f'----Fix is False: Leaving as is.') tmp_df = pd.DataFrame(data=probs[:,1], index=sorted_df.index, columns=[col]) #Extract and store the probability of cluster 2 else: #In any other case (no fixing, or fixing but components not inverted...) #Grab the probabilities of the second cluster tmp_df = pd.DataFrame(data=probs[:,1], index=sorted_df.index, columns=[col]) #Extract and store the probability of cluster 2 probs_df = pd.concat([probs_df, tmp_df], axis=1) #Store probabilities for export return probs_df
[docs]def _gmm_density_histogram(regional_data, regional_gmm_measures, col, dist_2=True): """Histogram of the value DENSITIES with overlayed density function for each GMM cluster. Density is the count of each bin, divided by the total number of counts and the bin width. (Ref: Matplotlib documentation) This option is necessary to see the density curves. Parameters ---------- regional_data : pandas.Series Single column from the `final_data` object representing the data in one region. regional_gmm_measures : dict Dictionary containing the mean and SD of each component. col : str String containing the name of the region. Used mostly for labels on the graphs. dist_2 : bool, optional Whether we want to plot one or two density functions (True == two), by default True Returns ------- matplotlib.pyplot.figure Returns matplotlib figure """ fig = plt.figure() #Instantiate figure plt.hist(regional_data, 50, density=True, facecolor='b', alpha=0.75) #Create histogram for the data plt.plot(np.sort(regional_data), stats.norm.pdf(np.sort(regional_data), regional_gmm_measures['mean_comp1'], regional_gmm_measures['sd_comp1']), color='green', linewidth=4) #Plots the density of component 1 if dist_2 is True: plt.plot(np.sort(regional_data), stats.norm.pdf(np.sort(regional_data), regional_gmm_measures['mean_comp2'], regional_gmm_measures['sd_comp2']), color='red', linewidth=4) #Plots the density of component 2 plt.xlabel(f'Distribution of values {col}') plt.ylabel('Density of binned values') return fig
[docs]def _gmm_raw_histogram(regional_data, col): """Generates a simple histogram of the values in a given region. Can plot both the probabilities and the raw values, as needed. Parameters ---------- regional_data : pandas.Series Single column of data for a single region (data or probabilities) col : str Name of the region of interest Returns ------- matplotlib.pyplot.figure Returns matplotlib figure """ fig = plt.figure() #Instantiate figure plt.hist(regional_data, 50, density=False, facecolor='b', alpha=0.75) plt.xlabel(f'Distribution of values {col}') plt.ylabel('Frequency of binned values') return fig
[docs]def gmm_histograms(final_data, gmm_measures, probs_df, dist_2=True, type="density"): """Optional function plotting histograms from the raw data, with overlayed density functions for both clusters. Parameters ---------- final_data : pandas.DataFrame Dataframe from `spex.gmm_measures` with final columns to plot. gmm_measures : dict Nested dictionary containing the mean and SDs of each component, for each region. probs_df : pandas.DataFrame Dataframe of the probabilities of belonging to the "abnormal" distribution, from the `spex.gmm_probs` function. dist_2 : bool, optional Whether we want to plot one or two density functions (True == two) if we plot density, by default True type : str, optional Type of histogram to plot ("density", "raw", "probs", "all"), by default "density". Returns ------- dict Returns a dictionary of matplotlib figures. """ dict_fig = {} #For each region for col in final_data: if type == "density": #Density plot hist_dens = _gmm_density_histogram(final_data[col], gmm_measures[col], col=col, dist_2=dist_2) dict_fig[f'hist_density_{col}'] = hist_dens #Store in dictionary for export plt.close(hist_dens) #Close figure window to reduce memory usage elif type == "raw": hist_raw = _gmm_raw_histogram(final_data[col], col=col) dict_fig[f'hist_raw_{col}'] = hist_raw plt.close(hist_raw) elif type == "probs": hist_probs = _gmm_raw_histogram(probs_df[col], col=col) dict_fig[f'hist_probs_{col}'] = hist_probs plt.close(hist_probs) elif type == "all": hist_dens = _gmm_density_histogram(final_data[col], gmm_measures[col], col=col, dist_2=dist_2) hist_raw = _gmm_raw_histogram(final_data[col], col=col) hist_probs = _gmm_raw_histogram(probs_df[col], col=col) dict_fig[f'hist_density_{col}'] = hist_dens dict_fig[f'hist_raw_{col}'] = hist_raw dict_fig[f'hist_probs_{col}'] = hist_probs plt.close(hist_dens) plt.close(hist_raw) plt.close(hist_probs) return dict_fig
[docs]def gmm_threshold_deriv(final_data, probs_df, prob_threshs, improb=None): """Function deriving the actual thresholds based on the probabilities of belonging to the "abnormal" distribution. Depending on the threshold value used, the probability of belonging to a given component can be inverted (e.g., the 50% probability threshold may have a higher value than the 90% threshold.). This usually happens when the second component is very spread out and overlaps with the first component. If that is the case, the use of the `improb` argument is recommended. Also note that to give more flexibility to the user, `sihnpy` allows for a list of thresholds to be given to derive multiple thresholds. However, `sihnpy` doesn't check whether the order of the thresholds make sense (e.g., that 50% comes before 90%) and assumes the user put them in the right order. It is up to the user to check this once the thresholds are derived. Parameters ---------- final_data : pandas.DataFrame Final data derived from `spex.gmm_measures`. probs_df : pandas.DataFrame Dataframe containing the probabilities of belonging to the "abnormal" distribution, from the `spex.gmm_probs` function. prob_threshs : list of float List of thresholds to apply to the data. Thresholds have to range between 0 and 1. improb : float, optional Value below which an "abnormal" value is improbable or impossible. Useful in the case that the GMM is very spread out, by default None Returns ------- pandas.DataFrame Dataframe where rows are the regions and columns are the thresholds derived from the probabilities. """ if not isinstance(prob_threshs, list): return "Error: The threshold derivation is expecting a list of values, even if only 1 threshold is given." #Create empty dataframe to store all the thresholds thresh_df = pd.DataFrame(index=final_data.columns.values, data=None) #Sort the thresholds to get the lowest to the highest probability. prob_threshs.sort() #For each threshold... for thresh in prob_threshs: #For each region... for col in final_data: #Sort the indices of the raw values and probabilities to make sure they match raw_sorted = final_data[col].sort_index(level=0) probs_sorted = probs_df[col].sort_index(level=0) #Quick check that the raw values and probabilities have the same index # (i.e., number of participants) if raw_sorted.index.equals(probs_sorted.index) is False: return "Error: The raw data and probability data don't share the same index." #Find the closest probability to the probability threshold id_prob_min = (np.abs(probs_sorted - thresh)).argmin() #Find the raw value of the participant with closest probability to the threshold thresh_value = raw_sorted.iloc[id_prob_min] #In some cases, the probability assigned by the GMM causes the thresholds # to be dramatically low. We can fix it by forcing values that would # be improbable and ignoring values that are improbable when creating # the threshold. This step is optional if improb is not None: #If the threshold is below the biologically plausible value if thresh_value < improb: print(f"Threshold for {col} is improbable. Fixing.") fix = False i = 0 #While the value is improbable, keep trying to find a value while fix is False: #Set the old probability to 0 to ignore it probs_sorted.iloc[id_prob_min] = 0 #Recompute the closest probability and find threshold id_prob_min = (np.abs(probs_sorted - thresh)).argmin() thresh_value = raw_sorted.iloc[id_prob_min] #If the value is higher than the improbable value, we are done if thresh_value > improb: #Save the new value and switch the loop off thresh_df.loc[col, f'thresh_{thresh}'] = thresh_value fix = True #In case no values go over the improbable threshold after a few tries, # we forcibly set that threshold to missing. else: i += 1 if i == 10: print(f"---Can't fix thresholds for {col}. Setting to missing.") thresh_df.loc[col, f'thresh_{thresh}'] = np.NaN fix = True else: #If the value is not improbable, simply save it thresh_df.loc[col, f'thresh_{thresh}'] = thresh_value else: #If the correction is turned off, we simply save the value regardless thresh_df.loc[col, f'thresh_{thresh}'] = thresh_value return thresh_df
[docs]def export_histograms(hist_dict_fig, output_path, name): """ Exporting the histograms to file, if requested by user. Will export ALL histograms saved to the dictionary Parameters ---------- hist_dict_fig : dict Dictionary of histogram figures from `spex.gmm_histograms` output_path : str String of the path to where the output should go name : str Name that should be tacked at the end of the file name, depending on the user's conventions. """ for type_hist, hist in hist_dict_fig.items(): hist.savefig(f'{output_path}/{type_hist}_{name}.png', dpi=500)
[docs]def export_threshs(final_data, probs_data, thresh_df, output_path, name): """ Wrapper function exporting the final data used and the probability data to files. Parameters ---------- final_data : pandas.DataFrame Final data derived from `spex.gmm_measures`. probs_df : pandas.DataFrame Dataframe containing the probabilities of belonging to the "abnormal" distribution, from the `spex.gmm_probs` function. thresh_df : pandas.DataFrame Dataframe containing the thresholds we just derived output_path : str String of the path to where the output should go name : str Name that should be tacked at the end of the file name, depending on the user's conventions. """ final_data.to_csv(f"{output_path}/final_data_derived_{name}.csv") probs_data.to_csv(f"{output_path}/probabilities_clust2_{name}.csv") thresh_df.to_csv(f"{output_path}/thresholds_{name}.csv")
# Spatial extent - Threshold application
[docs]def apply_clean(data_to_apply, thresh_data, index_name=None): """Function doing basic cleaning on the spatial extent and thresholds; just sorts the rows and make sure they match between the thresholds and data to apply. Parameters ---------- data_to_apply : pandas.DataFrame Data on which we want to apply thresholds. Columns should match rows of `thresh_data`. thresh_data : pandas.DataFrame Thresholds to be applied to the data. Rows should match columns of `data_to_apply`. index_name : str, optional String indicating the name of the column that should be considered as the `pandas.DataFrame.Index`. By default, assume it's already set; by default None Returns ------- pandas.DataFrame Returns `pandas.DataFrame` of the data, where the columns of the data shares the same order as the rows of the thresholds. """ #If the index name is given by the user, we set the index first or it will be discarded later. if index_name is not None: #Set index, keep columns that match the index of the threshold file and sort columns data_to_apply_clean = data_to_apply\ .set_index(index_name)\ .filter(items=thresh_data.index, axis=1)\ .sort_index(axis=1) else: data_to_apply_clean = data_to_apply\ .filter(items=thresh_data.index, axis=1)\ .sort_index(axis=1) #Filter rows to keep only those that appear in the columns of the data and sort the index thresh_data_clean = thresh_data\ .filter(items=data_to_apply_clean.columns, axis=0)\ .sort_index(axis=0) if len(thresh_data_clean) != len(data_to_apply_clean.columns): return f"Error: Rows in the threshold data ({len(thresh_data_clean)}) doesn't equal to the columns in the data to apply ({len(data_to_apply.columns)})" return data_to_apply_clean, thresh_data_clean
[docs]def apply_masks(data_to_apply_clean, thresh_data_clean): """Function applying the thresholds to the data, resulting in binary masks. The binary masks have the same shape as the original data (rows are participants, columns are regions). The number of masks depends on the number of thresholds (columns) in `thresh_data_clean`. Parameters ---------- data_to_apply_clean : pandas.DataFrame Data to which we want to apply the spatial extent, where columns are regions and rows are participants. From `spex.apply_clean`. thresh_data_clean : pandas.DataFrame Dataframe containing the threshold data, where rows are regions and columns are thresholds. From `spex.apply_clean` Returns ------- dict Returns a dictionary of `pandas.DataFrame`s, where each `DataFrame` contains binary values for each region, for each participant. """ dict_masks = {} #For each threshold value... for threshold in thresh_data_clean: #Create an empty DF to store all the regional binary values tmp_df = pd.DataFrame(index=data_to_apply_clean.index) #For each region in the DF we want to apply the thresholds to for region in data_to_apply_clean: #If the region has a threshold available if region in thresh_data_clean.index: #Apply the threshold, where 1 is above or equal to threshold tmp_df[region] = np.where(data_to_apply_clean[region] >= thresh_data_clean.loc[region, threshold], 1, 0) #Store the data in a dictionary, classified by thresholds dict_masks[f'{threshold}'] = tmp_df return dict_masks
[docs]def apply_index(data_to_apply_clean, dict_masks): """Create the spatial extent index, which is the sum of regions that are above the threshold. In the case where multiple thresholds are available we output the sum of each thresholds individually, as well as the total sum of all thresholds together. Parameters ---------- data_to_apply_clean : pandas.DataFrame Original dataframe cleaned with `spex.apply_clean`. Only used to get the index to ensure the spatial extent is the same order. dict_masks : dict Dictionary containing all the binary masks from `spex.apply_masks` Returns ------- pandas.DataFrame Dataframe containing the spatial extent index for each threshold. """ #Create empty dataframe with the same index as the original data spex_metrics = pd.DataFrame(index=data_to_apply_clean.index) #Compute the spatial extent, by summing rows (i.e., within each participant) for threshold_val, masks in dict_masks.items(): spex_metrics[f'spatial_extent_{threshold_val}'] = masks.sum(axis=1) #Sum the rows to get spatial extent index #If more than 1 threshold, we also compute a sum of regions for all thresholds if len(spex_metrics.columns) > 1: spex_metrics[f'spatial_extent_sum_all'] = spex_metrics.sum(axis=1) return spex_metrics
[docs]def apply_ind_mask(data_to_apply_clean, dict_masks): """Another way to leverage the spatial extent is by creating individualized spatial extent masks. The idea is that simply add weights to the original data, based on the probability of being abnormal in a given region. For instance, if a participant has a 90% probability of being positive, vs a 50% probability of being positive, we give more weight to the 90% probability value by multiplying it by a different constant. Parameters ---------- data_to_apply_clean : pandas.DataFrame Original dataframe cleaned with `spex.apply_clean`. dict_masks : dict Dictionary containing all the binary masks from `spex.apply_masks` Returns ------- dict Dictionary of individualized spatial extent masks. """ spex_ind_masks = {} dict_masks_copy = dict_masks.copy() tmp_data = pd.DataFrame() #If more than one mask, create a sum of masks if len(dict_masks) > 1: for masks in dict_masks.values(): tmp_data = tmp_data.add(masks, fill_value=0) #If original value was missing, the result will be missing. We force 0 to be everywhere #Store that new mask in the dictionary for output dict_masks_copy['mask_spatial_extent_sum_all'] = tmp_data #For each mask, compute the individualized mask for threshold_vals, masks in dict_masks_copy.items(): spex_ind_masks[threshold_vals] = data_to_apply_clean.multiply(masks)\ .replace(to_replace={0:np.NaN}) return spex_ind_masks
[docs]def export_spex_metrics(spex_metrics, output_path, name): """Function to export the spatial extent metrics. Parameters ---------- spex_metrics : pandas.DataFrame Dataframe containing the spatial extent indices. output_path : str Path where the dataframe should be output. name : str String that should be tacked at the end of the file name based on user convention. """ spex_metrics.to_csv(f"{output_path}/spex_metrics_{name}.csv")
[docs]def export_spex_bin_masks(dict_masks, output_path, name): """Function to export the binary masks. Parameters ---------- dict_masks : dict Dictionary of binary masks where the thresholds were applied. output_path : str Path where the dataframe should be output. name : str String that should be tacked at the end of the file name based on user convention. """ for name_mask, masks in dict_masks.items(): masks.to_csv(f"{output_path}/spex_bin_mask_{name_mask}_{name}.csv")
[docs]def export_spex_ind_masks(spex_ind_masks, output_path, name): """Function to export the individualized spatial extent masks Parameters ---------- spex_ind_masks : dict Dictionary of individualized spatial extent masks output_path : str Path where the dataframe should be output. name : str String that should be tacked at the end of the file name based on user convention. """ for name_mask, masks in spex_ind_masks.items(): masks.to_csv(f"{output_path}/spex_ind_mask_{name_mask}_{name}.csv")