Source code for sihnpy.sliding_window

import os
import math

import pandas as pd
import numpy as np

# Sliding-window

[docs]def bins(data, var, w_size, s_size, collapse=False):
    """Sliding-window function estimating the number of bins to compute.

    Parameters
    ----------
    data : pandas.DataFrame
        Data of the sample containing the variable `var` to use for sorting and sliding.
    var : str
        Name (string) of the column to use for sorting
    w_size : int
        Integer representing the window size (i.e., number of participants per window)
    s_size : int
        Integer representing the step size (i.e., number of non-overlapping participants per 
        window)
    collapse : bool, optional
        Switch determining if the last window has a larger or smaller number of participants,
        by default False

    Returns
    -------
    int
        Returns an integer representing the number of windows to use based on the data and
        parameters provided.
    """

    #Check missing values. If missing, we can't compute.
    if data[var].isnull().values.any():
        return f"Couldn't compute number of bins: there are missing values in '{var}'"
    
    #Sort the variable of interest.
    sorted_df = data.sort_values(by=var, axis=0, ascending=True)

    #Compute number of participants
    n_sub = len(sorted_df)

    #Compute the bins
    ## First situation: We want the last window to have more participants
    if collapse is True:
        print("Collapse is True: the last window may have a larger number of participants")
        n_bin = math.ceil((n_sub - w_size) / s_size)
        print(f'Number of windows: {n_bin}')
    ##Second situation: We want the last window to have less participants
    else:
        print("Collapse is False: the last window may have a smaller number of participants")
        n_bin = math.ceil((n_sub - w_size) / s_size) + 1
        print(f'Number of windows: {n_bin}')

    return n_bin

[docs]def build_windows(data, var, w_size, s_size, n_bin):
    """Function deriving the participants in each window. Returns a pandas.DataFrame with only an
    index. 

    Note: In the original script, the code creating "bin_list" has an extra +1. This was because R
    is 1-indexed. However, Python is 0-indexed, so it needs to start at 0.

    Parameters
    ----------
    data : pandas.DataFrame
        Data of the sample containing the variable `var` to use for sorting and sliding.
    var : str
        Name (string) of the column to use for sorting
    w_size : int
        Integer representing the window size (i.e., number of participants per window)
    s_size : int
        Integer representing the step size (i.e., number of non-overlapping participants per 
        window)
    n_bin : int
        Number of windows to derive

    Returns
    -------
    dict
        Returns a dictionary where the keys are the name of the windows and the values are
        the IDs of the participants in each window.
    """

    w_store = {} #Store the windows once computed

    #Check missing values. If missing, we can't compute.
    if data[var].isnull().values.any():
        return f"Couldn't compute number of bins: there are missing values in '{var}'"
    
    #Sort the variable of interest, keep only the sorting variable
    sorted_df = data.sort_values(by=var, axis=0, ascending=True)\
        .filter(items=[var], axis=1)

    #Grab the participants for each window
    for bin in range(0, n_bin):
        bin_id = bin + 1 #Create an ID for each bin. Mostly to match the original R code.
        #Mostly makes the printing a bit more readable since it's not 0-indexed
        print(f"Creating bin {bin_id}")

        if bin_id == n_bin: #If we reach the last window...
            bin_list = sorted_df.iloc[(s_size * (bin_id - 1)):] #... grab all remainding participants
        else: #For every window except the last
            bin_list = sorted_df.iloc[(s_size * (bin_id - 1)):
                                      (w_size + s_size * (bin_id - 1))] 
            #Grab from the start of the step size to the end of the new window

        #Create the name of the bin, to be used for saving the files
        if bin + 1 >= 10:
            bin_name = f"ww{w_size}_sts{s_size}_w{bin + 1}"
        else:
            bin_name = f"ww{w_size}_sts{s_size}_w0{bin + 1}"

        #Store the list of participants
        w_store[f'{bin_name}'] = bin_list.filter(items=[]) #Keeping only the index

    return w_store

[docs]def data_by_window(w_store, data):
    """This function separates the data in age windows.

    Parameters
    ----------
    w_store : dict
        Dictionary containing the window labels and the IDs for each window.
    data : pandas.DataFrame
        Dataframe containing the data to split in windows.

    Returns
    -------
    dict
        Dictionary where the keys are the labels of the windows and the values are the dataframes
        split for each window.
    """

    w_data = {} #Dict to store the data in windows

    for labels, win_ids in w_store.items():
        print(f'Reconstructing data for window {labels}')
        #Merge the data to the index we extracted
        merged_data = win_ids.merge(data, left_index=True, right_index=True, how='left')
        w_data[labels] = merged_data #Save the dataframe

    return w_data

[docs]def sum_by_window(w_data, var):
    """This function outputs summary measures for the sliding variable used for the sliding-window.
    Can be used on other variables in the data, as long as the variables are continuous.

    Parameters
    ----------
    w_data : dict
        Dictionary containing the data for each window.
    var : str
        String representing the name of the variable to generate stats for.

    Returns
    -------
    pandas.DataFrame
        _description_
    """

    w_summary = pd.DataFrame() #Dataframe to store the summary measures

    #Compute the summary measures for each window
    for labels, win_data in w_data.items():
        mean_var = win_data[var].mean()
        median_var = win_data[var].median()
        sd_var = win_data[var].std()
        min_var = win_data[var].min()
        max_var = win_data[var].max()

        #Store in df
        tmp_df = pd.DataFrame(data={'window':[labels], 
                                    f"mean_{var}": mean_var,
                                    f"median_{var}":median_var,
                                    f"sd_{var}":sd_var,
                                    f"min_{var}":min_var,
                                    f"max_{var}":max_var})

        #Save to dataframe
        w_summary = pd.concat([w_summary, tmp_df], ignore_index=True) 

    return w_summary.set_index('window')

[docs]def export_data(w_data, w_summary, var, path, name):
    """ Function exporting sliding window information.
    """

    #Save data for individual windows
    for labels, data in w_data.items():

        #Save full data for each window
        data.to_csv(f'{path}/full_data_{labels}_{name}.csv')

        #Save IDs for each window
        np.savetxt(f'{path}/ids_{labels}_{name}.txt', data.index.values, newline=os.linesep, fmt='%s') #

    #Save the summaries by window
    w_summary.to_csv(f'{path}/summary_{var}_by_window_{name}.csv')