Source code for sciSOM.SOM_recall.recall

import numpy as np
from typing import Any, Union, Dict
#import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import numpy.lib.recfunctions as rfn
import viff

# Lets organize this a bit better
# Need to move image manipulation functions to a separate file

[docs]
def assign_labels(data: np.ndarray, 
                  ref_img: np.ndarray, 
                  xdim: int, ydim: int, 
                  cut_out: int) -> tuple[np.ndarray, np.ndarray]:
    '''
    Assigns labels to the data based on the reference image from the SOM
    
    This functions takes in the data and classifications based on an image gives the
    unique labels as well as the data set bacl with the new classification
    PS this version only takes in S1s and S2s and ignores unclassified samples, 
    another version will be made to deal with the unclassified samples.
    
    Parameters
    ----------

    data : np.ndarray
        can be either peaks or peak_basics
    ref_img : np.ndarray
        will be the image extracted from the SOM classification of each data point
    xdim : int
        width of the image cube
    ydim : int
        height of the image cube
    cut_out : removes the n last digits of the image vector if necesarry
    
    Returns
    -------
    colorp : np.ndarray
        list of unique colors in the image
    data_new : np.ndarray
        structured array with the new classification
    '''

    # This relays on reference images which is a NeuroScope implementation
    # 

    from PIL import Image
    data_new = data
    img = Image.open(ref_img)
    imgGray = img.convert('L')
    img_color = np.array(img) #still in the x,y,3 format
    img_color_2d = img_color.reshape((xdim*ydim,3))
    label = -1 * np.ones(img_color.shape[:-1])
    colorp = np.unique(img_color_2d, axis = 0)
    for i, color in enumerate(colorp):  # argwhere
        label[np.all((img_color == color), axis = 2)] = i #assignes each color a number
    label_vec = label.reshape((xdim*ydim))
    if cut_out != 0:
        label_vec_nonzero = label_vec[:-cut_out]
    elif cut_out == 0:
        label_vec_nonzero = label_vec
    #s2_data = data[data['type'] == 2]
    #s1_data = data[data['type'] == 1]
    print(label_vec_nonzero)
    print(len(label_vec_nonzero))
    print(type(label_vec_nonzero))
    data_new['type'] = label_vec_nonzero.astype(int)

    return colorp, data_new




[docs]
def affine_transform(data: np.ndarray, 
                     target_min: Union[float, np.ndarray], 
                     target_max: Union[float, np.ndarray]) -> np.ndarray:
    """
    Takes a set of data an applies a affine transfrom to scale it. 
    The first axis is expected to be the number of data samples, the second
    axis is expected to be the number of features.

    Parameters
    ----------

    data : np.ndarray
        Input data to apply the affine transform to.
    target_min : float or np.ndarray
        Minimum of the target space
    target_max : float or np.ndarray
        Maximum of the target space

    Returns
    -------
    normalized_data : np.ndarray
        Data after the affine transformation
    """
    _, dim = np.shape(data)
    data_min = np.min(data, axis = 0)
    data_max = np.max(data, axis = 0)  

    if np.isscalar(target_min):
        target_min = np.repeat(target_min, dim)
    if np.isscalar(target_max):
        target_max = np.repeat(target_max, dim) 

    if (data_max == data_min).any():
        raise ZeroDivisionError('Data has no variance')
    
    normalized_data = ((data - data_min)/(data_max-data_min))*(target_max-target_min) + target_min
    return normalized_data

    
# I should make a separate file for image manipulation functions

[docs]
def select_middle_pixel(img_as_np_array: np.ndarray, 
                        pxl_per_block: int = 12) -> np.ndarray:
    """
    Selects the middle pixel of each cell in the image.

    Image resulting from NS have cells of about 12 pixels, we want to reduce the
    image to 1 pixel per cell, so we will take the middle pixel.
    Since images have their 0 index at the top and np arrays start at the bottom
    we have to filp the image across the y-axis.

    Parameters
    ----------
    img_as_np_array : np.ndarray
        Image as a numpy array
    pxl_per_block : int
        Number of pixels per block in the image, defualt set to 12

    Returns
    -------
    SOM_img_clusters : np.ndarray
        Image with 1 pixel per cell
    """
    [width, height, depth] = img_as_np_array.shape
    #img_flipped = np.flip(img_as_np_array, 0) # image indexing start at the top and go down
                                              # this fixes this issue.
    img_flipped = img_as_np_array
    SOM_width = int(width/pxl_per_block)
    SOM_height = int(height/pxl_per_block)
    
    SOM_img_clusters = np.zeros([SOM_width, SOM_height, depth])
    
    for col in np.arange(SOM_width):
        #print(f'col number is : {col}')
        for row in np.arange(SOM_height):
            #print(f'Number in computation is {pxl_per_block/2 + (row*12)}')
            SOM_img_clusters[col, row, :] = img_flipped[int(pxl_per_block/2) + (col*12), 
                                                        int(pxl_per_block/2) + (row*12), :]
            
    return SOM_img_clusters



[docs]
def recall_populations(dataset: np.ndarray, 
                       weight_cube: np.ndarray, 
                       SOM_cls_img: np.ndarray, 
                       norm_factors: np.ndarray) -> np.ndarray:
    """
    Recalls data from a SOM weight cube and assigns a population label to each data point.

    Master function that should let the user provide a weightcube,
    a reference img as a np.array, a dataset and a set of normalization factors.
    In theory, if these 5 things are provided, this function should output
    the original data back with one added field with the name "SOM_type"
    Here we will assume that the data has been preprocessed in the SOM
    input format.

    Parameters
    ----------

    weight_cube : np.array
        SOM weight cube (3D array)
    SOM_cls_img : 
      SOM reference image as a numpy array
    dataset :          
        Data to preform the recall on should be a structured array
    normfactos :       
        A set of numbers (equal to dimensionality of the data) 
        to normalize the data so we can preform a recall

    Returns
    -------
    output_data : np.ndarray
        Data with the SOM classification added as a field
    """
    [SOM_xdim, SOM_ydim, SOM_zdim] = weight_cube.shape
    [IMG_xdim, IMG_ydim, IMG_zdim] = SOM_cls_img.shape
    unique_colors = np.unique(np.reshape(SOM_cls_img, [SOM_xdim * SOM_ydim, 3]), axis=0)
    # Checks that the reference image matches the weight cube
    assert SOM_xdim == IMG_xdim, f'Dimensions mismatch between SOM weight cube ({SOM_xdim}) and reference image ({IMG_xdim})'
    assert SOM_ydim == IMG_ydim, f'Dimensions mismatch between SOM weight cube ({SOM_ydim}) and reference image ({IMG_ydim})'

    # Get the deciles representation of data for recall
    #decile_transform_check = data_to_log_decile_log_area_aft(dataset, norm_factors)
    # preform a recall of the dataset with the weight cube
    # assign each population color a number (can do from previous function)
    ref_map = generate_color_ref_map(SOM_cls_img, unique_colors, SOM_xdim, SOM_ydim)
    SOM_cls_array = np.empty(len(dataset))
    SOM_cls_array[:] = np.nan
    # Make new numpy structured array to save the SOM cls data
    data_with_SOM_cls = rfn.append_fields(dataset, 'SOM_type', SOM_cls_array)
    # preforms the recall and assigns SOM_type label
    output_data = SOM_cls_recall(data_with_SOM_cls, dataset, weight_cube, ref_map)
    return output_data['SOM_type']




[docs]
def generate_color_ref_map(color_image: np.ndarray, 
                           unique_colors: np.ndarray) -> np.ndarray:
    """
    Generate a map where the color image representing the labels of the
    som weight cube.

    Parameters
    ----------

    color_image : np.ndarray
        image made by the remap compressed to the SOM size
    unique_colors : np.ndarray
        unique colors found in the image (also represent # of clusters)

    Returns
    -------
    ref_map : np.ndarray
        reference map for the SOM
    """
    xdim, ydim, _ = np.shape((color_image))
    ref_map = np.zeros((xdim, ydim))
    for color in np.arange(len(unique_colors)):
        mask = np.all(np.equal(color_image, unique_colors[color, :]), axis=2)
        indices = np.argwhere(mask)  # generates a 2d mask
        for loc in np.arange(len(indices)):
            ref_map[indices[loc][0], indices[loc][1]] = color
    return ref_map




[docs]
def SOM_cls_recall(array_to_fill: np.ndarray, 
                   data_in_SOM_fmt: np.ndarray, 
                   weight_cube: np.ndarray, 
                   reference_map: np.ndarray) -> np.ndarray:
    """
    Takes the data, the weight cube and the classification map and assignes each
    data point a label based on their cluster.

    Parameters
    ----------
    array_to_fill : np.ndarray
        structured array to fill with the classification
    data_in_SOM_fmt : np.ndarray
        data to classify in the SOM format
    weight_cube : np.ndarray
        SOM weight cube
    reference_map : np.ndarray
        reference map for the SOM

    Returns
    -------
    array_to_fill : np.ndarray
        structured array with the SOM classification added
    """

    # Want to make it so it works with different metrics in the future
    [SOM_xdim, SOM_ydim, _] = weight_cube.shape
    distances = cdist(weight_cube.reshape(-1, weight_cube.shape[-1]), data_in_SOM_fmt, metric='euclidean')
    w_neuron = np.argmin(distances, axis=0)
    x_idx, y_idx = np.unravel_index(w_neuron, (SOM_xdim, SOM_ydim))
    array_to_fill['SOM_type'] = reference_map[x_idx, y_idx]
    return array_to_fill



[docs]
def SOM_location_recall(normalized_data: np.ndarray, 
                        weight_cube: np.ndarray,) -> np.ndarray:
    """
    Takes the data, the weight cube and the classification map and assignes each
    data point a label based on their cluster.

    Parameters
    ----------
    array_to_fill : np.ndarray
        structured array to fill with the classification
    data_in_SOM_fmt : np.ndarray
        data to classify in the SOM format
    weight_cube : np.ndarray
        SOM weight cube
    reference_map : np.ndarray
        reference map for the SOM

    Returns
    -------
    array_to_fill : np.ndarray
        structured array with the SOM classification added
    """

    # Want to make it so it works with different metrics in the future
    array_to_fill = np.empty(len(normalized_data), 2)
    [SOM_xdim, SOM_ydim, _] = weight_cube.shape
    distances = cdist(weight_cube.reshape(-1, weight_cube.shape[-1]), normalized_data, metric='euclidean')
    w_neuron = np.argmin(distances, axis=0)
    x_idx, y_idx = np.unravel_index(w_neuron, (SOM_xdim, SOM_ydim))
    array_to_fill = np.vstack((x_idx, y_idx))
    return array_to_fill.transpose()



[docs]
def create_mapping_dict(output_classes: Union[list, np.ndarray], 
                        dataset_classes: Union[list, np.ndarray]) -> Dict:
    """
    Create a mapping dictionary from output classes to dataset classes.

    Parameters
    ----------
    output_classes : np.ndarray or list
        List of output classes from the neural network.
    dataset_classes : np.ndarray or list
        List of corresponding dataset classes.

    Returns
    -------
    mapping_dict : dict
        A dictionary mapping output classes to dataset classes.
    """
    if len(output_classes) != len(dataset_classes):
        raise ValueError("The number of output classes must match the number of dataset classes.")
    return dict(zip(output_classes, dataset_classes))



[docs]
def map_output_to_dataset(output_classes: np.ndarray, mapping_dict: np.ndarray) -> np.ndarray:

    """
    Map output classes to dataset classes using the mapping dictionary.

    Parameters
    ----------
    output_array : np.ndarray
        Array of output classes from the neural network.
    mapping_dict : dict
        Dictionary mapping output classes to dataset classes.

    Returns
    -------
    mapped_array : np.ndarray
        Array of dataset classes corresponding to the output classes.
    """
    mapped_array = np.vectorize(mapping_dict.get)(output_classes)
    return mapped_array

            
    

[docs]
def normalize_data_recall(peaklet_data, normalization_factor):
    """
    Use this function to do operation with an already trained SOM
    Converts peaklet data into the current best inputs for the SOM,
    log10(deciles) + log10(area) + AFT
    Since we are dealing with logs, anything less than 1 will be set to 1

    peaklet_data:           straxen datatype peaklets (peaks also work)
    normalization_factors:  numbers needed to normalize data so recalls work
    """
    pass