Source code for omicsdata.ssm.supervariants

#################################################################################
# supervariants.py
#
# Source file containing functions for creating and manipulating 'supervariants'.
#################################################################################

import numpy as np
import numpy.ma as ma
from collections import namedtuple 

from .common import extract_vids
from .constants import Variants_Keys

# namedtuple defining the fields for a 'Variant'
Variant = namedtuple('Variant', (
  'id',
  'var_reads',
  'ref_reads',
  'total_reads',
  'vaf',
  'omega_v',
))

[docs]def convert_variant_dict_to_tuple(variant):
    """Converts a dictionary of variants into a tuple of variants
    
    Parameters
    ----------
    variant : dictionary
        A dictionary containing all of the following keys for a particular variant:
        'id',
        'var_reads',
        'ref_reads',
        'total_reads',
        'vaf',
        'omega_v'

    Returns
    -------
    namedtuple
        A 'Variant' named tuple with all of the same keys as the inputted dictionary
    """
    return Variant(**{K: variant[K] for K in Variant._fields})

[docs]def convert_all_variants_to_tuples(variants):
    """Converts a dictionary of variants each of which are represented by a dictionary into a list of
    tuples
    
    Parameters
    ----------
    variants : dictionary
        A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant
        containing the variant's 'id' (unique identifier), 'name' (string identifier), 
        'var_reads' (array of variants reads for each sample),  'total_reads' (array of total reads for each sample)
        'omega_v' (array of variant read probabilities for each sample)

    Returns
    -------
    list
        A list of namedtuples for each variant in the variants input. Each value in the list is a 'Variant'
        namedtuple with the following keys: 'id', 'var_reads', 'ref_reads', 'total_reads','vaf','omega_v'

    """
    return [convert_variant_dict_to_tuple(variants[V]) for V in list(variants.keys())]

[docs]def make_supervar(name, variants, fill_chr_pos=False):
    """Makes a supervariant given a list of variants
    
    Parameters
    ----------
    name : str
        A name/id value to give the supervariant

    variants : list
        A list of 'variant' dictionaries. Each variant dictionary contains the following keys:'id' (unique identifier), 'name' (string identifier), 
        'var_reads' (array of variants reads for each sample),  'total_reads' (array of total reads for each sample)
        'omega_v' (array of variant read probabilities for each sample)

    fill_chr_pos : bool
        A flag to fill the chromosome and position fields for each supervariant. This will only work 
        if all variant names match the pattern '{chromosome}_{position}'

    Returns
    -------
    dictionary
        A dictionary that has summarizes the information in the list of variants inputted. The supervariant
        has the following (used) keys: 'id' (unique id for supervariant), 'name' (string name of supervariant), 'var_reads' (array of variants reads for each sample),  'total_reads' (array of total reads for each sample)
        'omega_v' (array of variant read probabilities for each sample)
    """
    assert len(variants) > 0, "Cannot make supervariants from an empty list of variants"
    N = np.array([var[Variants_Keys.TOTAL_READS] for var in variants])
    V = np.array([var[Variants_Keys.VAR_READS] for var in variants])
    omega_v = np.array([var[Variants_Keys.OMEGA_V] for var in variants])

    # converts all supervariants to have an omega_v of 0.5
    _, S = N.shape
    N_hat = 2*N*omega_v
    V_hat = np.minimum(V, N_hat)
    omega_v_hat = 0.5 * np.ones(S)

    chrom = None 
    pos = None

    # fill chromosome and position if given flag and the name field matches pattern
    if fill_chr_pos:
        if all([len(var[Variants_Keys.NAME].split("_")) == 2 for var in variants]):
            chrom = np.array([var[Variants_Keys.NAME].split("_")[0] for var in variants])
            pos = np.array([var[Variants_Keys.NAME].split("_")[1] for var in variants])

    supervariant = {
        Variants_Keys.ID:          name,
        Variants_Keys.NAME:        name,
        Variants_Keys.CHROM:       chrom,
        Variants_Keys.POS:         pos,
        Variants_Keys.OMEGA_V:     omega_v_hat,
        Variants_Keys.VAR_READS:   np.round(np.sum(V_hat, axis=0)).astype(np.int32),
        Variants_Keys.TOTAL_READS: np.round(np.sum(N_hat, axis=0)).astype(np.int32),
    }
    supervariant[Variants_Keys.REF_READS] = \
        supervariant[Variants_Keys.TOTAL_READS] - supervariant[Variants_Keys.VAR_READS]
    T = ma.masked_equal(supervariant[Variants_Keys.TOTAL_READS], 0)
    supervariant[Variants_Keys.VAF] = np.array(supervariant[Variants_Keys.VAR_READS] / T)

    return supervariant

[docs]def clusters_to_supervars(clusters, variants, fill_chr_pos=False):
    """Converts clusters into supervariants
    
    Parameters
    ----------
    clusters: list
        A list of lists, where each sublist contains the 'id' values for the variants that are in that cluster

    variants : dictionary
        A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant
        containing the variant's 'id' (unique identifier), 'name' (string identifier), 
        'var_reads' (array of variants reads for each sample),  'total_reads' (array of total reads for each sample)
        'omega_v' (array of variant read probabilities for each sample)

    fill_chr_pos : bool
        A flag to fill the chromosome and position fields for each supervariant. This will only work 
        if all variant names match the pattern '{chromosome}_{position}'

    Returns
    -------
    dictionary
        A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a 
        dictionary containing the data for the supervariant
    """
    supervars = {}

    for cluster in clusters:
        assert len(cluster) > 0, "Cannot make a supervariant from an empty list"
        cluster_variants = [variants[vid] for vid in cluster]
        name = 'S%s' % (len(supervars) + 1)
        supervars[name] = make_supervar(name, cluster_variants, fill_chr_pos)

    return supervars

[docs]def make_superclusters(supervars):
    """Generates a clustering where each supervariant is in its own cluster
    
    Parameters
    ----------
    supervars : dictionary
        A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a 
        dictionary containing the data for the supervariant

    Returns
    -------
    list
        A list of lists where each sublist contains a single supervariant
    """
    svids = extract_vids(supervars)
    return [[S] for S in svids]

[docs]def supervars_to_binom_params(supervars):
    """Extracts the binomial parameters for each supervariant.
    
    Parameters
    ----------
    supervars : dictionary
        A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a 
        dictionary containing the data for the supervariant

    Returns
    -------
    ndarray
        An ndarray where each row i = 1,...,n is the variant reads for all m samples for supervariant i, and each column s = 1,...,m is the
        variants reads for supervariant i in sample s.
    ndarray
        An ndarray where each row i = 1,...,n is the total reads for all m samples for supervariant i, and each column s = 1,...,m is the
        total reads for supervariant i in sample s.
    ndarray
        An ndarray where each row i = 1,...,n is the variant read probability for all m samples for supervariant i, and each column s = 1,...,m is the
        variant read probability for supervariant i in sample s.
    """
    svids = extract_vids(supervars)
    V = np.array([supervars[S][Variants_Keys.VAR_READS] for S in svids])
    R = np.array([supervars[S][Variants_Keys.REF_READS] for S in svids])
    omega_v = np.array([supervars[S][Variants_Keys.OMEGA_V] for S in svids])

    assert np.all(omega_v == 0.5), "supervariant omega_v is incorrect"
    return V, R, omega_v