Source code for omicsdata.ssm.supervariants

#################################################################################
# supervariants.py
#
# Source file containing functions for creating and manipulating 'supervariants'.
#################################################################################

import numpy as np
import numpy.ma as ma
from collections import namedtuple 

from .common import extract_vids
from .constants import Variants_Keys

# namedtuple defining the fields for a 'Variant'
Variant = namedtuple('Variant', (
  'id',
  'var_reads',
  'ref_reads',
  'total_reads',
  'vaf',
  'omega_v',
))

[docs]def convert_variant_dict_to_tuple(variant): """Converts a dictionary of variants into a tuple of variants Parameters ---------- variant : dictionary A dictionary containing all of the following keys for a particular variant: 'id', 'var_reads', 'ref_reads', 'total_reads', 'vaf', 'omega_v' Returns ------- namedtuple A 'Variant' named tuple with all of the same keys as the inputted dictionary """ return Variant(**{K: variant[K] for K in Variant._fields})
[docs]def convert_all_variants_to_tuples(variants): """Converts a dictionary of variants each of which are represented by a dictionary into a list of tuples Parameters ---------- variants : dictionary A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant containing the variant's 'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) Returns ------- list A list of namedtuples for each variant in the variants input. Each value in the list is a 'Variant' namedtuple with the following keys: 'id', 'var_reads', 'ref_reads', 'total_reads','vaf','omega_v' """ return [convert_variant_dict_to_tuple(variants[V]) for V in list(variants.keys())]
[docs]def make_supervar(name, variants, fill_chr_pos=False): """Makes a supervariant given a list of variants Parameters ---------- name : str A name/id value to give the supervariant variants : list A list of 'variant' dictionaries. Each variant dictionary contains the following keys:'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) fill_chr_pos : bool A flag to fill the chromosome and position fields for each supervariant. This will only work if all variant names match the pattern '{chromosome}_{position}' Returns ------- dictionary A dictionary that has summarizes the information in the list of variants inputted. The supervariant has the following (used) keys: 'id' (unique id for supervariant), 'name' (string name of supervariant), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) """ assert len(variants) > 0, "Cannot make supervariants from an empty list of variants" N = np.array([var[Variants_Keys.TOTAL_READS] for var in variants]) V = np.array([var[Variants_Keys.VAR_READS] for var in variants]) omega_v = np.array([var[Variants_Keys.OMEGA_V] for var in variants]) # converts all supervariants to have an omega_v of 0.5 _, S = N.shape N_hat = 2*N*omega_v V_hat = np.minimum(V, N_hat) omega_v_hat = 0.5 * np.ones(S) chrom = None pos = None # fill chromosome and position if given flag and the name field matches pattern if fill_chr_pos: if all([len(var[Variants_Keys.NAME].split("_")) == 2 for var in variants]): chrom = np.array([var[Variants_Keys.NAME].split("_")[0] for var in variants]) pos = np.array([var[Variants_Keys.NAME].split("_")[1] for var in variants]) supervariant = { Variants_Keys.ID: name, Variants_Keys.NAME: name, Variants_Keys.CHROM: chrom, Variants_Keys.POS: pos, Variants_Keys.OMEGA_V: omega_v_hat, Variants_Keys.VAR_READS: np.round(np.sum(V_hat, axis=0)).astype(np.int32), Variants_Keys.TOTAL_READS: np.round(np.sum(N_hat, axis=0)).astype(np.int32), } supervariant[Variants_Keys.REF_READS] = \ supervariant[Variants_Keys.TOTAL_READS] - supervariant[Variants_Keys.VAR_READS] T = ma.masked_equal(supervariant[Variants_Keys.TOTAL_READS], 0) supervariant[Variants_Keys.VAF] = np.array(supervariant[Variants_Keys.VAR_READS] / T) return supervariant
[docs]def clusters_to_supervars(clusters, variants, fill_chr_pos=False): """Converts clusters into supervariants Parameters ---------- clusters: list A list of lists, where each sublist contains the 'id' values for the variants that are in that cluster variants : dictionary A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant containing the variant's 'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) fill_chr_pos : bool A flag to fill the chromosome and position fields for each supervariant. This will only work if all variant names match the pattern '{chromosome}_{position}' Returns ------- dictionary A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a dictionary containing the data for the supervariant """ supervars = {} for cluster in clusters: assert len(cluster) > 0, "Cannot make a supervariant from an empty list" cluster_variants = [variants[vid] for vid in cluster] name = 'S%s' % (len(supervars) + 1) supervars[name] = make_supervar(name, cluster_variants, fill_chr_pos) return supervars
[docs]def make_superclusters(supervars): """Generates a clustering where each supervariant is in its own cluster Parameters ---------- supervars : dictionary A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a dictionary containing the data for the supervariant Returns ------- list A list of lists where each sublist contains a single supervariant """ svids = extract_vids(supervars) return [[S] for S in svids]
[docs]def supervars_to_binom_params(supervars): """Extracts the binomial parameters for each supervariant. Parameters ---------- supervars : dictionary A dictionary of supervariants, where the keys are the supervariant 'id' values and the values are a dictionary containing the data for the supervariant Returns ------- ndarray An ndarray where each row i = 1,...,n is the variant reads for all m samples for supervariant i, and each column s = 1,...,m is the variants reads for supervariant i in sample s. ndarray An ndarray where each row i = 1,...,n is the total reads for all m samples for supervariant i, and each column s = 1,...,m is the total reads for supervariant i in sample s. ndarray An ndarray where each row i = 1,...,n is the variant read probability for all m samples for supervariant i, and each column s = 1,...,m is the variant read probability for supervariant i in sample s. """ svids = extract_vids(supervars) V = np.array([supervars[S][Variants_Keys.VAR_READS] for S in svids]) R = np.array([supervars[S][Variants_Keys.REF_READS] for S in svids]) omega_v = np.array([supervars[S][Variants_Keys.OMEGA_V] for S in svids]) assert np.all(omega_v == 0.5), "supervariant omega_v is incorrect" return V, R, omega_v