Source code for omicsdata.ssm.parse

##############################################################################
# parse.py
#
# Source file containing functions for reading/writing simple somatic mutation
# files and parameters files.
##############################################################################

import csv
import json
import numpy as np
import numpy.ma as ma
from .constants import Variants_Keys
from .columns import SSM_Columns, PARAMS_Columns
from .common import extract_vids


[docs]def extract_nums(S, dtype): """Extracts and converts values from a comma delimited string""" return np.array(S.split(','), dtype=dtype)
[docs]def load_ssm(ssm_fn, rescale_depth=False): """Loads a ssm file and extracts the read count and copy number data for each variant Parameters ---------- ssm_fn : str The simple somatic mutation file rescale_depth : bool, optional A flag for whether or not to rescale the read depth for each variant using the average across each sample. Returns ------- dictionary A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant containing the variant's 'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) """ variants = {} with open(ssm_fn) as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: variant = { Variants_Keys.ID: row[SSM_Columns.ID], Variants_Keys.NAME: row[SSM_Columns.NAME], Variants_Keys.VAR_READS: extract_nums(row[SSM_Columns.VAR_READS], np.int32), Variants_Keys.TOTAL_READS: extract_nums(row[SSM_Columns.TOTAL_READS], np.int32), Variants_Keys.OMEGA_V: extract_nums(row[SSM_Columns.VAR_READ_PROB], np.float32), } assert np.all(variant[Variants_Keys.TOTAL_READS] >= variant[Variants_Keys.VAR_READS]), \ "var_reads cannot be larger than total_reads" assert np.all(0 <= variant[Variants_Keys.OMEGA_V]) and np.all(variant[Variants_Keys.OMEGA_V] <= 1), \ "omega_v must be some float between 0 and 1" variant[Variants_Keys.REF_READS] = variant[Variants_Keys.TOTAL_READS] - variant[Variants_Keys.VAR_READS] T = ma.masked_equal(variant[Variants_Keys.TOTAL_READS], 0) variant[Variants_Keys.VAF] = np.array(variant[Variants_Keys.VAR_READS] / T) assert variant[Variants_Keys.ID] not in variants, \ "Cannot have variants with the same ID, variant %s occurs more than once" % variant[Variants_Keys.ID] variants[variant[Variants_Keys.ID]] = variant # rescale read depth using average across samples if rescale_depth: vids = extract_vids(variants) V = np.array([variants[vid][Variants_Keys.VAR_READS] for vid in vids]) T = np.array([variants[vid][Variants_Keys.TOTAL_READS] for vid in vids]) VAFs = np.divide(V, T, where=T>0) T = np.maximum(0,np.rint(np.tile(T.mean(axis=0), (T.shape[0],1))).astype(np.int32)) V = np.maximum(0,np.rint(VAFs*T).astype(np.int32)) for idx,vid in enumerate(vids): variants[vid][Variants_Keys.VAR_READS] = V[idx,:] variants[vid][Variants_Keys.TOTAL_READS] = T[idx,:] S = len(next(iter(variants.values()))[Variants_Keys.VAR_READS]) for vid, V in variants.items(): for K in (Variants_Keys.VAR_READS, Variants_Keys.TOTAL_READS, Variants_Keys.OMEGA_V): assert len(V[K]) == S, '%s for %s is of length %s, but %s expected' % (K, vid, len(V[K]), S) return variants
[docs]def remove_garbage(variants, garbage): """Removes garbage variants from dictionary of variants Parameters ---------- variants : dictionary A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant containing the variant's 'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) garbage : list A list of variant 'id' values to be removed from the 'variants' dictionary Returns ------- dictionary A 'variants' dictionary (same as what's returned by load_ssms) with the variants listed in the garbage parameter removed """ assert set(garbage).issubset(set(variants.keys())), \ "Garbage variants must be in the dictionary of variants" cleaned = dict(variants) for vid in garbage: del cleaned[vid] return cleaned
[docs]def load_params(params_fn): """Loads a params file into a dictionary Parameters ---------- params_fn : str The parameters file Returns ------- dictionary A dictionary where each of the key, value pairs are the same as those listed in the params_fn file """ if params_fn is None: return {} with open(params_fn) as P: return json.load(P)
[docs]def load_ssms_and_params(ssm_fn, params_fn, remove_garb=True, rescale_depth=False): """Loads ssm file and params file Parameters ---------- ssm_fn : str The simple somatic mutation file params_fn : str The parameters file remove_garb : bool, optional A flag to remove garbage from the 'variants' dictionary (default is True) """ variants = load_ssm(ssmfn, rescale_depth) params = load_params(paramsfn) if PARAMS_Columns.GARBAGE not in params: params[PARAMS_Columns.GARBAGE] = [] if remove_garb: variants = remove_garbage(variants, params[PARAMS_Columns.GARBAGE]) return (variants, params)
[docs]def write_ssms(variants, ssm_fn): """Writes a set of variants along with their associated read count and copy number information to an ssm file Parameters ---------- variants : dictionary A dictionary where the keys are unique variant 'id' values and the value is a dictionary for each variant containing the variant's 'id' (unique identifier), 'name' (string identifier), 'var_reads' (array of variants reads for each sample), 'total_reads' (array of total reads for each sample) 'omega_v' (array of variant read probabilities for each sample) ssm_fn : str The simple somatic mutation file Returns ------- None """ keys = ( SSM_Columns.ID, SSM_Columns.NAME, SSM_Columns.VAR_READS, SSM_Columns.TOTAL_READS, SSM_Columns.VAR_READ_PROB ) with open(ssm_fn, 'w') as f: print(*keys, sep='\t', file=f) for variant in variants.values(): variant = dict(variant) for k in (Variants_Keys.VAR_READS, Variants_Keys.TOTAL_READS, Variants_Keys.OMEGA_V): variant[k] = ','.join([str(val) for val in variant[k]]) variant[SSM_Columns.VAR_READ_PROB] = variant[Variants_Keys.OMEGA_V] print(*[variant[k] for k in keys], sep='\t', file=f)
[docs]def write_params(params, params_fn): """Writes a set of variants along with their associated read count and copy number information to an ssm file Parameters ---------- params : dictionary A dictionary that contains a set of key/value pairs to write to a file as a JSON string params_fn : str The parameters file (.params.json) to write the params to Returns ------- None """ json_params = json.dumps(params) with open(params_fn, "w") as outfile: outfile.write(json_params)