Source code for omicsdata.ssm.convert

##############################################################################
# ssm_to_viber.py
#
# Contains code to convert a simple somatic mutation file to a 
# tab separated file that can be used by VIBER.
##############################################################################

import pandas as pd 
import json, sys, os
import numpy as np 
from warnings import showwarning

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))

from omicsdata.ssm.columns import SSM_Columns, PARAMS_Columns

# CONSTANTS 
PYCLONE_VI_COLUMNS = ["mutation_id", 
                      "sample_id", 
                      "ref_counts", 
                      "alt_counts", 
                      "major_cn", 
                      "minor_cn", 
                      "normal_cn"]

[docs]def ssm_to_viber(viber_dir, ssm_fn, params_fn): """Processes simple somatic mutation (ssm) file to a tab separated file (tsv) that can be used by VIBER (https://github.com/caravagnalab/VIBER) Parameters ---------- viber_dir : str path to a directory to store all VIBER format files ssm_fn : str The simple somatic mutation file params_fn : str The parameters file Returns ------- None """ # make sure directory exists if not os.path.exists(viber_dir): os.mkdir(viber_dir) dataframe = pd.read_csv(ssm_fn, sep="\t") params = json.load(open(params_fn)) samples = params[PARAMS_Columns.SAMPLES] clusters = params[PARAMS_Columns.CLUSTERS] ssms = [c[0] for c in clusters] DP = [] NV = [] for vid in ssms: row = dataframe[dataframe["id"] == vid].iloc[0] row_DP = [] row_NV = [] # iterate through all var_read, ref_reads, var_read_prob per row for var_reads, total_reads, var_read_prob in zip([int(cnt) for cnt in row[SSM_Columns.VAR_READS].split(",")], [int(cnt) for cnt in row[SSM_Columns.TOTAL_READS].split(",")], [float(vrp) for vrp in row[SSM_Columns.VAR_READ_PROB].split(",")]): # make everything diploid total_reads_diploid = int(2*total_reads*var_read_prob) var_reads_diploid = np.minimum(int(var_reads), total_reads_diploid) row_DP.append(total_reads_diploid) row_NV.append(var_reads_diploid) DP.append(row_DP) NV.append(row_NV) pd.DataFrame(ssms, columns=["id"]).to_csv(os.path.join(viber_dir, "id.tsv"), sep="\t", index=False) pd.DataFrame(DP, columns=samples).to_csv(os.path.join(viber_dir, "DP.tsv"), sep="\t", index=False) pd.DataFrame(NV, columns=samples).to_csv(os.path.join(viber_dir, "NV.tsv"), sep="\t", index=False)
[docs]def ssm_to_pyclone(pyclone_fn, ssm_fn, params_fn): """Processes simple somatic mutation (ssm) file to a tab separated file (tsv) that can be used by PyClone-VI (https://github.com/Roth-Lab/pyclone-vi) Parameters ---------- pyclone_fn : str path to a file to a tsv file to output the convert ssm file data ssm_fn : str The simple somatic mutation file params_fn : str The parameters file Returns ------- None """ showwarning("We do not have a good way to estimate the major and minor copy numbers for PyClone" "-- if your data contains mutations from sex chromosomes, we do not know if the organism" "is male or female. If complex copy number changes have occurred, we also don't have a clear " "way to determine how many copies of each allele there are. You've been warned!", Warning, "ssm_to_pyclone.py", 42) dataframe = pd.read_csv(ssm_fn, sep="\t") params = json.load(open(params_fn)) samples = params[PARAMS_Columns.SAMPLES] clusters = params[PARAMS_Columns.CLUSTERS] ssms = [c[0] for c in clusters] pyclone_df = pd.DataFrame(columns=PYCLONE_VI_COLUMNS) for vid in ssms: row = dataframe[dataframe[SSM_Columns.ID] == vid].iloc[0] name = row[SSM_Columns.NAME] # iterate through all var_read, ref_reads, var_read_prob per row for var_reads, total_reads, var_read_prob, sample in zip([int(cnt) for cnt in row[SSM_Columns.VAR_READS].split(",")], [int(cnt) for cnt in row[SSM_Columns.TOTAL_READS].split(",")], [float(vrp) for vrp in row[SSM_Columns.VAR_READ_PROB].split(",")], samples): # assume everything is either diploid or haploid and in a non-CNA effected region major_cn, minor_cn = 1, 1 if (name[:2] == "Y_") or var_read_prob > 0.90: # this is a guess that the copy number is 1, this is not accurate if your data has complex copy number changes normal_cn = 1 else: normal_cn = 2 if normal_cn == 1 and minor_cn >= 1: minor_cn = 1 pyclone_values = [name, sample, total_reads - var_reads, var_reads, major_cn, minor_cn, normal_cn] pyclone_df = pd.concat([pyclone_df, pd.DataFrame.from_records([dict(zip(PYCLONE_VI_COLUMNS, pyclone_values))])], ignore_index=True) pyclone_df.to_csv(pyclone_fn, sep="\t", index=False)