import random import argparse num_sites_to_sample = 5 fp_snp_sample = num_sites_to_sample fp_indels_sample = num_sites_to_sample fn_snp_sample = num_sites_to_sample fn_indels_sample = num_sites_to_sample chrom_idx = 0 pos_idx = 1 ref_idx = 3 alt_idx = 4 filter_idx = 6 format_idx = 8 truth_gt_idx = 9 query_gt_idx = 10 def write_curation_sites_out(site_list, output_file): for site in site_list: split_site = site.split("\t") chrom = split_site[chrom_idx] pos = split_site[pos_idx] ref = split_site[ref_idx] alt = split_site[alt_idx] format_field = split_site[format_idx] truth_gt = split_site[truth_gt_idx] query_gt = split_site[query_gt_idx] to_write_out = chrom + "\t" + pos + "\t" + ref + "\t" + alt + "\t" + format_field + "\t" + truth_gt + "\t" + query_gt output_file.write(to_write_out) output_file.flush() parser = argparse.ArgumentParser(description="Find FN indels and write to new file") parser.add_argument('--input_vcf_file', metavar="I", type=str, nargs="+", help="input file") parser.add_argument('--output_prefix', metavar="O", type=str, nargs="+", help="output file") args = parser.parse_args() vcf_file_path = args.input_vcf_file[0] fp_snps_output_path = args.output_prefix[0] + "_fp_snps.tsv" fp_indels_output_path = args.output_prefix[0] + "_fp_indels.tsv" fn_snps_output_path = args.output_prefix[0] + "_fn_snps.tsv" fn_indels_output_path = args.output_prefix[0] + "_fn_indels.tsv" vcf_file = open(vcf_file_path, "r") vcf_lines = vcf_file.readlines() fp_snps = [] fn_snps = [] fp_indels = [] fn_indels = [] for line in vcf_lines: if "#" in line: continue split_line = line.split("\t") if ":FP:" in line and ":FN:" not in line and ":TP:" not in line: if split_line[filter_idx] == "PASS" or split_line[filter_idx] == ".": ref = split_line[ref_idx] alt = split_line[alt_idx] if len(ref) > 1 or len(alt) > 1: fp_indels.append(line) else: fp_snps.append(line) elif ":FN:" in line and ":FP:" not in line and ":TP:" not in line: split_line = line.split("\t") ref = split_line[ref_idx] alt = split_line[alt_idx] if len(ref) > 1 or len(alt) > 1: fn_indels.append(line) else: fn_snps.append(line) if len(fp_snps) < num_sites_to_sample: fp_snp_sample = len(fp_snps) print("Only " + str(len(fp_snps)) + " FP_SNP sites \n") if len(fp_indels) < num_sites_to_sample: fp_indels_sample = len(fp_indels) print("Only " + str(len(fp_indels)) + " FP_INDEL sites \n") if len(fn_snps) < num_sites_to_sample: fn_snp_sample = len(fn_snps) print("Only " + str(len(fn_snps)) + " FN_SNP sites \n") if len(fn_indels) < num_sites_to_sample: fn_indels_sample = len(fn_indels) print("Only " + str(len(fn_indels)) + " FN_INDEL sites \n") fp_snp_curation_sites = random.sample(fp_snps, fp_snp_sample) fp_indels_curation_sites = random.sample(fp_indels, fp_indels_sample) fn_snp_curation_sites = random.sample(fn_snps, fn_snp_sample) fn_indels_curation_sites = random.sample(fn_indels, fn_indels_sample) fp_snp_curation_sites_file_out = open(fp_snps_output_path, "w+") fp_indels_curation_sites_file_out = open(fp_indels_output_path, "w+") fn_snp_curation_sites_file_out = open(fn_snps_output_path, "w+") fn_indels_curation_sites_file_out = open(fn_indels_output_path, "w+") write_curation_sites_out(fp_snp_curation_sites, fp_snp_curation_sites_file_out) write_curation_sites_out(fp_indels_curation_sites, fp_indels_curation_sites_file_out) write_curation_sites_out(fn_snp_curation_sites, fn_snp_curation_sites_file_out) write_curation_sites_out(fn_indels_curation_sites, fn_indels_curation_sites_file_out) vcf_file.close() fp_snp_curation_sites_file_out.close() fp_indels_curation_sites_file_out.close() fn_snp_curation_sites_file_out.close() fn_indels_curation_sites_file_out.close()