from random import choice, sample, randint, randrange from __future__ import division def process_data_entry_line (line, ids): '''for a line in the vcf file this will return a list of the information in that line, see the return statement''' fields = line.split('\t') chr_n = int(fields[0]) pos_n = int(fields[1]) ref = fields[3] alt = fields[4] indiv_ids = zip(ids, indiv_snp_variation(line)) return (chr_n, pos_n, ref, alt, indiv_ids) #Creates a list of allelic variation for each SNP. The list itself contains one entry of two numbers for each individual. def indiv_snp_variation(line): '''This function takes a single line from a vcf file and returns a list of the genotypes of the given snp of each individual''' l = line.split('\t') list = l[9:] variation_list = [] for i in list: variation_list.append(map(int,i.split(':')[0].split('|'))) return variation_list def parse_data_file(f): '''This function takes a filepath for a vcf file and returns a list of each snp with the genotype of each individual in a sublist''' result = [] for line in f: if line.startswith('##'): pass elif line.startswith('#CHROM'): ids = line.rstrip().split('\t')[9:] else: result.append(process_data_entry_line(line, ids)) return result def get_gene_stats(vcf, gene_coordinates): '''This function should take a vcf file and a list of gene coordinates i.e. [(89653782, 89653798), ...]. The function should return the total number of snps in the given region, and the snp frequency (snps/base).''' #do some work return total_snps, snp_freqs def create_new_population(pop_size, genotypes): '''create a list of all alleles. The list is the size of the population''' result = [] for i in range(pop_size): genotype = choice(genotypes) result.append(genotype) return result def create_new_generation(population, pop_size, genotypes): '''create a new generation. This will be the same size as the original population''' males = sample(population, (int(len(population)/2))) result = [] for male in males: population.remove(male) while len(result) < pop_size: new_gene = (choice(choice(males)), choice(choice(population))) if new_gene == (genotypes[1][1], genotypes[1][0]): result.append((genotypes[1])) else: result.append((new_gene)) return result def count_generations(pop_size, alleles): '''count the number of generations until one allele is fixed''' genotypes = [(alleles[0], alleles[0]), (alleles[0], alleles[1]), (alleles[1], alleles[1])] population = create_new_population(pop_size, genotypes) c = 0 while genotypes[1] in population: pop1 = create_new_generation(population, pop_size, genotypes) population = pop1 # print pop1 c +=1 return c def count_allele_frequency(line): '''This counts the number of homozygotes and heterozygotes for a given snp''' hetero = 0 ref = 0 alt = 0 for i in line[4]: if i[1] == [0,1] or i[1] == [1, 0]: hetero += 1 elif i[1] == [0, 0]: ref += 1 elif i[1] == [1, 1]: alt += 1 return ref, alt, hetero def calculate_hwe(pop_size, alleles, generations): '''This takes a given population size and the number of generations and returns the predicted number of heterozygotes divided by the actual number present in the last generation. A population in complete equilibrium would return a value of 1''' genotypes = [(alleles[0], alleles[0]), (alleles[0], alleles[1]), (alleles[1], alleles[1])] population = create_new_population(pop_size, genotypes) for i in range(generations): new_pop = create_new_generation(population, pop_size, genotypes) population = new_pop p = (2*population.count(genotypes[0]) + (population.count(genotypes[1])))/(2*len(population)) q = p - 1 pq2 = ((population.count(genotypes[1])))/(len(population)) hetero = (2*p*q) return pq2/hetero def fraction_better_or_equivalent_hwe(vcf, alleles, replications, generations, snp=None): '''This file compares the ratio of predicted to actual heterozygotes in a vcf file for a given snp. It then compares this value to an artificial distribution based on a number of iterations of the calculate_hwe function. The return is a p-value for a null hypothesis that the snp in question is in hardy weinberg equilibrium.''' snps = parse_data_file(vcf) pop_len = len(snps[0][4]) result = [] total_snps = [] # If a snp is identified the function will return only the p-value for that snp # otherwise it will return a p-value for every snp for i in snps: if snp == None: total_snps.append(i) elif snp == int(i[1]): total_snps.append(i) else: pass for i in total_snps: snp = i[1] ref, alt, hetero = count_allele_frequency(i) p = ((2*ref) + hetero)/(2*pop_len) q = 1 - p pq2 = hetero/pop_len predicted_2pq = (2*p*q) if predicted_2pq == 0: pass else: ratio_actual = pq2/predicted_2pq count = 0 for i in range(replications): if (1 - abs(ratio_actual)) >= (1-abs(calculate_hwe(pop_len, alleles, generations))): count += 1 if count == 0: result.append((snp, 1)) else: result.append((snp, round((1 -(count/replications)), 10))) else: pass return result