Commit 2e7332a1 authored by Paul Costea's avatar Paul Costea

FST calculation added

parent 8f69707d
......@@ -338,47 +338,39 @@ def compute_diversity(ind, sample1, sample2):
# Apply the compute_diversity to all positions for a specific pairwise comparison
def genetic_distance(sample1, sample2):
temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])]
temp_res = [compute_diversity(ind, sample1, sample2) for ind in set(list(sample1.index))]
return np.nansum(temp_res)
def computeAllDiv(args):
print "Computing diversities"
# Coverage
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
# Genome
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
# Loop over species :
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
# Get species name :
species = int(f.split('/')[-1].split('.')[0])
# Get species genome length
genome_length = bedfile_tab.loc[str(species), 2].sum()
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
data = data.set_index(pd.Index(pos_index))
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[i][j] for i in range(j + 1)] for j in range(len(data.columns))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
FST = pd.DataFrame(FST, index=data.columns, columns=data.columns)
dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
FST.to_csv(args.projdir + '/distances/' + '%s.FST' % species, sep='\t')
if __name__ == "__main__":
# print("globals: {}".format(globals()))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment