Skip to content
Snippets Groups Projects
Commit 2e7332a1 authored by Paul Costea's avatar Paul Costea
Browse files

FST calculation added

parent 8f69707d
No related branches found
No related tags found
No related merge requests found
......@@ -338,47 +338,39 @@ def compute_diversity(ind, sample1, sample2):
# Apply the compute_diversity to all positions for a specific pairwise comparison
def genetic_distance(sample1, sample2):
temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])]
temp_res = [compute_diversity(ind, sample1, sample2) for ind in set(list(sample1.index))]
return np.nansum(temp_res)
def computeAllDiv(args):
print "Computing diversities"
# Coverage
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
# Genome
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
# Loop over species :
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
# Get species name :
species = int(f.split('/')[-1].split('.')[0])
# Get species genome length
genome_length = bedfile_tab.loc[str(species), 2].sum()
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
data = data.set_index(pd.Index(pos_index))
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[i][j] for i in range(j + 1)] for j in range(len(data.columns))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
FST = pd.DataFrame(FST, index=data.columns, columns=data.columns)
dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
FST.to_csv(args.projdir + '/distances/' + '%s.FST' % species, sep='\t')
if __name__ == "__main__":
# print("globals: {}".format(globals()))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment