FST calculation added

2e7332a1 · Paul Costea · 8f69707d · 2e7332a1
Commit 2e7332a1 authored 7 years ago by Paul Costea
--- a/metaSNV_post.py
+++ b/metaSNV_post.py
@@ -338,47 +338,39 @@ def compute_diversity(ind, sample1, sample2):

 # Apply the compute_diversity to all positions for a specific pairwise comparison
 def genetic_distance(sample1, sample2):
-    temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])]
+    temp_res = [compute_diversity(ind, sample1, sample2) for ind in set(list(sample1.index))]
    return np.nansum(temp_res)

 def computeAllDiv(args):
+
    print "Computing diversities"
-    
-    # Coverage
    cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
    cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
    cov_perc = cov_perc.drop('Unnamed: 0', 1)

-    # Genome
    bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
    bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
    bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))

-    # Loop over species :
    allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
-
    for f in allFreq:
-
-        # Get species name :
        species = int(f.split('/')[-1].split('.')[0])
-        # Get species genome length
        genome_length = bedfile_tab.loc[str(species), 2].sum()
-        # Read the freq table as data.frame
+
        data = pd.read_table(f, index_col=0, na_values=['-1'])
        pre_index = [i.split(':') for i in list(data.index)]
-        # Setting index for each position
-        index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
-        # Setting index for Non-synonymous vs Synonymous
-        index2 = [item[4] for item in pre_index]
-        indexes = [index1, index2]
-        index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
-        data = data.set_index(index)
-
+        pos_index = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
+        data = data.set_index(pd.Index(pos_index))
        # Correcting the coverage by the genome length observed in each pairwise comparison
-        correction_cov = [[min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length / 100 for i in data.columns] for j in data.columns]
-        dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[i][j] for i in range(j + 1)] for j in range(len(data.columns))]
-	dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
+        correction_cov = [[(min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length) / 100 for i in data.columns] for j in data.columns]
+        dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[j][i] for i in range(j + 1)] for j in range(len(data.columns))]
+        FST = [[(1-(dist[i][i]+dist[j][j])/(2*dist[j][i])) for i in range(j + 1)] for j in range(len(dist))]
+
+        dist = pd.DataFrame(dist, index=data.columns, columns=data.columns)
+        FST = pd.DataFrame(FST, index=data.columns, columns=data.columns)
+
        dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
+        FST.to_csv(args.projdir + '/distances/' + '%s.FST' % species, sep='\t')

 if __name__ == "__main__":
 #	print("globals: {}".format(globals()))