Commit cb4880e3 authored by Paul Costea's avatar Paul Costea

qa print removed and no more bed file needed

parent 8bae1567
...@@ -33,7 +33,6 @@ universe repository before): ...@@ -33,7 +33,6 @@ universe repository before):
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe"
sudo apt-get update sudo apt-get update
sudo apt-get install libhts-dev libboost-dev sudo apt-get install libhts-dev libboost-dev
### Installing dependencies using anaconda ### Installing dependencies using anaconda
...@@ -50,10 +49,8 @@ If you do not have a C++ compiler, anaconda can also install G++: ...@@ -50,10 +49,8 @@ If you do not have a C++ compiler, anaconda can also install G++:
conda create --name metaSNV boost htslib pkg-config conda create --name metaSNV boost htslib pkg-config
source activate metaSNV source activate metaSNV
# Add this command: # Add this command:
conda install gcc conda install gcc
export CFLAGS=-I$CONDA_ENV_PATH/include export CFLAGS=-I$CONDA_ENV_PATH/include
export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH
...@@ -104,7 +101,7 @@ Example Tutorial ...@@ -104,7 +101,7 @@ Example Tutorial
## 4. Run the SNV calling step ## 4. Run the SNV calling step
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --threads 8 --ctg_len db/freeze9.len.def.bed $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --threads 8
## 5. Run filtering and post processing ## 5. Run filtering and post processing
...@@ -120,13 +117,13 @@ run and you can decide on how to schedule and manage them. ...@@ -120,13 +117,13 @@ run and you can decide on how to schedule and manage them.
## 1. Get the first set of commands ## 1. Get the first set of commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --ctg_len db/freeze9.len.def.bed --print-commands $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
Note the addition of the "--print-commnads". This will print out one-liners that you need to run. When done, run same again. Note the addition of the "--print-commnads". This will print out one-liners that you need to run. When done, run same again.
## 2. Get the second set of commands ## 2. Get the second set of commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --ctg_len db/freeze9.len.def.bed --print-commands $ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
This will calculate the "load balancing" and give you the commands for running the SNV calling. This will calculate the "load balancing" and give you the commands for running the SNV calling.
......
...@@ -76,6 +76,21 @@ def compute_opt(args): ...@@ -76,6 +76,21 @@ def compute_opt(args):
stderr.write("Failure in sample {}".format(sample)) stderr.write("Failure in sample {}".format(sample))
stderr.write("Call to {} failed.".format(' '.join(cmd))) stderr.write("Call to {} failed.".format(' '.join(cmd)))
exit(1) exit(1)
def get_header(args):
use = open(args.all_samples).readline().rstrip()
o = subprocess.check_output(["samtools","view","-H",use])
f = open(args.project_dir + '/bed_header','w')
for line in o.split('\n')[1:]:
line = line.rstrip().split('\t')
if len(line) != 3:
continue
line[1] = line[1].replace('SN:','')
line[2] = line[2].replace('LN:','')
f.write(line[1]+'\t1\t'+line[2]+'\n')
f.close()
args.ctg_len = args.project_dir + '/bed_header'
def split_opt(args): def split_opt(args):
if args.n_splits > 100: if args.n_splits > 100:
...@@ -214,8 +229,7 @@ def main(): ...@@ -214,8 +229,7 @@ def main():
help='Number of jobs to run simmultaneously. Will create same number of splits, unless n_splits set differently.') help='Number of jobs to run simmultaneously. Will create same number of splits, unless n_splits set differently.')
parser.add_argument('--n_splits', metavar='INT', default=1,type=int, parser.add_argument('--n_splits', metavar='INT', default=1,type=int,
help='Number of bins to split ref into') help='Number of bins to split ref into')
parser.add_argument('--ctg_len',metavar='BED_FILE',default='',
help='Per contig lengths, required for splitting.')
args = parser.parse_args() args = parser.parse_args()
if not path.isfile(args.ref_db): if not path.isfile(args.ref_db):
stderr.write(''' stderr.write('''
...@@ -234,13 +248,6 @@ ERROR: No binaries found ...@@ -234,13 +248,6 @@ ERROR: No binaries found
SOLUTION: make\n\n'''.format(basedir)) SOLUTION: make\n\n'''.format(basedir))
exit(1) exit(1)
if (args.n_splits > 1 or args.threads > 1) and args.ctg_len=='':
stderr.write('''
ERROR: No contig information supplied. Cannot compute splits
SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))
exit(1)
if args.threads > 1 and args.n_splits==1: if args.threads > 1 and args.n_splits==1:
args.n_splits=args.threads args.n_splits=args.threads
...@@ -249,6 +256,8 @@ SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir)) ...@@ -249,6 +256,8 @@ SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))
exit(1) exit(1)
create_directories(args.project_dir) create_directories(args.project_dir)
compute_opt(args) compute_opt(args)
get_header(args)
if args.n_splits > 1: if args.n_splits > 1:
split_opt(args) split_opt(args)
snp_call(args) snp_call(args)
......
...@@ -341,31 +341,43 @@ def genetic_distance(sample1, sample2): ...@@ -341,31 +341,43 @@ def genetic_distance(sample1, sample2):
temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])] temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])]
return np.nansum(temp_res) return np.nansum(temp_res)
def computeAllDiv(args): def computeAllDiv(args):
print "Computing diversities" print "Computing diversities"
# Coverage
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
# Genome
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
# Loop over species :
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq') allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)] for f in allFreq:
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) for i in range(j+1)] for j in range(3)]
dist.to_csv(args.projdir+'/distances/'+'%s.diversity' % f.split('/')[-1].replace('.freq',''), sep='\t')
# To do : divide by the minimal horizontal coverage of each pairwise comparison.
# dist = dist / min(sample_i.1X_horizontal_coverage, sample_j.1X_horizontal_coverage)
# Get species name :
species = int(f.split('/')[-1].split('.')[0])
# Get species genome length
genome_length = bedfile_tab.loc[str(species), 2].sum()
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[i][j] for i in range(j + 1)] for j in range(len(data.columns))]
dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
if __name__ == "__main__": if __name__ == "__main__":
# print("globals: {}".format(globals())) # print("globals: {}".format(globals()))
......
...@@ -593,23 +593,23 @@ int main(int argc, char *argv[]) ...@@ -593,23 +593,23 @@ int main(int argc, char *argv[])
bam_destroy1(b); bam_destroy1(b);
free(entireChr); free(entireChr);
fprintf(stdout,"\nDuplicates:%u \n", duplicates); //fprintf(stdout,"\nDuplicates:%u \n", duplicates);
//Print header for next table in output file //Print header for next table in output file
fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n"); fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n");
fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength); //fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength);
//Compute procentages of genome cover!. //Compute procentages of genome cover!.
int i; int i;
for (i=0; i<=userOpt.maxCoverage; ++i) { for (i=0; i<=userOpt.maxCoverage; ++i) {
if (i == 0) { if (i == 0) {
//Non-covered! //Non-covered!
fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100); //fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100);
} else { } else {
uint64_t coverage = 0; uint64_t coverage = 0;
//All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition //All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition
for (int x = i; x<=userOpt.maxCoverage; ++x) coverage += coverageHist[x]; for (int x = i; x<=userOpt.maxCoverage; ++x) coverage += coverageHist[x];
fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i); //fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i);
fprintf(outputFile,"%d\t%3.5f\t%lu\n",i, (double)(coverage)/totalGenomeLength*100, coverage); fprintf(outputFile,"%d\t%3.5f\t%lu\n",i, (double)(coverage)/totalGenomeLength*100, coverage);
} }
...@@ -632,7 +632,7 @@ int main(int argc, char *argv[]) ...@@ -632,7 +632,7 @@ int main(int argc, char *argv[])
fprintf(outputFile, "Number of interchromosomal pairs: %u\n",interChr); fprintf(outputFile, "Number of interchromosomal pairs: %u\n",interChr);
} }
printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality); //printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
free(coverageHist); free(coverageHist);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment