Commit cb4880e3 authored by Paul Costea's avatar Paul Costea

qa print removed and no more bed file needed

parent 8bae1567
......@@ -33,7 +33,6 @@ universe repository before):
sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe"
sudo apt-get update
sudo apt-get install libhts-dev libboost-dev
### Installing dependencies using anaconda
......@@ -50,10 +49,8 @@ If you do not have a C++ compiler, anaconda can also install G++:
conda create --name metaSNV boost htslib pkg-config
source activate metaSNV
# Add this command:
conda install gcc
export CFLAGS=-I$CONDA_ENV_PATH/include
export LD_LIBRARY_PATH=$CONDA_ENV_PATH/lib:$LD_LIBRARY_PATH
......@@ -104,7 +101,7 @@ Example Tutorial
## 4. Run the SNV calling step
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --threads 8 --ctg_len db/freeze9.len.def.bed
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --threads 8
## 5. Run filtering and post processing
......@@ -120,13 +117,13 @@ run and you can decide on how to schedule and manage them.
## 1. Get the first set of commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --ctg_len db/freeze9.len.def.bed --print-commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
Note the addition of the "--print-commnads". This will print out one-liners that you need to run. When done, run same again.
## 2. Get the second set of commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --ctg_len db/freeze9.len.def.bed --print-commands
$ python metaSNV.py tutorial sample_list db/freeze9.genomes.RepGenomesv9.fna --n_splits 8 --print-commands
This will calculate the "load balancing" and give you the commands for running the SNV calling.
......
......@@ -76,6 +76,21 @@ def compute_opt(args):
stderr.write("Failure in sample {}".format(sample))
stderr.write("Call to {} failed.".format(' '.join(cmd)))
exit(1)
def get_header(args):
use = open(args.all_samples).readline().rstrip()
o = subprocess.check_output(["samtools","view","-H",use])
f = open(args.project_dir + '/bed_header','w')
for line in o.split('\n')[1:]:
line = line.rstrip().split('\t')
if len(line) != 3:
continue
line[1] = line[1].replace('SN:','')
line[2] = line[2].replace('LN:','')
f.write(line[1]+'\t1\t'+line[2]+'\n')
f.close()
args.ctg_len = args.project_dir + '/bed_header'
def split_opt(args):
if args.n_splits > 100:
......@@ -214,8 +229,7 @@ def main():
help='Number of jobs to run simmultaneously. Will create same number of splits, unless n_splits set differently.')
parser.add_argument('--n_splits', metavar='INT', default=1,type=int,
help='Number of bins to split ref into')
parser.add_argument('--ctg_len',metavar='BED_FILE',default='',
help='Per contig lengths, required for splitting.')
args = parser.parse_args()
if not path.isfile(args.ref_db):
stderr.write('''
......@@ -234,13 +248,6 @@ ERROR: No binaries found
SOLUTION: make\n\n'''.format(basedir))
exit(1)
if (args.n_splits > 1 or args.threads > 1) and args.ctg_len=='':
stderr.write('''
ERROR: No contig information supplied. Cannot compute splits
SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))
exit(1)
if args.threads > 1 and args.n_splits==1:
args.n_splits=args.threads
......@@ -249,6 +256,8 @@ SOLUTION: Supply contig information through --ctg_len\n\n'''.format(basedir))
exit(1)
create_directories(args.project_dir)
compute_opt(args)
get_header(args)
if args.n_splits > 1:
split_opt(args)
snp_call(args)
......
......@@ -341,31 +341,43 @@ def genetic_distance(sample1, sample2):
temp_res = [compute_diversity(ind, sample1, sample2) for ind in list(sample1.index.levels[0])]
return np.nansum(temp_res)
def computeAllDiv(args):
print "Computing diversities"
# Coverage
cov_perc = pd.read_table(args.percentage_file, skiprows=[1], index_col=False)
cov_perc = cov_perc.set_index(cov_perc.loc[:, 'Unnamed: 0'])
cov_perc = cov_perc.drop('Unnamed: 0', 1)
# Genome
bedfile_tab = pd.read_table(args.projdir + '/bed_header', index_col=0, header=None)
bed_index = [i.split('.')[0] for i in list(bedfile_tab.index)]
bedfile_tab = bedfile_tab.set_index(pd.Index(bed_index))
# Loop over species :
allFreq = glob.glob(args.projdir + '/filtered/pop/*.freq')
for f in allFreq:
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) for i in range(j+1)] for j in range(3)]
dist.to_csv(args.projdir+'/distances/'+'%s.diversity' % f.split('/')[-1].replace('.freq',''), sep='\t')
# To do : divide by the minimal horizontal coverage of each pairwise comparison.
# dist = dist / min(sample_i.1X_horizontal_coverage, sample_j.1X_horizontal_coverage)
for f in allFreq:
# Get species name :
species = int(f.split('/')[-1].split('.')[0])
# Get species genome length
genome_length = bedfile_tab.loc[str(species), 2].sum()
# Read the freq table as data.frame
data = pd.read_table(f, index_col=0, na_values=['-1'])
pre_index = [i.split(':') for i in list(data.index)]
# Setting index for each position
index1 = [item[0] + ':' + item[1] + ':' + item[2] for item in pre_index]
# Setting index for Non-synonymous vs Synonymous
index2 = [item[4] for item in pre_index]
indexes = [index1, index2]
index = pd.MultiIndex.from_arrays(indexes, names=['position', 'significance'])
data = data.set_index(index)
# Correcting the coverage by the genome length observed in each pairwise comparison
correction_cov = [[min(cov_perc.loc[species, i], cov_perc.loc[species, j]) * genome_length / 100 for i in data.columns] for j in data.columns]
dist = [[genetic_distance(data.iloc[:, [i]], data.iloc[:, [j]]) / correction_cov[i][j] for i in range(j + 1)] for j in range(len(data.columns))]
dist.to_csv(args.projdir + '/distances/' + '%s.diversity' % species, sep='\t')
if __name__ == "__main__":
# print("globals: {}".format(globals()))
......
......@@ -593,23 +593,23 @@ int main(int argc, char *argv[])
bam_destroy1(b);
free(entireChr);
fprintf(stdout,"\nDuplicates:%u \n", duplicates);
//fprintf(stdout,"\nDuplicates:%u \n", duplicates);
//Print header for next table in output file
fprintf(outputFile,"\nCov*X\tPercentage\tNr. of bases\n");
fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength);
//fprintf(stdout,"Total genome lenght %lu \n", totalGenomeLength);
//Compute procentages of genome cover!.
int i;
for (i=0; i<=userOpt.maxCoverage; ++i) {
if (i == 0) {
//Non-covered!
fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100);
//fprintf(stdout,"%3.2f of genome has not been covered\n", (double)(coverageHist[i])/totalGenomeLength*100);
} else {
uint64_t coverage = 0;
//All that has been covered i, had been covered i+1, i+2 and so on times. Thus, do this addition
for (int x = i; x<=userOpt.maxCoverage; ++x) coverage += coverageHist[x];
fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i);
//fprintf(stdout,"%3.2f of genome has been covered at least %dX \n", (double)(coverage)/totalGenomeLength*100, i);
fprintf(outputFile,"%d\t%3.5f\t%lu\n",i, (double)(coverage)/totalGenomeLength*100, coverage);
}
......@@ -632,7 +632,7 @@ int main(int argc, char *argv[])
fprintf(outputFile, "Number of interchromosomal pairs: %u\n",interChr);
}
printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
//printf("Out of %u reads, you have %3.5f unmapped reads\n and %3.5f sub-par quality mappings\n", totalNumberOfReads ,procentageOfUnmapped, procentageOfZeroQuality);
free(coverageHist);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment