Commit 444b593a authored by Paul Costea's avatar Paul Costea
Browse files

cleaned qaTools random stuff

parent 379c46b9
......@@ -3,48 +3,17 @@ include ../../SETUPFILE
CC=g++
# Flags for the compiles
CFLAGS=-c -Wno-sign-compare
#Samtools path, to samtools 1.3!
#SAMTOOLS=/g/bork3/x86_64/src/samtools-1.3
#Other stuff
HTSLIB=$(SAMTOOLS)/htslib-1.3
INC=-Iinclude/ -I$(SAMTOOLS) -I$(HTSLIB)
LIBS=-L$(SAMTOOLS) -L$(HTSLIB) -lbam -lhts -pthread -lz
INC= -I$(SAMTOOLS) -I$(HTSLIB)
LIBS= -L$(SAMTOOLS) -L$(HTSLIB) -lbam -lhts -pthread -lz
all: removeUnmapped qaCompute computeInsertSizeHistogram doBWAQualTrimming
fixPairedEnd: fixPairedEnd.o
$(CC) fixPairedEnd.o -o fixPairedEnd $(LIBS)
removeUnmapped: removeUnmapped.o
$(CC) removeUnmapped.o -o removeUnmapped $(LIBS)
all: qaCompute
qaCompute: qaCompute.o
$(CC) qaCompute.o -o qaCompute $(LIBS)
computeInsertSizeHistogram: computeInsertSizeHistogram.o
$(CC) computeInsertSizeHistogram.o -o computeInsertSizeHistogram $(LIBS)
doBWAQualTrimming: doBWAQualTrimming.o
$(CC) doBWAQualTrimming.o -o doBWAQualTrimming -lz
fixPairedEnd.o: fixPairedEnd.c
$(CC) $(INC) $(CFLAGS) fixPairedEnd.c
removeUnmapped.o: removeUnmapped.c
$(CC) $(INC) $(CFLAGS) removeUnmapped.c
qaCompute.o: qaCompute.c
$(CC) $(INC) $(CFLAGS) -o qaCompute.o qaCompute.c
computeInsertSizeHistogram.o: computeInsertSizeHistogram.c
$(CC) $(INC) $(CFLAGS) computeInsertSizeHistogram.c
doBWAQualTrimming.o: doBWAQualTrimming.c
$(CC) $(INC) $(CFLAGS) doBWAQualTrimming.c
clean:
rm -rf *o removeUnmapped
rm -rf *o qaCompute
rm -rf *o computeInsertSizeHistogram
rm -rf *o doBWAQualTrimming
rm -rf *o fixPairedEnd
A couple of useful qa tools for sequencing data.
I. Setup:
Change Makefile $SAMTOOLS path to your samtools instalation path.
If you don't have samtools, you can find it here:
http://samtools.sourceforge.net/
Just run make and you should be done!
II. Tools:
1. qaCompute
Computes normal and span coverage from a bam/sam file.
Also counts unmapped and sub-par quality reads.
Parameters:
m - Compute median coverage for each contig/chromosome.
Will make running a bit slower. Off by default.
q [INT] - Quality threshold. Any read with a mapping quality under
INT will be ignored when computing the coverage.
NOTE: bwa outputs mapping quality 0 for reads that map with
equal quality in multiple places. If you want to condier this,
set q to 0.
d - Print coverage histrogram over each individual contig/chromosome.
These details will be printed in file <output>.detail
p [INT] - Print coverage profile to bed file, averaged over given window size.
i - Silent run. Will not print running info to stdout.
s [INT] - Compute span coverage. (Use for mate pair libs)
Instead of actual read coverage, using the options will consider
the entire span of the insert as a read, if insert size is
lower than INT.
For an accurate estimation of span coverage, I recommend
setting an insert size limit INT around 3*std_dev of your lib's
insert size distribution.
c [INT] - Maximum X coverage to consider in histogram.
h [STR] - Use different header.
Because mappers sometimes break the headers or simply don't output them,
this is provieded as a non-kosher way around it. Use with care!
For more info on the parameteres try ./qaCompute
2. removeUnmapped
Remove unmapped and sub-par quality reads from a bam/sam file.
For more info on the parameters try ./removeUnmapped
3. computeInsertSizeHistogram
Compute the insert size distribution from a bam/sam file.
For more info on the parameters try ./computeInsertSizeHistogram
\ No newline at end of file
/*
qaTools - Just more qa tools.
Copyright (C) 2011 P. Costea(paul.igor.costea@embl.de)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <string>
#include "sam.h"
using namespace std;
#define MIN(x,y) \
(x < y) ? (x) : (y)
#define MAX(x,y) \
(x > y) ? (x) : (y)
/**
* Check if read is properly mapped
* @return true if read mapped, false otherwise
*/
static bool is_mapped(const bam1_core_t *core, int minQual)
{
if ((core->flag&BAM_FUNMAP) || (int(core->qual) < minQual)) {
return false;
}
return true;
}
static int print_usage()
{
fprintf(stderr, "\n");
fprintf(stderr, "Program: computeInsertSizeHistogram \n");
fprintf(stderr, "Version: 1.2\n");
fprintf(stderr, "Contact: Paul Costea <paul.igor.costea@embl.de>\n\n");
fprintf(stderr, "Usage: computeInsertSizeHistogram [options] <in.bam/sam> <out.hist>\n\n");
fprintf(stderr, "Options: -q INT minimum quality mapping to consider in counting distribution [60]\n");
fprintf(stderr, " -l INT maximum insert size (size of distribution) [1000]\n");
fprintf(stderr, " -v split histogram by FR,RF,T\n");
fprintf(stderr, " -s assume file is sorted");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Input file must contain paires as subsequent entries, unless -s was specified \n\n");
return 1;
}
/**
* Main of app
*/
int main(int argc, char *argv[])
{
samfile_t *fp;
FILE * out = NULL;
//Minimum quality to consider mapping
int minQual = 60;
int max = 1000;
bool sorted = false;
bool verbose = false;
int arg;
//Get args
while ((arg = getopt(argc, argv, "q:l:vs")) >= 0) {
switch (arg) {
case 'q': minQual = atoi(optarg); break;
case 'l': max = atoi(optarg); break;
case 's': sorted = true; break;
case 'v': verbose = true; break;
default:
fprintf(stderr,"Read wrong arguments! \n");
break;
}
}
if (argc-optind != 2) {
print_usage();
//Give up
return -1;
}
int32_t **hist = NULL;
if (verbose) {
//Need a bit more memory...
hist = new int32_t*[3];
hist[0] = new int32_t[max];
hist[1] = new int32_t[max];
hist[2] = new int32_t[max];
} else {
hist = new int32_t*[1];
hist[0] = new int32_t[max];
for (int i=0; i<max;++i) hist[0][i]=0;
}
string alignFile(argv[optind]);
//Check if this is sam or bam file
string flag = "r";
if (alignFile.substr(alignFile.size()-3).compare("bam") == 0) {
//BAM file!
flag += "b";
}
if ((fp = samopen(alignFile.c_str(), flag.c_str() , 0)) == 0) {
fprintf(stderr, "Fail to open file %s\n", alignFile.c_str());
return 1;
}
if ((out = fopen(argv[optind+1], "w")) == 0) {
fprintf(stderr, "Filed to create output file %s\n", argv[optind+1]);
return 1;
}
bam1_t *b = bam_init1();
bam1_t *c = bam_init1();
//Test for proper "sorting" (we can afford to chuck the first two reads
samread(fp, b);
samread(fp, c);
if (string(bam1_qname(b)).compare(string(bam1_qname(c))) == 0) {
if (sorted) {
fprintf(stderr,"File doesn't seem to be sorted...Don't use -s \n");
}
} else {
if (!sorted) {
fprintf(stderr,"File seems to be sorted...Use -s \n");
}
}
while (samread(fp,b) >= 0) {
if (!sorted) {
samread(fp,c);
}
if (is_mapped(&b->core, minQual)) { //First is mapped...
if (sorted) {//This is a sorted file! Forget the second one, it's not here
if (!(b->core.flag&BAM_FMUNMAP) //Mate is also mapped!
&& (b->core.pos < b->core.mpos)) {//Count pair only once! Thus, do this only for leftmost in pair
int i_size = 0;
if (b->core.tid != b->core.mtid)
i_size = 1;
else
i_size = abs(b->core.pos - b->core.mpos)+b->core.l_qseq;
if (i_size < max) {
if (verbose) {
if (b->core.tid != b->core.mtid) {//Forget this one...
++hist[0][i_size];
continue;
}
//Check FR, RF, T
//Get flag of the leftmost read!
int flag = b->core.flag;
if ( (flag&(BAM_FMREVERSE)) && !(flag&BAM_FREVERSE) ) {//ForwardReverse
++hist[0][i_size];
} else if ( (flag&BAM_FREVERSE) && !(flag&BAM_FMREVERSE) ){//ReverseForward!
++hist[2][i_size];
} else if (((flag&(BAM_FMREVERSE|BAM_FREVERSE))==(BAM_FMREVERSE|BAM_FREVERSE))
|| ((flag&(BAM_FMREVERSE|BAM_FREVERSE))==0)) {//T
++hist[1][i_size];
} else {
fprintf(stderr,"Cannot clasify %s %d\n",bam1_qname(b),b->core.flag);
}
} else {
++hist[0][i_size];
/*if (abs(b->core.isize)==1 && b->core.tid == b->core.mtid) {
fprintf(stderr,"%s\n",bam1_qname(b));
}*/
}
}
}
} else if (is_mapped(&c->core,minQual)) {//This is a consecutive entry list!
//Compute the "actual" insert size. SAM is being funny here.
int32_t i_size = 0;
if (b->core.tid != c->core.tid) //Diff chromosomes!
i_size = 1;
else {
int32_t left = MIN(b->core.pos,c->core.pos);
int32_t right = MAX(((c->core.pos)+(c->core.l_qseq)),((b->core.pos)+(b->core.l_qseq)));
i_size = right - left;
/*if (i_size < 300) {
fprintf(stderr,"%s\t%d\t%d\n",bam1_qname(b),i_size,b->core.isize);
}*/
/* Adding the length of the b sequence is not necessarily the best way to go,
because the other read may actually have a different length. However, this
should not prove to be a problem overall.
*/
}
if (abs(i_size) < max) {
if (verbose) {
if (b->core.tid != c->core.tid) {//Forget this one...
++hist[0][i_size];
continue;
}
//Check FR, RF, T
//Get flag of the leftmost read!
int flag = (b->core.pos < c->core.pos) ? (b->core.flag) : (c->core.flag);
if ( (flag&(BAM_FMREVERSE)) && !(flag&BAM_FREVERSE) ) {//ForwardReverse
++hist[0][i_size];
} else if ( (flag&BAM_FREVERSE) && !(flag&BAM_FMREVERSE) ){//ReverseForward!
//fprintf(stderr,"%s %d\n",bam1_qname(b),b->core.flag);
++hist[2][i_size];
} else if (((b->core.flag&(BAM_FMREVERSE|BAM_FREVERSE))==(BAM_FMREVERSE|BAM_FREVERSE))
|| ((b->core.flag&(BAM_FMREVERSE|BAM_FREVERSE))==0)) {//T
++hist[1][i_size];
} else {
fprintf(stderr,"Cannot clasify %s %d\n",bam1_qname(b),b->core.flag);
}
} else {
++hist[0][i_size];
}
}
}
}
}
bam_destroy1(b);
bam_destroy1(c);
if (verbose) {
//Print header!
fprintf(out,"FR\tT\tRF\n");
for (int i=0; i<max; ++i) {
fprintf(out,"%d\t%d\t%d\n",hist[0][i],hist[1][i],hist[2][i]);
}
} else {
for (int i=0; i<max; ++i)
fprintf(out,"%d\n", hist[0][i]);
}
samclose(fp);
fclose(out);
return 0;
}
/*
qaTools - Just more qa tools.
Copyright (C) 2011 P. Costea(paul.igor.costea@scilifelab.se)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "pol_fastq.h"
static void usage()
{
fprintf(stderr, "\nVersion: 1.4\n");
fprintf(stderr, "Contact: Paul Costea <paul.igor.costea@scilifelab.se>\n\n");
fprintf(stderr, "Usage: ");
fprintf(stderr, "doBWAQualTrimming [Options] <in.fastq> <out.fastq>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -p Input is a paierd library\n");
fprintf(stderr, " Use: doBWAQualTrimming [Options] <in1.fastq> <in2.fastq> <out1.fastq> <out2.fastq>\n");
fprintf(stderr, " -s Use solexa qualitites\n");
fprintf(stderr, " -v Write discarded sequences\n");
fprintf(stderr, " -a Write fasta output\n");
fprintf(stderr, " -f Replace discarded sequences with N's\n");
fprintf(stderr, " -n Only remove N's from sequence. -q will be ignored.\n");
fprintf(stderr, " -q [INT] Quality threshold [20]\n");
fprintf(stderr, " -l [INT] Lenght cutoff [54]\n\n");
}
int main(int argc, char* argv[])
{
char arg;
int qual = 20;
int min_length = 54;
bool onlyN = false;
bool isPairedLib = false;
bool writeDropped = false;
bool isSolexa = false;
bool replaceWithFake = false;
bool writeFasta = false;
//Get args
while ((arg = getopt(argc, argv, "q:l:psafvn")) >= 0) {
switch (arg) {
case 'q': qual = atoi(optarg); break;
case 'l': min_length = atoi(optarg); break;
case 'p': isPairedLib = true; break;
case 'n': onlyN = true; break;
case 'v': writeDropped = true; break;
case 's': isSolexa = true; break;
case 'a': writeFasta = true; break;
case 'f': replaceWithFake = true; break;
default: fprintf(stderr,"Wrong parameter input: %s\n",optarg); break;
}
}
if (argc-optind != ((isPairedLib) ? 4 : 2)) {
usage();
return 1;
}
int p = optind;
FILE* fast_q1 = fopen(argv[p],"r");
if (fast_q1 == NULL) {
printf("Cannot open file %s\n", argv[p]);
return 1;
}
p++;
FILE* fast_q2 = NULL;
if (isPairedLib) {
fast_q2 = fopen(argv[p],"r");
if (fast_q2 == NULL) {
printf("Cannot open file %s\n", argv[p]);
return 1;
}
p++;
}
FILE* fast_q_out1 = fopen(argv[p],"w");
if (fast_q_out1 == NULL) {
printf("Cannot create output file %s\n", argv[p]);
return 1;
}
p++;
FILE* drop1 = NULL;
FILE* drop2 = NULL;
FILE* fast_q_out2 = NULL;
if (isPairedLib) {
fast_q_out2 = fopen(argv[p],"w");
if (fast_q_out2 == NULL) {
printf("Cannot create output file %s\n", argv[p]);
}
}
if (writeDropped) {//Open files for writing dropped reads
if (isPairedLib) {
drop1 = fopen("dropped1.out","w");
drop2 = fopen("dropped2.out","w");
} else {
drop1 = fopen("dropped.out","w");
}
}
long dropped = 0;
long count = 0;
pol_util::FastqEntry* entry1 = NULL;
pol_util::FastqEntry* entry2 = NULL;
bool keepGoing = true;
while (keepGoing) {
++count;
entry1 = pol_util::FastqEntry::readEntry(fast_q1);
keepGoing = (entry1 != NULL);
if (isPairedLib)
entry2 = pol_util::FastqEntry::readEntry(fast_q2);
if (!keepGoing)
break;
if (onlyN == true) {
bool rem1,rem2;
rem1 = entry1->removePoly('N',min_length);
if (isPairedLib){
rem2 = entry2->removePoly('N',min_length);
}
if ((!rem1) || (isPairedLib && !rem2)) {
++dropped;
if (writeDropped) {
entry1->write(drop1);
if (isPairedLib) {
entry2->write(drop2);
}
}
delete entry1;
delete entry2;
continue;
}
}else {
bool trim1,trim2;
trim1 = entry1->trim(qual,min_length,isSolexa);
if (isPairedLib)
trim2 = entry2->trim(qual,min_length,isSolexa);
if (!trim1 || ( isPairedLib && !trim2 )) {
++dropped;
if (writeDropped) {
entry1->write(drop1);
if (isPairedLib) {
//Write second entry
entry2->write(drop2);
}
}
if (replaceWithFake) {//Still print these to the output fastq
if (!trim1)
entry1->writeFake(fast_q_out1);
if (isPairedLib && !trim2)
entry2->writeFake(fast_q_out2);
}
delete entry1;
delete entry2;
continue;
}
}
//Write them to the output files.
if (writeFasta) {
entry1->writeFasta(fast_q_out1);
} else {
entry1->write(fast_q_out1);
}
if (isPairedLib) {
//Write second entry
if (writeFasta) {
entry2->writeFasta(fast_q_out2);
} else {
entry2->write(fast_q_out2);
}
}
delete entry1;
delete entry2;
}
fclose(fast_q1);
fclose(fast_q_out1);
if (isPairedLib) {
fclose(fast_q2);
fclose(fast_q_out2);
}
if (writeDropped) {
fclose(drop1);
if (isPairedLib)
fclose(drop2);
}
printf("\n");
printf("%ld reads have been dropped!\n",dropped);
printf("You just lost about %3.2f procent of your data!\n",(double(dropped)/count)*100);
}
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOS
*/
/*Contact <paul.igor.costea@scilifelab.se>*/
#include <string>
#include "util.h"
namespace pol_util
{
///> Defines
#define MAX_LINE 1000
/**
* @description Fastq entry reading/writing + manipulation class
*/
class FastqEntry{
public:
/**
* @brief Entry reader and generator. The only way to get a new instance of a FastqEntry.