call_vC.cpp 18.9 KB
Newer Older
Robin Erich Muench's avatar
Robin Erich Muench committed
1
2
3
4
5
6
#include <iostream>
#include <unistd.h>
//getopt
//#include <ctype.h>
//#include <unistd.h>
//Other
Robin Erich Muench's avatar
Robin Erich Muench committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <sstream>
#include <vector>
#include <list>
#include <map>
#include <gene.h>
//Get the boost interval support
#include <boost/icl/discrete_interval.hpp>
#include <boost/icl/closed_interval.hpp>
#include <boost/icl/split_interval_map.hpp>
#include <boost/lexical_cast.hpp>
using namespace boost::icl;

23
24
25
26
27
28
29
30
31
32
33
struct SNPCallOptions {
    SNPCallOptions()
        :min_coverage(4)
        ,calling_threshold(4)
        ,calling_min_fraction(0.01)
    { }

    int min_coverage;
    int calling_threshold;
    double calling_min_fraction;
};
Robin Erich Muench's avatar
Robin Erich Muench committed
34
35
36

#define READ_BUFFER_SIZE 10000000

Robin Erich Muench's avatar
Robin Erich Muench committed
37

Robin Erich Muench's avatar
Robin Erich Muench committed
38
39
//#define DEBUG 1

Robin Erich Muench's avatar
Robin Erich Muench committed
40
41
42
43
44
//TODO: make nice DEBUG switch (-d flag)
void use_debug() {
//	#define DEBUG 1
}

Robin Erich Muench's avatar
Robin Erich Muench committed
45
46
47
48
49
50
struct filePosition {
  unsigned long start;
  unsigned long length;
  unsigned long lineCount;
};

Robin Erich Muench's avatar
Robin Erich Muench committed
51
52

/*std::string usage() {
Robin Erich Muench's avatar
Robin Erich Muench committed
53
54
  return "Run as:\nsnpCall <reference_file> <geneAnnotation> individual_SNP_output";
}
Robin Erich Muench's avatar
Robin Erich Muench committed
55
56
57
58
59
60
61
62
63
*/
static int print_usage()
{
  fprintf(stderr, "\n");
  fprintf(stderr, "metaSNP --- metagenomic SNP caller\n\n");
  fprintf(stderr, "Version: 1.0\n");
  fprintf(stderr, "Contact: Paul Costea <costea@embl.de>,\n\t Robin Muench <rmuench@embl.de>\n\n");
  fprintf(stderr, "Usage:   snpCall [options] <stdin.mpileup> \n");
  fprintf(stderr, "Options: \n");
Robin Erich Muench's avatar
Robin Erich Muench committed
64
//  fprintf(stderr, "	    -b,		TODO: list of input BAM filenames, one per line (for header) \n ");
Robin Erich Muench's avatar
Robin Erich Muench committed
65
66
67
68
69
70
71
72
  fprintf(stderr, "	    -f,		faidx indexed reference metagenome \n ");
  fprintf(stderr, "	    -g,		gene annotation file [NULL].\n");
  fprintf(stderr, "	    -i,		individual SNPs output file [NULL].\n\n");
//  fprintf(stderr, "	    -d,-a,	for debugging only \n\n");
  fprintf(stderr, "Note: Expecting samtools mpileup as standard input\n\n");
  return 1;
}

Robin Erich Muench's avatar
Robin Erich Muench committed
73
74
75
76
77
78

std::map<std::string,Genome*> mapGenomes;
std::map<std::string,filePosition> mapGenes;
//Here's where we save the genes
split_interval_map<long,GeneDef> geneIntervals;

Robin Erich Muench's avatar
Robin Erich Muench committed
79
/**
Luis Pedro Coelho's avatar
Luis Pedro Coelho committed
80
81
82
83
84
 * @brief Fast thread-safe string tokenizer.
 *
 * It ignores any spaces at the beginning of the string.
 *
 * Returns position after the token separator
Robin Erich Muench's avatar
Robin Erich Muench committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
 */
const char *toksplit(const char *src, /* Source of tokens */
		     char tokchar, /* token delimiting char */
		     char *token, /* receiver of parsed token */
		     size_t lgh) /* length token can receive */
/* not including final '\0' */
{
  if (src) {
    while (' ' == *src) src++;
    while (*src && (tokchar != *src)) {
      if (lgh) {
	*token++ = *src;
	--lgh;
      }
      src++;
    }
    if (*src && (tokchar == *src)) src++;
  }
  *token = '\0';
  return src;
}

Robin Erich Muench's avatar
Robin Erich Muench committed
107
108
109
110
111
/**
 * @brief Get a decent index of the genes for fast access and future annotation of SNPs.
 */
bool indexGenomeAndGenes(FILE* refGenome, FILE* refGenes) {
  char line[10000];
Robin Erich Muench's avatar
Robin Erich Muench committed
112
  char *tok = new char[10000];
Robin Erich Muench's avatar
Robin Erich Muench committed
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
  /*===================================
    We have the reference genomes indexed. Let's get the same for the gene definitions. 
    Would be nice if they also has a sort of fai indexing, but that's not in the plan right now.
   ===================================*/

  bool done = false;
  
  //Get the header and remember that length
  unsigned long filePosStart = 0;
  unsigned long fileConsumed = 0;
  unsigned long lineCount = 0;
  std::string name = "";
  fgets(line,10000,refGenes);
  filePosStart = strlen(line);//This includes the \n
  long llen = 0;
  filePosition p1;
  while (fgets(line,10000,refGenes)) {
    llen = strlen(line);
Robin Erich Muench's avatar
Robin Erich Muench committed
131
    const char* l = toksplit(line,'\t',tok,10000);
Robin Erich Muench's avatar
Robin Erich Muench committed
132
    int pos = 0;
Robin Erich Muench's avatar
Robin Erich Muench committed
133
    while (*l) {
Robin Erich Muench's avatar
Robin Erich Muench committed
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
      if (pos == 2) {//Thie is the name!                                                                      
	if (name.compare("") == 0) {//This is the first one
	  name = tok;
	} else if (name.compare(tok) != 0) {//Boom, this is a new one
	  p1.lineCount = lineCount;
	  lineCount = 0;
	  p1.start = filePosStart;
	  filePosStart += fileConsumed;
	  fileConsumed = 0;
	  mapGenes[name] = p1;
	  name = tok;
	  //fprintf(stderr,"%s ",name.c_str());
	}
	break;
      }	      
      ++pos;
Robin Erich Muench's avatar
Robin Erich Muench committed
150
      l = toksplit(l,'\t',tok,10000);
Robin Erich Muench's avatar
Robin Erich Muench committed
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    }
    fileConsumed += llen;
    lineCount += 1;
  }
  //Add the last one!
  p1.start = filePosStart;
  p1.lineCount = lineCount;
  mapGenes[name] = p1;

  //==================================================================
  //==================================================================

  name = "";
  std::string genome = "";
  bool skip = false;
  //Make the map                                                        
  while (fgets(line,10000,refGenome)) {//Get name, start, length
    line[strlen(line)-1] = '\0';
    if (line[0] == '>') {//New genome
      //Do we already have one?                                                                                                                                                                            
      if (genome.length() > 0 && !skip) {//Save it!
	mapGenomes[name] = new Genome(genome);
	//fprintf(stderr,"Loaded: %s|\n",name.c_str());
	genome = "";
      }
      name = line;
      name = name.erase(0,1);//Remove the >
      if (mapGenes.find(name) == mapGenes.end()){//This genome has no gene definitions, don't load it!
	skip = true;
      } else {
	skip = false;
      }
    } else {
      if (skip)
	continue;
      //Add to genome
      genome += line;
    }
  }
  //Add last genome
  mapGenomes[name] = new Genome(genome);
  //Let the user know we've loaded the genomes. Calling gets started now.
  fprintf(stderr,"Genomes loaded!\n");

Robin Erich Muench's avatar
Robin Erich Muench committed
195
  delete[] tok;
Robin Erich Muench's avatar
Robin Erich Muench committed
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
  return true;
}

/**
 *  @brief Load and encode the genomes to save some space. Probably overkill.
 */

bool loadGenome(std::string gName, FILE* refGenes, bool* hasGenes) {
  //Drop old one
  geneIntervals.clear();

  if (mapGenes.find(gName) == mapGenes.end()) {//We don't have genes in this genome?
    *hasGenes = false;
    //And, just return
    return true;
  }
  *hasGenes = true;

  char line[10000];
Robin Erich Muench's avatar
Robin Erich Muench committed
215
  char *tok = new char[10000];
Robin Erich Muench's avatar
Robin Erich Muench committed
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
  
  if (mapGenomes.find(gName) == mapGenomes.end()) {
    fprintf(stderr,"Weird...%s\n",gName.c_str());
  }

  //Now, most of the read genome is useless
  //Read the genes and discard "junk" dna
  long start,end,len;
  char strand;

  filePosition p = mapGenes[gName];
  //Scroll file to "start" of gene definitions
  if (fseek(refGenes,p.start,SEEK_SET) != 0) {
    fprintf(stderr,"File seek failed %ld \n",p.start);
    return false;
  }
  long lc = 0;
  std::string geneName = "";
  while (lc < p.lineCount) {
    if (!fgets(line,10000,refGenes)) {
      fprintf(stderr,"Read failed. Seeking file to %ld\n",p.start);
    }

    ++lc;
Robin Erich Muench's avatar
Robin Erich Muench committed
240
    const char* l = toksplit(line,'\t',tok,10000);
Robin Erich Muench's avatar
Robin Erich Muench committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
    int pos = 0;
    while (tok) {
      if (pos == 1) {
	geneName = tok;
      }
      if (pos == 2) {//This is the name!
	if (gName.compare(tok) != 0) {
	  fprintf(stderr,"Reading wrong gene defintions");
	  break;
	}
      } else if (pos == 5) {//Length
	len = atol(tok);
	/*if (len % 3 != 0) {
	  fprintf(stderr,"This can't really code for anything!!!: %s\n%s\n",geneName.c_str(),tok);
	  }*/
      } else if (pos == 6) {//Start
	start = atol(tok)-1;//These are 1 based!!!!

      } else if (pos == 7) {//End
	end = atol(tok)-1;//These are 1 based!!!!

      } else if (pos == 8) {//Strand
	strand = tok[0];
	break;
      }
      ++pos;
Robin Erich Muench's avatar
Robin Erich Muench committed
267
      l = toksplit(l,'\t',tok,10000);
Robin Erich Muench's avatar
Robin Erich Muench committed
268
269
270
271
272
273
274
275
276
277
278
279
280
    }

    Gene g(start,end,geneName,strand);
    //Copy the seq into gene and add to genome
    if (start > end) {//goes around!
      fprintf(stderr,"This gene goes around :(.\nPretending we didn't see it.\n");
    } else {
      discrete_interval<long> gene_interval = construct<discrete_interval<long> >(start,end,interval_bounds::closed());
      GeneDef gD(g);
      geneIntervals += make_pair(gene_interval,gD);
    }
  }

Robin Erich Muench's avatar
Robin Erich Muench committed
281
  delete[] tok;
Robin Erich Muench's avatar
Robin Erich Muench committed
282
283
284
285
286
287
288
289
290
291
292
293
  return true;
}

int getSum(std::map<char,long*>& map, std::string c, int sample=0) {
  int sum = 0;
  for (int i=0; i<c.length(); ++i) {
    sum += map[c[i]][sample];
  }
  return sum;
}

/**
Luis Pedro Coelho's avatar
Luis Pedro Coelho committed
294
 * @brief Return a reverse complemented copy of the input
Robin Erich Muench's avatar
Robin Erich Muench committed
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
 */

std::string revComplement(std::string codon) {
  std::string res = "";
  for (int i = codon.length()-1; i>=0; --i) {
    if (codon[i] == 'A') {
      res += 'T';
    } else if (codon[i] == 'T') {
      res += 'A';
    } else if (codon[i] == 'C') {
      res += 'G';
    } else if (codon[i] == 'G') {
      res += 'C';
    }
  }
  return res;
}

std::string getCoverageString(std::map<char,long*>& map, int nrSamples, std::string c) {
  char* res = new char[nrSamples*10];
  int pos = 0;
  //Get sum for each sample
  for (int i=1; i<=nrSamples; ++i) {
    int cov = getSum(map,c,i);
    pos += sprintf(res+pos,"%d|",cov);
    if (pos >= nrSamples*10) {
      fprintf(stderr,"Just corrupted the heap...\n");
    }
  }
  res[pos-1]='\0';
  std::string r = res;
  delete[] res;
  return r;
}

/**
Luis Pedro Coelho's avatar
Luis Pedro Coelho committed
331
 * @brief Main function
Robin Erich Muench's avatar
Robin Erich Muench committed
332
333
334
 */
int main(int argc, char** argv) {

Robin Erich Muench's avatar
Robin Erich Muench committed
335
336
337
  FILE* genomes = NULL;
  FILE* genes = NULL;
  FILE* individualFile = NULL;
Robin Erich Muench's avatar
Robin Erich Muench committed
338

Robin Erich Muench's avatar
Robin Erich Muench committed
339
340
341
342
  int aflag = 0;	// not used
  char *inputbams = NULL;
  int index;
  int c;
Robin Erich Muench's avatar
Robin Erich Muench committed
343

Robin Erich Muench's avatar
Robin Erich Muench committed
344
345
346
  opterr = 0;

  char* line = new char[READ_BUFFER_SIZE];
Robin Erich Muench's avatar
Robin Erich Muench committed
347
348
349

  long count = 0;
  long chr = 0;
Robin Erich Muench's avatar
Robin Erich Muench committed
350

Robin Erich Muench's avatar
Robin Erich Muench committed
351
352
  //Define the char map!
  std::map<char,long*> map;
353
  SNPCallOptions options;
Robin Erich Muench's avatar
Robin Erich Muench committed
354

355
  while ((c = getopt (argc, argv, "hdab:f:g:i:c:p:t:")) != -1)
Robin Erich Muench's avatar
Robin Erich Muench committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
    switch (c)
      {
      case 'h':		// help message
        print_usage();
        return -1;
      case 'a':		// unused flag
        aflag = 1;
        break;
      case 'd':         // Debug switch not working!
        if (optopt == 'd')
	  use_debug();
        break;
      case 'b':		// TODO: list of bam files for header generation [optional]
        inputbams = optarg;
        break;           
      case 'f':		//reference fasta file [required]
	genomes = fopen(optarg, "r");
        if (genomes == NULL) {
          fprintf(stderr,"Cannot open %s\n",optarg);
          return -1;
	}
	break;
      case 'g':		//gene annotation file [optional] (TODO format?)
        genes = fopen(optarg, "r");
        if (genes == NULL) {
          fprintf(stderr,"Cannot open %s\n",optarg);
          return -1;
	}
        break;
      case 'i':		// output filename [required]
        individualFile = fopen(optarg, "w");
        if (individualFile == NULL) { 
          fprintf(stderr,"Cannot open %s\n",optarg);
          return -1;
	}
//        printf("Writing individual SNPs to %s \n", optarg);
392
393
394
395
396
397
398
399
400
401
        break;
      case 'c':
        options.min_coverage = atol(optarg);
        break;
      case 'p':
        options.calling_min_fraction = atof(optarg);
        break;
      case 't':
        options.calling_threshold = atol(optarg);
        break;
Robin Erich Muench's avatar
Robin Erich Muench committed
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
      case '?':
        if (( optopt == 'f') || ( optopt == 'g') || ( optopt == 'i') ){
	  if ( optopt == 'f'){
	    fprintf (stderr, "Option -%c requires a reference file.\n", optopt);
	  }
          if ( optopt == 'g'){
            fprintf (stderr, "Option -%c requires an annotation file.\n", optopt);
	  }
          if ( optopt == 'i'){
            fprintf (stderr, "Option -%c requires an output filename.\n", optopt);
	  }
        } else if (isprint (optopt)){
          fprintf (stderr, "Unknown option `-%c'.\n", optopt);
        } else {
          fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); //e.g. umlauts
        return 1;
        } 
      default:
        abort ();
      }

// Fish out non option arguments
  for (index = optind; index < argc; index++){
    printf ("Non-option argument %s\n", argv[index]);
    return 0;
  }



//TODO: do the counting and generate header from bam file
Robin Erich Muench's avatar
Robin Erich Muench committed
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
//fgets(line,READ_BUFFER_SIZE,stdin) //gets the very first line of the pileup (drops it!)

  fgets(line,READ_BUFFER_SIZE,stdin);
  unsigned int number_of_tabs = 0;
    for (int i = 0; i < strlen(line); i++){
      if ('\t' == line[i]){
        ++number_of_tabs;
    	}
    }
  //+1 (last is newline) -3 (Omit first three: geneID \t SEQ \t REF \t START)
  int nrSamples = int(number_of_tabs+1-3)/3; // replacing atoi(argv[3]);


  fprintf(stderr,"Identified %d samples\n",nrSamples);
  map['.'] = new long[nrSamples+1];
  map[','] = new long[nrSamples+1];
  map['a'] = new long[nrSamples+1];
  map['c'] = new long[nrSamples+1];
  map['t'] = new long[nrSamples+1];
  map['g'] = new long[nrSamples+1];
  map['A'] = new long[nrSamples+1];
  map['C'] = new long[nrSamples+1];
  map['T'] = new long[nrSamples+1];
  map['G'] = new long[nrSamples+1];


Robin Erich Muench's avatar
Robin Erich Muench committed
458
459
460
461
462
463
464
465
466
  //Now read and index genomes and genes if reference genome and annotationfile are given
  if ( (genomes != NULL) && (genes != NULL) ) {
	fprintf(stderr,"Found reference genomes and annotation file.\nLoading Genomes...\n");
	indexGenomeAndGenes(genomes,genes);
	fclose(genomes);
  }


std::map<std::string, char> CodonMap(codons, codons + sizeof codons / sizeof codons[0]);
Robin Erich Muench's avatar
Robin Erich Muench committed
467
468
469
470
471
472
473
474
475
476
477
478
479
480


  //Reset file
  //fseek(in,0,SEEK_SET);
  std::map<char,long*>::iterator it;
  std::string name = "";
  int cov = 0;
  std::string num;
  int skip = 0;
  int leftoverSkipping = 0;
  int pos = 0;
  bool genomeLoaded = false;
  while (fgets(line,READ_BUFFER_SIZE,stdin)) {
    #ifdef DEBUG
Robin Erich Muench's avatar
Robin Erich Muench committed
481
    fprintf(stderr,"\nLINE READ:\n%s\n",line);
Robin Erich Muench's avatar
Robin Erich Muench committed
482
483
484
485
486
487
    #endif
    int lLen = strlen(line);
    if (lLen == READ_BUFFER_SIZE-1) {//This line is longer than buffer! 
	fprintf(stderr,"You need a bigger buffer!!!\n%d Is too little\n",lLen);      
    }

Robin Erich Muench's avatar
Robin Erich Muench committed
488
489
490
491
492
493
494
495
496
497
498
499
500
501
    line[--lLen]='\0';
    //Reset map
    for (it = map.begin(); it != map.end(); ++it) {
      for (int j=0; j<nrSamples+1; ++j)
	it->second[j]=0;
    }
    pos = 0;
    
    char *tok = new char[10000];
    const char *l = toksplit(line,'\t',tok,10000);
    #ifdef DEBUG
    fprintf(stderr,"Splitting string\n");
    fprintf(stderr,"Split1: %s\n",tok);
    #endif
Robin Erich Muench's avatar
Robin Erich Muench committed
502
503
    int lP = 0;
    char base;
Robin Erich Muench's avatar
Robin Erich Muench committed
504
    while (*l) {
Robin Erich Muench's avatar
Robin Erich Muench committed
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
      if (pos == 0) {//genome name
	if (name.compare(tok)!=0) {//New genome. Load it!
	  name = tok;
	  genomeLoaded = false;
	}
      } else if (pos==1) {
	//==================================================================
	//====================== IMPORTANT!!! ==============================
	lP = atol(tok)-1;//Because the positions in the pileup are 1 based!!!
	//==================================================================
      } else if (pos==2) {
	base = tok[0];
      } else if ((pos > 3) && (pos % 3 == 1)){//These are the base calls
	#ifdef DEBUG
	fprintf(stderr,"\n====\n%s\n====\n",tok);
	#endif
	int i = 0;
	int len = strlen(tok);
	while (i < len) {
	  switch (tok[i]) {
	  case '^':
	    //Skip next
	    ++i;
	    break;
	  case '+':
	  case '-': 
	    //Skip x number of bases, as given by number after this character
	    num = "";
	  while (isdigit(tok[++i])) num += tok[i];
	  skip = atoi(num.c_str());
	  i += skip-1;
	    break;
	  case '*':
	  case '$':
	  case 'N':
	  case 'n':
	    break;
	  default:
	    ++map[tok[i]][0];
	    ++map[tok[i]][pos/3];
	    break;
	  }
	  ++i;
	}
      }
      ++pos; 
      #ifdef DEBUG
      fprintf(stderr,"Pos now: %d\n",pos);
      #endif
Robin Erich Muench's avatar
Robin Erich Muench committed
554
      l = toksplit(l,'\t',tok,10000); 
Robin Erich Muench's avatar
Robin Erich Muench committed
555
    }
Robin Erich Muench's avatar
Robin Erich Muench committed
556
557

    delete[] tok;
Robin Erich Muench's avatar
Robin Erich Muench committed
558
559
560

    cov = getSum(map,"actgACTG,.");

561
    if (cov < options.min_coverage) {//We don't have enough coverage
Robin Erich Muench's avatar
Robin Erich Muench committed
562
563
      continue;
    }
564
    if (getSum(map,"actgACTG") < options.calling_threshold) {//We don't have enough SNP calls
Robin Erich Muench's avatar
Robin Erich Muench committed
565
566
567
568
      continue;
    }

    bool hasGenes;
Robin Erich Muench's avatar
Robin Erich Muench committed
569
    //Is genome loaded?           
Robin Erich Muench's avatar
Robin Erich Muench committed
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
    if (!genomeLoaded) {
      loadGenome(name,genes,&hasGenes);
      genomeLoaded = true;
    }

    std::string snps = "actg";
    long snpCount = 0;
    std::string s = "";
    std::string indiv = "";
    //Is it on a gene?  
    std::string geneName = "-";
    GeneDef def = geneIntervals(lP);
    Gene g(0,0,"",'*');
    bool isInGene = false;
    if (def.hasGene()) {
      g = def.getGene();
      geneName = g.name;
      isInGene = true;
    }
    std::string oldCodon = "";
    bool write = false;
    for (int i=0; i<snps.length(); ++i) {
592
593
594
595
596
597
598
599
600
601
        
        //Skip same base
        if (snps[i] == base) continue;
        std::ostringstream internal;
        char check[3];
        sprintf(check,"%c%c",snps[i],toupper(snps[i]));
        snpCount = getSum(map,check);//How many times do we see this position as a variant in the entire population?
        bool writeThis = false;
        std::string* sToWrite = NULL;
        
602
        if ((snpCount >= options.calling_threshold) && (snpCount >= cov*options.calling_min_fraction)) { // This is a "common" variant
603
604
605
606
607
            write = true;
            writeThis = true;
            sToWrite = &s;
        } else { //May be a individual variant
            for (int i=1; i<=nrSamples; ++i) {
Robin Erich Muench's avatar
Robin Erich Muench committed
608
                int s = getSum(map,check,i);
609
                if (s >= options.calling_threshold) {//We observe the variant "options.calling_threshold" times in one sample.
610
611
612
613
614
615
616
                    writeThis = true;
    	            sToWrite = &indiv;
    	            break;
                }
            }
        }
      
Robin Erich Muench's avatar
Robin Erich Muench committed
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
      if (writeThis) {
	if ((hasGenes) && (isInGene)) {
	  //TODO: consider multiple genes!

	  //Get codon spanning this base
	  long codonStart;
	  int codonPosition;//i.e., the position of the base within the codon
	  if (g.start < g.end) {//Normal gene
	    codonPosition = (lP - g.start)%3;
	    codonStart = lP - codonPosition;
	    oldCodon = mapGenomes[name]->getSequence(codonStart,codonStart+2);
	  } else {//This gene is "circular"
	    fprintf(stderr,"Will not handle circular genes\n");
	    continue;
	  }
	  std::string newCodon = oldCodon;
	  newCodon[codonPosition] = toupper(snps[i]);
	  //What strand are we on?
	  if (g.strand == '-') {//Reverse complement
	    oldCodon = revComplement(oldCodon);
	    newCodon = revComplement(newCodon);
	  }
	  internal << snpCount << '|' << check[1] << '|';
	  //Check if syonymous
	  if (CodonMap[newCodon] == CodonMap[oldCodon]) {
	    internal << "S";
	  } else {
	    internal << "N";
	  }
	  internal << "[" << oldCodon << "-" << newCodon + "]|" << getCoverageString(map,nrSamples,check);
	  *sToWrite += ',' + internal.str();
Robin Erich Muench's avatar
Robin Erich Muench committed
648
	  } else {//Just get the small string
Robin Erich Muench's avatar
Robin Erich Muench committed
649
	  internal << snpCount << '|' << check[1] << "|.|" << getCoverageString(map,nrSamples,check);
Robin Erich Muench's avatar
Robin Erich Muench committed
650
  	  *sToWrite += ',' + internal.str();
Robin Erich Muench's avatar
Robin Erich Muench committed
651
652
653
	}
      }  
    }
Robin Erich Muench's avatar
Robin Erich Muench committed
654
    //Writing OutputFile
Robin Erich Muench's avatar
Robin Erich Muench committed
655
656
    if (write) {
      s.erase(0,1);//There's an extra comma here
Robin Erich Muench's avatar
Robin Erich Muench committed
657

Robin Erich Muench's avatar
Robin Erich Muench committed
658
659
660
661
662
663
664
    //We now have information about all of the snp's. If multiple, sperate them by ,
    fprintf(stdout,"%s\t%s\t%ld\t%c\t%s\t%s\n",name.c_str(),geneName.c_str(),lP+1,//again, this one gets converted to 1 based representation                                                  
	    base,getCoverageString(map,nrSamples,"actgACTG,.").c_str(),s.c_str());
    } 
    if (indiv.length() != 0) {
      indiv.erase(0,1);
      fprintf(individualFile,"%s\t%s\t%ld\t%c\t%s\t%s\n",name.c_str(),geneName.c_str(),lP+1,//again, this one gets converted to 1 based representation
Robin Erich Muench's avatar
Robin Erich Muench committed
665
	      base,getCoverageString(map,nrSamples,"actgACTG,.").c_str(),indiv.c_str());
Robin Erich Muench's avatar
Robin Erich Muench committed
666
667
668
669
670
671
672
    }
  }

  //Clean map, just so it doesn't leak around
  for (it = map.begin(); it != map.end(); ++it) {
    delete[] it->second;
  }
Robin Erich Muench's avatar
Robin Erich Muench committed
673
  
Robin Erich Muench's avatar
Robin Erich Muench committed
674
675
676
677
678
679
  //Drop all the genomes. We're done.
  for (std::map<std::string,Genome*>::iterator it = mapGenomes.begin(); it!=mapGenomes.end(); ++it) {
    delete it->second;
  }


Robin Erich Muench's avatar
Robin Erich Muench committed
680
681
682
683
684
685
  if (genes != NULL){
    fclose(genes);
  }
  if (individualFile != NULL){
    fclose(individualFile);
  }
Robin Erich Muench's avatar
Robin Erich Muench committed
686
687

  return 0;
Robin Erich Muench's avatar
Robin Erich Muench committed
688

Robin Erich Muench's avatar
Robin Erich Muench committed
689
}