Commit a425ddef authored by Charles Girardot's avatar Charles Girardot

commit before refactoring to introduce the READBAR concept

parent 72f71463
......@@ -4,3 +4,5 @@ bin
target/
.DS_Store
._*
embl.properties
test.properties
......@@ -49,11 +49,21 @@ import htsjdk.samtools.fastq.FastqRecord;
* 2. A second one describing how to write the read name (header) e.g. '<BARCODE1><UMI1><UMI2>' to add the barcode and two extracted UMIs
* in the final read name, in addition to the original read name (ie header up to the space). Here each written slot is separated with ':' by default
*
*
* Note that in case of barcode, one might want to write the barcode or the read sequence corresponding to the looked up sample barcode.
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1'' instead of '<BARCODE1>' , '<UMI2>' and '<SAMPLE1>' ; respectively.
* The possible keys are :<br/>
* <ul>
* <li>SAMPLEn : refers to the SAMPLE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>UMIn : refers to the UMI slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>BARCODEn : refers to the sample barcode resolved from the read sequence found in the of the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>READBARn : refers to the read sequence found in the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* </ul>
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1' or 'R1' instead of '<BARCODE1>' , '<UMI2>' , '<SAMPLE1>' and <READBAR>; respectively.
* For example, 'B1U1U2' is the same as '<BARCODE1><UMI1><UMI2>'.
*
* Technically speaking, the short layout format is the only one used.
* Technically speaking, the short layout format is the only one used.
*
* @author girardot
*
......@@ -62,8 +72,8 @@ public class FastqWriterLayout {
private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class);
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUS]\\d+)+$";
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE|READBAR)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUSR]\\d+)+$";
......@@ -143,12 +153,32 @@ public class FastqWriterLayout {
shortLayout = shortLayout.replaceAll("ARCODE", "");
shortLayout = shortLayout.replaceAll("MI", "");
shortLayout = shortLayout.replaceAll("AMPLE", "");
shortLayout = shortLayout.replaceAll("EADBAR", "");
}
log.debug("short layout : "+shortLayout);
return shortLayout;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s).
* This method also use the read sequence to write BARCODE in read name
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
*
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
return ass;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
......
......@@ -34,6 +34,7 @@ import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
import org.embl.gbcs.je.jemultiplexer.Jemultiplexer;
import org.embl.gbcs.je.jemultiplexer.JemultiplexerIllumina;
import org.embl.gbcs.je.retag.TagFromReadName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -50,6 +51,7 @@ public class Je {
private static Logger log = LoggerFactory.getLogger(Je.class);
public static final String COMMAND_RETAG = "retag";
public static final String COMMAND_DEMULTIPLEX = "debarcode";
public static final String COMMAND_DROPSEQ = "dropseq";
public static final String COMMAND_CLIP = "clip";
......@@ -64,8 +66,9 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_DUPES);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA);
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
//ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_RETAG);
}
......@@ -125,6 +128,9 @@ public class Je {
else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){
new Jedropseq().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_RETAG)){
new TagFromReadName().instanceMainWithExit(argv);
}
else{
System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
......@@ -146,7 +152,8 @@ public class Je {
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
//+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_RETAG+" \t\t extracts barcode and UMI sequence(s) embedded in read names and tag reads with proper BAM tag.\n"
+"\n"
+"Version : "+getVersion()
;
......
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.QualityEncodingDetector;
import htsjdk.samtools.util.SolexaQualityConverter;
import java.io.File;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JeTry {
private static Logger log = LoggerFactory.getLogger(JeTry.class);
public JeTry() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) {
String f = "/g/furlong/incoming/2017-06-28-000000000-B954B/000000000-B954B_precap_allpromV2_17s003049-1-1_Ghavi-helm_lane117s003049_1_sequence.txt.gz";
FastqReader reader = new FastqReader(new File(f));
FastqQualityFormat QUALITY_FORMAT = QualityEncodingDetector.detect(100000, reader);
log.info(String.format("Auto-detected quality encoding format as: %s.", QUALITY_FORMAT));
FastqRecord r =reader.iterator().next();
System.out.println(r.toFastQString());
System.out.println(r.getBaseQualityString());
byte [] bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SAMUtils.fastqToPhred(bites);
System.out.println(Arrays.toString( bites ));
bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(bites);
System.out.println(Arrays.toString( bites ));
}
}
......@@ -48,7 +48,7 @@ import org.slf4j.LoggerFactory;
* <li> When no length ('x') is specified, all the sequence till the end is considered ; it only possible to use the 'x'
* shortcut in the last block of a layout</li>
* <li> When a negative value is given in place of length (e.g. '<BLOCKCODEn:-2>'), all but the last x (2 in the
* '<BLOCKCODEn:-2>' example) bases ; a negative length value is only acceptedin the last block of a layout</li>
* '<BLOCKCODEn:-2>' example) bases ; a negative length value is only accepted in the last block of a layout</li>
* </ul>
*
* <br/>
......@@ -152,6 +152,7 @@ public class ReadLayout {
}
/**
* change
* Process the layout and initialize useful variables
*/
public void parseLayout(){
......
......@@ -27,6 +27,7 @@ import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
......@@ -122,6 +123,24 @@ public class ReadLayoutConsumer {
}
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
* In this method, the read sequence is always used in BARCODE slots
*
* @param reads the reads in order matching that of the {@link ReadLayout} array used at construction
*
* @return
*/
public String assembleNewReadName(FastqRecord [] reads){
boolean[] useReadSequenceForBarcodes = new boolean[reads.length];
Arrays.fill(useReadSequenceForBarcodes, true);
return assembleNewReadName(reads, useReadSequenceForBarcodes, null);
}
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
......@@ -150,7 +169,7 @@ public class ReadLayoutConsumer {
*/
String subseq = null;
int bestQual = 0;
if(!useReadSequenceForBarcodes[slotIdx-1] && slotTypeCode == BYTECODE_BARCODE){
if(slotTypeCode == BYTECODE_BARCODE && !useReadSequenceForBarcodes[slotIdx-1] ){
// we init the subseq with the matched barcode directly
subseq = m.getBarcodeMatches().get(slotIdx).barcode;
}else{
......@@ -208,7 +227,7 @@ public class ReadLayoutConsumer {
byte slotTypeCode = slotCodes.get(i);
int slotIdx = this.slotIdx.get(i);
log.debug("gettign info for slot code "+slotTypeCode+" with idx "+slotIdx);
log.debug("getting info for slot code "+slotTypeCode+" with idx "+slotIdx);
/*
* when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality
*/
......
......@@ -41,12 +41,19 @@ public class SampleMatch {
*/
protected Map<Integer, BarcodeMatch> barcodeMatches;
protected String diagnosticNote = "";
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches){
this.sample= sample;
this.barcodeMatches = barcodeMatches;
}
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches, String diagnosticNote){
this.sample= sample;
this.barcodeMatches = barcodeMatches;
this.diagnosticNote = diagnosticNote;
}
/**
* @return the sample
......@@ -64,6 +71,22 @@ public class SampleMatch {
}
/**
* @return the diagnosticNote
*/
public String getDiagnosticNote() {
return diagnosticNote;
}
/**
* @param diagnosticNote the diagnosticNote to set
*/
public void setDiagnosticNote(String diagnosticNote) {
this.diagnosticNote = diagnosticNote;
}
/**
* @param sample the sample to set
*/
......
......@@ -36,6 +36,8 @@ import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
......@@ -45,6 +47,7 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import org.embl.cg.utilitytools.utils.CollectionUtils;
import org.embl.gbcs.je.BarcodeMatch;
import org.embl.gbcs.je.FastqWriterLayout;
import org.embl.gbcs.je.JeUtils;
......@@ -244,10 +247,11 @@ public class Demultiplexer {
* @param min_base_qualities one value for each defined BARCODE slot (in the read layouts)
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout}.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param strict how to handle barcode with redundant slots
* @param asyncWrite whether we should use async FASTQ writers
* @param diagnosticFile if not null Je writes info on sample matching process
*/
public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean[] useReadSequenceForBarcodes, boolean asyncWrite, File diagnosticFile){
public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean[] useReadSequenceForBarcodes, boolean strict, boolean asyncWrite, File diagnosticFile){
/*
* Initialize all barcode maps
......@@ -255,17 +259,22 @@ public class Demultiplexer {
// a map to get the list of all possible barcodes for a given slots
Map<Integer, Set<String>> barcodeSetBySlotId = new LinkedHashMap<Integer, Set<String>>();
//list of barcode length ordered by slot id
List<Integer> orderedBarcodeLengths = new ArrayList<Integer>();
for(Entry<String, List<Set<String>>> e : sample2BarcodeSets.entrySet()){
for (int i = 0; i < e.getValue().size(); i++) {
//set of redundant barcodes for this sample and for the BARCODE slot i
Set<String> _bcs = e.getValue().get(i);
int bcIdx = i+1;
if(!barcodeSetBySlotId.containsKey(bcIdx))
if(!barcodeSetBySlotId.containsKey(bcIdx)){
barcodeSetBySlotId.put(bcIdx, new TreeSet<String>());
orderedBarcodeLengths.add(_bcs.iterator().next().length());
}
barcodeSetBySlotId.get(bcIdx).addAll(_bcs);
}
}
// the same map as above but with byte[][] arrays
Map<Integer, byte[][]> barcodeBytesBySlotId = new HashMap<Integer, byte[][]>();
for (Entry<Integer, Set<String>> e : barcodeSetBySlotId.entrySet()) {
......@@ -373,14 +382,14 @@ public class Demultiplexer {
diagnosticFileWriter.print("ReadCount");
diagnosticFileWriter.print("Name");
for (int i = 0; i <= barcodeBlockUniqueIdNumber; i++) {
for (int i = 1; i <= barcodeBlockUniqueIdNumber; i++) {
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_readseq");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_bestbarcode");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Best");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Second");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_passes_cutoffs");
}
diagnosticFileWriter.println("\t"+"assigned_sample");
diagnosticFileWriter.println("\t"+"assigned_sample"+"\t"+"notes");
}
......@@ -397,7 +406,11 @@ public class Demultiplexer {
Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx = FastqWriterLayout.extractBarcodeSlots(reads, readLayouts);
//identify the sample matching these subsequence
SampleMatch assignedSample = assignToSample(barcodeSubsequenceBySlotIdx, barcodeSetBySlotId, barcodeBytesBySlotId, barcodehash2sample, min_base_qualities, max_mismatches, min_mismatch_deltas);
SampleMatch assignedSample = assignToSample(
barcodeSubsequenceBySlotIdx,
barcodeSetBySlotId, orderedBarcodeLengths,
barcodeBytesBySlotId, barcodehash2sample,
min_base_qualities, max_mismatches, min_mismatch_deltas, strict);
writeDiagnostics(reads, assignedSample, diagnosticFileWriter, cnt);
......@@ -469,7 +482,7 @@ public class Demultiplexer {
/**
* @param reads the original reads
* @param assignedSample the sample match report
* @param diagnosticFileWriter the writer or null if no diagnositics has to be written
* @param diagnosticFileWriter the writer or null if no diagnostics has to be written
* @param readCounter the current read iteration (starts at one)
*/
private void writeDiagnostics(FastqRecord[] reads,
......@@ -493,7 +506,8 @@ public class Demultiplexer {
diagnosticFileWriter.print("\t"+bm.mismatchesToSecondBest);
diagnosticFileWriter.print("\t"+ (bm.matched? "yes": "no") );
}
diagnosticFileWriter.println("\t"+(assignedSample.getSample().equals(Demultiplexer.UNASSIGNED) ? "unassigned" : assignedSample.getSample()));
diagnosticFileWriter.print("\t"+(assignedSample.getSample().equals(Demultiplexer.UNASSIGNED) ? "unassigned" : assignedSample.getSample()));
diagnosticFileWriter.println("\t"+assignedSample.getDiagnosticNote());
if(readCounter % 100 == 0)
diagnosticFileWriter.flush();
......@@ -505,28 +519,31 @@ public class Demultiplexer {
/**
* @param barcodeSubsequenceBySlotIdx associates each BARCODE slot (keyed by its ID) with the list of (redundant)
* barcodes sequences for this BARCODE slot. This is a list as a given BARCODE slot can appear more than once across
* the read layouts i.e. in the case of redundant barcode
* @param barcodeSetBySlotIdx set of possible barcodes for a given BARCODE slot id
* @param orderedBarcodeLengths ordered list of barcode length (following the concatenation ordered used for producing the hashcodes)
* @param barcodeBytesBySlotIdx set of possible barcodes for a given BARCODE slot id in byte format
* @param barcodehash2sample every single possible combination of barcode from all slots (one per slot in each combination) hash
* @param min_base_qualities
* @param max_mismatches
* @param min_mismatch_deltas
* @param strict how to handle barcode with redundant sequence slots
* @return a {@link SampleMatch} in which the sample name is set to Demultiplexer.UNASSIGNED if barcode lookup failed
*/
private SampleMatch assignToSample(
Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx,
Map<Integer, Set<String>> barcodeSetBySlotIdx,
List<Integer> orderedBarcodeLengths,
Map<Integer, byte[][]> barcodeBytesBySlotIdx,
Map<Integer, String> barcodehash2sample,
int [] min_base_qualities,
int [] max_mismatches,
int [] min_mismatch_deltas
int [] min_mismatch_deltas,
boolean strict
) {
/*
......@@ -563,41 +580,187 @@ public class Demultiplexer {
}
//identify the corresponding sample, if any
String concatenated = "";
Map<Integer, BarcodeMatch> barcodeMatches = new HashMap<Integer, BarcodeMatch>();
boolean hasSample = true;
//set of all concatenated codes and the sum of their mismatch
Map<String, Integer> concatenatedCodes = new HashMap<String, Integer>();
concatenatedCodes.put("", 0); //init with empty string
Map<Integer, Map<String, BarcodeMatch>> barcodeMatches = new HashMap<Integer, Map<String, BarcodeMatch>>();
boolean hasNoSample = false;
for(Entry<Integer, List<BarcodeMatch>> e : barcodeMatchBySlotIdx.entrySet()){
/*
* for those slots with more than one BARCODE MATCH, we only need to only consider one of the matched BarcodeMatch
* for those slots with more than one BARCODE MATCH, we need to consider all possible concatenations
*/
int slotIdx = e.getKey();
BarcodeMatch bm = keepOnlyBestBarcodeMatch(e.getValue());
barcodeMatches.put(slotIdx, bm);
concatenated += bm.barcode;
if(!bm.matched)
hasSample = false; //we can t look up a sample
Map<String, BarcodeMatch> allValidAndNotRedundantBarcodeMatches = keepOnlyBestBarcodeMatches(e.getValue()); // keyed by the barcode sequence
if(allValidAndNotRedundantBarcodeMatches.size() == 0){
hasNoSample = true; //we can t look up a sample
break;
}
//remember for later
barcodeMatches.put(slotIdx, allValidAndNotRedundantBarcodeMatches);
//augment the concatenated codes
Map<String, Integer> augmentedCodes = new HashMap<String, Integer>();
for (Entry<String, Integer> _concat : concatenatedCodes.entrySet()) {
for (Entry<String, BarcodeMatch> toAdd : allValidAndNotRedundantBarcodeMatches.entrySet()) {
augmentedCodes.put(
_concat.getKey() + toAdd.getKey() ,
_concat.getValue() + toAdd.getValue().mismatches
);
}
}
concatenatedCodes = augmentedCodes;
}
log.debug(" concatenated barcode sequence for sample hashing "+concatenated);
String sampleName = (hasSample ? barcodehash2sample.get(concatenated.hashCode()) : Demultiplexer.UNASSIGNED);
log.debug(" sample is ===> "+sampleName);
//do we have concatenated string(s) representing whole the bc slots?
String sampleName = "";
//a note to add to the diagnostic
String diagNote = "";
if( false == hasNoSample){
//do these string resolved to the same sample ?
Map<String, Integer> sampleNames = new HashMap<String, Integer>();
Map<String, String> sampleName2concatenatedCode = new HashMap<String, String>();
for (Entry<String, Integer> code : concatenatedCodes.entrySet()) {
String sname = barcodehash2sample.get(code.getKey().hashCode());
sampleNames.put( sname , code.getValue());
sampleName2concatenatedCode.put(sname, code.getKey());
}
//if there is a unique sample assignment
if(sampleNames.size() == 1){
Entry<String, Integer> en = sampleNames.entrySet().iterator().next();
sampleName = en.getKey();
// pick a unique match per slot
Map<Integer, BarcodeMatch> uniqueBCMatches = new HashMap<Integer, BarcodeMatch>();
for (Entry<Integer, Map<String, BarcodeMatch>> e : barcodeMatches.entrySet()) {
uniqueBCMatches.put(e.getKey(), e.getValue().values().iterator().next());
}
return new SampleMatch(sampleName, uniqueBCMatches);
}
// OR if NOT strict
else if (!strict) {
// for each possible sample, compute the overall sum of mismatches
Integer lowestMM = null;
Map<Integer, Set<String>> mm2samples = new HashMap<Integer, Set<String>>();
for (Entry<String, Integer> e : sampleNames.entrySet()){
String _sample = e.getKey();
int mm = e.getValue();
if(!mm2samples.containsKey(mm)){
mm2samples.put(mm, new TreeSet<String>());
}
mm2samples.get(mm).add(_sample);
if(lowestMM == null || mm< lowestMM) lowestMM = mm;
}
List<Integer> orderedMMs = new ArrayList<Integer>(mm2samples.keySet());
Collections.sort(orderedMMs);
for (Integer _mm : orderedMMs) {
for (String _sampl : mm2samples.get(_mm)) {
diagNote += (diagNote.isEmpty() ? "" : " ; ");
diagNote += _sampl +"("+_mm + " MMs)";
}
}
// is there a better assignment ie a single sample with lowest overall MM number?
if(mm2samples.get(lowestMM).size() == 1){
//we have a better sample
sampleName = mm2samples.get(lowestMM).iterator().next();
//extract the barcodes of the concatenated barcode and indentify back the BarcodeMatch
Map<Integer, BarcodeMatch> uniqueBCMatches = new HashMap<Integer, BarcodeMatch>();
int from = 0;
int _slotIdx = 0;
String concatenatedBC = sampleName2concatenatedCode.get(sampleName);
for(int bcLen : orderedBarcodeLengths){
_slotIdx++;
int end = from + bcLen;
String _bc = concatenatedBC.substring(from, end);
BarcodeMatch bcM = barcodeMatches.get(_slotIdx).get(_bc);
from = end;
uniqueBCMatches.put(_slotIdx, bcM);
}
log.debug( " selecting "+sampleName+" as it has the lowest overall MM count : "+lowestMM);
diagNote = "Selected "+sampleName+" due to lowest overall MM from : " + diagNote;
log.debug( " "+diagNote);
return new SampleMatch(sampleName, uniqueBCMatches, diagNote);
}else{
diagNote = "Cannot select from : " + diagNote;
}
}else{
for (Entry<String, Integer> e : sampleNames.entrySet()) {
diagNote += (diagNote.isEmpty() ? "" : " ; ");
diagNote += e.getKey() +"("+e.getValue() + " MMs)";
}
diagNote = "Cannot select from : " + diagNote;
}
log.debug(" barcodes'matches resolve to multiple samples :" + Arrays.toString(sampleNames.keySet().toArray()));
}
return new SampleMatch(sampleName, barcodeMatches);
//build a fake match set for diag file
Map<Integer, BarcodeMatch> uniqueBCMatches = new HashMap<Integer, BarcodeMatch>();
for(Entry<Integer, List<BarcodeMatch>> e : barcodeMatchBySlotIdx.entrySet()){
uniqueBCMatches.put(e.getKey(), buildFakeBarcodeMatchForNoSampleLookupSituation(e.getValue()));
}
if(diagNote.isEmpty())
diagNote = "indicated mismatches : -1 for no match else lowest mismatch";
return new SampleMatch(Demultiplexer.UNASSIGNED, uniqueBCMatches, diagNote);
}
private BarcodeMatch buildFakeBarcodeMatchForNoSampleLookupSituation(
Collection<BarcodeMatch> values) {
if(values.size() == 1)
return values.iterator().next();
BarcodeMatch fake = new BarcodeMatch();
fake.readSequence = "";
fake.barcode = "";
fake.matched = false;
fake.mismatches = -1;
fake.mismatchesToSecondBest = -1;
for (BarcodeMatch bm : values) {