Commit a425ddef authored by Charles Girardot's avatar Charles Girardot
Browse files

commit before refactoring to introduce the READBAR concept

parent 72f71463
...@@ -4,3 +4,5 @@ bin ...@@ -4,3 +4,5 @@ bin
target/ target/
.DS_Store .DS_Store
._* ._*
embl.properties
test.properties
...@@ -49,11 +49,21 @@ import htsjdk.samtools.fastq.FastqRecord; ...@@ -49,11 +49,21 @@ import htsjdk.samtools.fastq.FastqRecord;
* 2. A second one describing how to write the read name (header) e.g. '<BARCODE1><UMI1><UMI2>' to add the barcode and two extracted UMIs * 2. A second one describing how to write the read name (header) e.g. '<BARCODE1><UMI1><UMI2>' to add the barcode and two extracted UMIs
* in the final read name, in addition to the original read name (ie header up to the space). Here each written slot is separated with ':' by default * in the final read name, in addition to the original read name (ie header up to the space). Here each written slot is separated with ':' by default
* *
*
* Note that in case of barcode, one might want to write the barcode or the read sequence corresponding to the looked up sample barcode.
* *
* Note that a short layout format can also be used like 'B1', 'U2', 'S1'' instead of '<BARCODE1>' , '<UMI2>' and '<SAMPLE1>' ; respectively. * The possible keys are :<br/>
* <ul>
* <li>SAMPLEn : refers to the SAMPLE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>UMIn : refers to the UMI slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>BARCODEn : refers to the sample barcode resolved from the read sequence found in the of the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>READBARn : refers to the read sequence found in the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* </ul>
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1' or 'R1' instead of '<BARCODE1>' , '<UMI2>' , '<SAMPLE1>' and <READBAR>; respectively.
* For example, 'B1U1U2' is the same as '<BARCODE1><UMI1><UMI2>'. * For example, 'B1U1U2' is the same as '<BARCODE1><UMI1><UMI2>'.
* *
* Technically speaking, the short layout format is the only one used. * Technically speaking, the short layout format is the only one used.
* *
* @author girardot * @author girardot
* *
...@@ -62,8 +72,8 @@ public class FastqWriterLayout { ...@@ -62,8 +72,8 @@ public class FastqWriterLayout {
private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class); private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class);
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE)\\d+>?)+$"; private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE|READBAR)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUS]\\d+)+$"; private static final String SHORT_LAYOUT_REGEX = "^([BUSR]\\d+)+$";
...@@ -143,12 +153,32 @@ public class FastqWriterLayout { ...@@ -143,12 +153,32 @@ public class FastqWriterLayout {
shortLayout = shortLayout.replaceAll("ARCODE", ""); shortLayout = shortLayout.replaceAll("ARCODE", "");
shortLayout = shortLayout.replaceAll("MI", ""); shortLayout = shortLayout.replaceAll("MI", "");
shortLayout = shortLayout.replaceAll("AMPLE", ""); shortLayout = shortLayout.replaceAll("AMPLE", "");
shortLayout = shortLayout.replaceAll("EADBAR", "");
} }
log.debug("short layout : "+shortLayout); log.debug("short layout : "+shortLayout);
return shortLayout; return shortLayout;
} }
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s).
* This method also use the read sequence to write BARCODE in read name
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
*
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
return ass;
}
/** /**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s) * Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
......
...@@ -34,6 +34,7 @@ import org.embl.gbcs.je.jedropseq.Jedropseq; ...@@ -34,6 +34,7 @@ import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode; import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
import org.embl.gbcs.je.jemultiplexer.Jemultiplexer; import org.embl.gbcs.je.jemultiplexer.Jemultiplexer;
import org.embl.gbcs.je.jemultiplexer.JemultiplexerIllumina; import org.embl.gbcs.je.jemultiplexer.JemultiplexerIllumina;
import org.embl.gbcs.je.retag.TagFromReadName;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
...@@ -50,6 +51,7 @@ public class Je { ...@@ -50,6 +51,7 @@ public class Je {
private static Logger log = LoggerFactory.getLogger(Je.class); private static Logger log = LoggerFactory.getLogger(Je.class);
public static final String COMMAND_RETAG = "retag";
public static final String COMMAND_DEMULTIPLEX = "debarcode"; public static final String COMMAND_DEMULTIPLEX = "debarcode";
public static final String COMMAND_DROPSEQ = "dropseq"; public static final String COMMAND_DROPSEQ = "dropseq";
public static final String COMMAND_CLIP = "clip"; public static final String COMMAND_CLIP = "clip";
...@@ -64,8 +66,9 @@ public class Je { ...@@ -64,8 +66,9 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_DUPES); ALLOWED_COMMANDS.add(COMMAND_DUPES);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX); ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA); ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA);
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ); //ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX); ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_RETAG);
} }
...@@ -125,6 +128,9 @@ public class Je { ...@@ -125,6 +128,9 @@ public class Je {
else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){ else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){
new Jedropseq().instanceMainWithExit(argv); new Jedropseq().instanceMainWithExit(argv);
} }
else if(option.equalsIgnoreCase(COMMAND_RETAG)){
new TagFromReadName().instanceMainWithExit(argv);
}
else{ else{
System.err.println( System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+ "FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
...@@ -146,7 +152,8 @@ public class Je { ...@@ -146,7 +152,8 @@ public class Je {
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n" +"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n" +"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n" +"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n" //+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_RETAG+" \t\t extracts barcode and UMI sequence(s) embedded in read names and tag reads with proper BAM tag.\n"
+"\n" +"\n"
+"Version : "+getVersion() +"Version : "+getVersion()
; ;
......
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.QualityEncodingDetector;
import htsjdk.samtools.util.SolexaQualityConverter;
import java.io.File;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JeTry {
private static Logger log = LoggerFactory.getLogger(JeTry.class);
public JeTry() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) {
String f = "/g/furlong/incoming/2017-06-28-000000000-B954B/000000000-B954B_precap_allpromV2_17s003049-1-1_Ghavi-helm_lane117s003049_1_sequence.txt.gz";
FastqReader reader = new FastqReader(new File(f));
FastqQualityFormat QUALITY_FORMAT = QualityEncodingDetector.detect(100000, reader);
log.info(String.format("Auto-detected quality encoding format as: %s.", QUALITY_FORMAT));
FastqRecord r =reader.iterator().next();
System.out.println(r.toFastQString());
System.out.println(r.getBaseQualityString());
byte [] bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SAMUtils.fastqToPhred(bites);
System.out.println(Arrays.toString( bites ));
bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(bites);
System.out.println(Arrays.toString( bites ));
}
}
...@@ -48,7 +48,7 @@ import org.slf4j.LoggerFactory; ...@@ -48,7 +48,7 @@ import org.slf4j.LoggerFactory;
* <li> When no length ('x') is specified, all the sequence till the end is considered ; it only possible to use the 'x' * <li> When no length ('x') is specified, all the sequence till the end is considered ; it only possible to use the 'x'
* shortcut in the last block of a layout</li> * shortcut in the last block of a layout</li>
* <li> When a negative value is given in place of length (e.g. '<BLOCKCODEn:-2>'), all but the last x (2 in the * <li> When a negative value is given in place of length (e.g. '<BLOCKCODEn:-2>'), all but the last x (2 in the
* '<BLOCKCODEn:-2>' example) bases ; a negative length value is only acceptedin the last block of a layout</li> * '<BLOCKCODEn:-2>' example) bases ; a negative length value is only accepted in the last block of a layout</li>
* </ul> * </ul>
* *
* <br/> * <br/>
...@@ -152,6 +152,7 @@ public class ReadLayout { ...@@ -152,6 +152,7 @@ public class ReadLayout {
} }
/** /**
* change
* Process the layout and initialize useful variables * Process the layout and initialize useful variables
*/ */
public void parseLayout(){ public void parseLayout(){
......
...@@ -27,6 +27,7 @@ import htsjdk.samtools.SAMUtils; ...@@ -27,6 +27,7 @@ import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord; import htsjdk.samtools.fastq.FastqRecord;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
...@@ -122,6 +123,24 @@ public class ReadLayoutConsumer { ...@@ -122,6 +123,24 @@ public class ReadLayoutConsumer {
} }
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
* In this method, the read sequence is always used in BARCODE slots
*
* @param reads the reads in order matching that of the {@link ReadLayout} array used at construction
*
* @return
*/
public String assembleNewReadName(FastqRecord [] reads){
boolean[] useReadSequenceForBarcodes = new boolean[reads.length];
Arrays.fill(useReadSequenceForBarcodes, true);
return assembleNewReadName(reads, useReadSequenceForBarcodes, null);
}
/** /**
* Assemble a read name by concatenating the output layout to the original read name. * Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot * Concatenation is made by inserting a readNameDelimitor between each added slot
...@@ -150,7 +169,7 @@ public class ReadLayoutConsumer { ...@@ -150,7 +169,7 @@ public class ReadLayoutConsumer {
*/ */
String subseq = null; String subseq = null;
int bestQual = 0; int bestQual = 0;
if(!useReadSequenceForBarcodes[slotIdx-1] && slotTypeCode == BYTECODE_BARCODE){ if(slotTypeCode == BYTECODE_BARCODE && !useReadSequenceForBarcodes[slotIdx-1] ){
// we init the subseq with the matched barcode directly // we init the subseq with the matched barcode directly
subseq = m.getBarcodeMatches().get(slotIdx).barcode; subseq = m.getBarcodeMatches().get(slotIdx).barcode;
}else{ }else{
...@@ -208,7 +227,7 @@ public class ReadLayoutConsumer { ...@@ -208,7 +227,7 @@ public class ReadLayoutConsumer {
byte slotTypeCode = slotCodes.get(i); byte slotTypeCode = slotCodes.get(i);
int slotIdx = this.slotIdx.get(i); int slotIdx = this.slotIdx.get(i);
log.debug("gettign info for slot code "+slotTypeCode+" with idx "+slotIdx); log.debug("getting info for slot code "+slotTypeCode+" with idx "+slotIdx);
/* /*
* when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality * when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality
*/ */
......
...@@ -41,12 +41,19 @@ public class SampleMatch { ...@@ -41,12 +41,19 @@ public class SampleMatch {
*/ */
protected Map<Integer, BarcodeMatch> barcodeMatches; protected Map<Integer, BarcodeMatch> barcodeMatches;
protected String diagnosticNote = "";
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches){ public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches){
this.sample= sample; this.sample= sample;
this.barcodeMatches = barcodeMatches; this.barcodeMatches = barcodeMatches;
} }
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches, String diagnosticNote){
this.sample= sample;
this.barcodeMatches = barcodeMatches;
this.diagnosticNote = diagnosticNote;
}
/** /**
* @return the sample * @return the sample
...@@ -64,6 +71,22 @@ public class SampleMatch { ...@@ -64,6 +71,22 @@ public class SampleMatch {
} }
/**
* @return the diagnosticNote
*/
public String getDiagnosticNote() {
return diagnosticNote;
}
/**
* @param diagnosticNote the diagnosticNote to set
*/
public void setDiagnosticNote(String diagnosticNote) {
this.diagnosticNote = diagnosticNote;
}
/** /**
* @param sample the sample to set * @param sample the sample to set
*/ */
......
...@@ -36,6 +36,8 @@ import java.io.FileNotFoundException; ...@@ -36,6 +36,8 @@ import java.io.FileNotFoundException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
...@@ -45,6 +47,7 @@ import java.util.Map.Entry; ...@@ -45,6 +47,7 @@ import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import org.embl.cg.utilitytools.utils.CollectionUtils;
import org.embl.gbcs.je.BarcodeMatch; import org.embl.gbcs.je.BarcodeMatch;
import org.embl.gbcs.je.FastqWriterLayout; import org.embl.gbcs.je.FastqWriterLayout;
import org.embl.gbcs.je.JeUtils; import org.embl.gbcs.je.JeUtils;
...@@ -244,10 +247,11 @@ public class Demultiplexer { ...@@ -244,10 +247,11 @@ public class Demultiplexer {
* @param min_base_qualities one value for each defined BARCODE slot (in the read layouts) * @param min_base_qualities one value for each defined BARCODE slot (in the read layouts)
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout}. * @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout}.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written * When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param strict how to handle barcode with redundant slots
* @param asyncWrite whether we should use async FASTQ writers * @param asyncWrite whether we should use async FASTQ writers
* @param diagnosticFile if not null Je writes info on sample matching process * @param diagnosticFile if not null Je writes info on sample matching process
*/ */
public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean[] useReadSequenceForBarcodes, boolean asyncWrite, File diagnosticFile){ public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean[] useReadSequenceForBarcodes, boolean strict, boolean asyncWrite, File diagnosticFile){
/* /*
* Initialize all barcode maps * Initialize all barcode maps
...@@ -255,17 +259,22 @@ public class Demultiplexer { ...@@ -255,17 +259,22 @@ public class Demultiplexer {
// a map to get the list of all possible barcodes for a given slots // a map to get the list of all possible barcodes for a given slots
Map<Integer, Set<String>> barcodeSetBySlotId = new LinkedHashMap<Integer, Set<String>>(); Map<Integer, Set<String>> barcodeSetBySlotId = new LinkedHashMap<Integer, Set<String>>();
//list of barcode length ordered by slot id
List<Integer> orderedBarcodeLengths = new ArrayList<Integer>();
for(Entry<String, List<Set<String>>> e : sample2BarcodeSets.entrySet()){ for(Entry<String, List<Set<String>>> e : sample2BarcodeSets.entrySet()){
for (int i = 0; i < e.getValue().size(); i++) { for (int i = 0; i < e.getValue().size(); i++) {
//set of redundant barcodes for this sample and for the BARCODE slot i //set of redundant barcodes for this sample and for the BARCODE slot i
Set<String> _bcs = e.getValue().get(i); Set<String> _bcs = e.getValue().get(i);
int bcIdx = i+1; int bcIdx = i+1;
if(!barcodeSetBySlotId.containsKey(bcIdx)) if(!barcodeSetBySlotId.containsKey(bcIdx)){
barcodeSetBySlotId.put(bcIdx, new TreeSet<String>()); barcodeSetBySlotId.put(bcIdx, new TreeSet<String>());
orderedBarcodeLengths.add(_bcs.iterator().next().length());
}
barcodeSetBySlotId.get(bcIdx).addAll(_bcs); barcodeSetBySlotId.get(bcIdx).addAll(_bcs);
} }
} }
// the same map as above but with byte[][] arrays // the same map as above but with byte[][] arrays
Map<Integer, byte[][]> barcodeBytesBySlotId = new HashMap<Integer, byte[][]>(); Map<Integer, byte[][]> barcodeBytesBySlotId = new HashMap<Integer, byte[][]>();
for (Entry<Integer, Set<String>> e : barcodeSetBySlotId.entrySet()) { for (Entry<Integer, Set<String>> e : barcodeSetBySlotId.entrySet()) {
...@@ -373,14 +382,14 @@ public class Demultiplexer { ...@@ -373,14 +382,14 @@ public class Demultiplexer {
diagnosticFileWriter.print("ReadCount"); diagnosticFileWriter.print("ReadCount");
diagnosticFileWriter.print("Name"); diagnosticFileWriter.print("Name");
for (int i = 0; i <= barcodeBlockUniqueIdNumber; i++) { for (int i = 1; i <= barcodeBlockUniqueIdNumber; i++) {
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_readseq"); diagnosticFileWriter.print("\t"+"BARCODE"+i+"_readseq");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_bestbarcode"); diagnosticFileWriter.print("\t"+"BARCODE"+i+"_bestbarcode");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Best"); diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Best");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Second"); diagnosticFileWriter.print("\t"+"BARCODE"+i+"_MM_Second");
diagnosticFileWriter.print("\t"+"BARCODE"+i+"_passes_cutoffs"); diagnosticFileWriter.print("\t"+"BARCODE"+i+"_passes_cutoffs");
} }
diagnosticFileWriter.println("\t"+"assigned_sample"); diagnosticFileWriter.println("\t"+"assigned_sample"+"\t"+"notes");
} }
...@@ -397,7 +406,11 @@ public class Demultiplexer { ...@@ -397,7 +406,11 @@ public class Demultiplexer {
Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx = FastqWriterLayout.extractBarcodeSlots(reads, readLayouts); Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx = FastqWriterLayout.extractBarcodeSlots(reads, readLayouts);
//identify the sample matching these subsequence //identify the sample matching these subsequence
SampleMatch assignedSample = assignToSample(barcodeSubsequenceBySlotIdx, barcodeSetBySlotId, barcodeBytesBySlotId, barcodehash2sample, min_base_qualities, max_mismatches, min_mismatch_deltas); SampleMatch assignedSample = assignToSample(
barcodeSubsequenceBySlotIdx,
barcodeSetBySlotId, orderedBarcodeLengths,
barcodeBytesBySlotId, barcodehash2sample,
min_base_qualities, max_mismatches, min_mismatch_deltas, strict);
writeDiagnostics(reads, assignedSample, diagnosticFileWriter, cnt); writeDiagnostics(reads, assignedSample, diagnosticFileWriter, cnt);
...@@ -469,7 +482,7 @@ public class Demultiplexer { ...@@ -469,7 +482,7 @@ public class Demultiplexer {
/** /**
* @param reads the original reads * @param reads the original reads
* @param assignedSample the sample match report * @param assignedSample the sample match report
* @param diagnosticFileWriter the writer or null if no diagnositics has to be written * @param diagnosticFileWriter the writer or null if no diagnostics has to be written
* @param readCounter the current read iteration (starts at one) * @param readCounter the current read iteration (starts at one)
*/ */
private void writeDiagnostics(FastqRecord[] reads, private void writeDiagnostics(FastqRecord[] reads,
...@@ -493,7 +506,8 @@ public class Demultiplexer { ...@@ -493,7 +506,8 @@ public class Demultiplexer {
diagnosticFileWriter.print("\t"+bm.mismatchesToSecondBest); diagnosticFileWriter.print("\t"+bm.mismatchesToSecondBest);
diagnosticFileWriter.print("\t"+ (bm.matched? "yes": "no") ); diagnosticFileWriter.print("\t"+ (bm.matched? "yes": "no") );
} }
diagnosticFileWriter.println("\t"+(assignedSample.getSample().equals(Demultiplexer.UNASSIGNED) ? "unassigned" : assignedSample.getSample())); diagnosticFileWriter.print("\t"+(assignedSample.getSample().equals(Demultiplexer.UNASSIGNED) ? "unassigned" : assignedSample.getSample()));
diagnosticFileWriter.println("\t"+assignedSample.getDiagnosticNote());
if(readCounter % 100 == 0) if(readCounter % 100 == 0)
diagnosticFileWriter.flush(); diagnosticFileWriter.flush();
...@@ -505,28 +519,31 @@ public class Demultiplexer { ...@@ -505,28 +519,31 @@ public class Demultiplexer {
/** /**
* @param barcodeSubsequenceBySlotIdx associates each BARCODE slot (keyed by its ID) with the list of (redundant) * @param barcodeSubsequenceBySlotIdx associates each BARCODE slot (keyed by its ID) with the list of (redundant)
* barcodes sequences for this BARCODE slot. This is a list as a given BARCODE slot can appear more than once across * barcodes sequences for this BARCODE slot. This is a list as a given BARCODE slot can appear more than once across
* the read layouts i.e. in the case of redundant barcode * the read layouts i.e. in the case of redundant barcode
* @param barcodeSetBySlotIdx set of possible barcodes for a given BARCODE slot id * @param barcodeSetBySlotIdx set of possible barcodes for a given BARCODE slot id
* @param orderedBarcodeLengths ordered list of barcode length (following the concatenation ordered used for producing the hashcodes)
* @param barcodeBytesBySlotIdx set of possible barcodes for a given BARCODE slot id in byte format * @param barcodeBytesBySlotIdx set of possible barcodes for a given BARCODE slot id in byte format
* @param barcodehash2sample every single possible combination of barcode from all slots (one per slot in each combination) hash * @param barcodehash2sample every single possible combination of barcode from all slots (one per slot in each combination) hash
* @param min_base_qualities * @param min_base_qualities
* @param max_mismatches * @param max_mismatches
* @param min_mismatch_deltas * @param min_mismatch_deltas
* @param strict how to handle barcode with redundant sequence slots
* @return a {@link SampleMatch} in which the sample name is set to Demultiplexer.UNASSIGNED if barcode lookup failed * @return a {@link SampleMatch} in which the sample name is set to Demultiplexer.UNASSIGNED if barcode lookup failed
*/ */
private SampleMatch assignToSample( private SampleMatch assignToSample(
Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx, Map<Integer, List<FastqRecord>> barcodeSubsequenceBySlotIdx,
Map<Integer, Set<String>> barcodeSetBySlotIdx, Map<Integer, Set<String>> barcodeSetBySlotIdx,
List<Integer> orderedBarcodeLengths,
Map<Integer, byte[][]> barcodeBytesBySlotIdx, Map<Integer, byte[][]> barcodeBytesBySlotIdx,
Map<Integer, String> barcodehash2sample, Map<Integer, String> barcodehash2sample,
int [] min_base_qualities, int [] min_base_qualities,
int [] max_mismatches, int [] max_mismatches,
int [] min_mismatch_deltas int [] min_mismatch_deltas,
boolean strict
) { ) {