Commit 542b8ac9 authored by Charles Girardot's avatar Charles Girardot

finalized options. Ready for testing

parent a425ddef
......@@ -27,16 +27,12 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import picard.util.MathUtil;
import htsjdk.samtools.fastq.FastqRecord;
/**
......@@ -57,7 +53,8 @@ import htsjdk.samtools.fastq.FastqRecord;
* <li>SAMPLEn : refers to the SAMPLE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>UMIn : refers to the UMI slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>BARCODEn : refers to the sample barcode resolved from the read sequence found in the of the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>READBARn : refers to the read sequence found in the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>READBARn : refers to the read sequence found in the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects ; this is only valida in read name layout
* i.e. read sequence always contain original sequence</li>
* </ul>
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1' or 'R1' instead of '<BARCODE1>' , '<UMI2>' , '<SAMPLE1>' and <READBAR>; respectively.
......@@ -72,6 +69,7 @@ public class FastqWriterLayout {
private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class);
public static final String DEFAULT_READNAME_DELIMITOR = ":";
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE|READBAR)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUSR]\\d+)+$";
......@@ -80,10 +78,14 @@ public class FastqWriterLayout {
/**
* char to use to delineate slots in read name ; if needed
*/
protected static String readNameDelimitor = ":";
protected String readNameDelimitor = DEFAULT_READNAME_DELIMITOR;
/**
* Should the quality string be injected into read name together with READBAR and UMI slots ?
*/
protected boolean withQualityInReadName = false;
/**
* Layout for writing the read sequence ; in short format
*/
......@@ -105,7 +107,6 @@ public class FastqWriterLayout {
protected ReadLayoutConsumer readNameConsumer ;
/**
* All the {@link ReadLayout} defined in the demultiplexing ; used to find how to extract needed information
*/
......@@ -116,21 +117,44 @@ public class FastqWriterLayout {
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayouts
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts) {
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor) {
this(readSequenceLayout, readNameLayout, readLayouts, withQualityInReadName, readNameDelimitor, false);
}
/**
* @param readSequenceLayout
* @param readNameLayout
* @param readLayouts
* @param withQualityInReadName
* @param readNameDelimitor
* @param convertBarcodeToReadbar if true all BARCODE slots are converted to READBAR in the readNameLayout (BARCODE == READBAR in readSequenceLayout)
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor, final boolean convertBarcodeToReadbar) {
this.readNameLayout = (StringUtils.isBlank(readNameLayout) ? null : convertToShortLayout(readNameLayout));
this.readSequenceLayout = convertToShortLayout(readSequenceLayout);
this.readLayouts = readLayouts;
this.withQualityInReadName = withQualityInReadName;
this.readNameDelimitor = readNameDelimitor;
if(convertBarcodeToReadbar && readNameLayout!=null) {
this.readNameLayout = this.readNameLayout.replaceAll("B", "R");
}
init(); //build all maps for easy lookup
}
/**
* @param readSequenceLayout
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayout
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout });
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout, final boolean withQualityInReadName, final String readNameDelimitor) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout }, withQualityInReadName, readNameDelimitor, false);
}
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout, final boolean withQualityInReadName, final String readNameDelimitor, final boolean convertBarcodeToReadbar) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout }, withQualityInReadName, readNameDelimitor, convertBarcodeToReadbar);
}
......@@ -183,17 +207,15 @@ public class FastqWriterLayout {
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout} for each BARCODE.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param m a {@link SampleMatch} holding all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads, boolean[] useReadSequenceForBarcodes, SampleMatch m ){
public FastqRecord assembleRecord( FastqRecord[] reads, SampleMatch m ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads, useReadSequenceForBarcodes, m);
name = readNameConsumer.assembleNewReadName(reads, m);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
......@@ -203,21 +225,17 @@ public class FastqWriterLayout {
/**
* Convenient wrapper for single end configuration
* @param read the {@link FastqRecord} from the input fastq file
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout} for each BARCODE.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param m a {@link SampleMatch} holdign all the barcode matches
* @param m a {@link SampleMatch} holding all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord read, boolean[] useReadSequenceForBarcodes, SampleMatch m ){
return assembleRecord(new FastqRecord[]{read}, useReadSequenceForBarcodes, m);
public FastqRecord assembleRecord( FastqRecord read, SampleMatch m ){
return assembleRecord(new FastqRecord[]{read}, m);
}
/**
* Checks that this layout can be produced from a list of {@link ReadLayout}
* @param readLayouts
* @return
*
*/
protected void init(){
......@@ -237,7 +255,7 @@ public class FastqWriterLayout {
if(!Pattern.matches(SHORT_LAYOUT_REGEX, this.readNameLayout)){
throw new LayoutMalformedException("FASTQ Output Layout for read name does not match expected short format (regex is :"+SHORT_LAYOUT_REGEX+")", this.readNameLayout);
}
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts);
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts, this.withQualityInReadName, this.readNameDelimitor);
}
}
......@@ -295,6 +313,21 @@ public class FastqWriterLayout {
this.readNameDelimitor = readNameDelimitor;
}
/**
* @return the withQualityInReadName
*/
public boolean isWithQualityInReadName() {
return withQualityInReadName;
}
/**
* @param withQualityInReadName the withQualityInReadName to set
*/
public void setWithQualityInReadName(boolean withQualityInReadName) {
this.withQualityInReadName = withQualityInReadName;
}
/**
* @return the readSequenceLayout in short format
*/
......
......@@ -28,7 +28,7 @@ import java.util.TreeSet;
import org.embl.cg.utilitytools.utils.ExceptionUtil;
import org.embl.cg.utilitytools.utils.StringUtil;
import org.embl.gbcs.je.demultiplexer.Jedemultiplex;
import org.embl.gbcs.je.demultiplexer.DemultiplexCLI;
import org.embl.gbcs.je.jeclipper.Jeclipper;
import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
......@@ -66,7 +66,7 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_DUPES);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA);
//ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_RETAG);
......@@ -114,7 +114,7 @@ public class Je {
new Jeclipper().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DEMULTIPLEX)){
new Jedemultiplex().instanceMainWithExit(argv);
new DemultiplexCLI().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_MULTIPLEX)){
new Jemultiplexer().instanceMainWithExit(argv);
......@@ -147,12 +147,12 @@ public class Je {
protected static String getUsage(){
return "Usage: je <command> [options] \n\n"+
"with command in : \n"
+"\t "+COMMAND_CLIP+" \t\t clips molecular barcodes from fastq sequence and places them in read name headers for further use in 'dupes' module\n"
+"\t "+COMMAND_DEMULTIPLEX+" \t\t demultiplexes fastq file(s), with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_CLIP+" \t\t clips barcodes/UMIs from fastq sequence and places them in read name headers \n"
+"\t "+COMMAND_DEMULTIPLEX+" \t\t demultiplexes fastq file(s) into user-defined output files, with optional handling of molecular barcodes\n"
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
//+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_RETAG+" \t\t extracts barcode and UMI sequence(s) embedded in read names and tag reads with proper BAM tag.\n"
+"\n"
+"Version : "+getVersion()
......
......@@ -23,14 +23,6 @@
*/
package org.embl.gbcs.je;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.QualityEncodingDetector;
import htsjdk.samtools.util.SolexaQualityConverter;
import java.io.File;
import java.util.Arrays;
import org.slf4j.Logger;
......@@ -44,25 +36,9 @@ public class JeTry {
}
public static void main(String[] args) {
String f = "/g/furlong/incoming/2017-06-28-000000000-B954B/000000000-B954B_precap_allpromV2_17s003049-1-1_Ghavi-helm_lane117s003049_1_sequence.txt.gz";
FastqReader reader = new FastqReader(new File(f));
FastqQualityFormat QUALITY_FORMAT = QualityEncodingDetector.detect(100000, reader);
log.info(String.format("Auto-detected quality encoding format as: %s.", QUALITY_FORMAT));
FastqRecord r =reader.iterator().next();
System.out.println(r.toFastQString());
System.out.println(r.getBaseQualityString());
byte [] bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SAMUtils.fastqToPhred(bites);
System.out.println(Arrays.toString( bites ));
bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(bites);
System.out.println(Arrays.toString( bites ));
System.out.println(JeUtils.toBytesThenPhred(
"26242516303031"
));
}
......
......@@ -28,8 +28,62 @@ import java.util.TreeSet;
import com.developpez.adiguba.shell.ProcessConsumer;
import htsjdk.samtools.SAMUtils;
public class JeUtils {
/*
* BC : for sample barcode, raw or corrected, with QT to store its quality string
*/
public static final String SAMTAG_BC = "BC";
/*
* QT : Phred quality of the sample-barcode sequence in the BC (or RT) tag
*/
public static final String SAMTAG_QT = "QT";
/*
* RX : Sequence bases of the (possibly corrected) unique molecular identifier
*/
public static final String SAMTAG_RX = "RX";
/*
* QX : Quality score of the unique molecular identifier in the RX tag
*/
public static final String SAMTAG_QX = "QX";
/*
* OX : Original unique molecular barcode bases
*/
public static final String SAMTAG_OX = "OX";
/*
* BZ : Phred quality of the unique molecular barcode bases in the OX tag
*/
public static final String SAMTAG_BZ = "BZ";
/*
* MI : Molecular identifier; a string that uniquely identifies the molecule from which the record was derived
*/
public static final String SAMTAG_MI = "MI";
/**convert a string of quality numbers (each quality has 2 char) to the Phred String
* ie
* @param s
* @return
*/
public static String toBytesThenPhred(String s) {
byte [] arr = new byte [s.length()/2];
int i =0;
for(String t : s.split("(?<=\\G.{2})")) {
arr[i] = Byte.parseByte(t);
i++;
}
return SAMUtils.phredToFastq(arr);
}
/**
* @return the result of executing whoami on the underlying OS
......
......@@ -23,11 +23,8 @@
*/
package org.embl.gbcs.je;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
......@@ -36,6 +33,9 @@ import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord;
public class ReadLayoutConsumer {
private static Logger log = LoggerFactory.getLogger(ReadLayoutConsumer.class);
......@@ -43,6 +43,7 @@ public class ReadLayoutConsumer {
private static final byte BYTECODE_SAMPLE = 0;
private static final byte BYTECODE_BARCODE = 1;
private static final byte BYTECODE_UMI = 2;
private static final byte BYTECODE_READBAR = 3;
ArrayList<Byte> slotCodes = new ArrayList<Byte>();
......@@ -50,19 +51,33 @@ public class ReadLayoutConsumer {
ArrayList<Set<Integer>> layoutIndicesToUseForSlots = new ArrayList<Set<Integer>>();
ReadLayout [] readLayouts;
String outPutLayout;
boolean withQualityInReadName;
String readNameDelimitor = ":";
/**
* @param outPutLayout in short format
* @param readLayouts all ordered layout (order is as the reads are read from files)
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts){
this(outPutLayout, readLayouts, false, ":");
}
/**
* @param outPutLayout in short format
* @param readLayouts all ordered layout (order is as the reads are read from files)
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts, boolean withQualityInReadName, String readNameDelimitor){
this.outPutLayout = outPutLayout;
this.readLayouts = readLayouts;
Pattern sub = Pattern.compile("([BUS])(\\d+)");
this.withQualityInReadName = withQualityInReadName;
this.readNameDelimitor = readNameDelimitor;
Pattern sub = Pattern.compile("([BUSR])(\\d+)");
Matcher subMatcher = sub.matcher("");
Pattern p = Pattern.compile("([BUS]\\d+)");
Pattern p = Pattern.compile("([BUSR]\\d+)");
Matcher m = p.matcher(outPutLayout);
log.debug("ReadLayoutConsumer created with "+outPutLayout);
while(m.find()){
......@@ -77,6 +92,8 @@ public class ReadLayoutConsumer {
byte bCode ;
if(slotType.equalsIgnoreCase("B"))
bCode = BYTECODE_BARCODE;
else if(slotType.equalsIgnoreCase("R"))
bCode = BYTECODE_READBAR;
else if(slotType.equalsIgnoreCase("U"))
bCode = BYTECODE_UMI;
else
......@@ -98,7 +115,8 @@ public class ReadLayoutConsumer {
boolean canBeUsed = false;
switch (bCode) {
case BYTECODE_BARCODE:
canBeUsed = rl.containsBarcode() && rl.getBarcodeBlockUniqueIds().contains(slotId);
case BYTECODE_READBAR:
canBeUsed = rl.containsBarcode() && rl.getBarcodeBlockUniqueIds().contains(slotId);
break;
case BYTECODE_UMI:
canBeUsed = rl.containsUMI() && rl.getUMIBlockUniqueIds().contains(slotId);
......@@ -134,10 +152,7 @@ public class ReadLayoutConsumer {
* @return
*/
public String assembleNewReadName(FastqRecord [] reads){
boolean[] useReadSequenceForBarcodes = new boolean[reads.length];
Arrays.fill(useReadSequenceForBarcodes, true);
return assembleNewReadName(reads, useReadSequenceForBarcodes, null);
return assembleNewReadName(reads, null);
}
......@@ -147,16 +162,14 @@ public class ReadLayoutConsumer {
*
*
* @param reads the reads in order matching that of the {@link ReadLayout} array used at construction
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout} for each BARCODE.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param m a {@link SampleMatch} holding all the barcode matches
*
* @return
*/
public String assembleNewReadName(FastqRecord [] reads, boolean[] useReadSequenceForBarcodes, SampleMatch m){
public String assembleNewReadName(FastqRecord [] reads, SampleMatch m){
String newname = reads[0].getReadName().split("\\s")[0];
if(newname.endsWith(FastqWriterLayout.readNameDelimitor))
if(newname.endsWith(readNameDelimitor))
newname = newname.substring(0, newname.length()-1);
log.debug("assembling read name with pattern "+this.outPutLayout);
......@@ -168,8 +181,9 @@ public class ReadLayoutConsumer {
* when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality
*/
String subseq = null;
byte[] qualB = null;
int bestQual = 0;
if(slotTypeCode == BYTECODE_BARCODE && !useReadSequenceForBarcodes[slotIdx-1] ){
if(slotTypeCode == BYTECODE_BARCODE ){
// we init the subseq with the matched barcode directly
subseq = m.getBarcodeMatches().get(slotIdx).barcode;
}else{
......@@ -181,7 +195,7 @@ public class ReadLayoutConsumer {
String _subseq = null;
String _subqual = null;
switch (slotTypeCode) {
case BYTECODE_BARCODE:
case BYTECODE_READBAR:
_subseq = rl.extractBarcode(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
break;
......@@ -194,21 +208,38 @@ public class ReadLayoutConsumer {
_subqual = rl.extractSample(readForLayout.getBaseQualityString(), slotIdx);
break;
}
int _qualsum = overallQuality( SAMUtils.fastqToPhred(_subqual) );
byte[] _qualB = SAMUtils.fastqToPhred(_subqual);
int _qualsum = overallQuality( _qualB );
if(subseq == null || _qualsum > bestQual){
subseq = _subseq;
qualB = _qualB;
bestQual = _qualsum;
}
}
}
//concatenate to the growing name
newname += FastqWriterLayout.readNameDelimitor + subseq;
newname += this.readNameDelimitor + subseq;
if(withQualityInReadName)
newname += qualityToNumberString(qualB);
log.debug("header is now : "+newname);
}
return newname;
}
/*
* returns a string made of 2-digits quality scores for injection in the read name
*/
public synchronized static String qualityToNumberString(byte[] qualbytes) {
NumberFormat nf = NumberFormat.getIntegerInstance();
nf.setMinimumIntegerDigits(2);
StringBuffer sb = new StringBuffer(qualbytes.length*2);
for (byte b : qualbytes) {
sb.append(nf.format(b));
}
return sb.toString();
}
/**
* Assemble a FastqRecord using original read name and quality header.
* The read sequence (and associated quality string) are assembled according to the
......@@ -241,6 +272,7 @@ public class ReadLayoutConsumer {
String _subseq, _subqual = null;
switch (slotTypeCode) {
case BYTECODE_BARCODE:
case BYTECODE_READBAR:
_subseq = rl.extractBarcode(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
break;
......
......@@ -23,17 +23,14 @@
*/
package org.embl.gbcs.je.demultiplexer;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.fastq.FastqWriter;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SolexaQualityConverter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
......@@ -46,8 +43,9 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import org.embl.cg.utilitytools.utils.CollectionUtils;
import org.embl.cg.utilitytools.utils.ExceptionUtil;
import org.embl.gbcs.je.BarcodeMatch;
import org.embl.gbcs.je.FastqWriterLayout;
import org.embl.gbcs.je.JeUtils;
......@@ -58,6 +56,14 @@ import org.embl.gbcs.je.SampleMatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.fastq.FastqWriter;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.SolexaQualityConverter;
public class Demultiplexer {
private static Logger log = LoggerFactory.getLogger(Demultiplexer.class);
......@@ -72,6 +78,14 @@ public class Demultiplexer {
*/
File [] fastqInFiles;
/**
* Je normally infers if input files are compressed or not by checking for a 'gz' extension.
* You can overwrite this behavior but setting this explicitly (useless in context where files are named by hash or UUID)
*/
Boolean compressedInputFastqFiles = null;
/*
* ordered list of the ReadLayout ie matching fastqInFiles order
*/
......@@ -127,7 +141,7 @@ public class Demultiplexer {
/**
* The command line caller
*/
protected Jedemultiplex jedemultiplexer;
protected DemultiplexCLI jedemultiplexer;
/**
......@@ -136,6 +150,8 @@ public class Demultiplexer {
*/
protected HashMap<String, Integer> sampleCountMap = null;
//turned to true if a single sample named Jedemultiplex.UNIQUE_MULTIPLEXED_SAMPLE_NAME is found in the sample2outputFileList map
protected boolean WRITE_ALL_READS_IN_SAME_OUTPUT = false;
/**
*
......@@ -158,7 +174,7 @@ public class Demultiplexer {
* @param createMD5Sum
*/
public Demultiplexer(
Jedemultiplex jedemultiplexer,
DemultiplexCLI jedemultiplexer,
File [] fastqInFiles,
ReadLayout [] readLayouts,
Map<String, List<Set<String>>> sample2BarcodeSets,
......@@ -194,7 +210,12 @@ public class Demultiplexer {
if(sample2outputFileList == null || sample2outputFileList.size() == 0)
throw new Jexception("no sample2outputFileList provided (null or empty)");
if(sample2BarcodeSets.size() != sample2outputFileList.size())
if( sample2outputFileList.size() == 1 && !sample2outputFileList.containsKey(DemultiplexCLI.UNIQUE_MULTIPLEXED_SAMPLE_NAME)){
throw new Jexception("sample2outputFileList must have a unique entry when using sampel name '"+DemultiplexCLI.UNIQUE_MULTIPLEXED_SAMPLE_NAME+"' indicating that all reads must be written in the same output");
}
WRITE_ALL_READS_IN_SAME_OUTPUT = (sample2outputFileList.size() == 1 && sample2outputFileList.containsKey(DemultiplexCLI.UNIQUE_MULTIPLEXED_SAMPLE_NAME) );
if(!WRITE_ALL_READS_IN_SAME_OUTPUT && sample2BarcodeSets.size() != sample2outputFileList.size())
throw new Jexception("The number samples described in sample2BarcodeSets and sample2outputFileList maps is not the same : "+sample2BarcodeSets.size() + "and "+ sample2outputFileList.size()+" (respectively)");
//check all entries in sample2BarcodeSets have the same number of barcode set ; which must match the overall number of barcode slots across the read layouts
......@@ -237,7 +258,19 @@ public class Demultiplexer {
/**
* @return the compressedInputFastqFiles
*/
public Boolean isCompressedInputFastqFiles() {
return compressedInputFastqFiles;
}
/**
* @param compressedInputFastqFiles the compressedInputFastqFiles to set
*/
public void setCompressedInputFastqFiles(Boolean compressedInputFastqFiles) {
this.compressedInputFastqFiles = compressedInputFastqFiles;
}
......@@ -245,13 +278,11 @@ public class Demultiplexer {
* @param max_mismatches one value for each defined BARCODE slot (in the read layouts)
* @param min_mismatch_deltas one value for each defined BARCODE slot (in the read layouts)
* @param min_base_qualities one value for each defined BARCODE slot (in the read layouts)
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout}.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param strict how to handle barcode with redundant slots
* @param asyncWrite whether we should use async FASTQ writers
* @param diagnosticFile if not null Je writes info on sample matching process
*/
public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean[] useReadSequenceForBarcodes, boolean strict, boolean asyncWrite, File diagnosticFile){
public void run(int[] max_mismatches, int[] min_mismatch_deltas, int[] min_base_qualities, boolean strict, boolean asyncWrite, File diagnosticFile){
/*
* Initialize all barcode maps
......@@ -315,7 +346,6 @@ public class Demultiplexer {
Map<String, List<FastqWriter>> sampleFastqWriters = new HashMap<String, List<FastqWriter>>();
for (Entry<String, List<File>> e : sample2outputFileList.entrySet()) {
sampleCountMap.put(e.getKey(), 0);
List<FastqWriter> _writers = new ArrayList<FastqWriter>();
for (File _f : e.getValue()) {
_writers.add(fastqFactory.newWriter(_f, gzipOutput, createMD5Sum));
......@@ -324,6 +354,11 @@ public class Demultiplexer {
}
//init count map
for (String _smpl : sample2BarcodeSets.keySet()) {
sampleCountMap.put(_smpl, 0);
}
/*
* Open writers for unassigned FASTQ reads ; if requested
*/
......@@ -345,7 +380,28 @@ public class Demultiplexer {
List<FastqReader> fastqReaders = new ArrayList<FastqReader>(); //we need to store them to close them at the end
List<Iterator<FastqRecord>> fastqFileIterators = new ArrayList<Iterator<FastqRecord>>();
for (File fqFile : fastqInFiles) {
FastqReader r = new FastqReader(fqFile);
FastqReader r = null;
if(isCompressedInputFastqFiles() == null) {
//we let Je check extensions to decide what reader to init on the files (gz or not)
r = new FastqReader(fqFile);
}
else {
//we follow the settings
InputStream is = null;
try {
if (isCompressedInputFastqFiles() ) {
is = new GZIPInputStream( Files.newInputStream(fqFile.toPath() ) );
}
else {
is = Files.newInputStream( fqFile.toPath() );
}
}catch(IOException ioe) {
throw new Jexception("Failed to open "+fqFile.getAbsolutePath()+" as a "+(isCompressedInputFastqFiles()?"gzipped":"non-compressed") +" file.\n"
+ExceptionUtil.getStackTrace(ioe));
}
r = new FastqReader(fqFile, new BufferedReader(new InputStreamReader( is )));
}
fastqReaders.add(r);
fastqFileIterators.add( r.iterator() );
}
......@@ -422,11 +478,15 @@ public class Demultiplexer {
sampleCountMap.put(assignedSample.getSample(), c);
assigned++;
List<FastqWriter> writers = sampleFastqWriters.get(assignedSample.getSample());
List<FastqWriter> writers = (
WRITE_ALL_READS_IN_SAME_OUTPUT ?
sampleFastqWriters.get(DemultiplexCLI.UNIQUE_MULTIPLEXED_SAMPLE_NAME) :
sampleFastqWriters.get(assignedSample.getSample())
);
for (int i = 0; i < writers.size(); i++) {