Commit 9373e9bb authored by Charles Girardot's avatar Charles Girardot

pushing new release 2.0.1

parent d46acaec
......@@ -27,10 +27,6 @@
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.launching.macosx.MacOSXType/Java 1.7.0">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="output" path="target/classes"/>
</classpath>
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
org.eclipse.jdt.core.compiler.compliance=1.5
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
org.eclipse.jdt.core.compiler.compliance=1.7
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.5
org.eclipse.jdt.core.compiler.source=1.7
......@@ -5,8 +5,7 @@
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# echo $DIR
# path to jar file to execute, this jar is supposed to be in the same dir as this script
JAR_FILE=$DIR"/je_2.0.RC_bundle.jar"
JAR_FILE=$DIR"/je_2.0.1.RC_bundle.jar"
# set default _JAVA_OPTIONS
_JAVA_OPTIONS=${_JAVA_OPTIONS:-'-Xmx4G -Xms256m'}
......
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>Je</groupId>
<artifactId>Je</artifactId>
<version>2.0.RC</version>
<version>2.0.1.RC</version>
<name>Je</name>
<description>Je provides command line utilities to deal with barcoded FASTQ files with or without Unique Molecular Index (UMI)</description>
......@@ -235,6 +235,7 @@
<artifactId>ut_utils</artifactId>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
......
......@@ -77,16 +77,33 @@ public class FastqWriterLayout {
/**
* char to use to delineate slots in read name ; if needed
* char to use to delineate slots of the barcode/UMI bloc in the read name ; if needed ; cannot be null
*/
protected String readNameDelimitor = DEFAULT_READNAME_DELIMITOR;
/**
* the delimitor to use to separate the original read name and the extracted barcode/UMI block.
*/
protected String headerBlocksDelimiter = DEFAULT_READNAME_DELIMITOR;
/**
* Should the quality string be injected into read name together with READBAR and UMI slots ?
* Should the quality string be injected as numeric representation into read name together with READBAR and UMI slots ?
* inject the code quality as 2 digits number directly next to the code ie :ACGG12342435:....
*/
protected boolean withQualityInReadName = false;
protected boolean withNumericQualityInReadName = false;
/**
* Should the raw quality string be injected into read name together with READBAR and UMI slots as a separate slot?
* inject the code quality as a separate field directly next to the code ie :ACGG:#$##:....
*/
protected boolean withRawQualityInReadName = false;
/**
* to have sepapate named blocks like '@ILLUMINA_HEADER BARCODE:BBBB:>A>A UMI:UUUU:#$AB' or '@ILLUMINA_HEADER BARCODE:BBBB12341234 UMI:UUUU34414232'
*/
protected boolean withNamedBlocks;
/**
* Layout for writing the read sequence ; in short format
*/
......@@ -126,8 +143,13 @@ public class FastqWriterLayout {
* @param readSequenceLayout the string representation of the output layout to use for the read sequence e.g. "S1"
* @param readNameLayout the string representation of the output layout to use for the read name e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param withNumericQualityInReadName indicates if the Barcode/UMI quality should be injected as a 2-digit representation in the read name together with their sequence
* inject the code quality as 2 digits number directly next to the code ie "...:ACGG12342435:..."
* @param withRawQualityInReadName indicates if the Barcode/UMI raw quality string should be injected in its own slot in the read name together with their sequence
* inject the code quality as a separate field directly next to the code ie "...:ACGG:#$##:..."
* @param withNamedBlocks true to have sepapate named blocks like '@ILLUMINA_HEADER BARCODE:BBBB:>A>A UMI:UUUU:#$AB' or '@ILLUMINA_HEADER BARCODE:BBBB12341234 UMI:UUUU34414232'
* @param readNameDelimitor the character to use to split up the code block of the read name (':' is the default)
* @param readHeaderDelimitor the delimitor to use to separate the original read name and the extracted barcode/UMI block.
* @param convertBarcodeToReadbar if true all BARCODE slots are converted to READBAR in the readNameLayout (BARCODE == READBAR in readSequenceLayout)
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
......@@ -135,20 +157,31 @@ public class FastqWriterLayout {
final String readSequenceLayout,
final String readNameLayout,
final ReadLayout [] readLayouts,
final boolean withQualityInReadName,
final boolean withNumericQualityInReadName,
final boolean withRawQualityInReadName,
final boolean withNamedBlocks,
final String readNameDelimitor,
final String readHeaderDelimitor,
final boolean convertBarcodeToReadbar,
final FastqQualityFormat fastqQualityFormat) {
this.readNameLayout = (StringUtils.isBlank(readNameLayout) ? null : convertToShortLayout(readNameLayout));
this.readSequenceLayout = convertToShortLayout(readSequenceLayout);
this.readLayouts = readLayouts;
this.withQualityInReadName = withQualityInReadName;
this.withNumericQualityInReadName = withNumericQualityInReadName;
this.withRawQualityInReadName = withRawQualityInReadName;
this.withNamedBlocks = withNamedBlocks;
this.readNameDelimitor = readNameDelimitor;
this.headerBlocksDelimiter = readHeaderDelimitor;
if(convertBarcodeToReadbar && readNameLayout!=null) {
this.readNameLayout = this.readNameLayout.replaceAll("B", "R");
}
this.fastqQualityFormat = fastqQualityFormat;
if(withNumericQualityInReadName && withRawQualityInReadName ) {
throw new RuntimeException("One cannot export both barcode/UMI quality in header in both the raw and the numeric format !");
}
init(); //build all maps for easy lookup
}
......@@ -157,12 +190,22 @@ public class FastqWriterLayout {
* @param readSequenceLayout the string representation of the output layout to use for the read sequence e.g. "S1"
* @param readNameLayout the string representation of the output layout to use for the read name e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param withNumericQualityInReadName indicates if the Barcode/UMI quality should be injected as a 2-digit representation in the read name together with their sequence
* inject the code quality as 2 digits number directly next to the code ie "...:ACGG12342435:..."
* @param withRawQualityInReadName indicates if the Barcode/UMI raw quality string should be injected in its own slot in the read name together with their sequence
* inject the code quality as a separate field directly next to the code ie "...:ACGG:#$##:..."
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param readHeaderDelimitor the delimitor to use to separate the original read name and the extracted barcode/UMI block.
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor, final FastqQualityFormat fastqQualityFormat) {
this(readSequenceLayout, readNameLayout, readLayouts, withQualityInReadName, readNameDelimitor, false, fastqQualityFormat);
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts,
final boolean withNumericQualityInReadName,
final boolean withRawQualityInReadName,
final boolean withNamedBlocks,
final String readNameDelimitor,
final String readHeaderDelimitor,
final FastqQualityFormat fastqQualityFormat) {
this(readSequenceLayout, readNameLayout, readLayouts, withNumericQualityInReadName, withRawQualityInReadName, withNamedBlocks, readNameDelimitor, readHeaderDelimitor, false, fastqQualityFormat);
}
......@@ -245,7 +288,11 @@ public class FastqWriterLayout {
if(!Pattern.matches(SHORT_LAYOUT_REGEX, this.readNameLayout)){
throw new LayoutMalformedException("FASTQ Output Layout for read name does not match expected short format (regex is :"+SHORT_LAYOUT_REGEX+")", this.readNameLayout);
}
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts, this.withQualityInReadName , this.readNameDelimitor, this.fastqQualityFormat);
readNameConsumer = new ReadLayoutConsumer(
this.readNameLayout, this.readLayouts,
this.withNumericQualityInReadName , this.withRawQualityInReadName, this.withNamedBlocks,
this.readNameDelimitor, this.headerBlocksDelimiter,
this.fastqQualityFormat);
}
}
......@@ -307,17 +354,48 @@ public class FastqWriterLayout {
/**
* @return the withQualityInReadName
*/
public boolean isWithQualityInReadName() {
return withQualityInReadName;
public boolean isWithNumericQualityInReadName() {
return withNumericQualityInReadName;
}
/**
* @return the withQualityInReadName
*/
public boolean isWithRawQualityInReadName() {
return withRawQualityInReadName;
}
/**
* @return the withNamedBlocks
*/
public boolean isWithNamedBlocks() {
return withNamedBlocks;
}
/**
* @param withNamedBlocks the withNamedBlocks to set
*/
public void setWithNamedBlocks(boolean withNamedBlocks) {
this.withNamedBlocks = withNamedBlocks;
}
/**
* @param withQualityInReadName the withQualityInReadName to set
* @param withNumericQualityInReadName the withQualityInReadName to set
*/
public void setWithQualityInReadName(boolean withQualityInReadName) {
this.withQualityInReadName = withQualityInReadName;
public void setWithNumericQualityInReadName(boolean withNumericQualityInReadName) {
this.withNumericQualityInReadName = withNumericQualityInReadName;
}
/**
* @param withRawQualityInReadName the withRawQualityInReadName to set
*/
public void setWithRawQualityInReadName(boolean withRawQualityInReadName) {
this.withRawQualityInReadName = withRawQualityInReadName;
}
/**
* @return the readSequenceLayout in short format
*/
......
......@@ -35,6 +35,7 @@ import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
import org.embl.gbcs.je.jemultiplexer.Jemultiplexer;
import org.embl.gbcs.je.jemultiplexer.JemultiplexerIllumina;
import org.embl.gbcs.je.retag.TagFromReadName;
import org.embl.gbcs.je.shift.ShiftReadByStrand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -51,6 +52,7 @@ public class Je {
private static Logger log = LoggerFactory.getLogger(Je.class);
public static final String COMMAND_SHIFTREAD = "shift";
public static final String COMMAND_RETAG = "retag";
public static final String COMMAND_DEMULTIPLEX = "debarcode";
public static final String COMMAND_DROPSEQ = "dropseq";
......@@ -69,12 +71,14 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_RETAG);
ALLOWED_COMMANDS.add(COMMAND_SHIFTREAD);
}
protected String command = null;
public static void main(String[] args) {
System.out.println("cwd="+System.getProperty("user.dir"));
//we need at least one option to proceed
if(args.length == 0 ){
System.err.println(getUsage());
......@@ -131,6 +135,9 @@ public class Je {
else if(option.equalsIgnoreCase(COMMAND_RETAG)){
new TagFromReadName().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_SHIFTREAD)){
new ShiftReadByStrand().instanceMainWithExit(argv);
}
else{
System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
......@@ -154,6 +161,7 @@ public class Je {
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_RETAG+" \t\t extracts barcode and UMI sequence(s) embedded in read names and tag reads with proper BAM tag.\n"
+"\t "+COMMAND_SHIFTREAD+" \t\tshifts read mapping starts by a specified strand-specific offset (can be negative).\n"
+"\n"
+"Version : "+getVersion()
;
......
......@@ -151,6 +151,65 @@ public class ReadLayout {
parseLayout();
}
/**
* Generate a default output layout's sequence slot from this read layout
* By default, the output files only contain the SAMPLE slot(s) unless clip is set to false
* @param clip
* @return a short format layout
*/
public String generateDefaultSequenceSlotOutputLayout(boolean clip) {
String ol = "";
Matcher layoutMatcher = blockP.matcher(this.layout);
while(layoutMatcher.find()){
String blockType = layoutMatcher.group(1);
int blockNumber = getBlockNumber(blockType, layoutMatcher.group(2));
if(blockType.equals(BLOCKCODE_BARCODE)){
if(clip)
continue;
ol += CHAR_FOR_BARCODE+blockNumber;
}
else if(blockType.equals(BLOCKCODE_UMI)){
if(clip)
continue;
ol += CHAR_FOR_UMI+blockNumber;
}
else if(blockType.equals(BLOCKCODE_SAMPLE)){
ol += CHAR_FOR_SAMPLE+blockNumber;
}
else{
throw new Jexception("Unknown block type in read layout : "+blockType);
}
}
return ol;
}
/**
* Generate an output layout from this read layout
* all BARCODE and UMI slots (if any) are placed in the fastq headers following their slot index
* i.e. BARCODE1:...:BARCODEn:UMI1:UMI2:...:UMIn
* @return a short format layout
*/
public String generateDefaultHeaderSlotOutputLayout() {
String ol = "";
if( hasBarcodeBlock ) {
for(int blockId: this.getOrderedBarcodeBlockUniqueIds()) {
ol += CHAR_FOR_BARCODE+blockId;
}
}
if( hasUMIBlock ) {
for(int blockId: this.getOrderedUMIBlockUniqueIds()) {
ol += CHAR_FOR_UMI+blockId;
}
}
return ol;
}
/**
* change
* Process the layout and initialize useful variables
......@@ -288,6 +347,21 @@ public class ReadLayout {
}
/**
* @return the layout
*/
public String getLayout() {
return layout;
}
/**
* @param layout the layout to set
*/
public void setLayout(String layout) {
this.layout = layout;
}
private int nextStartInLayout(String flatlayout, String blockChar,
Integer len, int from) {
int l = (len!=null && len > 0 ? len : 1);
......
......@@ -53,8 +53,21 @@ public class ReadLayoutConsumer {
ArrayList<Set<Integer>> layoutIndicesToUseForSlots = new ArrayList<Set<Integer>>();
ReadLayout [] readLayouts;
String outPutLayout;
boolean withQualityInReadName;
String readNameDelimitor = ":";
//inject the code quality as 2 digits number directly next to the code ie :ACGG12342435:....
boolean withNumericQualityInReadName;
//inject the code quality as a separate field directly next to the code ie :ACGG:#$##:....
boolean withRawQualityInReadName;
//to have sepapate named blocks like '@ILLUMINA_HEADER BARCODE:BBBB:>A>A UMI:UUUU:#$AB' or '@ILLUMINA_HEADER BARCODE:BBBB12341234 UMI:UUUU34414232'
boolean withNamedBlocks;
// the delimiter to use to seprate the different code/quality fields
String codeBlockDelimiter = ":";
//the delimitor to use to separate the original read name and the extracted barcode/UMI block. Set to NULL to use a space
String headerBlocksDelimiter = ":";
FastqQualityFormat fastqQualityFormat = null;
/**
......@@ -66,7 +79,7 @@ public class ReadLayoutConsumer {
*
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts){
this(outPutLayout, readLayouts, false, ":", FastqQualityFormat.Standard);
this(outPutLayout, readLayouts, false, false, false, ":", ":", FastqQualityFormat.Standard);
}
......@@ -75,17 +88,35 @@ public class ReadLayoutConsumer {
*
* @param outPutLayout the string representation of the output layout e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param withNumericQualityInReadName indicates if the Barcode/UMI quality should be injected as a
* 2-digits representation in the read name together with their sequence i.e "...:ACGG12342435:...."
* mutually exclusive with withRawQualityInReadName
* @param withRawQualityInReadName indicates if the Barcode/UMI quality string should be injected as-is
* in the read name as a separate field i.e. "...:ACGG:#$##:...." ;
* mutually exclusive with withNumericQualityInReadName
* @param withNamedBlocks true to have sepapate named blocks like '@ILLUMINA_HEADER BARCODE:BBBB:>A>A UMI:UUUU:#$AB' or '@ILLUMINA_HEADER BARCODE:BBBB12341234 UMI:UUUU34414232'
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param readHeaderDelimitor the delimitor to use to separate the original read name and the extracted barcode/UMI block.
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts, boolean withQualityInReadName, String readNameDelimitor, final FastqQualityFormat fastqQualityFormat){
public ReadLayoutConsumer(
String outPutLayout, ReadLayout [] readLayouts,
boolean withNumericQualityInReadName, boolean withRawQualityInReadName, boolean withNamedBlocks,
String readNameDelimitor, String readHeaderDelimitor,
final FastqQualityFormat fastqQualityFormat){
this.outPutLayout = outPutLayout;
this.readLayouts = readLayouts;
this.withQualityInReadName = withQualityInReadName;
this.readNameDelimitor = readNameDelimitor;
this.withNumericQualityInReadName = withNumericQualityInReadName;
this.withRawQualityInReadName = withRawQualityInReadName;
this.withNamedBlocks = withNamedBlocks;
this.codeBlockDelimiter = readNameDelimitor;
this.headerBlocksDelimiter = readHeaderDelimitor;
this.fastqQualityFormat = fastqQualityFormat;
if(withNumericQualityInReadName && withRawQualityInReadName ) {
throw new RuntimeException("One cannot export both barcode/UMI quality in header in both the raw and the numeric format !");
}
Pattern sub = Pattern.compile("([BUSR])(\\d+)");
Matcher subMatcher = sub.matcher("");
......@@ -168,7 +199,7 @@ public class ReadLayoutConsumer {
public String assembleNewReadName(FastqRecord [] reads, SampleMatch m){
String newname = reads[0].getReadName().split("\\s")[0];
if(newname.endsWith(readNameDelimitor))
if(newname.endsWith(codeBlockDelimiter))
newname = newname.substring(0, newname.length()-1);
log.debug("assembling read name with pattern "+this.outPutLayout);
......@@ -180,9 +211,10 @@ public class ReadLayoutConsumer {
* when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality
*/
String subseq = null;
String subqual = null;
byte[] qualB = null;
int bestQual = 0;
String _blockName = null;
for(int rlIdx : layoutIndicesToUseForSlots.get(i)){
ReadLayout rl = readLayouts[rlIdx];
......@@ -190,23 +222,28 @@ public class ReadLayoutConsumer {
String _subseq = null;
String _subqual = null;
switch (slotTypeCode) {
case BYTECODE_BARCODE:
// we init the subseq with the matched barcode directly
_subseq = m.getBarcodeMatches().get(slotIdx).barcode;
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
_blockName = "BARCODE";
break;
case BYTECODE_READBAR:
_subseq = rl.extractBarcode(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
_blockName = "READBAR";
break;
case BYTECODE_UMI:
_subseq = rl.extractUMI(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractUMI(readForLayout.getBaseQualityString(), slotIdx);
_blockName = "UMI";
break;
default:
_subseq = rl.extractSample(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractSample(readForLayout.getBaseQualityString(), slotIdx);
_blockName = "UNKNOWN";
break;
}
byte[] _qualB = _subqual.getBytes();
......@@ -215,13 +252,23 @@ public class ReadLayoutConsumer {
subseq = _subseq;
qualB = _qualB;
bestQual = _qualsum;
subqual = _subqual;
}
}
//concatenate to the growing name
newname += this.readNameDelimitor + subseq;
if(withQualityInReadName) {
// if i == 0 or withNamedBlocks = true we need to use the we need to use the header delimiter
//add delimiter
newname += (i == 0 || withNamedBlocks ? this.headerBlocksDelimiter : this.codeBlockDelimiter) ;
//add block name ?
if(withNamedBlocks)
newname += _blockName + codeBlockDelimiter;
//add sequence
newname += subseq;
if(withNumericQualityInReadName) {
newname += qualityToNumberString(qualB, this.fastqQualityFormat);
} else if(withRawQualityInReadName) {
newname += this.codeBlockDelimiter + subqual;
}
log.debug("header is now : "+newname);
}
......
......@@ -74,7 +74,7 @@ public class BarcodeFileGeneralParser {
/**
* regex of the header line
*/
public static String headerLineRegex = "^" + HEADER_SAMPLE + "(\\s"+HEADER_BARCODE+"\\d+)+(\\s"+HEADER_OUT+"\\d+)*$";
public static String headerLineRegex = "^" + HEADER_SAMPLE + "(\\s+"+HEADER_BARCODE+"\\d+)+(\\s"+HEADER_OUT+"\\d+)*$";
/**
* The barcode file to validate
......
......@@ -53,6 +53,7 @@ import org.embl.gbcs.je.JemultiplexerFastqWriterFactory;
import org.embl.gbcs.je.Jexception;
import org.embl.gbcs.je.ReadLayout;
import org.embl.gbcs.je.SampleMatch;
import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -963,4 +964,6 @@ public class Demultiplexer {
return c;
}
}
......@@ -66,6 +66,7 @@ public class Jeclipper extends CommandLineProgram {
protected static final Boolean DEFAULT_GZIP_OUTPUTS = true;
protected static final Boolean DEFAULT_WRITER_FACTORY_USE_ASYNC_IO = true;
protected static final String DEFAULT_READ_NAME_SEPARATOR_CHAR = ":";
protected static final String DEFAULT_HEADER_SEPARATOR_CHAR = ":";
protected static final boolean DEFAULT_ADD_SEQUENCE_LAYOUT_IN_OUTPUT_FILENAME = false;
......@@ -110,7 +111,7 @@ public class Jeclipper extends CommandLineProgram {
//output layouts
@Option(shortName="OL", optional = false,
printOrder=40,
printOrder=35,
doc="Describes the output file layout(s) using the slots defined in read layouts and ':' to delimitate three parts e.g. 'OL=1:<BARCODE1><UMI1><UMI2>:<SAMPLE1>' : \n" +
"\t"+"1.The number in the first part (i.e. from '1:' above) is the output file index and it must be unique across all 'OL' inputs. "+
"Inferred from order in comamnd line when not given\n"+
......@@ -120,15 +121,44 @@ public class Jeclipper extends CommandLineProgram {
)
public List<String> OUTPUT_LAYOUT;
@Option(shortName="WQ", optional = true,
printOrder=41,
doc="Should quality string also be injected in read names. Only applies to READBAR and UMI described in the read name slot of output layout \n"+
"If turned on, the quality string is translated into 2 digits number and a e.g. UMI will look like\n"+
"\t"+" '...:ATGCAT333423212322:...' instead of '...:ATGCAT:...'\n"+
@Option(shortName="WNQ", optional = true, mutex= {"WITH_RAW_QUALITY"},
printOrder=38,
doc="Set to True to keep Phred sequence qualities in output read names. \n"+
"This option only applies to BARCODE, READBAR and UMI described in the read name slot of output layout. "+
"For BARCODE, the equivalent READBAR quality is used. In case of redundant slots, the best found quality is used.\n"+
"The quality string is translated into 2 digits number representing the quality scores on the Phred scale and a e.g. UMI will look like\n"+
"\t"+" '...:ATGCAT333023212322:...' instead of '...:ATGCAT:...'\n"+
"This option is particularly useful with the retag module that knows how to extract quality numbers into BAM tags."
)
public boolean WITH_QUALITY_IN_READNAME = false;
public boolean WITH_NUMERIC_QUALITY = false;
@Option(shortName="WRQ", optional = true, mutex= {"WITH_NUMERIC_QUALITY"},
printOrder=40,
doc="Set to True to keep raw sequence qualities in output read names. \n"+
"This option only applies to BARCODE, READBAR and UMI described in the read name slot of output layout. "+
"For BARCODE, the equivalent READBAR quality is used. In case of redundant slots, the best found quality is used.\n"+
"The raw quality string of a e.g. UMI is injected directly following the UMI using the READ_NAME_SEPARATOR_CHAR e.g. :\n"+
"\t"+" '...:ATGCAT:>>A>A:...' instead of '...:ATGCAT:...'\n"+
"This option might be difficult to handle when extracting more than one barcode/UMI as the field separator, ':' by default), is also used in quality encoding"
)
public boolean WITH_RAW_QUALITY = false;
@Option(shortName="WNB", optional = true,
printOrder=41,
doc="Name barcode/umi blocks with type\n"+
"By default the different extracted UMI and/or barcodes are concatenated as a single block in the read name so the resulting header looks like : \n "+
"\t @ILLUMINA_HEADER:BBBB:UUUU \n" +
"\t where ILLUMINA_HEADER is the original read header and BBBB and UUUU are the extracted Barcode and UMI (order follows that of the output read layout)\n" +
"\t here both the SEP and HSEP are the default i.e. ':'"+
"When WITH_NAMED_BLOCKS=true, the extracted barcode/UMIs are injected as separated named blocks. Here is the same example as above but with SEP=NONE (to use spaces between the blocks) for clearer display\n"+
"\t"+" @ILLUMINA_HEADER BARCODE:BBBB UMI:UUUU\n"+
"Note that if WITH_RAW_QUALITY or WITH_NUMERIC_QUALITY are used, the output will look like :\n " +
"\t"+" - WITH_NUMERIC_QUALITY => @ILLUMINA_HEADER BARCODE:BBBB12341234 UMI:UUUU34414232 \n"+
"\t"+" - WITH_RAW_QUALITY (extra field) => @ILLUMINA_HEADER BARCODE:BBBB:>A>A UMI:UUUU:#$AB\n"
)
public boolean WITH_NAMED_BLOCKS = false;
@Option(shortName="OWID",
optional = true,
......@@ -183,11 +213,18 @@ public class Jeclipper extends CommandLineProgram {
public boolean GZIP_OUTPUTS = DEFAULT_GZIP_OUTPUTS;
@Option(shortName = "SEP", optional = true,
printOrder=170,
printOrder=180,
doc="Separator character used to concatenate barcodes and umis in read header\n"
)
public String READ_NAME_SEPARATOR_CHAR = DEFAULT_READ_NAME_SEPARATOR_CHAR;
@Option(shortName = "HSEP", optional = true,
printOrder=182,
doc="Separator character used to concatenate the read header bloc (from Illumina) and the barcode/umi bloc (extracted from fastq files) in read header. Use HSEP=NULL to use a space. \n"
)
public String HEADER_SEPARATOR_CHAR = DEFAULT_HEADER_SEPARATOR_CHAR;
@Option(shortName="V", optional = true,
printOrder=190,
......@@ -266,6 +303,19 @@ public class Jeclipper extends CommandLineProgram {
}
}
/*
* Check separator
*/
if(this.READ_NAME_SEPARATOR_CHAR == null || this.READ_NAME_SEPARATOR_CHAR.equalsIgnoreCase("NULL") || this.READ_NAME_SEPARATOR_CHAR.equalsIgnoreCase("NONE")) {
log.info("Setting separator for the barcode/UMI block to : space");
this.READ_NAME_SEPARATOR_CHAR = " ";
}
if(this.HEADER_SEPARATOR_CHAR == null || this.HEADER_SEPARATOR_CHAR.equalsIgnoreCase("NULL") || this.HEADER_SEPARATOR_CHAR.equalsIgnoreCase("NONE")) {
log.info("Setting separator between original read name blcok and the barcode/UMI block to : space");
this.HEADER_SEPARATOR_CHAR = " ";
}
/*
* Check quality format
......@@ -347,7 +397,7 @@ public class Jeclipper extends CommandLineProgram {
* here BARCODE (or B) always mean READBAR (or R). We need to convert BARCODE to READBAR to
* make sure the FastqWriterLayout bahaves properly
*/
outLayouts[j] = new FastqWriterLayout(seqLayout, headerLayout, readLayouts, WITH_QUALITY_IN_READNAME, READ_NAME_SEPARATOR_CHAR, true, this.QUALITY_FORMAT);
outLayouts[j] = new FastqWriterLayout(seqLayout, headerLayout, readLayouts, WITH_NUMERIC_QUALITY, WITH_RAW_QUALITY, WITH_NAMED_BLOCKS, READ_NAME_SEPARATOR_CHAR, HEADER_SEPARATOR_CHAR, true, this.QUALITY_FORMAT);
}catch(Exception e){
log.error(ExceptionUtil.getStackTrace(e));
return new String[]{e.getMessage()};
......
......@@ -60,7 +60,7 @@ public class FastqWriterLayoutTest {
};
for (String layout : layouts) {
try {
FastqWriterLayout l = new FastqWriterLayout(layout, null, rls, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
FastqWriterLayout l = new FastqWriterLayout(layout, null, rls, false, false, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
Assert.fail("Should have thrown an Jexception for layout "+layout);
} catch (Jexception e) {
log.debug(e.getMessage());
......@@ -90,7 +90,7 @@ public class FastqWriterLayoutTest {
qualityheader,
bcQual+samplQual);
try {
FastqWriterLayout fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
FastqWriterLayout fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, false, false, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
FastqRecord r = fwl.assembleRecord(ori, null);
Assert.assertEquals(readname+":"+bcSeq, r.getReadName());
......@@ -99,7 +99,7 @@ public class FastqWriterLayoutTest {
Assert.assertEquals(samplQual, r.getBaseQualityString());
seqlayout = "<READBAR1><SAMPLE1>" ; //write the real barcode sequence
fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, false, false, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
r = fwl.assembleRecord(ori, null);
Assert.assertEquals(bcSeq+samplSeq, r.getReadString());
Assert.assertEquals(bcQual+samplQual, r.getBaseQualityString());
......@@ -136,7 +136,7 @@ public class FastqWriterLayoutTest {
qualityheader,
bcQual+samplQual);
try {
FastqWriterLayout fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, true, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
FastqWriterLayout fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, true, false, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR,FastqQualityFormat.Standard);
FastqRecord r = fwl.assembleRecord(ori, null);
Assert.assertEquals(readname+":"+bcSeq+bcQualNum, r.getReadName());
......@@ -145,7 +145,7 @@ public class FastqWriterLayoutTest {
Assert.assertEquals(samplQual, r.getBaseQualityString());
seqlayout = "<READBAR1><SAMPLE1>" ; //write the real barcode sequence
fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, true, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);
fwl = new FastqWriterLayout(seqlayout, headlayout, new ReadLayout[] {new ReadLayout("<BARCODE:6><SAMPLE:x>")}, true, false, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR,FastqQualityFormat.Standard);
r = fwl.assembleRecord(ori, null);
Assert.assertEquals(readname+":"+bcSeq+bcQualNum, r.getReadName());
Assert.assertEquals(bcSeq+samplSeq, r.getReadString());
......@@ -183,8 +183,8 @@ public class FastqWriterLayoutTest {
try {
FastqWriterLayout fwdl = new FastqWriterLayout("<SAMPLE1>", "<READBAR1>", new ReadLayout[]{rl1, rl2}, false, FastqWriterLayout.DEFAULT_READNAME_DELIMITOR, FastqQualityFormat.Standard);