Commit e6dcce0a authored by Charles Girardot's avatar Charles Girardot

2.0 RC version. je markdupes does not deal with UMI from BAM TAG (still

expected in read names)
parent 542b8ac9
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>Je</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding//src/test/resources=UTF-8
encoding/<project>=UTF-8
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
org.eclipse.jdt.core.compiler.compliance=1.5
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.5
eclipse.preferences.version=1
org.eclipse.jdt.ui.javadoc=false
org.eclipse.jdt.ui.text.custom_code_templates=<?xml version\="1.0" encoding\="UTF-8" standalone\="no"?><templates><template autoinsert\="true" context\="gettercomment_context" deleted\="false" description\="Comment for getter method" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.gettercomment" name\="gettercomment">/**\n * @return the ${bare_field_name}\n */</template><template autoinsert\="true" context\="settercomment_context" deleted\="false" description\="Comment for setter method" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.settercomment" name\="settercomment">/**\n * @param ${param} the ${bare_field_name} to set\n */</template><template autoinsert\="true" context\="constructorcomment_context" deleted\="false" description\="Comment for created constructors" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.constructorcomment" name\="constructorcomment">/**\n * ${tags}\n */</template><template autoinsert\="true" context\="filecomment_context" deleted\="false" description\="Comment for created Java files" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.filecomment" name\="filecomment">/**\n * \n */</template><template autoinsert\="true" context\="typecomment_context" deleted\="false" description\="Comment for created types" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.typecomment" name\="typecomment">/**\n * @author ${user}\n *\n * ${tags}\n */</template><template autoinsert\="true" context\="fieldcomment_context" deleted\="false" description\="Comment for fields" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.fieldcomment" name\="fieldcomment">/**\n * \n */</template><template autoinsert\="true" context\="methodcomment_context" deleted\="false" description\="Comment for non-overriding methods" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.methodcomment" name\="methodcomment">/**\n * ${tags}\n */</template><template autoinsert\="true" context\="overridecomment_context" deleted\="false" description\="Comment for overriding methods" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.overridecomment" name\="overridecomment">/* (non-Javadoc)\n * ${see_to_overridden}\n */</template><template autoinsert\="true" context\="delegatecomment_context" deleted\="false" description\="Comment for delegate methods" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.delegatecomment" name\="delegatecomment">/**\n * ${tags}\n * ${see_to_target}\n */</template><template autoinsert\="false" context\="newtype_context" deleted\="false" description\="Newly created files" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.newtype" name\="newtype">/*\n * The MIT License\n *\n * Copyright (c) 2009 The Broad Institute\n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the "Software"), to deal\n * in the Software without restriction, including without limitation the rights\n * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n * copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions\:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n * THE SOFTWARE.\n */\n${filecomment}\n${package_declaration}\n\n${typecomment}\n${type_declaration}</template><template autoinsert\="true" context\="classbody_context" deleted\="false" description\="Code in new class type bodies" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.classbody" name\="classbody">\n</template><template autoinsert\="true" context\="interfacebody_context" deleted\="false" description\="Code in new interface type bodies" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.interfacebody" name\="interfacebody">\n</template><template autoinsert\="true" context\="enumbody_context" deleted\="false" description\="Code in new enum type bodies" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.enumbody" name\="enumbody">\n</template><template autoinsert\="true" context\="annotationbody_context" deleted\="false" description\="Code in new annotation type bodies" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.annotationbody" name\="annotationbody">\n</template><template autoinsert\="true" context\="catchblock_context" deleted\="false" description\="Code in new catch blocks" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.catchblock" name\="catchblock">// ${todo} Auto-generated catch block\n${exception_var}.printStackTrace();</template><template autoinsert\="true" context\="methodbody_context" deleted\="false" description\="Code in created method stubs" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.methodbody" name\="methodbody">// ${todo} Auto-generated method stub\n${body_statement}</template><template autoinsert\="true" context\="constructorbody_context" deleted\="false" description\="Code in created constructor stubs" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.constructorbody" name\="constructorbody">${body_statement}\n// ${todo} Auto-generated constructor stub</template><template autoinsert\="true" context\="getterbody_context" deleted\="false" description\="Code in created getters" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.getterbody" name\="getterbody">return ${field};</template><template autoinsert\="true" context\="setterbody_context" deleted\="false" description\="Code in created setters" enabled\="true" id\="org.eclipse.jdt.ui.text.codetemplates.setterbody" name\="setterbody">${field} \= ${param};</template></templates>
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
1. Create a dir with :
* the je wrapper
* the log4j.xml
* the Je bundle jar
2. Make sure the je wrapper calls the correct jar
3. tar czf <tarname>.tar.gz <dirname>/*
e.g.
`tar czf je_2.0.RC.tar.gz je_2.0.RC/*`
\ No newline at end of file
#!/bin/sh
# Wrapper around je_1.1_bundle.jar
# Wrapper around je_*_bundle.jar
# where are we stored ?
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# echo $DIR
# path to jar file to execute, this jar is supposed to be in the same dir as this script
JAR_FILE=$DIR"/je_1.1_bundle.jar"
JAR_FILE=$DIR"/je_2.0.RC_bundle.jar"
# set default _JAVA_OPTIONS
_JAVA_OPTIONS=${_JAVA_OPTIONS:-'-Xmx4G -Xms256m'}
......
#!/bin/sh
# Wrapper around je_*_bundle.jar
# where are we stored ?
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# echo $DIR
# path to jar file to execute, this jar is supposed to be in the same dir as this script
JAR_FILE=$DIR"/je_2.0.RC_bundle.jar"
# set default _JAVA_OPTIONS
_JAVA_OPTIONS=${_JAVA_OPTIONS:-'-Xmx4G -Xms256m'}
export _JAVA_OPTIONS
# uncomment to change logging level using your own log4j.xml found in $DIR file
# OPTS="-Dlog4j.configuration=file:$DIR/log4j.xml"
java $OPTS -jar $JAR_FILE "$@"
exit $?
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{dd-MM-yy HH:mm:ss} %-5p [%t] %c{1}.%M(%L) | %m%n" />
</layout>
</appender>
<root>
<level value="INFO" />
<appender-ref ref="CONSOLE" />
</root>
</log4j:configuration>
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>Je</groupId>
<artifactId>Je</artifactId>
<version>2.0.beta</version>
<version>2.0.RC</version>
<name>Je</name>
<description>Je provides command line utilities to deal with barcoded FASTQ files with or without Unique Molecular Index (UMI)</description>
......
......@@ -34,6 +34,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
/**
*
......@@ -112,24 +113,32 @@ public class FastqWriterLayout {
*/
protected ReadLayout [] readLayouts;
/**
* @param readSequenceLayout
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayouts
* the {@link FastqQualityFormat} of the input fastq files
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor) {
this(readSequenceLayout, readNameLayout, readLayouts, withQualityInReadName, readNameDelimitor, false);
}
protected FastqQualityFormat fastqQualityFormat = null;
/**
* @param readSequenceLayout
* @param readNameLayout
* @param readLayouts
* @param withQualityInReadName
* @param readNameDelimitor
* @param readSequenceLayout the string representation of the output layout to use for the read sequence e.g. "S1"
* @param readNameLayout the string representation of the output layout to use for the read name e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param convertBarcodeToReadbar if true all BARCODE slots are converted to READBAR in the readNameLayout (BARCODE == READBAR in readSequenceLayout)
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor, final boolean convertBarcodeToReadbar) {
public FastqWriterLayout(
final String readSequenceLayout,
final String readNameLayout,
final ReadLayout [] readLayouts,
final boolean withQualityInReadName,
final String readNameDelimitor,
final boolean convertBarcodeToReadbar,
final FastqQualityFormat fastqQualityFormat) {
this.readNameLayout = (StringUtils.isBlank(readNameLayout) ? null : convertToShortLayout(readNameLayout));
this.readSequenceLayout = convertToShortLayout(readSequenceLayout);
......@@ -139,24 +148,23 @@ public class FastqWriterLayout {
if(convertBarcodeToReadbar && readNameLayout!=null) {
this.readNameLayout = this.readNameLayout.replaceAll("B", "R");
}
this.fastqQualityFormat = fastqQualityFormat;
init(); //build all maps for easy lookup
}
/**
* @param readSequenceLayout
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayout
* @param readSequenceLayout the string representation of the output layout to use for the read sequence e.g. "S1"
* @param readNameLayout the string representation of the output layout to use for the read name e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout, final boolean withQualityInReadName, final String readNameDelimitor) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout }, withQualityInReadName, readNameDelimitor, false);
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts, final boolean withQualityInReadName, final String readNameDelimitor, final FastqQualityFormat fastqQualityFormat) {
this(readSequenceLayout, readNameLayout, readLayouts, withQualityInReadName, readNameDelimitor, false, fastqQualityFormat);
}
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout, final boolean withQualityInReadName, final String readNameDelimitor, final boolean convertBarcodeToReadbar) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout }, withQualityInReadName, readNameDelimitor, convertBarcodeToReadbar);
}
/**
* @param layout
......@@ -186,36 +194,18 @@ public class FastqWriterLayout {
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s).
* This method also use the read sequence to write BARCODE in read name
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s) ; this method should be used when a barcode has been matched
*
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
* @param sampleMatch a {@link SampleMatch} holding all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
return ass;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
* @param m a {@link SampleMatch} holding all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads, SampleMatch m ){
public FastqRecord assembleRecord( FastqRecord[] reads, SampleMatch sampleMatch ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads, m);
name = readNameConsumer.assembleNewReadName(reads, sampleMatch);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
......@@ -224,7 +214,7 @@ public class FastqWriterLayout {
/**
* Convenient wrapper for single end configuration
* @param read the {@link FastqRecord} from the input fastq file
* @param read the {@link FastqRecord} from the input fastq file
* @param m a {@link SampleMatch} holding all the barcode matches
* @return
*/
......@@ -255,7 +245,7 @@ public class FastqWriterLayout {
if(!Pattern.matches(SHORT_LAYOUT_REGEX, this.readNameLayout)){
throw new LayoutMalformedException("FASTQ Output Layout for read name does not match expected short format (regex is :"+SHORT_LAYOUT_REGEX+")", this.readNameLayout);
}
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts, this.withQualityInReadName, this.readNameDelimitor);
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts, this.withQualityInReadName , this.readNameDelimitor, this.fastqQualityFormat);
}
}
......
......@@ -28,6 +28,8 @@ import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import htsjdk.samtools.SAMUtils;
public class JeTry {
private static Logger log = LoggerFactory.getLogger(JeTry.class);
......@@ -36,9 +38,7 @@ public class JeTry {
}
public static void main(String[] args) {
System.out.println(JeUtils.toBytesThenPhred(
"26242516303031"
));
}
......
......@@ -26,12 +26,23 @@ package org.embl.gbcs.je;
import java.util.Set;
import java.util.TreeSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.developpez.adiguba.shell.ProcessConsumer;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.QualityEncodingDetector;
import htsjdk.samtools.util.SolexaQualityConverter;
public class JeUtils {
private static Logger log = LoggerFactory.getLogger(JeUtils.class);
/*
* BC : for sample barcode, raw or corrected, with QT to store its quality string
*/
......@@ -69,7 +80,7 @@ public class JeUtils {
public static final String SAMTAG_MI = "MI";
/**convert a string of quality numbers (each quality has 2 char) to the Phred String
/**convert a string of quality numbers in Phred Scale (each quality has 2 char) to the Standard Phred + 33 encoding
* ie
* @param s
* @return
......@@ -85,6 +96,54 @@ public class JeUtils {
}
/**
* Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale.
*/
public static void convertQualityToPhred(byte[] quals, final FastqQualityFormat version) {
switch (version) {
case Standard:
SAMUtils.fastqToPhred(quals);
break ;
case Solexa:
SolexaQualityConverter.getSingleton().convertSolexaQualityCharsToPhredBinary(quals);
break ;
case Illumina:
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(quals);
break ;
}
}
/**
* Looks at fastq input(s) and attempts to determine the proper quality format
*
* Closes the reader(s) by side effect
*
* @param readers readers on the input fastq files
* @param expectedQuality If provided, will be used for sanity checking. If left null, autodetection will occur
*/
public static FastqQualityFormat determineQualityFormat(final FastqReader [] readers, final FastqQualityFormat expectedQuality) {
final QualityEncodingDetector detector = new QualityEncodingDetector();
//add all fastq readers
detector.add(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, readers);
//close all readers
for (FastqReader reader : readers) {
reader.close();
}
//
final FastqQualityFormat qualityFormat = detector.generateBestGuess(QualityEncodingDetector.FileContext.FASTQ, expectedQuality);
//in case there is no expected quality and different options were possible, warn user
if (detector.isDeterminationAmbiguous()) {
log.warn("Making ambiguous determination about fastq's quality encoding; more than one format possible based on observed qualities.");
}
log.info(String.format("Auto-detected quality format as: %s.", qualityFormat));
return qualityFormat;
}
/**
* @return the result of executing whoami on the underlying OS
*/
......
......@@ -30,11 +30,13 @@ import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.embl.gbcs.je.demultiplexer.Demultiplexer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
public class ReadLayoutConsumer {
private static Logger log = LoggerFactory.getLogger(ReadLayoutConsumer.class);
......@@ -53,26 +55,36 @@ public class ReadLayoutConsumer {
String outPutLayout;
boolean withQualityInReadName;
String readNameDelimitor = ":";
FastqQualityFormat fastqQualityFormat = null;
/**
* @param outPutLayout in short format
* @param readLayouts all ordered layout (order is as the reads are read from files)
* Creates a simple ReadLayoutConsumer with default read name delimitor (':') and standard
* fastq quality format {@link FastqQualityFormat#Standard}.
*
* @param outPutLayout the string representation of the output layout e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
*
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts){
this(outPutLayout, readLayouts, false, ":");
this(outPutLayout, readLayouts, false, ":", FastqQualityFormat.Standard);
}
/**
* @param outPutLayout in short format
* @param readLayouts all ordered layout (order is as the reads are read from files)
* Creates a ReadLayoutConsumer
*
* @param outPutLayout the string representation of the output layout e.g. "B1U1S1"
* @param readLayouts the ordered {@link ReadLayout} objects defining how input fastq files are formatted
* @param withQualityInReadName indicates if the Barcode/UMI quality should be injected in the read name together with their sequence
* @param readNameDelimitor the character to use to split up the read name (':' is the default)
* @param fastqQualityFormat the {@link FastqQualityFormat} of the input fastq files
*/
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts, boolean withQualityInReadName, String readNameDelimitor){
public ReadLayoutConsumer(String outPutLayout, ReadLayout [] readLayouts, boolean withQualityInReadName, String readNameDelimitor, final FastqQualityFormat fastqQualityFormat){
this.outPutLayout = outPutLayout;
this.readLayouts = readLayouts;
this.withQualityInReadName = withQualityInReadName;
this.readNameDelimitor = readNameDelimitor;
this.fastqQualityFormat = fastqQualityFormat;
Pattern sub = Pattern.compile("([BUSR])(\\d+)");
Matcher subMatcher = sub.matcher("");
......@@ -142,19 +154,6 @@ public class ReadLayoutConsumer {
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
* In this method, the read sequence is always used in BARCODE slots
*
* @param reads the reads in order matching that of the {@link ReadLayout} array used at construction
*
* @return
*/
public String assembleNewReadName(FastqRecord [] reads){
return assembleNewReadName(reads, null);
}
/**
* Assemble a read name by concatenating the output layout to the original read name.
......@@ -183,54 +182,61 @@ public class ReadLayoutConsumer {
String subseq = null;
byte[] qualB = null;
int bestQual = 0;
if(slotTypeCode == BYTECODE_BARCODE ){
// we init the subseq with the matched barcode directly
subseq = m.getBarcodeMatches().get(slotIdx).barcode;
}else{
for(int rlIdx : layoutIndicesToUseForSlots.get(i)){
ReadLayout rl = readLayouts[rlIdx];
FastqRecord readForLayout = reads[rlIdx];
String _subseq = null;
String _subqual = null;
switch (slotTypeCode) {
case BYTECODE_READBAR:
_subseq = rl.extractBarcode(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
break;
case BYTECODE_UMI:
_subseq = rl.extractUMI(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractUMI(readForLayout.getBaseQualityString(), slotIdx);
break;
default:
_subseq = rl.extractSample(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractSample(readForLayout.getBaseQualityString(), slotIdx);
break;
}
byte[] _qualB = SAMUtils.fastqToPhred(_subqual);
int _qualsum = overallQuality( _qualB );
if(subseq == null || _qualsum > bestQual){
subseq = _subseq;
qualB = _qualB;
bestQual = _qualsum;
}
for(int rlIdx : layoutIndicesToUseForSlots.get(i)){
ReadLayout rl = readLayouts[rlIdx];
FastqRecord readForLayout = reads[rlIdx];
String _subseq = null;
String _subqual = null;
switch (slotTypeCode) {
case BYTECODE_BARCODE:
// we init the subseq with the matched barcode directly
_subseq = m.getBarcodeMatches().get(slotIdx).barcode;
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
break;
case BYTECODE_READBAR:
_subseq = rl.extractBarcode(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractBarcode(readForLayout.getBaseQualityString(), slotIdx);
break;
case BYTECODE_UMI:
_subseq = rl.extractUMI(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractUMI(readForLayout.getBaseQualityString(), slotIdx);
break;
default:
_subseq = rl.extractSample(readForLayout.getReadString(), slotIdx);
_subqual = rl.extractSample(readForLayout.getBaseQualityString(), slotIdx);
break;
}
byte[] _qualB = _subqual.getBytes();
int _qualsum = overallQuality( _qualB );
if(subseq == null || _qualsum > bestQual){
subseq = _subseq;
qualB = _qualB;
bestQual = _qualsum;
}
}
//concatenate to the growing name
newname += this.readNameDelimitor + subseq;
if(withQualityInReadName)
newname += qualityToNumberString(qualB);
if(withQualityInReadName) {
newname += qualityToNumberString(qualB, this.fastqQualityFormat);
}
log.debug("header is now : "+newname);
}
return newname;
}
/*
* returns a string made of 2-digits quality scores for injection in the read name
/**
* @param qualbytes byte representation of initial quality string
* @param fastqQualityFormat the encoding of these bytes
* @return
*/
public synchronized static String qualityToNumberString(byte[] qualbytes) {
public synchronized static String qualityToNumberString(byte[] qualbytes, FastqQualityFormat fastqQualityFormat) {
JeUtils.convertQualityToPhred(qualbytes, fastqQualityFormat);
NumberFormat nf = NumberFormat.getIntegerInstance();
nf.setMinimumIntegerDigits(2);
StringBuffer sb = new StringBuffer(qualbytes.length*2);
......
......@@ -41,6 +41,9 @@ public class SampleMatch {
*/
protected Map<Integer, BarcodeMatch> barcodeMatches;
/**
* A note to propagate for diagnostic file
*/
protected String diagnosticNote = "";
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches){
......
......@@ -836,7 +836,7 @@ public class Demultiplexer {
byte [] subseqBytes = readSlot.getReadBases();
byte [] qualBytes = readSlot.getBaseQualityString().getBytes();
log.debug(" Q bytes => "+Arrays.toString(qualBytes));
convertQuality(qualBytes, this.fastqQualityFormat);
JeUtils.convertQualityToPhred(qualBytes, this.fastqQualityFormat);
log.debug(" converted Q bytes => "+Arrays.toString(qualBytes));
int numMismatchesInBestBarcode = readSlot.getReadLength() + 1; //init with max mismatch num + 1
......@@ -920,22 +920,6 @@ public class Demultiplexer {
}
/**
* Based on the type of quality scores coming in, converts them to a numeric byte[] in phred scale.
*/
protected void convertQuality(byte[] quals, final FastqQualityFormat version) {
switch (version) {
case Standard:
SAMUtils.fastqToPhred(quals);
break ;
case Solexa:
SolexaQualityConverter.getSingleton().convertSolexaQualityCharsToPhredBinary(quals);
break ;
case Illumina:
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(quals);
break ;
}
}
private FastqRecord[] nextReads(
......
......@@ -35,6 +35,7 @@ import org.embl.cg.utilitytools.utils.ExceptionUtil;
import org.embl.cg.utilitytools.utils.FileUtil;
import org.embl.cg.utilitytools.utils.StringUtil;
import org.embl.gbcs.je.FastqWriterLayout;
import org.embl.gbcs.je.JeUtils;
import org.embl.gbcs.je.JemultiplexerFastqWriterFactory;
import org.embl.gbcs.je.Jexception;
import org.embl.gbcs.je.ReadLayout;
......@@ -44,6 +45,7 @@ import org.slf4j.LoggerFactory;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.fastq.FastqWriter;
import htsjdk.samtools.util.FastqQualityFormat;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
......@@ -187,6 +189,14 @@ public class Jeclipper extends CommandLineProgram {
)
public String READ_NAME_SEPARATOR_CHAR = DEFAULT_READ_NAME_SEPARATOR_CHAR;
@Option(shortName="V", optional = true,
printOrder=190,
doc="A value describing how the quality values are encoded in the fastq files. Either 'Solexa' for pre-pipeline 1.3 " +
"style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and above (phred scaling + 64) or 'Standard' for phred scaled " +
"scores with a character shift of 33. If this value is not specified (or 'null' is given), the quality format is assumed to be will the 'Standard' for phred scale.\n"
)
public FastqQualityFormat QUALITY_FORMAT = null;
@Option(shortName="TEST", optional = true,
printOrder=210,
......@@ -237,6 +247,7 @@ public class Jeclipper extends CommandLineProgram {
_names.add(f.getAbsolutePath());
}
/*
* Validate O
* if not given, init to current dir else ensure the dir exists
......@@ -256,6 +267,22 @@ public class Jeclipper extends CommandLineProgram {
}
/*
* Check quality format
*/
if (QUALITY_FORMAT == null) { // we assume it s Standard
FastqReader [] readers = new FastqReader[FASTQ.size()];
int i = 0;
for(File f : FASTQ){
readers[i++] = new FastqReader(f);
}
QUALITY_FORMAT = JeUtils.determineQualityFormat(readers, FastqQualityFormat.Standard);
log.info( String.format("Auto-detected quality encoding format as '%s'. Please set V option explicitely if not correct.", QUALITY_FORMAT) );
} else {
log.info(String.format("Quality encoding format set to %s by user.", QUALITY_FORMAT));
}
/*
* parse read layout, we must have one per input FASTQ
*
......@@ -320,7 +347,7 @@ public class Jeclipper extends CommandLineProgram {
* here BARCODE (or B) always mean READBAR (or R). We need to convert BARCODE to READBAR to
* make sure the FastqWriterLayout bahaves properly
*/
outLayouts[j] = new FastqWriterLayout(seqLayout, headerLayout, readLayouts, WITH_QUALITY_IN_READNAME, READ_NAME_SEPARATOR_CHAR, true);
outLayouts[j] = new FastqWriterLayout(seqLayout, headerLayout, readLayouts, WITH_QUALITY_IN_READNAME, READ_NAME_SEPARATOR_CHAR, true, this.QUALITY_FORMAT);
}catch(Exception e){
log.error(ExceptionUtil.getStackTrace(e));
return new String[]{e.getMessage()};
......@@ -480,7 +507,7 @@ public class Jeclipper extends CommandLineProgram {
for (int i = 0; i < fastqWriters.size(); i++) {
//prepare the output according to output layout
log.debug("Writing in output idx "+(i+1));
FastqRecord rec = outLayouts[i].assembleRecord( reads );
FastqRecord rec = outLayouts[i].assembleRecord( reads , null);
fastqWriters.get(i).write(rec);
}
}
......
......@@ -38,6 +38,7 @@ import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.fastq.FastqWriter;
import htsjdk.samtools.util.FastqQualityFormat;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
......@@ -90,13 +91,13 @@ public class Jedropseq extends CommandLineProgram {
@Option(shortName="F1", optional = false,
printOrder=10,
doc="Input fastq file (optionally gzipped) for first read. This read contains the cell barcode followed by the UMI"
doc="Input fastq file (optionally gzipped) for first read. This read contains the cell barcode followed by the UMI. Quality encoding must be Phred+33 (Standard)."
)
public File FASTQ_FILE1;
@Option(shortName="F2", optional = false,
printOrder=20,
doc="Input fastq file (optionally gzipped) for the second read."
doc="Input fastq file (optionally gzipped) for the second read. Quality encoding must be Phred+33 (Standard)."
)
public File FASTQ_FILE2 = null;
......@@ -312,8 +313,8 @@ public class Jedropseq extends CommandLineProgram {
if(WITH_QUALITY_IN_READNAME) {
//add the converted quality
cellBarcodeSeq += ReadLayoutConsumer.qualityToNumberString( SAMUtils.fastqToPhred(cellBarcodeQual) );