Commit 72f71463 authored by Charles Girardot's avatar Charles Girardot

added all functionality, needs in depth tsting now

parent 087183a0
#jars found in this folder are artifact that are not found in maven central, you can then puch them in your local maven repo with the following commands:
#ADAPT fpath to YOUR Je/lib
LIBPATH="/Users/girardot/Work/eclipse_ws/Je/lib/"
#ADAPT path to YOUR Je/lib
LIBPATH="/Users/girardot/git/Je/lib/"
cd ~/.m2
mvn install:install-file -DgroupId=net.sf -DartifactId=htsjdk -Dversion=1.140custom -Dfile=$LIBPATH/custom-picard/htsjdk-1.140.jar -Dpackaging=jar -DgeneratePom=true
mvn install:install-file -DgroupId=net.sf -DartifactId=picard -Dversion=1.140custom -Dfile=$LIBPATH/custom-picard/picard.jar -Dpackaging=jar -DgeneratePom=true
mvn install:install-file -DgroupId=org.broadinstitute -DartifactId=picard -Dversion=2.9.4 -Dfile=$LIBPATH/picard_2.9.4.jar -Dpackaging=jar -DgeneratePom=true
# Uncomment to ADD GBCS artifacts if needed (ie if you don t have access to these repos)
# IF you are at embl, you rather want to checkout the relevant projects and build them locally
......
......@@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>Je</groupId>
<artifactId>Je</artifactId>
<version>1.1</version>
<version>2.0.beta</version>
<name>Je</name>
<description>Je provides command line utilities to deal with barcoded FASTQ files with or without Unique Molecular Index (UMI)</description>
......@@ -233,34 +233,9 @@
<dependency>
<groupId>org.embl.cg.utilitytools</groupId>
<artifactId>ut_utils</artifactId>
<version>1.0</version>
</dependency>
<!-- <dependency> -->
<!-- <groupId>net.sf</groupId> -->
<!-- <artifactId>picard</artifactId> -->
<!-- <version>1.140</version> -->
<!-- </dependency> -->
<!-- <dependency> -->
<!-- <groupId>net.sf</groupId> -->
<!-- <artifactId>htsjdk</artifactId> -->
<!-- <version>1.140</version> -->
<!-- </dependency> -->
<dependency>
<groupId>net.sf</groupId>
<artifactId>picard</artifactId>
<version>1.140custom</version>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>net.sf</groupId>
<artifactId>htsjdk</artifactId>
<version>1.140custom</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
......@@ -291,6 +266,11 @@
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.broadinstitute</groupId>
<artifactId>picard</artifactId>
<version>2.9.4</version>
</dependency>
</dependencies>
......
......@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je.jemultiplexer;
package org.embl.gbcs.je;
import java.io.IOException;
import java.io.InputStream;
......
......@@ -21,13 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je.jemultiplexer;
package org.embl.gbcs.je;
/** Utility class to hang onto data about the best match for a given barcode */
public class BarcodeMatch {
/**
* indicates if a barcode match has been found, in which case 'barcode' is not null
* indicates if this barcode match fullfils the thresholds for barcode matching
*/
public boolean matched;
......@@ -35,6 +37,12 @@ public class BarcodeMatch {
* sequence of the matched barcode
*/
public String barcode;
/**
* sequence extracted from read
*/
public String readSequence;
/**
* number of mismatches with 'barcode'
......@@ -48,8 +56,9 @@ public class BarcodeMatch {
public String toString(){
if(matched)
return "matched :"+ barcode+" [MM="+mismatches+", MMD="+mismatchesToSecondBest+"]";
return "no match";
return "Match for "+readSequence+ " read sequence : barcode "+ barcode+" identified with [MM="+mismatches+", MMD="+mismatchesToSecondBest+"]";
else
return "No Match for "+readSequence+ " read sequence (best barcode is "+ barcode+" identified with [MM="+mismatches+", MMD="+mismatchesToSecondBest+"])";
}
}
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import picard.util.MathUtil;
import htsjdk.samtools.fastq.FastqRecord;
/**
*
* Describes how and what to write in FASTQ output files when demultiplexing input FASTQ files
* More precisely, it describes the output file layout(s) using the slots defined in read layouts e.g. '<BARCODE1><UMI1><UMI2>' or '<SAMPLE1>'
* Each FastqWriterLayout needs two of these descriptors :
* 1. One describing how to write the read sequence e.g. '<SAMPLE1>' (only writes the sample sequence) or '<BARCODE1><SAMPLE1>' to also
* keep the barcode in the output read sequence (and qualities)
* 2. A second one describing how to write the read name (header) e.g. '<BARCODE1><UMI1><UMI2>' to add the barcode and two extracted UMIs
* in the final read name, in addition to the original read name (ie header up to the space). Here each written slot is separated with ':' by default
*
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1'' instead of '<BARCODE1>' , '<UMI2>' and '<SAMPLE1>' ; respectively.
* For example, 'B1U1U2' is the same as '<BARCODE1><UMI1><UMI2>'.
*
* Technically speaking, the short layout format is the only one used.
*
* @author girardot
*
*/
public class FastqWriterLayout {
private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class);
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUS]\\d+)+$";
/**
* char to use to delineate slots in read name ; if needed
*/
protected static String readNameDelimitor = ":";
/**
* Layout for writing the read sequence ; in short format
*/
protected String readSequenceLayout;
/**
* Consumer associated with the readSequenceLayout
*/
protected ReadLayoutConsumer sequenceConsumer;
/**
* Layout for writing the read name ; in short format
*/
protected String readNameLayout;
/**
* Consumer associated with the readNameLayout
*/
protected ReadLayoutConsumer readNameConsumer ;
/**
* All the {@link ReadLayout} defined in the demultiplexing ; used to find how to extract needed information
*/
protected ReadLayout [] readLayouts;
/**
* @param readSequenceLayout
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayouts
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout [] readLayouts) {
this.readNameLayout = (StringUtils.isBlank(readNameLayout) ? null : convertToShortLayout(readNameLayout));
this.readSequenceLayout = convertToShortLayout(readSequenceLayout);
this.readLayouts = readLayouts;
init(); //build all maps for easy lookup
}
/**
* @param readSequenceLayout
* @param readNameLayout can be null when the read name should be reused unmodified
* @param readLayout
*/
public FastqWriterLayout(final String readSequenceLayout, final String readNameLayout, final ReadLayout readLayout) {
this(readSequenceLayout, readNameLayout, new ReadLayout[]{ readLayout });
}
/**
* @param layout
* @return
*/
private String convertToShortLayout(final String layout) {
log.debug("given layout : "+layout);
if(StringUtils.isBlank(layout))
return layout;
String shortLayout = layout;
if(!Pattern.matches(SHORT_LAYOUT_REGEX, layout)){
if(!Pattern.matches(LONG_LAYOUT_REGEX, layout))
throw new LayoutMalformedException("FASTQ Output Layout does not match expected short ("+SHORT_LAYOUT_REGEX+") nor long ("+LONG_LAYOUT_REGEX+") formats", layout);
//convert to short
shortLayout = shortLayout.replaceAll("<", "");
shortLayout = shortLayout.replaceAll(">", "");
shortLayout = shortLayout.replaceAll("ARCODE", "");
shortLayout = shortLayout.replaceAll("MI", "");
shortLayout = shortLayout.replaceAll("AMPLE", "");
}
log.debug("short layout : "+shortLayout);
return shortLayout;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout} for each BARCODE.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param m a {@link SampleMatch} holding all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads, boolean[] useReadSequenceForBarcodes, SampleMatch m ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads, useReadSequenceForBarcodes, m);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
return ass;
}
/**
* Convenient wrapper for single end configuration
* @param read the {@link FastqRecord} from the input fastq file
* @param useReadSequenceForBarcodes dictates what to write in the read header layouts of the {@link FastqWriterLayout} for each BARCODE.
* When false, the matched barcode is used. When true, the exact read sequence extracted from the barcode slot is written
* @param m a {@link SampleMatch} holdign all the barcode matches
* @return
*/
public FastqRecord assembleRecord( FastqRecord read, boolean[] useReadSequenceForBarcodes, SampleMatch m ){
return assembleRecord(new FastqRecord[]{read}, useReadSequenceForBarcodes, m);
}
/**
* Checks that this layout can be produced from a list of {@link ReadLayout}
* @param readLayouts
* @return
*/
protected void init(){
/*
* Process (short format) layout for easy output assembly at FASTQ writing time
*/
// do for read seq
if(!Pattern.matches(SHORT_LAYOUT_REGEX, this.readSequenceLayout)){
throw new LayoutMalformedException("FASTQ Output Layout for read sequence does not match expected short format (regex is :"+SHORT_LAYOUT_REGEX+")", this.readSequenceLayout);
}
sequenceConsumer = new ReadLayoutConsumer(this.readSequenceLayout, this.readLayouts);
// do for read name if not null
if( this.readNameLayout != null){
if(!Pattern.matches(SHORT_LAYOUT_REGEX, this.readNameLayout)){
throw new LayoutMalformedException("FASTQ Output Layout for read name does not match expected short format (regex is :"+SHORT_LAYOUT_REGEX+")", this.readNameLayout);
}
readNameConsumer = new ReadLayoutConsumer(this.readNameLayout, this.readLayouts);
}
}
/**
* @param reads
* @param readLayouts
* @return
*/
public static Map<Integer, List<FastqRecord>> extractBarcodeSlots(FastqRecord[] reads, ReadLayout[] readLayouts) {
Map<Integer, List<FastqRecord>> m = new HashMap<Integer, List<FastqRecord>>();
/*
* for each read layout
*/
for (int i = 0; i < readLayouts.length; i++) {
//find those with BARCODE slot(s)
if(readLayouts[i].containsBarcode()){
//extract the BARCODE subsequence
String [] sequences = readLayouts[i].extractBarcodes(reads[i].getReadString());
//and the corresponding quality strings
String [] qualities = readLayouts[i].extractBarcodes(reads[i].getBaseQualityString());
//and the BARCODE idx corresponding to them
List<Integer> barcodeBlockIds = readLayouts[i].getOrderedBarcodeBlockUniqueIds();
//save each subsequence/quality as a FastqRecord in the list corresponding to the BARCODE idx
for (int j = 0; j < barcodeBlockIds.size(); j++) {
int blockId = barcodeBlockIds.get(j);
if(!m.containsKey(blockId))
m.put(blockId, new ArrayList<FastqRecord>());
FastqRecord fr = new FastqRecord(null, sequences[j], null, qualities[j]);
log.debug("Extracted Barcode : "+fr.toFastQString());
m.get(blockId).add(
fr
);
}
}
}
return m;
}
/**
* @return the readNameDelimitor
*/
public String getReadNameDelimitor() {
return readNameDelimitor;
}
/**
* @param readNameDelimitor the readNameDelimitor to set
*/
public void setReadNameDelimitor(String readNameDelimitor) {
this.readNameDelimitor = readNameDelimitor;
}
/**
* @return the readSequenceLayout in short format
*/
public String getReadSequenceLayout() {
return readSequenceLayout;
}
/**
* @return the readNameLayout in short format
*/
public String getReadNameLayout() {
return readNameLayout;
}
}
......@@ -26,7 +26,9 @@ package org.embl.gbcs.je;
import java.util.Set;
import java.util.TreeSet;
import org.embl.cg.utilitytools.utils.ExceptionUtil;
import org.embl.cg.utilitytools.utils.StringUtil;
import org.embl.gbcs.je.demultiplexer.Jedemultiplex;
import org.embl.gbcs.je.jeclipper.Jeclipper;
import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
......@@ -48,6 +50,7 @@ public class Je {
private static Logger log = LoggerFactory.getLogger(Je.class);
public static final String COMMAND_DEMULTIPLEX = "debarcode";
public static final String COMMAND_DROPSEQ = "dropseq";
public static final String COMMAND_CLIP = "clip";
public static final String COMMAND_DUPES = "markdupes";
......@@ -62,6 +65,7 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA);
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
}
......@@ -90,49 +94,57 @@ public class Je {
System.exit(0);
}
else if(!ALLOWED_COMMANDS.contains(option.toLowerCase())){
System.err.println("Unknown command name : "+option);
System.err.println(getUsage());
System.err.println("Unknown command name : "+option+" ; please check help with -h");
//System.err.println(getUsage());
System.exit(1); //error
}
/*
* looks good , we delegate to proper implementation
*/
String [] argv = {"-h"}; // init to get help
if(args.length > 1){
argv = StringUtil.subArray(args, 1, args.length-1);
}
if(option.equalsIgnoreCase(COMMAND_CLIP)){
new Jeclipper().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_MULTIPLEX)){
new Jemultiplexer().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_MULTIPLEX_ILLUMINA)){
new JemultiplexerIllumina().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DUPES)){
new MarkDuplicatesWithMolecularCode().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){
new Jedropseq().instanceMainWithExit(argv);
}
else{
System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
StringUtil.mergeArray(args, " ")
);
try{
String [] argv = {"-h"}; // init to get help
if(args.length > 1){
argv = StringUtil.subArray(args, 1, args.length-1);
}
if(option.equalsIgnoreCase(COMMAND_CLIP)){
new Jeclipper().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DEMULTIPLEX)){
new Jedemultiplex().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_MULTIPLEX)){
new Jemultiplexer().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_MULTIPLEX_ILLUMINA)){
new JemultiplexerIllumina().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DUPES)){
new MarkDuplicatesWithMolecularCode().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){
new Jedropseq().instanceMainWithExit(argv);
}
else{
System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
StringUtil.mergeArray(args, " ")
);
System.exit(1); //error
}
} catch(Exception e){
log.error(ExceptionUtil.getStackTrace(e));
System.exit(1); //error
}
}
protected static String getUsage(){
return "Usage: je <command> [options] \n\n"+
"with command in : \n"
+"\t "+COMMAND_CLIP+" \t\t clips molecular barcodes from fastq sequence and places them in read name headers for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplex fastq file(s), with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplex fastq file(s) using Illumina Index files, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DEMULTIPLEX+" \t\t demultiplexes fastq file(s), with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\n"
......
......@@ -23,6 +23,9 @@
*/
package org.embl.gbcs.je;
import java.util.Set;
import java.util.TreeSet;
import com.developpez.adiguba.shell.ProcessConsumer;
public class JeUtils {
......@@ -61,6 +64,19 @@ public class JeUtils {
}
public static int barcodeSlotCount(ReadLayout[] readLayouts) {
return barcodeBlockUniqueIdSet(readLayouts).size();
}
public static Set<Integer> barcodeBlockUniqueIdSet(ReadLayout[] readLayouts) {
Set<Integer> allIds = new TreeSet<Integer>();
for (ReadLayout rl : readLayouts) {
allIds.addAll(rl.getBarcodeBlockUniqueIds());
}
return allIds;
}
......
......@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je.jemultiplexer;
package org.embl.gbcs.je;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.fastq.AsyncFastqWriter;
......@@ -37,7 +37,7 @@ import java.util.zip.GZIPOutputStream;
public class JemultiplexerFastqWriterFactory {
boolean useAsyncIo = Defaults.USE_ASYNC_IO;
boolean useAsyncIo = Defaults.USE_ASYNC_IO_WRITE_FOR_SAMTOOLS;
/** Sets whether or not to use async io (i.e. a dedicated thread per writer. */
......
......@@ -23,14 +23,14 @@
*/
package org.embl.gbcs.je;
public class ReadLayoutMalformedException extends Jexception {
public class LayoutMalformedException extends Jexception {
/**
*
*/
private static final long serialVersionUID = -4024288961353288534L;
public ReadLayoutMalformedException(String message, String layout) {
public LayoutMalformedException(String message, String layout) {
super(message+"\n Layout was : "+layout);
}
......
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute