Commit a425ddef authored by Charles Girardot's avatar Charles Girardot

commit before refactoring to introduce the READBAR concept

parent 72f71463
......@@ -4,3 +4,5 @@ bin
target/
.DS_Store
._*
embl.properties
test.properties
......@@ -49,11 +49,21 @@ import htsjdk.samtools.fastq.FastqRecord;
* 2. A second one describing how to write the read name (header) e.g. '<BARCODE1><UMI1><UMI2>' to add the barcode and two extracted UMIs
* in the final read name, in addition to the original read name (ie header up to the space). Here each written slot is separated with ':' by default
*
*
* Note that in case of barcode, one might want to write the barcode or the read sequence corresponding to the looked up sample barcode.
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1'' instead of '<BARCODE1>' , '<UMI2>' and '<SAMPLE1>' ; respectively.
* The possible keys are :<br/>
* <ul>
* <li>SAMPLEn : refers to the SAMPLE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>UMIn : refers to the UMI slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>BARCODEn : refers to the sample barcode resolved from the read sequence found in the of the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* <li>READBARn : refers to the read sequence found in the BARCODE slot with idx 'n' defined in the {@link ReadLayout} objects</li>
* </ul>
*
* Note that a short layout format can also be used like 'B1', 'U2', 'S1' or 'R1' instead of '<BARCODE1>' , '<UMI2>' , '<SAMPLE1>' and <READBAR>; respectively.
* For example, 'B1U1U2' is the same as '<BARCODE1><UMI1><UMI2>'.
*
* Technically speaking, the short layout format is the only one used.
* Technically speaking, the short layout format is the only one used.
*
* @author girardot
*
......@@ -62,8 +72,8 @@ public class FastqWriterLayout {
private static Logger log = LoggerFactory.getLogger(FastqWriterLayout.class);
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUS]\\d+)+$";
private static final String LONG_LAYOUT_REGEX = "^(<?(BARCODE|UMI|SAMPLE|READBAR)\\d+>?)+$";
private static final String SHORT_LAYOUT_REGEX = "^([BUSR]\\d+)+$";
......@@ -143,12 +153,32 @@ public class FastqWriterLayout {
shortLayout = shortLayout.replaceAll("ARCODE", "");
shortLayout = shortLayout.replaceAll("MI", "");
shortLayout = shortLayout.replaceAll("AMPLE", "");
shortLayout = shortLayout.replaceAll("EADBAR", "");
}
log.debug("short layout : "+shortLayout);
return shortLayout;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s).
* This method also use the read sequence to write BARCODE in read name
* @param reads the {@link FastqRecord} from the input fastq files in the order matching the {@link ReadLayout} given at construction
*
* @return
*/
public FastqRecord assembleRecord( FastqRecord[] reads ){
FastqRecord rec = sequenceConsumer.assembleNewRead(reads);
String name = rec.getReadName();
if(readNameConsumer != null)
name = readNameConsumer.assembleNewReadName(reads);
FastqRecord ass = new FastqRecord(name, rec.getReadString(), rec.getBaseQualityHeader(), rec.getBaseQualityString());
log.debug("Assembled read for output using layout [NameLayout="+this.readNameLayout+" ; SequenceLayout="+this.readSequenceLayout+"] => \n"+ass.toFastQString());
return ass;
}
/**
* Assemble the {@link FastqRecord} that should be written in the output file according to the layout(s)
......
......@@ -34,6 +34,7 @@ import org.embl.gbcs.je.jedropseq.Jedropseq;
import org.embl.gbcs.je.jeduplicates.MarkDuplicatesWithMolecularCode;
import org.embl.gbcs.je.jemultiplexer.Jemultiplexer;
import org.embl.gbcs.je.jemultiplexer.JemultiplexerIllumina;
import org.embl.gbcs.je.retag.TagFromReadName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -50,6 +51,7 @@ public class Je {
private static Logger log = LoggerFactory.getLogger(Je.class);
public static final String COMMAND_RETAG = "retag";
public static final String COMMAND_DEMULTIPLEX = "debarcode";
public static final String COMMAND_DROPSEQ = "dropseq";
public static final String COMMAND_CLIP = "clip";
......@@ -64,8 +66,9 @@ public class Je {
ALLOWED_COMMANDS.add(COMMAND_DUPES);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_MULTIPLEX_ILLUMINA);
ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
//ALLOWED_COMMANDS.add(COMMAND_DROPSEQ);
ALLOWED_COMMANDS.add(COMMAND_DEMULTIPLEX);
ALLOWED_COMMANDS.add(COMMAND_RETAG);
}
......@@ -125,6 +128,9 @@ public class Je {
else if(option.equalsIgnoreCase(COMMAND_DROPSEQ)){
new Jedropseq().instanceMainWithExit(argv);
}
else if(option.equalsIgnoreCase(COMMAND_RETAG)){
new TagFromReadName().instanceMainWithExit(argv);
}
else{
System.err.println(
"FATAL : We just reached a supposedly unreachable part of the code. Please report this bug to Je developpers indicating the options you used i.e. : \n "+
......@@ -146,7 +152,8 @@ public class Je {
+"\t "+COMMAND_MULTIPLEX+" \t\t demultiplexes fastq file(s) with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_MULTIPLEX_ILLUMINA+" \t demultiplexes fastq file(s) using Illumina Index files with Je 1.x implementation, with optional handling of molecular barcodes for further use in 'dupes' module\n"
+"\t "+COMMAND_DUPES+" \t\t removes read duplicates based on molecular barcodes found in read name headers (as produced by clip or plex)\n"
+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
//+"\t "+COMMAND_DROPSEQ+" \t\t clips cell barcode and UMI from read 1 and adds them to header of read 2. This command is for processing drop-seq results.\n"
+"\t "+COMMAND_RETAG+" \t\t extracts barcode and UMI sequence(s) embedded in read names and tag reads with proper BAM tag.\n"
+"\n"
+"Version : "+getVersion()
;
......
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je;
import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import htsjdk.samtools.util.FastqQualityFormat;
import htsjdk.samtools.util.QualityEncodingDetector;
import htsjdk.samtools.util.SolexaQualityConverter;
import java.io.File;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class JeTry {
private static Logger log = LoggerFactory.getLogger(JeTry.class);
public JeTry() {
// TODO Auto-generated constructor stub
}
public static void main(String[] args) {
String f = "/g/furlong/incoming/2017-06-28-000000000-B954B/000000000-B954B_precap_allpromV2_17s003049-1-1_Ghavi-helm_lane117s003049_1_sequence.txt.gz";
FastqReader reader = new FastqReader(new File(f));
FastqQualityFormat QUALITY_FORMAT = QualityEncodingDetector.detect(100000, reader);
log.info(String.format("Auto-detected quality encoding format as: %s.", QUALITY_FORMAT));
FastqRecord r =reader.iterator().next();
System.out.println(r.toFastQString());
System.out.println(r.getBaseQualityString());
byte [] bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SAMUtils.fastqToPhred(bites);
System.out.println(Arrays.toString( bites ));
bites = Arrays.copyOf( r.getBaseQualityString().getBytes() , r.getBaseQualityString().getBytes().length);
System.out.println(Arrays.toString( bites ));
SolexaQualityConverter.getSingleton().convertSolexa_1_3_QualityCharsToPhredBinary(bites);
System.out.println(Arrays.toString( bites ));
}
}
......@@ -48,7 +48,7 @@ import org.slf4j.LoggerFactory;
* <li> When no length ('x') is specified, all the sequence till the end is considered ; it only possible to use the 'x'
* shortcut in the last block of a layout</li>
* <li> When a negative value is given in place of length (e.g. '<BLOCKCODEn:-2>'), all but the last x (2 in the
* '<BLOCKCODEn:-2>' example) bases ; a negative length value is only acceptedin the last block of a layout</li>
* '<BLOCKCODEn:-2>' example) bases ; a negative length value is only accepted in the last block of a layout</li>
* </ul>
*
* <br/>
......@@ -152,6 +152,7 @@ public class ReadLayout {
}
/**
* change
* Process the layout and initialize useful variables
*/
public void parseLayout(){
......
......@@ -27,6 +27,7 @@ import htsjdk.samtools.SAMUtils;
import htsjdk.samtools.fastq.FastqRecord;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
......@@ -122,6 +123,24 @@ public class ReadLayoutConsumer {
}
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
* In this method, the read sequence is always used in BARCODE slots
*
* @param reads the reads in order matching that of the {@link ReadLayout} array used at construction
*
* @return
*/
public String assembleNewReadName(FastqRecord [] reads){
boolean[] useReadSequenceForBarcodes = new boolean[reads.length];
Arrays.fill(useReadSequenceForBarcodes, true);
return assembleNewReadName(reads, useReadSequenceForBarcodes, null);
}
/**
* Assemble a read name by concatenating the output layout to the original read name.
* Concatenation is made by inserting a readNameDelimitor between each added slot
......@@ -150,7 +169,7 @@ public class ReadLayoutConsumer {
*/
String subseq = null;
int bestQual = 0;
if(!useReadSequenceForBarcodes[slotIdx-1] && slotTypeCode == BYTECODE_BARCODE){
if(slotTypeCode == BYTECODE_BARCODE && !useReadSequenceForBarcodes[slotIdx-1] ){
// we init the subseq with the matched barcode directly
subseq = m.getBarcodeMatches().get(slotIdx).barcode;
}else{
......@@ -208,7 +227,7 @@ public class ReadLayoutConsumer {
byte slotTypeCode = slotCodes.get(i);
int slotIdx = this.slotIdx.get(i);
log.debug("gettign info for slot code "+slotTypeCode+" with idx "+slotIdx);
log.debug("getting info for slot code "+slotTypeCode+" with idx "+slotIdx);
/*
* when a slot can be obtained from different reads (e.g. redundant barcode), keep the one with best overall quality
*/
......
......@@ -41,12 +41,19 @@ public class SampleMatch {
*/
protected Map<Integer, BarcodeMatch> barcodeMatches;
protected String diagnosticNote = "";
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches){
this.sample= sample;
this.barcodeMatches = barcodeMatches;
}
public SampleMatch(String sample, Map<Integer, BarcodeMatch> barcodeMatches, String diagnosticNote){
this.sample= sample;
this.barcodeMatches = barcodeMatches;
this.diagnosticNote = diagnosticNote;
}
/**
* @return the sample
......@@ -64,6 +71,22 @@ public class SampleMatch {
}
/**
* @return the diagnosticNote
*/
public String getDiagnosticNote() {
return diagnosticNote;
}
/**
* @param diagnosticNote the diagnosticNote to set
*/
public void setDiagnosticNote(String diagnosticNote) {
this.diagnosticNote = diagnosticNote;
}
/**
* @param sample the sample to set
*/
......
......@@ -30,6 +30,7 @@ import htsjdk.samtools.util.QualityEncodingDetector;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
......@@ -87,7 +88,8 @@ public class Jedemultiplex extends CommandLineProgram {
protected static final String DEFAULT_METRICS_FILE_NAME = "jemultiplexer_out_stats.txt";
protected static final String DEFAULT_READ_NAME_SEPARATOR_CHAR = ":";
protected static final String DEFAULT_USE_ORIGINAL_BARCODE_SEQUENCE = "0";
protected static final boolean DEFAULT_STRICT = false;
protected static final boolean DEFAULT_ADD_SEQUENCE_LAYOUT_IN_OUTPUT_FILENAME = false;
protected static final boolean DEFAULT_ADD_HEADER_LAYOUT_IN_OUTPUT_FILENAME = false;
......@@ -304,6 +306,14 @@ public class Jedemultiplex extends CommandLineProgram {
// array associated with MIN_BASE_QUALITY (after parsing) ; with a length that of the different barcode number found in the barcode file
int [] min_base_qualities;
@Option(shortName="S", optional = true,
printOrder=85,
doc="When reads have redundant BARCODE slots, this option tells how to handle situation when the read sequence do not resolve to the same sample.\n"
+" When true, the read pair is always 'unassigned'.\n"
+" When false, the read pair is assigned to the sample with the lowest overall mismatch sum\n"
)
public boolean STRICT = false;
@Option(shortName = "O", optional = true,
printOrder=90,
doc="Output directory. By default, output files are written in running directory.\n")
......@@ -951,8 +961,7 @@ public class Jedemultiplex extends CommandLineProgram {
private List<String> customEmbaseCommandLineValidation() {
//error messages to be accumulated
List<String> messages = new ArrayList<String>();
OUTPUT_DIR = null;
GZIP_OUTPUTS=true;
CREATE_MD5_FILE = true;
......@@ -964,12 +973,18 @@ public class Jedemultiplex extends CommandLineProgram {
String ufname= FileUtil.removeExtension( f.getName() ) + "_unassigned-reads"+".txt.gz" ;
unassignedFilePathes.add(ufname);
}
/*
* Get information from emBASE
*/
//try load properties
try {
ApplicationConfiguration.init();
} catch (IOException e) {
log.error("Failed to read properties from property file.");
throw new RuntimeException(e);
}
//Determine the caller
String username = System.getProperty("user.name");
//check user is not trying to trick me with a -Duser.name=another user!
......@@ -1099,7 +1114,7 @@ public class Jedemultiplex extends CommandLineProgram {
"Are you sure this lane has been successfully uploaded to emBASE ?\nIf so, please contact emBASE admins.";
log.debug(mess);
if(TEST_MODE_STOP_AFTER_PARSING){
log.debug("TEST MODE : Setting fake dir for "+lib.getName());
log.info("TEST MODE : Setting fake dir for "+lib.getName());
lib.setSampleDir ( EmBaseDatabaseFacade.getFakeSampleDir(lib.getName(), lib.getId()) );//does not exist, just fake for test
}
else{
......@@ -1110,17 +1125,22 @@ public class Jedemultiplex extends CommandLineProgram {
//update lib.getSampleDir with fastq sub-dir
File fastqDir = new File(lib.getSampleDir(), "fastq");
if(!fastqDir.exists() || !fastqDir.canWrite()){
String mess = "The 'fastq' dir under the library dir "+lib.getSampleDir().getAbsolutePath()
+" is either not existing OR not writable.\n";
if(!fastqDir.exists()){
mess+="The storage architecture is not ready. Are you sure this lane has been successfully uploaded to emBASE ?\nIf so, please contact emBASE admins.";
if(!TEST_MODE_STOP_AFTER_PARSING){
String mess = "The 'fastq' dir under the library dir "+lib.getSampleDir().getAbsolutePath()
+" is either not existing OR not writable.\n";
if(!fastqDir.exists()){
mess+="The storage architecture is not ready. Are you sure this lane has been successfully uploaded to emBASE ?\nIf so, please contact emBASE admins.";
}
else{
mess+="The fastq storage file has been already locked to prevent further modifications.\n" +
"Please contact emBASE admins if you think this should not be the case.";
}
log.debug(mess);
messages.add(mess);
} else {
lib.setSampleDir( EmBaseDatabaseFacade.TMP_DIR );
log.info("TEST MODE : Setting fake sample dir to "+EmBaseDatabaseFacade.TMP_DIR);
}
else{
mess+="The fastq storage file has been already locked to prevent further modifications.\n" +
"Please contact emBASE admins if you think this should not be the case.";
}
log.debug(mess);
messages.add(mess);
}else{
lib.setSampleDir( fastqDir );
}
......@@ -1142,7 +1162,11 @@ public class Jedemultiplex extends CommandLineProgram {
*/
PrintWriter pw = null;
try{
BARCODE_FILE = getBarcodeFileTmpLocation(FASTQ.get(0), username, runStatDir);
BARCODE_FILE = getBarcodeFileTmpLocation(
FASTQ.get(0),
username,
(TEST_MODE_STOP_AFTER_PARSING ? EmBaseDatabaseFacade.TMP_DIR : runStatDir)
);
if(BARCODE_FILE.exists()){
log.warn("The barcode file already exists and will be overwritten : "+BARCODE_FILE.getAbsolutePath());
}
......@@ -1152,7 +1176,6 @@ public class Jedemultiplex extends CommandLineProgram {
File fastq1 = getDemultiplexedFile(lib, 1 , GZIP_OUTPUTS);
File fastq2 = getDemultiplexedFile(lib, 2 , GZIP_OUTPUTS);
/*
* check if file already exists , the test on the length is meant to avoid considering a 'fake' file as a real file
* we indeed create such files to prepare the architecture
......@@ -1215,7 +1238,7 @@ public class Jedemultiplex extends CommandLineProgram {
outLayouts,
this.unassignedFiles,
this.metricsFile,
this.QUALITY_FORMAT,
this.QUALITY_FORMAT,
this.GZIP_OUTPUTS,
this.CREATE_MD5_FILE);
......@@ -1226,6 +1249,7 @@ public class Jedemultiplex extends CommandLineProgram {
min_mismatch_deltas,
min_base_qualities,
useOriginalSequenceForBarcode,
STRICT,
WRITER_FACTORY_USE_ASYNC_IO,
bcDiagFile);
......
......@@ -54,6 +54,7 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.embl.cg.utilitytools.utils.FileUtil;
......@@ -80,8 +81,8 @@ import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap;
+ " account molecular barcodes (Unique Molecular Identifiers or UMIs) found in read header. "
+ "All records are then either written to the output file with the duplicate records flagged or trashed.\n"
+"Example :\n"
+"\t je markdupes INPUT=file_with_dupes.bam OUPUT=result.bam MM=1",
usageShort = "je markdupes INPUT=file_with_dupes.bam OUPUT=result.bam MM=1",
+"\t je markdupes INPUT=file_with_dupes.bam OUTPUT=result.bam MM=1",
usageShort = "je markdupes INPUT=file_with_dupes.bam OUTPUT=result.bam MM=1",
programGroup = SamOrBam.class
)
public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesCommandLineProgram {
......@@ -210,8 +211,6 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
private int numDuplicateIndices = 0;
private LibraryIdGenerator libraryIdGenerator = null; // this is initialized in buildSortedReadEndLists
public MarkDuplicatesWithMolecularCode() {
super();
......@@ -223,6 +222,42 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
new MarkDuplicatesWithMolecularCode().instanceMainWithExit(args);
}
private void initReadNameRegexFromRead() {
//teh base regex to which will had as many ":[ATGCUNatgcun]+" slots as needed
String baseRegex = "(?:.*:)?([0-9]+)[^:]*:([0-9]+)[^:]*:([0-9]+)[^:]*";
final SamHeaderAndIterator headerAndIterator = openInputs();
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
SAMRecord rec = null;
//peek a record
while (iterator.hasNext()) {
rec = iterator.next();
break;
}
iterator.close();
//get read name
String name = rec.getReadName();
String [] tokens = name.split(":");
//start from end : how many slots match a barcode ?
int n=0;
for(int p = tokens.length -1 ; p>0 ;p-- ){
if(Pattern.matches("[ATGCUNatgcun]+", tokens[p]))
n++;
}
for(int i = 0 ; i< n; i++){
baseRegex += ":[ATGCUNatgcun]+";
}
log.debug("READ_NAME_REGEX initialized to "+baseRegex);
this.READ_NAME_REGEX = baseRegex;
}
/**
* Main work method. Reads the BAM file once and collects sorted information about
* the 5' ends of both ends of each read (or just one end in the case of pairs).
......@@ -234,6 +269,23 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(METRICS_FILE);
/*
* SET READ_NAME_REGEX to a proper value IF it was not given in the command line
* => Reasoning on command line for READ_NAME_REGEX option
*/
if(!this.getCommandLine().contains("READ_NAME_REGEX")){
/*
* we reset the default READ_NAME_REGEX to accommodate the additional barcode slots added to the read name
* but only it the separator used was ':' and SLOTS or TSLOTS are not null
*/
if(this.SPLIT_CHAR.equals(':')){
initReadNameRegexFromRead();
}
//else the default regex should still work
}
/*
* Molecular Barcode Support
* Initialize the MolecularBarcodeFinder
......@@ -361,8 +413,7 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
}
if (recordInFileIndex == nextDuplicateIndex) {
if (recordInFileIndex == nextDuplicateIndex) {
rec.setDuplicateReadFlag(true);
// Update the duplication metrics
......@@ -419,6 +470,7 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
}
if (!this.REMOVE_DUPLICATES || !rec.getDuplicateReadFlag()) {
if (PROGRAM_RECORD_ID != null) {
rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name())));
......@@ -844,6 +896,7 @@ public class MarkDuplicatesWithMolecularCode extends AbstractMarkDuplicatesComma
best = end;
}
}
}
//flag all reads duplicates but best
......
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.embl.gbcs.je.jeduplicates;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import htsjdk.samtools.util.StringUtil;
import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates;
public class ReadEndsForMarkDuplicatesWithMolecularCode2 extends
ReadEndsForMarkDuplicates implements MolecularCodedReadEnds{
/*
What do we need to store you ask? Well, we need to store:
- byte: orientation
- short: libraryId, readGroup, tile, x, y, score
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, code length
- long: read1IndexInFile, read2IndexInFile
- byte array for code : fix it to 20, should be a max here
*/
public static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 5) + (8 * 2) + 1
+ 8 + // last 8 == reference overhead
+ 20 + //arbitrary byte array for code
13; // This is determined experimentally with JProfiler
protected byte[] molecularCodeArr = null;
public void setMolecularCode(String code){
molecularCodeArr = StringUtil.stringToBytes(code.toUpperCase());
}
/**
* This is only meant to be used by the Codec. We don t want people to use this ie
* we want to make sure they use the string version so that we ensure codes are set upper case
* @param molecularCodeArr
*/
protected void setMolecularCode(byte[] molecularCodeArr){
this.molecularCodeArr = molecularCodeArr;
}
/**
* @return the molecularCode
*/
public String getMolecularCode() {
return StringUtil.bytesToString(molecularCodeArr);
}
/**
* @param codelen length of an individual code. Each code is supposed to have the same length
* @return the molecularCodes
*/
public List<String> getMolecularCodes(int codelen) {
int codenumber = molecularCodeArr.length / codelen;
List<String> codes = new ArrayList<String>(codenumber);
for(int i = 0 ; i < codenumber; i++){
codes.add(
StringUtil.bytesToString(molecularCodeArr, i*codelen, codelen)
);
}
return codes;
}
/**
* @return the molecularCodeArr
*/
public byte[] getMolecularCodeArr() {
return molecularCodeArr;
}
public String toString(){
String s = "RG="+readGroup+" ; "+tile+":"+x+":"+y+" ; orientation="+orientation+" ; paired="+(isPaired()?"1":"0")+" \n "
+" R1-ref="+read1ReferenceIndex + ", R1-coor="+read1Coordinate +", idx="+read1IndexInFile+" \n "
+" R2-ref="+read2ReferenceIndex + ", R2-coor="+read2Coordinate +", idx="+read2IndexInFile+" \n ";
return s;
}
}
This diff is collapsed.
......@@ -40,14 +40,14 @@ public class JeclipperTest {
private static Logger log = Logger.getLogger(JeclipperTest.class);
//@After
@After
public void cleanUpResultFiles(){
try {
File f1 = new File(JeclipperTest.class.getResource("file_1_sequence.txt").toURI());
File outdir = f1.getParentFile();
String [] fnames = new String []{
"file_1_sequence_out.txt", "file_2_sequence_out.txt"
"out_1.txt", "out_2.txt"
};
for (String fname : fnames) {
......@@ -80,14 +80,12 @@ public class JeclipperTest {
File outdir = f1.getParentFile();
String[] argv = new String[] {
"F1="+f1.getAbsolutePath(),
"F2="+f2.getAbsolutePath(),