* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
packageorg.embl.gbcs.je;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.Map;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
importorg.embl.cg.utilitytools.utils.StringUtil;
importorg.slf4j.Logger;
importorg.slf4j.LoggerFactory;
/**
* Describe the read layout i.e. indicates where barcode, UMI, fixed bases and sample sequences are located.
* Barcodes, UMIs and read sequence are specified using a "<BLOCKCODE:length>" while fixed bases are depicted using ATCGN directly.
* The following BLOCKCODE are supported :
* <ul>
* <li>BARCODE, expects a fixed length e.g. <BARCODEn:6> ; where 'n' is an optional number to uniquely identify this barcode slot </li>
* <li>UMI, expects a fixed length e.g. <UMIn:8> ; where 'n' is an optional number to uniquely identify this umi slot</li>
* <li>SAMPLE, expects a fixed length or "x" e.g. <SAMPLEn:30> or <SAMPLE:x> ; where 'n' is an optional number to uniquely identify this sample sequence slot
* When no length ('x') is specified, all the sequence till the end is considered.
* When a length is provided, the exact sequence length is considered (any extra bases are discarded).
* Negative values are also supported to indicate all but the last x bases ; this is only accepted when the SAMPLE block is the last one
* </li>
* </ul>
*
* <br/>
* <h2>Note on the optional 'n' in blocks.</h2>
* When multiple Barcode slots are described (potentially across different read layout), Je needs to understand how to use these barcodes.
*
* Let s talk examples : <br />
*
* In standard PE protocol, the barcode is found at the beginning of both reads and is identical between read1 and read2 (REDUNDANT_BARCODES=True).
* In such a situation, you can simply give READ_LAYOUT=1:<BARCODE:6><SAMPLE:x> and READ_LAYOUT=2:<BARCODE:6><SAMPLE:x> which is
* equivalent to READ_LAYOUT=1:<BARCODE1:6><SAMPLE:x> and READ_LAYOUT=2:<BARCODE1:6><SAMPLE:x> or simply READ_LAYOUT=<BARCODE:6><SAMPLE:x>.
* <br/>
* Note that the "1:", "2:", ... "i:" number preceding the read layout. This is used to match the read layout to the correct FASTQ file (passed like FASTQ=1:fastq_1.txt.gz, FASTQ=2:fastq_2.txt.gz, ...)
* An alternative is to provide the read layout with the FASTQ i.e. FASTQ=1:<BARCODE1:6><SAMPLE:x>:fastq_1.txt.gz
* <br/>
* When this is not the case and the barcodes are different, all barcodes are then used to look up sample name and the barcode file must
* have BARCODEn headers i.e. matching the number of defined BARCODE ; for example if 2 barcode blocks were defined like
* <br/>
* READ_LAYOUT=1:<BARCODE1:6><SAMPLE:x> and READ_LAYOUT=2:<BARCODE2:6><SAMPLE:x>
* <br/>
* The barcode file format is expected to be like :
* <br />
* SAMPLE BARCODE1 BARCODE2
* <br />
* sample1 ATGCGC TTCGAA
* sample2 GCGCTA AACTGA
* ...
*
* <br/>
* Important:
* <ul>
* <li> The sample sequence must always come after UMI and/or Barcode blocks.</li>
* <li>The Layout accepts multiple blocks for Barcode and UMI but not for the sample sequence </li>
* </ul>
* <br/>
*
* Example of read layout :
* <ul>
* <li>
* NN<BARCODE:6>N<UMI:8>A<SAMPLE:x> would instruct Je to ignore the first two bases, use the first 6 bases to match the sample barcode,
* ignore a base (N), use the next 8 bases as a UMI, ignore the next A and use the remaining of the sequence as the sample sequence
* </li>
* </ul>
*
* Note that in a paired-end setup, two read layouts should be assembled when the read structures are different ; otherwise a unique layout is sufficient
log.debug(LABEL_FOR_SAMPLE+" of length "+sampleSequenceLength+" starts at "+sampleSequenceStart);
}
//final check : the sample block must be the last one for layout with a sample block
if(hasSampleBlock){
for(ints:umiStart){
if(s>sampleSequenceStart){
thrownewReadLayoutMalformedException("The "+BLOCKCODE_SAMPLE+" block is not the last 3' block of your layout while it must : at least one "+BLOCKCODE_UMI+" block is found after",layout);
}
}
for(ints:bcStart){
if(s>sampleSequenceStart){
thrownewReadLayoutMalformedException("The "+BLOCKCODE_SAMPLE+" block is not the last 3' block of your layout while it must : at least one "+BLOCKCODE_BARCODE+" block is found after",layout);
thrownewReadLayoutMalformedException("Malformed read layout : block for "+blockName+" could not be found ! ",layout);
}
Integerl=null;
Stringtoken=m.group(2);
try{
l=Integer.parseInt(token);
}catch(NumberFormatExceptione){
// if it is 'x' it is fine
if(!token.equalsIgnoreCase("x")){
//then it is a format error
thrownewReadLayoutMalformedException("Malformed read block : length in block for "+blockName+" should be specified with a valid number or 'x'! ",layout);
}
}
returnl;
}
/**
* @param pat the pattern
* @param blockName for error reporting only, use a user meaningful name here
* @return the number found next to the bllock name i.e. 2 from <UMI2:6>; or 1 if block has no number
* @throws ReadLayoutMalformedException is pattern can't be matched or is invalid
thrownewReadLayoutMalformedException("Malformed read layout : block for "+blockName+" could not be found ! ",layout);
}
Integerl=null;
Stringtoken=m.group(1);
if(token==null||token.isEmpty())
return1;
try{
l=Integer.parseInt(token);
}catch(NumberFormatExceptione){
//then it is a format error
thrownewReadLayoutMalformedException("Malformed read block : block number for "+blockName+" should be specified with a valid number or absent ",layout);
}
returnl;
}
/**
* @return true if this layout has a UMI block
*/
publicbooleancontainsUMI(){
returnhasUMIBlock;
}
/**
* @return true if this layout has a Barcode block
*/
publicbooleancontainsBarcode(){
returnhasBarcodeBlock;
}
/**
* @return true if this layout has a Sample block
*/
publicbooleancontainsSampleSequence(){
returnhasSampleBlock;
}
/**
* @return the number of UMI block number found in this layout
*/
publicintumiBlockNumber(){
if(!hasUMIBlock)
return0;
returnumiStart.size();
}
/**
* @return the number of BARCODE block number found in this layout
*/
publicintbarcodeBlockNumber(){
if(!hasBarcodeBlock)
return0;
returnbcStart.size();
}
/**
* Extract the subsequence(s) corresponding to the UMI blocks in the read layout
*
* @param read the whole read or quality string
* @return the sequences corresponding to the UMI or null if this layout has no UMI block
*/
publicString[]extractUMIs(Stringread){
if(!hasUMIBlock)
returnnull;
String[]umis=newString[umiStart.size()];
for(inti=0;i<umiStart.size();i++){
intstart=umiStart.get(i);
intlen=umiLength.get(i);
umis[i]=read.substring(start,start+len);
}
returnumis;
}
/**
* Extract the subsequence(s) corresponding to the UMI blocks in the read layout
* and merge them in a unique String (following the 5' to 3' order on the layout)
*
* @param read the whole read or quality string
* @return the sequences corresponding to the UMI or null if this layout has no UMI block