Commit c099f62d authored by Charles Girardot's avatar Charles Girardot

initial recommit in gbcs repos

parents
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
*~
.DS_Store
bin
.metadata
target/
._*
.settings/
.project
Je
--
The Je tool suite
=================
Contains
++++++++
Je currently offers 4 tools :
**je clip**
to remove UMIs contained in reads of fastq files that do not need sample demultiplexing
**je demultiplex**
to demultiplex multi-samples fastq files which reads contain barcodes and UMIs (or not)
**je demultiplex-illu**
to demultiplex fastq files according to associated index files (contain the sample encoding barcodes).
Reads can additionally contain UMIs (inline)
**je markdupes**
to filter BAM files for read duplicates taking UMIs into account
Source
++++++
src/shell/je
is the wrapper script to call ``java -jar je_1.0_bundle.jar``
src/galaxy/
contains the Je wrappers for Galaxy
src/test/
holds the different test data
# props loaded when building test profile
log.level=INFO
db.url=jdbc:mysql://gbcs.embl.de/base
db.user=baseread
db.pwd=PWD_GOES_HERE
admin.email=gbcs@embl.de
\ No newline at end of file
# props loaded when building test profile
log.level=DEBUG
db.url=jdbc:mysql://localhost/base
db.user=baseread
db.pwd=
admin.email=je@embl.de
\ No newline at end of file
# props loaded when building test profile
log.level=DEBUG
db.url=jdbc:mysql://gbcs-dev.embl.de/base
db.user=baseread
db.pwd=PWD_GOES_HERE
admin.email=YOUR_EMAIL_HERE
\ No newline at end of file
#jars found in this folder are artifact that are not found in maven central, you can then puch them in your local maven repo with the following commands:
#ADAPT fpath to Je/lib
LIBPATH="/Users/girardot/Work/eclipse_ws/Je/lib/custom-picard"
cd ~/.m2
mvn install:install-file -DgroupId=net.sf -DartifactId=htsjdk -Dversion=1.140custom -Dfile=$LIBPATH/htsjdk-1.140.jar -Dpackaging=jar -DgeneratePom=true
mvn install:install-file -DgroupId=net.sf -DartifactId=picard -Dversion=1.140custom -Dfile=$LIBPATH/picard.jar -Dpackaging=jar -DgeneratePom=true
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>Je</groupId>
<artifactId>Je</artifactId>
<version>1.0</version>
<name>Je</name>
<description>Je provides command line utilities to deal with barcoded FASTQ files with or without Unique Molecular Index (UMI)</description>
<licenses>
<license>
<name>GPLv3</name>
<url><![CDATA[http://www.gnu.org/copyleft/gpl.html]]></url>
</license>
</licenses>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<log.level>INFO</log.level>
</properties>
<!-- Building is a little tricky due to the custom picard. Each time you
modify picard (meaning you have a git clone of both picard and htsjdk), you
need to build picard with ant i.e. 'ant build' This creates jars in the dist
picard/folder and the 2 important ones are : (1) picard.jar (2) htsjdk_lib_dir/htsjdk-1.140.jar
You then need to add these in your local mvn repo so that they are found
by maven, this is performed by the src/shell/install-picard-deps-in-mvn-repos_from-git.sh
(you need to adapt path in there first !) The script also takes care of replacing
the jars in Je/lib to make sure we always distribute with the right custom
picard jars You can now build Je ! -->
<build>
<resources>
<resource>
<directory>${basedir}/src/main/java</directory>
<includes>
<include>**/*.java</include>
</includes>
</resource>
<!-- Resource def needed to overwrite properties dynamically in eg log4j.xml -->
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.*xml</include>
<include>**/*.*properties</include>
</includes>
<filtering>true</filtering>
</resource>
</resources>
<plugins>
<!-- Define a plugin that embeds all dependences (jars) in the final jar -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>org.embl.gbcs.je.Je</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<!-- Bind to packaging phase: the package phase will now produce 2 jars
: - the 'usual' one - one with all dependencies bundled -->
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<profiles>
<profile>
<!-- Profile to get a test jar for testing against the embase dev db -->
<id>test</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>properties-maven-plugin</artifactId>
<version>1.0.0</version>
<executions>
<execution>
<phase>initialize</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<files>
<file>config/test.properties</file>
</files>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<finalName>je_bundle-TEST</finalName>
<appendAssemblyId>false</appendAssemblyId>
<archive>
<manifestEntries>
<Implementation-Version>${project.version}</Implementation-Version>
</manifestEntries>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<!-- Profile to get the production jar for EMBL -->
<id>embl</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>properties-maven-plugin</artifactId>
<version>1.0.0</version>
<executions>
<execution>
<phase>initialize</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<files>
<file>config/embl.properties</file>
</files>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<finalName>je_embl_${project.version}_bundle</finalName>
<appendAssemblyId>false</appendAssemblyId>
<archive>
<manifestEntries>
<Implementation-Version>${project.version}</Implementation-Version>
</manifestEntries>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<!-- Profile to get the production jar for public use -->
<id>public</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>properties-maven-plugin</artifactId>
<version>1.0.0</version>
<executions>
<execution>
<phase>initialize</phase>
<goals>
<goal>read-project-properties</goal>
</goals>
<configuration>
<files>
<file>config/public.properties</file>
</files>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<finalName>je_${project.version}_bundle</finalName>
<appendAssemblyId>false</appendAssemblyId>
<archive>
<manifestEntries>
<Implementation-Version>${project.version}</Implementation-Version>
</manifestEntries>
</archive>
</configuration>
</plugin>
</plugins>
</build>
</profile>
</profiles>
<dependencies>
<!-- <dependency> -->
<!-- <groupId>net.sf</groupId> -->
<!-- <artifactId>picard</artifactId> -->
<!-- <version>1.140</version> -->
<!-- </dependency> -->
<!-- <dependency> -->
<!-- <groupId>net.sf</groupId> -->
<!-- <artifactId>htsjdk</artifactId> -->
<!-- <version>1.140</version> -->
<!-- </dependency> -->
<dependency>
<groupId>net.sf</groupId>
<artifactId>picard</artifactId>
<version>1.140custom</version>
</dependency>
<dependency>
<groupId>net.sf</groupId>
<artifactId>htsjdk</artifactId>
<version>1.140custom</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.7</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.25</version>
</dependency>
<dependency>
<groupId>org.embl.gbcs.embase</groupId>
<artifactId>jembase_api</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>org.embl.cg.utilitytools</groupId>
<artifactId>ut_utils</artifactId>
<version>0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
#!/bin/sh
# Wrapper around je_embl_1.0_bundle.jar
#path to jar file to execute
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
#echo $DIR
JAR_FILE=$DIR"/je_1.0_bundle.jar"
OPTS="-Xmx8g"
java $OPTS -jar $JAR_FILE "$@"
exit $?
<tool id="je-clip" name="Je-Clip" version="1.0">
<description>clips Unique Molecular Identifiers (UMIs) from fastq files</description>
<macros>
<import>macros.xml</import>
</macros>
<stdio>
<exit_code range="1:" level="fatal" description="Tool exception" />
</stdio>
<version_command>echo '1.0'</version_command>
<command interpreter="bash">
<![CDATA[
je clip
## Fastq inputs
@single_or_paired_cmd@
#if str( $library.type ) != "single":
BPOS=${library.BPOS}
#end if
@common_options_cmd@
@barcode_len_cmd@
ADD=${ADD}
#if str($ADD) == "false":
BARCODE_RESULT_FILENAME=$BARCODE_RESULT_FILENAME
#end if
OF1=${OF1}
#if str( $library.type ) != "single":
OF2=${OF2}
#end if
FORCE=true
]]>
</command>
<inputs>
<!-- single/paired -->
<expand macro="single_or_paired_general">
<param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes.">
<option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option>
<option value="READ_2">READ_2 (beginning of read from the second fastq file)</option>
<option value="BOTH">BOTH (beginning of both reads)</option>
</param>
</expand>
<expand macro="barcode_len_option"/>
<param name="ADD" type="boolean"
label="Add matched barcode at the end of the read header (ADD)"
truevalue="true"
falsevalue="false"
checked="true"
/>
<expand macro="common_options"/>
</inputs>
<outputs>
<data name="BARCODE_RESULT_FILENAME" format="tabular" label="Je-Clipped Barcodes"/>
<data name="OF1" format_source="input_1" label="Je-Clipped {on_string}"/>
<data name="OF2" format_source="input_1" label="Je-Clipped {on_string}">
<filter>(type != "single")</filter>
</data>
</outputs>
<tests>
<test>
<!-- simple test on single end data -->
<param name="type" value="single"/>
<param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
<param name="LEN" value="6"/>
<param name="ADD" value="false"/>
<output name="BARCODE_RESULT_FILENAME" file="clip_barcode_result_file.txt"/>
<output name="OF1" file="clip_dataset1_SE.fastq"/>
</test>
<test>
<!-- more complex test on paired end data with different barcode for fwd/rev -->
<param name="type" value="paired"/>
<param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
<param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/>
<param name="LEN" value="6"/>
<param name="BPOS" value="BOTH"/>
<output name="OF1" file="clip_dataset1_PE.fastq"/>
<output name="OF2" file="clip_dataset2_PE.fastq"/>
</test>
</tests>
<help>
<![CDATA[
**What it does**
Je clip: Clips barcodes or Unique Molecular Identifiers (UMIs) from the input fastq files
Input files are fastq files, and can be in gzip compressed format (end in .gz).
Author: Charles Girardot (charles.girardot@embl.de).
Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
------
**Know what you are doing**
.. class:: warningmark
You will want to read the `documentation`__.
.. __: http://gbcs.embl.de/portal/Je
------
**Parameter list**
This is an exhaustive list of options::
FASTQ_FILE1=File
F1=File
Input fastq file (optionally gzipped) for single end data, or first read in paired end data.
Required.
FASTQ_FILE2=File
F2=File
Input fastq file (optionally gzipped) for the second read of paired end data.
Default value: null.
BCLEN=String
LEN=String
Length of the barcode sequences. When BARCODE_READ_POS == BOTH, two distinct lengths can
be provided using the syntax LEN=X:Z where X and Z are 2 integers representing the
barcode length for read_1 and read_2 respectively.
Required.
BARCODE_READ_POS=BarcodePosition
BPOS=BarcodePosition
Reads containing the sequence (i.e. UMIs) to clip:
READ_1 (beginning of read from FASTQ_FILE_1),
READ_2 (beginning of read from FASTQ_FILE_2),
BOTH (beginning of both reads).
Automatically set to READ_1 in single end mode and BOTH in paired end mode. Actually not
relevant for single end data
Default value: BOTH. This option can be set to 'null' to clear the default value.
Possible values: {READ_1, READ_2, BOTH, NONE}
ADD_BARCODE_TO_HEADER=Boolean
ADD=Boolean
Should clipped UMIs be added to the read header (at the end); apply to both barcodes when
BPOS=BOTH.
If ADD=true, the string ':barcode' is added at the end of the read header with a ':'
added only if current read header does not end with ':'.
If both reads of the pair contains a UMI (i.e. BARCODE_READ_POS == BOTH), the UMI from
the second read is also added to the read header.
Else, the header of the read without UMI receives the UMI from the other read.
For example:
@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
becomes
@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE
Default value: true. This option can be set to 'null' to clear the default value.
Possible values: {true, false}
ENSURE_IDENTICAL_HEADER_NAMES=Boolean
SAME_HEADERS=Boolean
Makes sure headers of both reads of a pair are identical.
Read name (or headers) will follow the pattern (for both reads of a pair):
@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2
This option only makes sense in paired end mode and ADD=true.Some (if not all) mappers
will indeed complain when read headers of a read pair are not identical.
When SAME_HEADERS=FALSE and the RCHAR is used, read headers look like this:
HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TGGAGTAG
HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:CGTTGTAT
SAME_HEADERS=true will instead generates the following identical header for both reads :
HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TGGAGTAG:CGTTGTAT
Note that we also clipped the useless '1:N:0' amd '3:N:0' as they also result in
different headers
Important : this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which case
a space will be preserved i.e.:
HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
Default value: true.
This option can be set to 'null' to clear the default value. Possible values: {true,
false}
READ_NAME_REPLACE_CHAR=String
RCHAR=String
Replace spaces in read name/header using provided character.
This is needed when you need to retain ADDed barcode in read name/header during mapping
as everything after space in read name is usually clipped in BAM files.
For example, with RCHAR=':':
@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 1:N:0:
becomes
@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:1:N:0:BARCODE
Default value: ':'. This option can be set to 'null' to clear the default value.
XTRIMLEN=String
XT=String
Optional extra number of base(s) to be trimmed right after the barcode. These extra bases
are not added to read headers.
When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode to
end up with reads of identical length (note that this can also be operated using ZT). If
a unique value is given, e.g. XT=1, while running paired-end the following rule applies :
(1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
(2) BPOS=BOTH, the value is used for both reads.
Note that XT=null is like XT=0.
Default value: 0. This option can be set to 'null' to clear the default value.
ZTRIMLEN=String
ZT=String
Optional extra number of bases to be trimmed from the read end i.e. 3' end. These extra
bases are not added to read headers.
When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
as to end up with reads of the same length (note that this can also be operated using
XT). Note that if a single value is passed, the value always applies to both reads in
paired-end mode without further consideration.
Default value: 0. This option can be set to 'null' to clear the default value.
BARCODE_RESULT_FILENAME=String
BF=String
Optional file name where to write clipped barcodes, default name is clipped_barcodes.txt.
This file is automatically created if ADD=FALSE i.e. even if this option is not provided
by user (and always created if this option is given).
File format is tab delimited with:
``read header (col 1) barcode from read_1 (col 2) barcode quality from read_1 (col 2)``
+ barcode + quality from read_2 (col 4 and 5 respectively) when relevant.
Can either be a name (in which case the file will be created in the output dir) or a full path.
Default value: null.
]]>
</help>
</tool>
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
../../bin/je_1.0_bundle.jar
\ No newline at end of file
This diff is collapsed.
sample1 CACTGT:GTATAG sample1_1_PE.txt sample1_2_PE.txt
sample2 ATTCCG:TCCGTC sample2_1_PE.txt sample2_2_PE.txt
sample3 GCTACC:TGGTCA sample3_1_PE.txt sample3_2_PE.txt
sample4 CGAAAC:CACTGT sample4_1_PE.txt sample4_2_PE.txt
\ No newline at end of file
sample1 CACTGT sample1_SE.txt
sample2 ATTCCG sample2_SE.txt
sample3 GCTACC sample3_SE.txt
sample4 CGAAAC sample4_SE.txt
\ No newline at end of file
READ_HEADER BC_SEQ_READ1 BC_QUAL_READ1
1:N:0: CACTGT CCCFFF
1:N:0: CACTGT BBCFFF
1:N:0: CACTGT CCCFFF
1:N:0: CACTGT =?@D;D
1:N:0: CACTGT BB=DDD