Commit feaeedd1 authored by Robin Erich Muench's avatar Robin Erich Muench
Browse files

Merge branch 'master' of git.embl.de:rmuench/metaSNP

parents 470245a0 2e7c741e
#!/bin/bash
#########################################
# metaSNP v1.1 - Initiate New Project #
#########################################
#
# Helper script to initiate a new project file structure with a 'uniq_project_name'
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Code is (c) Copyright ???
# This code is released under LICENSE
# Variables
# directory=results.$1
PROJECTPATH=$1
directory=$PROJECTPATH
#printf '%s\n' "${PROJECTPATH##*/}"
# Usage Message
display_usage(){
echo -e "\nUsage: $0 /path/2/project_dir\n"
}
# print error message if no PROJECTNAME argument is supplied
if [ $# -eq 0 ]
then
display_usage
exit 1
fi
# Check if the Project Directory already exists.
if [ ! -d "$directory" ]; then
# If $DIRECTORY doesn't exist.
echo -e "\nProject name is available. New project directory will be created here: $PROJECTPATH"
mkdir -p $directory/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
else
echo -e "\nCAUTION: The directory '$PROJECTPATH' already exists.\n\t Use a uniq project name or delet the existing directory.\n"
fi
......@@ -18,14 +18,10 @@ or [download](https://git.embl.de/rmuench/metaSNP/repository/archive.zip?ref=mas
Dependencies
============
* g++/c++
* Boost-1.53.0 or above
* samtools-1.19 or above
* htslib-1.0 or above (check samtools compatibility)
* Python-2.7 or above
......@@ -170,7 +166,7 @@ Caution: Perform this step seperately for individual SNPs and population SNPs.
within samples_of_interest (default: 0.5)
### b) Analysis:
> TODO: include R scripts for pairwise distance and subspecies computation
> TODO: include R scripts for computing pairwise distances and visualization
Example Tutorial
......@@ -217,6 +213,7 @@ Example Tutorial
TODO!
Basic usage (tools and scripts)
===============================
If you are interested in using the pipeline in a more manual way (for example the metaSNP caller stand alone) feel free to explore the src/ directory.
......
#! /bin/bash
############################################
# metaSNP Step III: `Post-Processing #
############################################
#
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
#
# Helper script for metaSNP distance computation
display_usage() {
echo >&2 ""
echo >&2 " Usage: $0 input_dir/ output/"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 " Required:"
echo >&2 " input_dir/ DIR directory with the filtered frequency tables (metaSNP_filtering.py)."
echo >&2 " output_dir/ DIR write output to DIR."
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_filtering.py' to be completed."
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required parameter"
display_usage
exit 1
}
dir_missing() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
IN_DIR="$1"
OUT_DIR="$2"
## Test
[ -n "$IN_DIR" ] || required_parameter "input_dir/"
[ -n "$OUT_DIR" ] || required_parameter "output_dir/"
[ -d "$IN_DIR" ] || dir_missing "$IN_DIR"
[ -d "$OUT_DIR" ] || dir_missing "$OUT_DIR"
## Commandlines:
for i in $(ls $IN_DIR"/"*.freq); do echo "/g/bork3/home/costea/R-3.0.2/bin/R --no-save --file=$DIR/src/computeDistances.R --args $i $OUT_DIR"; done
exit
\ No newline at end of file
###############################################
# metaSNP : Step III `Post-processing` #
###############################################
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
#
# This script computes SNP profile distances between sample pairs.
# - distances are computed over the SNP space.
# - default distance measure: manhattan distance.
# - PCoA visualization to gain a quick overview of the SNP space.
## INVOCATION (manually):
#
# Example:
# $ R --no-save --file=/path/2/src/computeDistances.R --args infile.freq outfolder/
#
# Loop (all):
# for i in $(ls project_dir/filtered/pop/*.freq); \
# do echo R --no-save --file=../src/subspeciesClustering.R \
# --args $i project_dir/distances/ \
# ;done
#############
## Functions
majorAllel = function(d) {
apply(d,1,function(x) {
x <- x[x!=-1]
x[x <50] <- 0
x[x >=50] <- 1
return(median(x))
})
}
rowMedians = function(d) {
apply(d,1,function(x) median(x[x!=-1]))
}
rMeans = function(d) {
apply(d,1,function(x) mean(x[x!=-1]))
}
getPnPs = function(s,d) {
s <- as.character(s$V2)
s <- substr(s,1,1)
m <- apply(d,2,function(x) {
a <- s[which(x > 50)]
return(sum(a=='N')/sum(a=='S'))
})
return(m)
}
#############
## libraries
library(ape) #pcoa
library(ggplot2)
library(gridExtra)
#############
#############
## Arguments
#argv <- c("/g/bork3/home/costea/R-3.0.2/bin/R", "--no-save", "--file=src/computeDistances.R", "--args", "tutorial/filtered/pop/420247.filtered.freq", "tutorial/distances/")
argv <- commandArgs(trailingOnly = F) # All arguments including --file
scriptPath <- dirname(sub("--file=","",argv[grep("--file",argv)])) ##sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
dist_location <- paste(scriptPath,'/distances/',sep='')
####
# Source required packages
source(paste(dist_location,'Strain_functions.R',sep=''))
args <- commandArgs(trailingOnly = TRUE) #--args
print(args)
file <- args[1]
outf <- args[2]
print(file)
print(outf)
data <- read.table(file,header=T,row.names=1,check.names=F)
species <- strsplit(sapply(strsplit(file,"\\/"),tail,1),'\\.')[[1]][1]
mantel <- NULL
print(dim(data))
colnames(data)
#### MetaSNP First (data)
#Remove mostly constant positions
freq_means <- apply(data,1,function(x) {
a <- which(x==-1)
if (length(a) > 0) {
return(mean(x[-a]))
} else {
return(mean(x))
}
})
#Remove mostly constant positions
r <- c(which( freq_means > 0.95),which( freq_means < 0.05))
if (length(r) > 0) {
data <- data[-r,]
}
## Remove low resolution Samples:
lowRes <- which(colSums(data==-1)>(nrow(data)*0.5))
if (length(lowRes) > 0) {
#Remove samples with more than half -1's
print('Removing samples because they do not have enough resolution')
print(length(lowRes))
#data <- data[,-lowRes]
}
# Compute allele frequency distributional properties
freq_data_sample_20_80 <- apply(data,2,function(x) {
x <- x[x!=-1]
sum(x < 0.20 | x > 0.80)/length(x)
})
freq_data_sample_10_90 <- apply(data,2,function(x) {
x <- x[x!=-1]
sum(x < 0.10 | x > 0.90)/length(x)
})
freq_data_sample_5_95 <- apply(data,2,function(x) {
x <- x[x!=-1]
sum(x < 0.5 | x > 0.95)/length(x)
})
freq_data_sample <- data.frame(freq_data_sample_20_80,freq_data_sample_10_90,freq_data_sample_5_95)
write.table(freq_data_sample,paste(outf,species,'_freq_composition.tab',sep=''),sep='\t',quote=F, col.names = NA)
# Requirement:
# !! <freq_euc.so> !! #
#------COMPILATION--------#
# The compiling is trivial:
# R CMD COMPILE freq_euc.c
# R CMD SHLIB freq_euc.o
#------COMPILATION--------#
mann_dist <- mann.dist(data*100)
write.table(as.matrix(mann_dist),paste(outf,species,'_mann_distance_matrix.tab',sep=''),sep='\t',quote=F, col.names = NA)
allele_dist <- allel.dist(data*100)
write.table(as.matrix(allele_dist),paste(outf,species,'_allele_distance_matrix.tab',sep=''),sep='\t',quote=F, col.names = NA)
############################################################################
#### Vizualize the SNP space #####
## PCoA Plots for each Species
#Get PCOA projection
pca <- pcoa(mann_dist)
eig <- pca[["values"]][["Eigenvalues"]]
eig[eig<0] <- 0
eig_m <- eig/sum(eig)*100
pcoa_df <- data.frame(pca[["vectors"]][,1:3])
pcoa_df$f <- freq_data_sample[rownames(pcoa_df),]$freq_data_sample_10_90
#pcoa_df$samples <- df[rownames(pcoa_df),]
write.table(pcoa_df,paste(outf,species,'_mann_pcoa_proj.tab',sep=''),sep='\t',quote=F, col.names=NA)
## Vizualize data as 2D PCoA
pdf(paste(outf,species,'_mann_PCoA.pdf',sep=''))
grid.arrange(ggplot(pcoa_df,aes(x=Axis.1,y=Axis.2)) + geom_point() + ggtitle(paste("Species:",species))
+ xlab(sprintf("PCoA1: %3.2f%%",eig_m[1])) + ylab(sprintf("PCoA2: %3.2f%%",eig_m[2])) )
dev.off()
\ No newline at end of file
#!/bin/bash
download_9() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f9.tar.bz2
echo "This might take up to 10 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f9.tar.bz2
exit
}
download_11() {
[ -d "db" ] || mkdir -p db
cd db
echo "Downloading the archive."
wget vm-lux.embl.de/~rmuench/files/db_f11.tar.bz2
echo "This might take up to 20 minutes."
echo "Extracting files!"
echo "Patience please..."
tar xjvf db_f11.tar.bz2
exit
}
echo "You are about to download the metaSNP database"
echo ""
echo "Requirements:"
echo " - Freeze9: 12G available disc space (1753 genomes)"
echo " - Freeze11: 33G available disc space (5278 genomes)"
echo ""
echo "Please make a selection by entering the right number:"
select yn in "f9" "f11" "cancel"; do
case $yn in
f9 ) download_9 ; break;;
f11 ) download_11; break;;
cancel ) exit;;
esac
done
#!/bin/bash
display_usage() {
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_dir/ all_samples"
echo >&2 ""
echo >&2 "Parameters:"
echo >&2 " Required:"
echo >&2 " project_dir/ DIR A metaSNP initialized project directory (metaSNP_NEW)"
echo >&2 " all_samples FILE Input list of bam files, one file per line."
echo >&2 ""
}
missing() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' is not a metaSNP project. Subdirectory '$2' is missing."
echo >&2 ""
echo >&2 "Please initialize a new metaSNP project first!"
echo >&2 ""
echo >&2 " RUN: metaSNP_NEW projectName"
echo >&2 "or"
echo >&2 " RUN: mkdir $2"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# print error message if no argument is supplied
if [ $# -ne 2 ]
then
display_usage
exit 1
fi
# assign arguments to variables
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
all_samples="$2"
PROJECT_DIR="$1"
OUT_DIR="$PROJECT_DIR/cov"
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
## Control
[ -d "$PROJECT_DIR" ] || missing "$PROJECT_DIR"
[ -f "$all_samples" ] || missing "$all_samples"
# Check required output directories
[ -d "$OUT_DIR" ] || make_dir "$PROJECT_DIR" "$OUT_DIR"
## Generate command lines
while read file
do
# name=$(echo $file | awk -F"/" '{split($NF,a,"."); print a[1]}')
# name=$(echo $file | awk -F"/" '{split($NF,a,"/"); print a[1]}')
name=$(basename $file)
echo $DIR/src/qaTools/qaCompute -c 10 -d -i $file $OUT_DIR/$name.cov
done < $all_samples
exit
#!/bin/bash
#########################################
# metaSNP v1.1 - Initiate New Project #
#########################################
#
# Helper script to initiate a new project file structure.
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Variables
PROJECT_NAME="$1"
# Usage Message
display_usage(){
echo >&2 ""
echo >&2 " Usage: $(basename $0) project_name"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required parameter"
display_usage
exit 1
}
make_dir(){
echo >&2 ""
echo >&2 " Project name is available. New project directory created here: $PROJECT_NAME"
echo >&2 ""
mkdir -p $PROJECT_NAME/{cov,bestsplits,snpCaller,filtered/{pop,ind},distances}
}
file_exists(){
echo >&2 ""
echo >&2 " ERROR: The directory '$PROJECT_NAME' exists. Please choose a unique project name."
echo >&2 ""
}
# Check required parameters
[ -n "$PROJECT_NAME" ] || required_parameter "project_name"
# Make
if [ ! -d "$PROJECT_NAME" ]
then
make_dir
else
file_exists
fi
exit
#!/bin/bash
#########################################
# metaSNP Step II: `Genome Splitting` #
#########################################
#
# Helper script
# This code is part of the metagenomic SNP calling pipeline (metaSNP)
# Copyright (c) 2016 Robin Munch
# Licenced under the GNU General Public License (see LICENSE)
# Abort on any errors
set -e
# Variables
wd=`pwd`
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
arg1="$1"
arg2="$2"
arg3="$3"
PROJECT_DIR=$arg1
GENOME_DEF=$arg2
NUM_SPLITS=10
SPLIT_LIMIT=100
COV_FILES=$PROJECT_DIR"/cov"
date=$(date +%Y-%m-%d)
#Usage Messages
display_usage() {
echo >&2 ""
echo >&2 ""
echo >&2 "Usage: $(basename $0) project_dir/ genome_def nr_splits[int]"
echo >&2 ""
echo >&2 "Parameters"
echo >&2 " Required:"
echo >&2 " project_dir/ = the metaSNP project directory"
echo >&2 " genome_def FILE = Contig ranges in BED format. (Fields: Contig_id, contigStart, contigEnd)"
echo >&2 ""
echo >&2 " Optional:"
echo >&2 " nr_splits INT = INT for job parallelization (range: 1-100) [10]"
echo >&2 ""
echo >&2 "Note: Expecting 'metaSNP_COV' to be completed!"
echo >&2 ""
}
required_parameter() {
echo >&2 ""
echo >&2 "ERROR: '$1' is a required argument"
display_usage
exit 1
}
no_such_file() {
echo >&2 ""
echo >&2 "ERROR: '$1' no such file or directory"
display_usage
exit 1
}
split_limit() {
echo >&2 ""
echo >&2 "ERROR: nr_splits[$1] exceeds the limit of 100"
echo >&2 ""
display_usage
exit 1
}
missing_input_dir() {
echo >&2 ""
echo >&2 "ERROR: '$1' missing in the project directory '$2'"
echo >&2 "ERROR: '$1' contains the input files (metaSNP_COV output)"
echo >&2 "SUGGESTION: Double check if '$2' is the intended project directory, or"
# echo >&2 "SUGGESTION: At this point the directory should have the coverage files in '$1'"
echo >&2 "SOLUTION: Initialize a new project (metaSNP_NEW) and rerun the coverage estimation (metaSNP_COV)."
echo >&2 ""
exit 1
}
rerun_cov() {
echo >&2 ""
echo >&2 "ERROR: Requires coverage estimations. Run 'metaSNP_COV' first"
echo >&2 ""
exit 1
}
make_dir() {
echo >&2 ""
echo >&2 "WARNING '$1' is not an initialized metaSNP project. Subdirectory '$(basename $2)' (output DIR) is missing."
echo >&2 "WARNING: If you used '$1' for the coverage estimation (metaSNP_COV) RUN:"
echo >&2 ""
echo >&2 " mkdir $2"
echo >&2 ""
echo >&2 "SUGGESTION: Double check if '$1' has the correct cov/ folder!"
echo >&2 ""
# display_usage
# mkdir $OUT_DIR
exit 1
}
# getopt to use -h flag
ARGS=$(getopt -o h -n "$0" -- "$@")
# reorganize arguments as returned by getopt
eval set -- "$ARGS"
while true; do
case "$1" in
# Shift before to throw away option
# Shift after if option has a required positional argument
-h)
shift
display_usage
exit 1
;;
--)
shift
break
;;
esac
done
## Test
# Required parameters:
[ -z "$arg1" ] && required_parameter "project_dir"