Skip to content
Snippets Groups Projects
Commit ce6a741e authored by Nikolaos Papadopoulos's avatar Nikolaos Papadopoulos
Browse files

installed colabfold dependencies, trying to run

parent 5044660b
No related branches found
No related tags found
No related merge requests found
......@@ -3,66 +3,15 @@
#SBATCH -t 10:00:00
#SBATCH -c 128
#SBATCH --mem=200G
#SBATCH -o /g/arendt/npapadop/cluster/align_%j.out
#SBATCH -e /g/arendt/npapadop/cluster/align_%j.err
#SBATCH -o /g/arendt/npapadop/cluster/align.out
#SBATCH -e /g/arendt/npapadop/cluster/align.err
# a heavily modified version of the colabfold_search.sh script from https://github.com/sokrypton/ColabFold/
# briefly:
# - we replace the MMseqs2 binary with the version in the module system
# - replace variables with static files and locations
# - replace DB2; initially BFD/Mgnify, now PDB70
# - remove DB3; initially ColabfoldDB
# - keep query database (search against UniRef30), since we are interested in the sequence alignments
module load CUDA
conda activate /g/arendt/npapadop/repos/condas/maf
module load MMseqs2
export OMP_NUM_THREADS=128
# MMSEQS="$1"
MMSEQS="/g/arendt/npapadop/"
QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
DBBASE="/scratch/npapadop/database"
BASE="/scratch/npapadop/result"
DB1="uniref30_2103_db"
DB2="PDB"
# DB3="$7"
FILTER="1"
THREADS="128"
SENSITIVITY=8
EXPAND_EVAL=inf
ALIGN_EVAL=10
DIFF=3000
QSC=-20.0
MAX_ACCEPT=1000000
if [ "${FILTER}" = "1" ]; then
# 0.1 was not used in benchmarks due to POSIX shell bug in line above
# EXPAND_EVAL=0.1
ALIGN_EVAL=10
QSC=0.8
MAX_ACCEPT=100000
fi
export MMSEQS_CALL_DEPTH=1
SEARCH_PARAM="--num-iterations 3 --db-load-mode 2 -a -s ${SENSITIVITY} -e 0.1 --max-seqs 10000"
FILTER_PARAM="--filter-msa ${FILTER} --filter-min-enable 1000 --diff ${DIFF} --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
EXPAND_PARAM="--expansion-mode 0 -e ${EXPAND_EVAL} --expand-filter-clusters ${FILTER} --max-seq-id 0.95"
mkdir -p "${BASE}"
mmseqs createdb "${QUERY}" "${BASE}/qdb"
mmseqs search "${BASE}/qdb" "${DBBASE}/${DB1}" "${BASE}/res" "${BASE}/tmp" $SEARCH_PARAM
mmseqs expandaln "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" --db-load-mode 2 --threads ${THREADS} ${EXPAND_PARAM}
mmseqs mvdb "${BASE}/tmp/latest/profile_1" "${BASE}/prof_res"
mmseqs lndb "${BASE}/qdb_h" "${BASE}/prof_res_h"
mmseqs align "${BASE}/prof_res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" "${BASE}/res_exp_realign" --db-load-mode 2 --threads ${THREADS} -e ${ALIGN_EVAL} --max-accept ${MAX_ACCEPT} --alt-ali 10 -a
mmseqs filterresult "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign" "${BASE}/res_exp_realign_filter" --db-load-mode 2 --threads ${THREADS} --qid 0 --qsc $QSC --diff 0 --max-seq-id 1.0 --filter-min-enable 100
mmseqs result2msa "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign_filter" "${BASE}/uniref.a3m" --msa-format-mode 6 --db-load-mode 2 --threads ${THREADS} ${FILTER_PARAM}
mmseqs rmdb "${BASE}/res_exp_realign"
mmseqs rmdb "${BASE}/res_exp"
mmseqs rmdb "${BASE}/res"
mmseqs rmdb "${BASE}/res_exp_realign_filter"
# align profile database against PDB
mmseqs search "${BASE}/prof_res" "${DBBASE}/${DB2}" "${BASE}/res_pdb" "${BASE}/tmp" --db-load-mode 2 --threads ${THREADS} -s 7.5 -a -e 0.1
mmseqs convertalis "${BASE}/prof_res" "${DBBASE}/${DB2}.idx" "${BASE}/res_pdb" "${BASE}/${DB2}.m8" --threads ${THREADS} --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,cigar --db-load-mode 2
mmseqs rmdb "${BASE}/res_pdb"
BASE="/scratch/npapadop/"
mmseqs rmdb "${BASE}/qdb"
mmseqs rmdb "${BASE}/qdb_h"
mmseqs rmdb "${BASE}/res"
rm -f -- "${BASE}/prof_res"*
rm -rf -- "${BASE}/tmp"
cd "${BASE}"
colabfold_search "${QUERY}" database/ ./ --mmseqs "${MMSEQS}" --threads 128
\ No newline at end of file
#!/bin/bash -ex
#SBATCH -J databases
#SBATCH -t 04:00:00
#SBATCH -c 5
#SBATCH --mem=50G
#SBATCH -o /g/arendt/npapadop/cluster/databases2.out
#SBATCH -e /g/arendt/npapadop/cluster/databases2.err
#SBATCH -t 10:00:00
#SBATCH -c 10
#SBATCH --mem=400G
#SBATCH -o /g/arendt/npapadop/cluster/databases.out
#SBATCH -e /g/arendt/npapadop/cluster/databases.err
DATABASE="$1"
DATABASE_NAME="$2"
OUTPUT_DIR="$3"
module load aria2
module load bzip2
cd ${OUTPUT_DIR}
mkdir ${DATABASE_NAME}
cd ${DATABASE_NAME}
# Setup everything for using mmseqs locally
ARIA_NUM_CONN=8
WORKDIR="/scratch/npapadop/database/"
MMSEQS="/g/arendt/npapadop/repos/MMseqs2/build/bin/mmseqs"
module load MMseqs2
cd "${WORKDIR}"
mmseqs databases ${DATABASE} ${DATABASE_NAME} tmp
hasCommand () {
command -v "$1" >/dev/null 2>&1
}
module unload MMseqs2
\ No newline at end of file
STRATEGY=""
if hasCommand aria2c; then STRATEGY="$STRATEGY ARIA"; fi
if hasCommand curl; then STRATEGY="$STRATEGY CURL"; fi
if hasCommand wget; then STRATEGY="$STRATEGY WGET"; fi
if [ "$STRATEGY" = "" ]; then
fail "No download tool found in PATH. Please install aria2c, curl or wget."
fi
downloadFile() {
URL="$1"
OUTPUT="$2"
set +e
for i in $STRATEGY; do
case "$i" in
ARIA)
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0
;;
CURL)
curl -L -o "$OUTPUT" "$URL" && set -e && return 0
;;
WGET)
wget -O "$OUTPUT" "$URL" && set -e && return 0
;;
esac
done
set -e
fail "Could not download $URL to $OUTPUT"
}
if [ ! -f UNIREF30_READY ]; then
downloadFile "http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz" "uniref30_2103.tar.gz"
tar xzvf "uniref30_2103.tar.gz"
${MMSEQS} tsv2exprofiledb "uniref30_2103" "uniref30_2103_db"
${MMSEQS} createindex "uniref30_2103_db" tmp1 --remove-tmp-files 1 --threads 10
touch UNIREF30_READY
fi
if [ ! -f COLABDB_READY ]; then
downloadFile "http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz" "colabfold_envdb_202108.tar.gz"
tar xzvf "colabfold_envdb_202108.tar.gz"
${MMSEQS} tsv2exprofiledb "colabfold_envdb_202108" "colabfold_envdb_202108_db"
# TODO: split memory value for createindex?
${MMSEQS} createindex "colabfold_envdb_202108_db" tmp2 --remove-tmp-files 1 --threads 10
touch COLABDB_READY
fi
module unload aria2
module unload bzip2
\ No newline at end of file
#!/bin/bash
#SBATCH --time=2-00:00:00
#SBATCH -e AF_%x_err.txt
#SBATCH -o AF_%x_out.txt
#SBATCH --qos=normal
#SBATCH -p gpu
#SBATCH -N 1
#SBATCH --ntasks=32
#SBATCH --mem=512000
module load AlphaFold
module load GCC/10.2.0
module load tqdm
module load matplotlib
SOFTWARE_DIR=<your dir>
export PYTHONPATH=$SOFTWARE_DIR/ColabFold:$PYTHONPATH
# If you use --cpus-per-task=X and --ntasks=1 your script should contain:
# export ALPHAFOLD_JACKHMMER_N_CPU=$SLURM_CPUS_PER_TASK
# export ALPHAFOLD_HHBLITS_N_CPU=$SLURM_CPUS_PER_TASK
# TF_FORCE_UNIFIED_MEMORY='1' XLA_PYTHON_CLIENT_MEM_FRACTION='4.0' are optional but may be necessary for bigger sequences.
# If you read this after 2050-01-01, probably you want to adjust the date
# Add "--model-type AlphaFold2-ptm" option to run the old ColabFold for complexes,
# equivalent to the original https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/beta/AlphaFold2_advanced.ipynb
TF_FORCE_UNIFIED_MEMORY='1' XLA_PYTHON_CLIENT_MEM_FRACTION='4.0' time $SOFTWARE_DIR/ColabFold/bin/colabfold_batch dimer.fasta ./ --templates
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment