Skip to content
Snippets Groups Projects
Commit c63c7b63 authored by Nikolaos Papadopoulos's avatar Nikolaos Papadopoulos
Browse files

sequence alignments vs UniRef30, PDB

parent 26bcef93
No related branches found
No related tags found
No related merge requests found
align.sh 0 → 100755
#!/bin/bash -ex
#SBATCH -J align
#SBATCH -t 10:00:00
#SBATCH -c 128
#SBATCH --mem=200G
#SBATCH -o /g/arendt/npapadop/cluster/align_%j.out
#SBATCH -e /g/arendt/npapadop/cluster/align_%j.err
# a heavily modified version of the colabfold_search.sh script from https://github.com/sokrypton/ColabFold/
# briefly:
# - we replace the MMseqs2 binary with the version in the module system
# - replace variables with static files and locations
# - replace DB2; initially BFD/Mgnify, now PDB70
# - remove DB3; initially ColabfoldDB
# - keep query database (search against UniRef30), since we are interested in the sequence alignments
module load MMseqs2
export OMP_NUM_THREADS=128
# MMSEQS="$1"
QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
DBBASE="/scratch/npapadop/database"
BASE="/scratch/npapadop/result"
DB1="uniref30_2103_db"
DB2="PDB"
# DB3="$7"
FILTER="1"
THREADS="128"
SENSITIVITY=8
EXPAND_EVAL=inf
ALIGN_EVAL=10
DIFF=3000
QSC=-20.0
MAX_ACCEPT=1000000
if [ "${FILTER}" = "1" ]; then
# 0.1 was not used in benchmarks due to POSIX shell bug in line above
# EXPAND_EVAL=0.1
ALIGN_EVAL=10
QSC=0.8
MAX_ACCEPT=100000
fi
export MMSEQS_CALL_DEPTH=1
SEARCH_PARAM="--num-iterations 3 --db-load-mode 2 -a -s ${SENSITIVITY} -e 0.1 --max-seqs 10000"
FILTER_PARAM="--filter-msa ${FILTER} --filter-min-enable 1000 --diff ${DIFF} --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
EXPAND_PARAM="--expansion-mode 0 -e ${EXPAND_EVAL} --expand-filter-clusters ${FILTER} --max-seq-id 0.95"
mkdir -p "${BASE}"
mmseqs createdb "${QUERY}" "${BASE}/qdb"
mmseqs search "${BASE}/qdb" "${DBBASE}/${DB1}" "${BASE}/res" "${BASE}/tmp" $SEARCH_PARAM
mmseqs expandaln "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" --db-load-mode 2 --threads ${THREADS} ${EXPAND_PARAM}
mmseqs mvdb "${BASE}/tmp/latest/profile_1" "${BASE}/prof_res"
mmseqs lndb "${BASE}/qdb_h" "${BASE}/prof_res_h"
mmseqs align "${BASE}/prof_res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" "${BASE}/res_exp_realign" --db-load-mode 2 --threads ${THREADS} -e ${ALIGN_EVAL} --max-accept ${MAX_ACCEPT} --alt-ali 10 -a
mmseqs filterresult "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign" "${BASE}/res_exp_realign_filter" --db-load-mode 2 --threads ${THREADS} --qid 0 --qsc $QSC --diff 0 --max-seq-id 1.0 --filter-min-enable 100
mmseqs result2msa "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign_filter" "${BASE}/uniref.a3m" --msa-format-mode 6 --db-load-mode 2 --threads ${THREADS} ${FILTER_PARAM}
mmseqs rmdb "${BASE}/res_exp_realign"
mmseqs rmdb "${BASE}/res_exp"
mmseqs rmdb "${BASE}/res"
mmseqs rmdb "${BASE}/res_exp_realign_filter"
# align profile database against PDB
mmseqs search "${BASE}/prof_res" "${DBBASE}/${DB2}" "${BASE}/res_pdb" "${BASE}/tmp" --db-load-mode 2 --threads ${THREADS} -s 7.5 -a -e 0.1
mmseqs convertalis "${BASE}/prof_res" "${DBBASE}/${DB2}.idx" "${BASE}/res_pdb" "${BASE}/${DB2}.m8" --threads ${THREADS} --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,cigar --db-load-mode 2
mmseqs rmdb "${BASE}/res_pdb"
mmseqs rmdb "${BASE}/qdb"
mmseqs rmdb "${BASE}/qdb_h"
mmseqs rmdb "${BASE}/res"
rm -f -- "${BASE}/prof_res"*
rm -rf -- "${BASE}/tmp"
#!/bin/bash -ex
#SBATCH -J create_index
#SBATCH -t 02:00:00
#SBATCH -c 12
#SBATCH --mem=50G
#SBATCH -t 08:00:00
#SBATCH -c 64
#SBATCH --mem=200G
#SBATCH -o /g/arendt/npapadop/cluster/create_index.out
#SBATCH -e /g/arendt/npapadop/cluster/create_index.err
OUTPUT_DIR="$1"
DATABASE="$2"
cd ${OUTPUT_DIR}
cd /scratch/npapadop/database
module load MMseqs2
mmseqs createindex ${DATABASE} tmp --threads 12 --remove-tmp-files 1
# mmseqs tsv2exprofiledb "uniref30_2103" "uniref30_2103_db"
# mmseqs createindex "uniref30_2103_db" tmp1 --remove-tmp-files 1 --threads 64
mmseqs databases PDB PDB tmp
mmseqs createindex PDB tmp3 --remove-tmp-files 1 --threads 64
module unload MMseqs2
\ No newline at end of file
#!/bin/bash -ex
SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh"
QUERYDB="/scratch/npapadop/Spongilla_lacustris_70AAcutoffTransDecoder/spongilla"
TARGETDB="/scratch/npapadop/PDB70/PDB70"
RESULTDB="/scratch/npapadop/spongilla_pdb70/result"
TMP="/scratch/npapadop/spongilla_pdb70/tmp"
sbatch ${SCRIPT} ${QUERYDB} ${TARGETDB} ${RESULTDB} ${TMP}
\ No newline at end of file
#!/bin/bash -ex
SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh"
QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
DBBASE="/scratch/npapadop"
BASE="/scratch/npapadop"
DB1="Uniref100/Uniref100"
FILTER="1"
sbatch ${SCRIPT} ${QUERY} ${DBBASE} ${BASE} ${DB1} ${FILTER}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment