Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MorF
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arendt Group
MorF
Commits
ce6a741e
Commit
ce6a741e
authored
3 years ago
by
Nikolaos Papadopoulos
Browse files
Options
Downloads
Patches
Plain Diff
installed colabfold dependencies, trying to run
parent
5044660b
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
align.sh
+8
-59
8 additions, 59 deletions
align.sh
databases.sh
+65
-14
65 additions, 14 deletions
databases.sh
predict_protein.sh
+28
-0
28 additions, 0 deletions
predict_protein.sh
with
101 additions
and
73 deletions
align.sh
+
8
−
59
View file @
ce6a741e
...
...
@@ -3,66 +3,15 @@
#SBATCH -t 10:00:00
#SBATCH -c 128
#SBATCH --mem=200G
#SBATCH -o /g/arendt/npapadop/cluster/align
_%j
.out
#SBATCH -e /g/arendt/npapadop/cluster/align
_%j
.err
#SBATCH -o /g/arendt/npapadop/cluster/align.out
#SBATCH -e /g/arendt/npapadop/cluster/align.err
# a heavily modified version of the colabfold_search.sh script from https://github.com/sokrypton/ColabFold/
# briefly:
# - we replace the MMseqs2 binary with the version in the module system
# - replace variables with static files and locations
# - replace DB2; initially BFD/Mgnify, now PDB70
# - remove DB3; initially ColabfoldDB
# - keep query database (search against UniRef30), since we are interested in the sequence alignments
module load CUDA
conda activate /g/arendt/npapadop/repos/condas/maf
module load MMseqs2
export
OMP_NUM_THREADS
=
128
# MMSEQS="$1"
MMSEQS
=
"/g/arendt/npapadop/"
QUERY
=
"/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
DBBASE
=
"/scratch/npapadop/database"
BASE
=
"/scratch/npapadop/result"
DB1
=
"uniref30_2103_db"
DB2
=
"PDB"
# DB3="$7"
FILTER
=
"1"
THREADS
=
"128"
SENSITIVITY
=
8
EXPAND_EVAL
=
inf
ALIGN_EVAL
=
10
DIFF
=
3000
QSC
=
-20
.0
MAX_ACCEPT
=
1000000
if
[
"
${
FILTER
}
"
=
"1"
]
;
then
# 0.1 was not used in benchmarks due to POSIX shell bug in line above
# EXPAND_EVAL=0.1
ALIGN_EVAL
=
10
QSC
=
0.8
MAX_ACCEPT
=
100000
fi
export
MMSEQS_CALL_DEPTH
=
1
SEARCH_PARAM
=
"--num-iterations 3 --db-load-mode 2 -a -s
${
SENSITIVITY
}
-e 0.1 --max-seqs 10000"
FILTER_PARAM
=
"--filter-msa
${
FILTER
}
--filter-min-enable 1000 --diff
${
DIFF
}
--qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
EXPAND_PARAM
=
"--expansion-mode 0 -e
${
EXPAND_EVAL
}
--expand-filter-clusters
${
FILTER
}
--max-seq-id 0.95"
mkdir
-p
"
${
BASE
}
"
mmseqs createdb
"
${
QUERY
}
"
"
${
BASE
}
/qdb"
mmseqs search
"
${
BASE
}
/qdb"
"
${
DBBASE
}
/
${
DB1
}
"
"
${
BASE
}
/res"
"
${
BASE
}
/tmp"
$SEARCH_PARAM
mmseqs expandaln
"
${
BASE
}
/qdb"
"
${
DBBASE
}
/
${
DB1
}
.idx"
"
${
BASE
}
/res"
"
${
DBBASE
}
/
${
DB1
}
.idx"
"
${
BASE
}
/res_exp"
--db-load-mode
2
--threads
${
THREADS
}
${
EXPAND_PARAM
}
mmseqs mvdb
"
${
BASE
}
/tmp/latest/profile_1"
"
${
BASE
}
/prof_res"
mmseqs lndb
"
${
BASE
}
/qdb_h"
"
${
BASE
}
/prof_res_h"
mmseqs align
"
${
BASE
}
/prof_res"
"
${
DBBASE
}
/
${
DB1
}
.idx"
"
${
BASE
}
/res_exp"
"
${
BASE
}
/res_exp_realign"
--db-load-mode
2
--threads
${
THREADS
}
-e
${
ALIGN_EVAL
}
--max-accept
${
MAX_ACCEPT
}
--alt-ali
10
-a
mmseqs filterresult
"
${
BASE
}
/qdb"
"
${
DBBASE
}
/
${
DB1
}
.idx"
"
${
BASE
}
/res_exp_realign"
"
${
BASE
}
/res_exp_realign_filter"
--db-load-mode
2
--threads
${
THREADS
}
--qid
0
--qsc
$QSC
--diff
0
--max-seq-id
1.0
--filter-min-enable
100
mmseqs result2msa
"
${
BASE
}
/qdb"
"
${
DBBASE
}
/
${
DB1
}
.idx"
"
${
BASE
}
/res_exp_realign_filter"
"
${
BASE
}
/uniref.a3m"
--msa-format-mode
6
--db-load-mode
2
--threads
${
THREADS
}
${
FILTER_PARAM
}
mmseqs rmdb
"
${
BASE
}
/res_exp_realign"
mmseqs rmdb
"
${
BASE
}
/res_exp"
mmseqs rmdb
"
${
BASE
}
/res"
mmseqs rmdb
"
${
BASE
}
/res_exp_realign_filter"
# align profile database against PDB
mmseqs search
"
${
BASE
}
/prof_res"
"
${
DBBASE
}
/
${
DB2
}
"
"
${
BASE
}
/res_pdb"
"
${
BASE
}
/tmp"
--db-load-mode
2
--threads
${
THREADS
}
-s
7.5
-a
-e
0.1
mmseqs convertalis
"
${
BASE
}
/prof_res"
"
${
DBBASE
}
/
${
DB2
}
.idx"
"
${
BASE
}
/res_pdb"
"
${
BASE
}
/
${
DB2
}
.m8"
--threads
${
THREADS
}
--format-output
query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,cigar
--db-load-mode
2
mmseqs rmdb
"
${
BASE
}
/res_pdb"
BASE
=
"/scratch/npapadop/"
mmseqs rmdb
"
${
BASE
}
/qdb"
mmseqs rmdb
"
${
BASE
}
/qdb_h"
mmseqs rmdb
"
${
BASE
}
/res"
rm
-f
--
"
${
BASE
}
/prof_res"
*
rm
-rf
--
"
${
BASE
}
/tmp"
cd
"
${
BASE
}
"
colabfold_search
"
${
QUERY
}
"
database/ ./
--mmseqs
"
${
MMSEQS
}
"
--threads
128
\ No newline at end of file
This diff is collapsed.
Click to expand it.
databases.sh
+
65
−
14
View file @
ce6a741e
#!/bin/bash -ex
#SBATCH -J databases
#SBATCH -t 0
4
:00:00
#SBATCH -c
5
#SBATCH --mem=
5
0G
#SBATCH -o /g/arendt/npapadop/cluster/databases
2
.out
#SBATCH -e /g/arendt/npapadop/cluster/databases
2
.err
#SBATCH -t
1
0:00:00
#SBATCH -c
10
#SBATCH --mem=
40
0G
#SBATCH -o /g/arendt/npapadop/cluster/databases.out
#SBATCH -e /g/arendt/npapadop/cluster/databases.err
DATABASE
=
"
$1
"
DATABASE_NAME
=
"
$2
"
OUTPUT_DIR
=
"
$3
"
module load aria2
module load bzip2
cd
${
OUTPUT_DIR
}
mkdir
${
DATABASE_NAME
}
cd
${
DATABASE_NAME
}
# Setup everything for using mmseqs locally
ARIA_NUM_CONN
=
8
WORKDIR
=
"/scratch/npapadop/database/"
MMSEQS
=
"/g/arendt/npapadop/repos/MMseqs2/build/bin/mmseqs"
module load MMseqs2
cd
"
${
WORKDIR
}
"
mmseqs databases
${
DATABASE
}
${
DATABASE_NAME
}
tmp
hasCommand
()
{
command
-v
"
$1
"
>
/dev/null 2>&1
}
module unload MMseqs2
\ No newline at end of file
STRATEGY
=
""
if
hasCommand aria2c
;
then
STRATEGY
=
"
$STRATEGY
ARIA"
;
fi
if
hasCommand curl
;
then
STRATEGY
=
"
$STRATEGY
CURL"
;
fi
if
hasCommand wget
;
then
STRATEGY
=
"
$STRATEGY
WGET"
;
fi
if
[
"
$STRATEGY
"
=
""
]
;
then
fail
"No download tool found in PATH. Please install aria2c, curl or wget."
fi
downloadFile
()
{
URL
=
"
$1
"
OUTPUT
=
"
$2
"
set
+e
for
i
in
$STRATEGY
;
do
case
"
$i
"
in
ARIA
)
FILENAME
=
$(
basename
"
${
OUTPUT
}
"
)
DIR
=
$(
dirname
"
${
OUTPUT
}
"
)
aria2c
--max-connection-per-server
=
"
$ARIA_NUM_CONN
"
--allow-overwrite
=
true
-o
"
$FILENAME
"
-d
"
$DIR
"
"
$URL
"
&&
set
-e
&&
return
0
;;
CURL
)
curl
-L
-o
"
$OUTPUT
"
"
$URL
"
&&
set
-e
&&
return
0
;;
WGET
)
wget
-O
"
$OUTPUT
"
"
$URL
"
&&
set
-e
&&
return
0
;;
esac
done
set
-e
fail
"Could not download
$URL
to
$OUTPUT
"
}
if
[
!
-f
UNIREF30_READY
]
;
then
downloadFile
"http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
"uniref30_2103.tar.gz"
tar
xzvf
"uniref30_2103.tar.gz"
${
MMSEQS
}
tsv2exprofiledb
"uniref30_2103"
"uniref30_2103_db"
${
MMSEQS
}
createindex
"uniref30_2103_db"
tmp1
--remove-tmp-files
1
--threads
10
touch
UNIREF30_READY
fi
if
[
!
-f
COLABDB_READY
]
;
then
downloadFile
"http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz"
"colabfold_envdb_202108.tar.gz"
tar
xzvf
"colabfold_envdb_202108.tar.gz"
${
MMSEQS
}
tsv2exprofiledb
"colabfold_envdb_202108"
"colabfold_envdb_202108_db"
# TODO: split memory value for createindex?
${
MMSEQS
}
createindex
"colabfold_envdb_202108_db"
tmp2
--remove-tmp-files
1
--threads
10
touch
COLABDB_READY
fi
module unload aria2
module unload bzip2
\ No newline at end of file
This diff is collapsed.
Click to expand it.
predict_protein.sh
0 → 100755
+
28
−
0
View file @
ce6a741e
#!/bin/bash
#SBATCH --time=2-00:00:00
#SBATCH -e AF_%x_err.txt
#SBATCH -o AF_%x_out.txt
#SBATCH --qos=normal
#SBATCH -p gpu
#SBATCH -N 1
#SBATCH --ntasks=32
#SBATCH --mem=512000
module load AlphaFold
module load GCC/10.2.0
module load tqdm
module load matplotlib
SOFTWARE_DIR
=
<your
dir
>
export
PYTHONPATH
=
$SOFTWARE_DIR
/ColabFold:
$PYTHONPATH
# If you use --cpus-per-task=X and --ntasks=1 your script should contain:
# export ALPHAFOLD_JACKHMMER_N_CPU=$SLURM_CPUS_PER_TASK
# export ALPHAFOLD_HHBLITS_N_CPU=$SLURM_CPUS_PER_TASK
# TF_FORCE_UNIFIED_MEMORY='1' XLA_PYTHON_CLIENT_MEM_FRACTION='4.0' are optional but may be necessary for bigger sequences.
# If you read this after 2050-01-01, probably you want to adjust the date
# Add "--model-type AlphaFold2-ptm" option to run the old ColabFold for complexes,
# equivalent to the original https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/beta/AlphaFold2_advanced.ipynb
TF_FORCE_UNIFIED_MEMORY
=
'1'
XLA_PYTHON_CLIENT_MEM_FRACTION
=
'4.0'
time
$SOFTWARE_DIR
/ColabFold/bin/colabfold_batch dimer.fasta ./
--templates
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment