Skip to content
Snippets Groups Projects
Commit 62a76855 authored by Milot Mirdita's avatar Milot Mirdita Committed by Nikolaos Papadopoulos
Browse files

Add scripts to reproduce the sequence and profile search based analysis

parent e619cd50
Branches main
Tags v1.2
No related merge requests found
# sequence and profile based annotation scripts
The directory contains all scripts to rerun the sequence and profile search based annotations.
## MMseqs2
We perform a profile (based on the generated ColabFold MSAs) to sequence (uniref100, part of ColabFold's UniRef30 2021_03) search.
### `setup_mmseqs.sh`
This script will download the same MMseqs2 version that was used to generate the presented results.
It assumes that an archive with all ColabFold MSAs (`msa.tar.gz`) is present in the same folder.
### `run_mmseqs.sh`
Executes MMseqs2. Please adjust the `COLABFOLD_DB_PATH` to point to a folder containing the UniRef30 2021_03 from:
http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz
## HHblits
We perform a profile (based on the generated ColabFold MSAs) to profile (UniRef30_2021_03) search.
### `setup_hhblits.sh`
This script will download the same HHblits version that was used to generate the presented results and creates a HHblits readable version of the previously generated MSA database. Assumes that `setup_mmseqs.sh` was already executed.
### `run_hhblits.sh`
Executes HHblits. Please adjust the `HHSUITE_DB_PATH` variable to to point to a folder containing the Uniref30 2021_03 from:
http://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
### `hhblits_lock.sh`
Workaround helper scripts to get around a performance issue with `hhblits_omp` on many CPU-cores.
#!/bin/bash -e
INPUT="$1"
OUTPUT="$(readlink -f $2)"
N="$3"
shift
shift
shift
mkdir -p "${OUTPUT}/a3m"
mkdir -p "${OUTPUT}/hhr"
mkdir -p "${OUTPUT}/m8"
open_sem() {
mkfifo pipe-$$
exec 3<>pipe-$$
rm pipe-$$
local i=$1
for ((;i>0;i--)); do
printf %s 000 >&3
done
}
# run the given command asynchronously and pop/push tokens
run_with_lock() {
local x
# this read waits until there is something to read
read -u 3 -n 3 x && ((0==x)) || exit $x
(
( "$@"; )
# push the return code of the command to the semaphore
printf '%.3d' $? >&3
)&
}
task() {
shift
shift
shift
if [ ! -e "${OUTPUT}/a3m/${KEY}" ] && [ ! -e "${OUTPUT}/hhr/${KEY}" ] && [ ! -e "${OUTPUT}/m8/${KEY}" ]; then
dd if=${INPUT}.ffdata ibs=1 skip="${OFF}" count="${LEN}" status=none | \
./hhsuite/bin/hhblits -i stdin -oa3m "${OUTPUT}/a3m/${KEY}" -o "${OUTPUT}/hhr/${KEY}" -blasttab "${OUTPUT}/m8/${KEY}" "${@}" -cpu 1
fi
}
open_sem $N
while read -r KEY OFF LEN; do
run_with_lock task "${KEY}" "${OFF}" "${LEN}" "${@}"
done < ${INPUT}.ffindex
wait
(cd "${OUTPUT}/a3m" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_a3m.ffdata" "${OUTPUT}_a3m.ffindex" .)
(cd "${OUTPUT}/hhr" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_hhr.ffdata" "${OUTPUT}_hhr.ffindex" .)
(cd "${OUTPUT}/m8" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_m8.ffdata" "${OUTPUT}_m8.ffindex" .)
#!/bin/sh -e
HHSUITE_DB_PATH=/storage/databases/uniref30
NCORES=128
./hhblits_lock.sh cfmsa_db cfmsa_hhblits ${NCORES} -d ${HHSUITE_DB_PATH}/UniRef30_2021_03 -n 1 -cpu 1 -E 100
#!/bin/bash -e
COLABFOLD_DB_PATH=/storage/databases/colabfold_db_all
./mmseqs/bin/mmseqs search cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres tmp -s 7.5 -a -e 1
./mmseqs/bin/mmseqs convertalis cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres cfress75.m8
./mmseqs/bin/mmseqs filterdb cfmsa_db cf_msa_head --extract-lines 1
./mmseqs/bin/mmseqs prefixid cf_msa_head cf_msa.tsv --tsv
join <(sort -n cfmsa_db.lookup) <(sort -n cfmsa_headers.tsv) > cfmsa_db_headers.lookup
awk 'BEGIN { OFS="\t" } NR == FNR { f[$2] = $4; next; } $1 in f { $1 = f[$1]; print; }' cfmsa_db_headers.lookup cfress75.m8 > cfress75_fixed_queries.m8
#!/bin/sh -e
wget https://mmseqs.com/archive/4c0cd66434ce0b83ccd247053f57989fdd53d82b/hhsuite-linux-avx2.tar.gz
tar xzvf hhsuite-linux-avx2.tar.gz
ln -s cfmsa_db cfmsa_db.ffdata
LC_ALL=C sort cfmsa_db.index > cfmsa_db.ffindex
#!/bin/sh -e
wget https://mmseqs.com/archive/7ebd2e0441e5c3bdec585317c2b1c3cdbf943568/mmseqs-linux-avx2.tar.gz
tar xzvf mmseqs-linux-avx2.tar.gz
MMSEQS_FORCE_MERGE=1 ./mmseqs/bin/mmseqs tar2db msa.tar.gz cfmsa_db --output-dbtype 11 --threads 8
./mmseqs/bin/mmseqs msa2profile cfmsa_db cfprof --threads 64
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment