From 62a76855f392da1080edaf2e793f4777e1ebffd6 Mon Sep 17 00:00:00 2001 From: Milot Mirdita <milot@mirdita.de> Date: Thu, 28 Jul 2022 13:26:23 +0900 Subject: [PATCH] Add scripts to reproduce the sequence and profile search based analysis --- scripts/sequence_annotation/README.md | 34 ++++++++++++ scripts/sequence_annotation/hhblits_lock.sh | 54 ++++++++++++++++++++ scripts/sequence_annotation/run_hhblits.sh | 4 ++ scripts/sequence_annotation/run_mmseqs.sh | 9 ++++ scripts/sequence_annotation/setup_hhblits.sh | 6 +++ scripts/sequence_annotation/setup_mmseqs.sh | 5 ++ 6 files changed, 112 insertions(+) create mode 100644 scripts/sequence_annotation/README.md create mode 100755 scripts/sequence_annotation/hhblits_lock.sh create mode 100755 scripts/sequence_annotation/run_hhblits.sh create mode 100755 scripts/sequence_annotation/run_mmseqs.sh create mode 100755 scripts/sequence_annotation/setup_hhblits.sh create mode 100755 scripts/sequence_annotation/setup_mmseqs.sh diff --git a/scripts/sequence_annotation/README.md b/scripts/sequence_annotation/README.md new file mode 100644 index 0000000..140acbc --- /dev/null +++ b/scripts/sequence_annotation/README.md @@ -0,0 +1,34 @@ +# sequence and profile based annotation scripts + +The directory contains all scripts to rerun the sequence and profile search based annotations. + +## MMseqs2 + +We perform a profile (based on the generated ColabFold MSAs) to sequence (uniref100, part of ColabFold's UniRef30 2021_03) search. + +### `setup_mmseqs.sh` + +This script will download the same MMseqs2 version that was used to generate the presented results. +It assumes that an archive with all ColabFold MSAs (`msa.tar.gz`) is present in the same folder. + +### `run_mmseqs.sh` + +Executes MMseqs2. Please adjust the `COLABFOLD_DB_PATH` to point to a folder containing the UniRef30 2021_03 from: +http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz + +## HHblits + +We perform a profile (based on the generated ColabFold MSAs) to profile (UniRef30_2021_03) search. + +### `setup_hhblits.sh` + +This script will download the same HHblits version that was used to generate the presented results and creates a HHblits readable version of the previously generated MSA database. Assumes that `setup_mmseqs.sh` was already executed. + +### `run_hhblits.sh` + +Executes HHblits. Please adjust the `HHSUITE_DB_PATH` variable to to point to a folder containing the Uniref30 2021_03 from: +http://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz + +### `hhblits_lock.sh` + +Workaround helper scripts to get around a performance issue with `hhblits_omp` on many CPU-cores. diff --git a/scripts/sequence_annotation/hhblits_lock.sh b/scripts/sequence_annotation/hhblits_lock.sh new file mode 100755 index 0000000..1ccf227 --- /dev/null +++ b/scripts/sequence_annotation/hhblits_lock.sh @@ -0,0 +1,54 @@ +#!/bin/bash -e +INPUT="$1" +OUTPUT="$(readlink -f $2)" +N="$3" +shift +shift +shift +mkdir -p "${OUTPUT}/a3m" +mkdir -p "${OUTPUT}/hhr" +mkdir -p "${OUTPUT}/m8" + +open_sem() { + mkfifo pipe-$$ + exec 3<>pipe-$$ + rm pipe-$$ + local i=$1 + for ((;i>0;i--)); do + printf %s 000 >&3 + done +} + +# run the given command asynchronously and pop/push tokens +run_with_lock() { + local x + # this read waits until there is something to read + read -u 3 -n 3 x && ((0==x)) || exit $x + ( + ( "$@"; ) + # push the return code of the command to the semaphore + printf '%.3d' $? >&3 + )& +} + +task() { + shift + shift + shift + if [ ! -e "${OUTPUT}/a3m/${KEY}" ] && [ ! -e "${OUTPUT}/hhr/${KEY}" ] && [ ! -e "${OUTPUT}/m8/${KEY}" ]; then + dd if=${INPUT}.ffdata ibs=1 skip="${OFF}" count="${LEN}" status=none | \ + ./hhsuite/bin/hhblits -i stdin -oa3m "${OUTPUT}/a3m/${KEY}" -o "${OUTPUT}/hhr/${KEY}" -blasttab "${OUTPUT}/m8/${KEY}" "${@}" -cpu 1 + fi +} + +open_sem $N +while read -r KEY OFF LEN; do + run_with_lock task "${KEY}" "${OFF}" "${LEN}" "${@}" +done < ${INPUT}.ffindex + +wait + +(cd "${OUTPUT}/a3m" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_a3m.ffdata" "${OUTPUT}_a3m.ffindex" .) +(cd "${OUTPUT}/hhr" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_hhr.ffdata" "${OUTPUT}_hhr.ffindex" .) +(cd "${OUTPUT}/m8" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_m8.ffdata" "${OUTPUT}_m8.ffindex" .) + diff --git a/scripts/sequence_annotation/run_hhblits.sh b/scripts/sequence_annotation/run_hhblits.sh new file mode 100755 index 0000000..c2ac091 --- /dev/null +++ b/scripts/sequence_annotation/run_hhblits.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +HHSUITE_DB_PATH=/storage/databases/uniref30 +NCORES=128 +./hhblits_lock.sh cfmsa_db cfmsa_hhblits ${NCORES} -d ${HHSUITE_DB_PATH}/UniRef30_2021_03 -n 1 -cpu 1 -E 100 diff --git a/scripts/sequence_annotation/run_mmseqs.sh b/scripts/sequence_annotation/run_mmseqs.sh new file mode 100755 index 0000000..5bdd8c6 --- /dev/null +++ b/scripts/sequence_annotation/run_mmseqs.sh @@ -0,0 +1,9 @@ +#!/bin/bash -e +COLABFOLD_DB_PATH=/storage/databases/colabfold_db_all +./mmseqs/bin/mmseqs search cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres tmp -s 7.5 -a -e 1 +./mmseqs/bin/mmseqs convertalis cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres cfress75.m8 +./mmseqs/bin/mmseqs filterdb cfmsa_db cf_msa_head --extract-lines 1 +./mmseqs/bin/mmseqs prefixid cf_msa_head cf_msa.tsv --tsv +join <(sort -n cfmsa_db.lookup) <(sort -n cfmsa_headers.tsv) > cfmsa_db_headers.lookup +awk 'BEGIN { OFS="\t" } NR == FNR { f[$2] = $4; next; } $1 in f { $1 = f[$1]; print; }' cfmsa_db_headers.lookup cfress75.m8 > cfress75_fixed_queries.m8 + diff --git a/scripts/sequence_annotation/setup_hhblits.sh b/scripts/sequence_annotation/setup_hhblits.sh new file mode 100755 index 0000000..cc81d56 --- /dev/null +++ b/scripts/sequence_annotation/setup_hhblits.sh @@ -0,0 +1,6 @@ +#!/bin/sh -e +wget https://mmseqs.com/archive/4c0cd66434ce0b83ccd247053f57989fdd53d82b/hhsuite-linux-avx2.tar.gz +tar xzvf hhsuite-linux-avx2.tar.gz + +ln -s cfmsa_db cfmsa_db.ffdata +LC_ALL=C sort cfmsa_db.index > cfmsa_db.ffindex diff --git a/scripts/sequence_annotation/setup_mmseqs.sh b/scripts/sequence_annotation/setup_mmseqs.sh new file mode 100755 index 0000000..88add94 --- /dev/null +++ b/scripts/sequence_annotation/setup_mmseqs.sh @@ -0,0 +1,5 @@ +#!/bin/sh -e +wget https://mmseqs.com/archive/7ebd2e0441e5c3bdec585317c2b1c3cdbf943568/mmseqs-linux-avx2.tar.gz +tar xzvf mmseqs-linux-avx2.tar.gz +MMSEQS_FORCE_MERGE=1 ./mmseqs/bin/mmseqs tar2db msa.tar.gz cfmsa_db --output-dbtype 11 --threads 8 +./mmseqs/bin/mmseqs msa2profile cfmsa_db cfprof --threads 64 -- GitLab