From 62a76855f392da1080edaf2e793f4777e1ebffd6 Mon Sep 17 00:00:00 2001
From: Milot Mirdita <milot@mirdita.de>
Date: Thu, 28 Jul 2022 13:26:23 +0900
Subject: [PATCH] Add scripts to reproduce the sequence and profile search
 based analysis

---
 scripts/sequence_annotation/README.md        | 34 ++++++++++++
 scripts/sequence_annotation/hhblits_lock.sh  | 54 ++++++++++++++++++++
 scripts/sequence_annotation/run_hhblits.sh   |  4 ++
 scripts/sequence_annotation/run_mmseqs.sh    |  9 ++++
 scripts/sequence_annotation/setup_hhblits.sh |  6 +++
 scripts/sequence_annotation/setup_mmseqs.sh  |  5 ++
 6 files changed, 112 insertions(+)
 create mode 100644 scripts/sequence_annotation/README.md
 create mode 100755 scripts/sequence_annotation/hhblits_lock.sh
 create mode 100755 scripts/sequence_annotation/run_hhblits.sh
 create mode 100755 scripts/sequence_annotation/run_mmseqs.sh
 create mode 100755 scripts/sequence_annotation/setup_hhblits.sh
 create mode 100755 scripts/sequence_annotation/setup_mmseqs.sh

diff --git a/scripts/sequence_annotation/README.md b/scripts/sequence_annotation/README.md
new file mode 100644
index 0000000..140acbc
--- /dev/null
+++ b/scripts/sequence_annotation/README.md
@@ -0,0 +1,34 @@
+# sequence and profile based annotation scripts
+
+The directory contains all scripts to rerun the sequence and profile search based annotations.
+
+## MMseqs2
+
+We perform a profile (based on the generated ColabFold MSAs) to sequence (uniref100, part of ColabFold's UniRef30 2021_03) search.
+
+### `setup_mmseqs.sh`
+
+This script will download the same MMseqs2 version that was used to generate the presented results.
+It assumes that an archive with all ColabFold MSAs (`msa.tar.gz`) is present in the same folder.
+
+### `run_mmseqs.sh`
+
+Executes MMseqs2. Please adjust the `COLABFOLD_DB_PATH` to point to a folder containing the UniRef30 2021_03 from:
+http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz
+
+## HHblits
+
+We perform a profile (based on the generated ColabFold MSAs) to profile (UniRef30_2021_03) search.
+
+### `setup_hhblits.sh`
+
+This script will download the same HHblits version that was used to generate the presented results and creates a HHblits readable version of the previously generated MSA database. Assumes that `setup_mmseqs.sh` was already executed.
+
+### `run_hhblits.sh`
+
+Executes HHblits. Please adjust the `HHSUITE_DB_PATH` variable to  to point to a folder containing the Uniref30 2021_03 from:
+http://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
+
+### `hhblits_lock.sh`
+
+Workaround helper scripts to get around a performance issue with `hhblits_omp` on many CPU-cores.
diff --git a/scripts/sequence_annotation/hhblits_lock.sh b/scripts/sequence_annotation/hhblits_lock.sh
new file mode 100755
index 0000000..1ccf227
--- /dev/null
+++ b/scripts/sequence_annotation/hhblits_lock.sh
@@ -0,0 +1,54 @@
+#!/bin/bash -e
+INPUT="$1"
+OUTPUT="$(readlink -f $2)"
+N="$3"
+shift
+shift
+shift
+mkdir -p "${OUTPUT}/a3m"
+mkdir -p "${OUTPUT}/hhr"
+mkdir -p "${OUTPUT}/m8"
+
+open_sem() {
+    mkfifo pipe-$$
+    exec 3<>pipe-$$
+    rm pipe-$$
+    local i=$1
+    for ((;i>0;i--)); do
+        printf %s 000 >&3
+    done
+}
+
+# run the given command asynchronously and pop/push tokens
+run_with_lock() {
+    local x
+    # this read waits until there is something to read
+    read -u 3 -n 3 x && ((0==x)) || exit $x
+    (
+        ( "$@"; )
+        # push the return code of the command to the semaphore
+        printf '%.3d' $? >&3
+    )&
+}
+
+task() {
+    shift
+    shift
+    shift
+    if [ ! -e "${OUTPUT}/a3m/${KEY}" ] && [ ! -e "${OUTPUT}/hhr/${KEY}" ] && [ ! -e "${OUTPUT}/m8/${KEY}" ]; then
+        dd if=${INPUT}.ffdata ibs=1 skip="${OFF}" count="${LEN}" status=none | \
+	    ./hhsuite/bin/hhblits -i stdin -oa3m "${OUTPUT}/a3m/${KEY}" -o "${OUTPUT}/hhr/${KEY}" -blasttab "${OUTPUT}/m8/${KEY}" "${@}" -cpu 1
+    fi
+}
+
+open_sem $N
+while read -r KEY OFF LEN; do
+    run_with_lock task "${KEY}" "${OFF}" "${LEN}" "${@}"
+done < ${INPUT}.ffindex
+
+wait
+
+(cd "${OUTPUT}/a3m" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_a3m.ffdata" "${OUTPUT}_a3m.ffindex" .)
+(cd "${OUTPUT}/hhr" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_hhr.ffdata" "${OUTPUT}_hhr.ffindex" .)
+(cd "${OUTPUT}/m8" && ./hhsuite/bin/ffindex_build -s "${OUTPUT}_m8.ffdata" "${OUTPUT}_m8.ffindex" .)
+
diff --git a/scripts/sequence_annotation/run_hhblits.sh b/scripts/sequence_annotation/run_hhblits.sh
new file mode 100755
index 0000000..c2ac091
--- /dev/null
+++ b/scripts/sequence_annotation/run_hhblits.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+HHSUITE_DB_PATH=/storage/databases/uniref30
+NCORES=128
+./hhblits_lock.sh cfmsa_db cfmsa_hhblits ${NCORES} -d ${HHSUITE_DB_PATH}/UniRef30_2021_03 -n 1 -cpu 1 -E 100
diff --git a/scripts/sequence_annotation/run_mmseqs.sh b/scripts/sequence_annotation/run_mmseqs.sh
new file mode 100755
index 0000000..5bdd8c6
--- /dev/null
+++ b/scripts/sequence_annotation/run_mmseqs.sh
@@ -0,0 +1,9 @@
+#!/bin/bash -e
+COLABFOLD_DB_PATH=/storage/databases/colabfold_db_all
+./mmseqs/bin/mmseqs search cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres tmp -s 7.5 -a -e 1
+./mmseqs/bin/mmseqs convertalis cfprof ${COLABFOLD_DB_PATH}/uniref30_2103_db_seq cfres cfress75.m8
+./mmseqs/bin/mmseqs filterdb cfmsa_db cf_msa_head --extract-lines 1
+./mmseqs/bin/mmseqs prefixid cf_msa_head cf_msa.tsv --tsv
+join <(sort -n cfmsa_db.lookup) <(sort -n cfmsa_headers.tsv) > cfmsa_db_headers.lookup
+awk 'BEGIN { OFS="\t" } NR == FNR { f[$2] = $4; next; } $1 in f { $1 = f[$1]; print; }' cfmsa_db_headers.lookup cfress75.m8 > cfress75_fixed_queries.m8
+
diff --git a/scripts/sequence_annotation/setup_hhblits.sh b/scripts/sequence_annotation/setup_hhblits.sh
new file mode 100755
index 0000000..cc81d56
--- /dev/null
+++ b/scripts/sequence_annotation/setup_hhblits.sh
@@ -0,0 +1,6 @@
+#!/bin/sh -e
+wget https://mmseqs.com/archive/4c0cd66434ce0b83ccd247053f57989fdd53d82b/hhsuite-linux-avx2.tar.gz
+tar xzvf hhsuite-linux-avx2.tar.gz
+
+ln -s cfmsa_db cfmsa_db.ffdata
+LC_ALL=C sort cfmsa_db.index > cfmsa_db.ffindex
diff --git a/scripts/sequence_annotation/setup_mmseqs.sh b/scripts/sequence_annotation/setup_mmseqs.sh
new file mode 100755
index 0000000..88add94
--- /dev/null
+++ b/scripts/sequence_annotation/setup_mmseqs.sh
@@ -0,0 +1,5 @@
+#!/bin/sh -e
+wget https://mmseqs.com/archive/7ebd2e0441e5c3bdec585317c2b1c3cdbf943568/mmseqs-linux-avx2.tar.gz
+tar xzvf mmseqs-linux-avx2.tar.gz
+MMSEQS_FORCE_MERGE=1 ./mmseqs/bin/mmseqs tar2db msa.tar.gz cfmsa_db --output-dbtype 11 --threads 8
+./mmseqs/bin/mmseqs msa2profile cfmsa_db cfprof --threads 64
-- 
GitLab