From 8299e04deab777ac0c2e193a0a20d81e1a8e239a Mon Sep 17 00:00:00 2001
From: Nikolaos Papadopoulos <nikolaos.papadopoulos@embl.de>
Date: Wed, 15 Jun 2022 07:54:39 +0200
Subject: [PATCH] script cleanup - moved to scripts folder - wrote README -
 removed obsolete - renamed properly

---
 README.md                                     | 51 +++++++++++++++----
 batch_predict.sh                              | 29 -----------
 gpu_test.sh                                   | 25 ---------
 mkdb.sh                                       | 20 --------
 predict_protein.sh                            | 28 ----------
 predict_structure.sh                          | 24 ---------
 scripts/README.md                             | 26 ++++++++++
 align.sh => scripts/align.sh                  |  0
 databases.sh => scripts/databases.sh          |  0
 create_index.sh => scripts/databases_pdb.sh   |  5 +-
 fs_afdb.sh => scripts/fs_afdb.sh              |  0
 fs_pdb.sh => scripts/fs_pdb.sh                |  0
 fs_query.sh => scripts/fs_query.sh            |  0
 .../fs_search_afdb.sh                         |  0
 fs_search_pdb.sh => scripts/fs_search_pdb.sh  |  0
 .../fs_search_swissprot.sh                    |  0
 fs_sp.sh => scripts/fs_sp.sh                  |  0
 milot.sh => scripts/predict_structures.sh     |  0
 submit_colab.sh => scripts/submit_colab.sh    |  0
 spongilla_create_index.sh                     |  7 ---
 spongilla_mkdb.sh                             |  8 ---
 spongilla_pdb70.sh                            |  9 ----
 spongilla_uniref100.sh                        | 10 ----
 23 files changed, 70 insertions(+), 172 deletions(-)
 delete mode 100755 batch_predict.sh
 delete mode 100755 gpu_test.sh
 delete mode 100755 mkdb.sh
 delete mode 100755 predict_protein.sh
 delete mode 100755 predict_structure.sh
 create mode 100644 scripts/README.md
 rename align.sh => scripts/align.sh (100%)
 rename databases.sh => scripts/databases.sh (100%)
 rename create_index.sh => scripts/databases_pdb.sh (73%)
 rename fs_afdb.sh => scripts/fs_afdb.sh (100%)
 rename fs_pdb.sh => scripts/fs_pdb.sh (100%)
 rename fs_query.sh => scripts/fs_query.sh (100%)
 rename fs_search_afdb.sh => scripts/fs_search_afdb.sh (100%)
 rename fs_search_pdb.sh => scripts/fs_search_pdb.sh (100%)
 rename fs_search_swissprot.sh => scripts/fs_search_swissprot.sh (100%)
 rename fs_sp.sh => scripts/fs_sp.sh (100%)
 rename milot.sh => scripts/predict_structures.sh (100%)
 rename submit_colab.sh => scripts/submit_colab.sh (100%)
 delete mode 100755 spongilla_create_index.sh
 delete mode 100755 spongilla_mkdb.sh
 delete mode 100755 spongilla_pdb70.sh
 delete mode 100755 spongilla_uniref100.sh

diff --git a/README.md b/README.md
index 62a7752..a1964f5 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,63 @@
 # spongfold
 
-This repository documents the idea of Fabi and Niko to use predicted protein structures and structure similarity to annotate proteomes.
+This repository documents the idea of Fabi and Niko to use predicted protein structures and
+structure similarity to annotate proteomes.
 
 ## Background
-[AlphaFold](https://www.nature.com/articles/s41586-021-03819-2) changed how we think about protein structures. By leveraging deep learning, multiple sequence alignments, and the ever-expanding library of solved protein structures, AlphaFold is able to predict three-dimensional protein structures at resolutions that rival solved crystal structures, and has immediately found use in large parts of biological research.
+[AlphaFold](https://www.nature.com/articles/s41586-021-03819-2) changed how we think about protein
+structures. By leveraging deep learning, multiple sequence alignments, and the ever-expanding
+library of solved protein structures, AlphaFold is able to predict three-dimensional protein
+structures at resolutions that rival solved crystal structures, and has immediately found use in
+large parts of biological research.
 
 One of the older dogmata in molecular biology concerns proteins, and holds that
 
 > Sequence defines structure defines function.
 
-In the past this was taken to mean that sequence similarity, above a certain threshold, correlated with sequence homology or even orthology. Sufficient sequence similarity was then inferred to mean high structural similarity (this was mostly proven by the success of homology modelling in CASP) as well as functional equivalence. In fact, "bidirectional best BLAST hits" is the _de facto_ baseline for functional annotation.
+In the past this was taken to mean that sequence similarity, above a certain threshold, correlated
+with sequence homology or even orthology. Sufficient sequence similarity was then inferred to mean
+high structural similarity (this was mostly proven by the success of homology modelling in CASP) as
+well as functional equivalence. In fact, "bidirectional best BLAST hits" is the _de facto_ baseline
+for functional annotation.
 
-The question of the threshold turns out to be an important one. Burkhardt Rost famously outlined it as the ["twilight zone"](https://pubmed.ncbi.nlm.nih.gov/10195279/) of protein similarity; briefly, while we can confidently assert structural similarity (at least to the fold level) for sequences with >30% sequence identity, this deteriorates impressively fast at 25% sequence identity. In the years since this pronouncement, sequence search algorithms with increased sensitivity exploited the mountains of new sequencing data to dive into the twilight zone and detect remote homology; however, at low sequence identity, the sequence information alone is still not enough to guarantee homology.
+The question of the threshold turns out to be an important one. Burkhardt Rost famously outlined it
+as the ["twilight zone"](https://pubmed.ncbi.nlm.nih.gov/10195279/) of protein similarity; briefly,
+while we can confidently assert structural similarity (at least to the fold level) for sequences
+with >30% sequence identity, this deteriorates impressively fast at 25% sequence identity. In the
+years since this pronouncement, sequence search algorithms with increased sensitivity exploited the
+mountains of new sequencing data to dive into the twilight zone and detect remote homology; however,
+at low sequence identity, the sequence information alone is still not enough to guarantee homology.
 
-However, structure is more conserved than sequence. In theory, predicted structures can be compared against known structures that are otherwise annotated, allowing for the transfer of functional annotations (albeit less specific than sequence-based ones, since we will be detecting very remote homology at best). This is of particular interest for non-model organisms, especially ones outside the well-studied taxonomic groups (e.g. vertebrates or ecdysozoans).
+However, structure is more conserved than sequence. In theory, predicted structures can be compared
+against known structures that are otherwise annotated, allowing for the transfer of functional
+annotations (albeit less specific than sequence-based ones, since we will be detecting very remote
+homology at best). This is of particular interest for non-model organisms, especially ones outside
+the well-studied taxonomic groups (e.g. vertebrates or ecdysozoans).
 
 ## Follow-up ideas
-Having a phylome, scRNAseq/cell type annotation, functional proteomic data and the prediction of protein structures for a non-bilaterian Metazoan presents a unique combination that would allow to ask many fundamental questions. Potential follow-up analysis include:
+Having a phylome, scRNAseq/cell type annotation, functional proteomic data and the prediction of
+protein structures for a non-bilaterian Metazoan presents a unique combination that would allow to
+ask many fundamental questions. Potential follow-up analysis include:
 
-- Correltation between AF prediction accuracy (overall, domain specific, etc.) and sequence identity/similarity or bitscore of best FoldSeek hit. I.e.: "Does higher sequence identity mean better prediction accuracy?"
-- Relationship between identified homologs through sequence search (orthofinder, eggnog-mapper, blast, phylome) and best hits in FoldSeek for single sponge proteins. I.e.: "Do the best AF hits also include proteins identified as homologs in the phylome? Is there a sequence identity threshold to that?"
-- Is there biological meaning to best FoldSeek hits of un-annotated, highly expressed genes in the scRNAseq dataset or differentially regulated hits in the functional proteomic datasets?. I.e.: "Can we transfer function / functionally annotate previously un-annotated hits in scRNAseq and protomics data and most importantly, do these hits make sense when taking prior knowledge into account?"
+- Correlation between AF prediction accuracy (overall, domain specific, etc.) and sequence
+  identity/similarity or bitscore of best FoldSeek hit. I.e.: "Does higher sequence identity mean
+  better prediction accuracy?"
+- Relationship between identified homologs through sequence search (orthofinder, eggnog-mapper,
+  blast, phylome) and best hits in FoldSeek for single sponge proteins. I.e.: "Do the best AF hits
+  also include proteins identified as homologs in the phylome? Is there a sequence identity
+  threshold to that?"
+- Is there biological meaning to best FoldSeek hits of un-annotated, highly expressed genes in the
+  scRNAseq dataset or differentially regulated hits in the functional proteomic datasets?. I.e.:
+  "Can we transfer function / functionally annotate previously un-annotated hits in scRNAseq and
+  protomics data and most importantly, do these hits make sense when taking prior knowledge into
+  account?"
 
 ## Usage
 (eventually a tutorial on what order to use the scripts in, if we don't have a master script).
 
 ## Roadmap
-We will write this in as general a way as possible so that it can be reused by others/for other species. Platynereis is an obvious candidate after we proof _Spongilla_.
+We will write this in as general a way as possible so that it can be reused by others/for other
+species. Platynereis is an obvious candidate after we proof _Spongilla_.
 
 ## Authors and contributions
 
diff --git a/batch_predict.sh b/batch_predict.sh
deleted file mode 100755
index 63d6a76..0000000
--- a/batch_predict.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash -ex
-#SBATCH -t 03-00
-#SBATCH -o /g/arendt/npapadop/cluster/alphafold-%j.out
-#SBATCH -e /g/arendt/npapadop/cluster/alphafold-%j.err
-#SBATCH -p gpu
-#SBATCH -C gpu=A100
-#SBATCH --gres=gpu:1
-#SBATCH -N 1
-#SBATCH --ntasks=32
-#SBATCH --mem=512000
-
-module load Anaconda3
-module load GCC
-module load bzip2
-module load CUDA
-source ~/.bash_profile
-conda activate /g/arendt/npapadop/repos/condas/maf
-
-BASE="/scratch/npapadop/"
-BATCH=$1
-
-cd "${BASE}"
-
-echo "$BATCH"
-
-colabfold_batch msas/"$BATCH"/ spongilla_structures/ --stop-at-score 85
-
-module unload
-
diff --git a/gpu_test.sh b/gpu_test.sh
deleted file mode 100755
index c1f7b35..0000000
--- a/gpu_test.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash -ex
-#SBATCH -J predict
-#SBATCH -t 30:00
-#SBATCH -o /g/arendt/npapadop/cluster/2080alphafold.out
-#SBATCH -e /g/arendt/npapadop/cluster/2080alphafold.err
-#SBATCH -p gpu
-#SBATCH -C gpu=2080Ti
-#SBATCH --gres=gpu:1
-#SBATCH -N 1
-#SBATCH --ntasks=32
-#SBATCH --mem=256000
-
-module load Anaconda3
-module load GCC
-module load bzip2
-module load CUDA
-source ~/.bash_profile
-conda activate /g/arendt/npapadop/repos/condas/maf
-
-BASE="/scratch/npapadop/results/"
-
-cd "${BASE}"
-colabfold_batch test/ predictions/ --stop-at-score 85
-
-module unload
\ No newline at end of file
diff --git a/mkdb.sh b/mkdb.sh
deleted file mode 100755
index 568d741..0000000
--- a/mkdb.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash -ex
-#SBATCH -J mkdb
-#SBATCH -t 10:00
-#SBATCH -c 1
-#SBATCH --mem=1G
-#SBATCH -o /g/arendt/npapadop/cluster/mkdb.out
-#SBATCH -e /g/arendt/npapadop/cluster/mkdb.err
-
-INPUT_PEP="$1"
-OUTPUT_DIR="$2"
-NAME="$3"
-
-mkdir ${OUTPUT_DIR}
-cd ${OUTPUT_DIR}
-
-module load MMseqs2
-
-mmseqs createdb ${INPUT_PEP} ${NAME}
-
-module unload MMseqs2
\ No newline at end of file
diff --git a/predict_protein.sh b/predict_protein.sh
deleted file mode 100755
index 123c1ee..0000000
--- a/predict_protein.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-#SBATCH --time=2-00:00:00
-#SBATCH -e AF_%x_err.txt
-#SBATCH -o AF_%x_out.txt
-#SBATCH --qos=normal
-#SBATCH -p gpu
-#SBATCH -N 1
-#SBATCH --ntasks=32
-#SBATCH --mem=512000
-
-module load AlphaFold
-module load GCC/10.2.0
-module load tqdm
-module load matplotlib
-
-SOFTWARE_DIR=<your dir>
-export PYTHONPATH=$SOFTWARE_DIR/ColabFold:$PYTHONPATH
-
-# If you use --cpus-per-task=X and --ntasks=1 your script should contain:
-# export ALPHAFOLD_JACKHMMER_N_CPU=$SLURM_CPUS_PER_TASK
-# export ALPHAFOLD_HHBLITS_N_CPU=$SLURM_CPUS_PER_TASK
-
-# TF_FORCE_UNIFIED_MEMORY='1' XLA_PYTHON_CLIENT_MEM_FRACTION='4.0' are optional but may be necessary for bigger sequences.
-# If you read this after 2050-01-01, probably you want to adjust the date
-# Add "--model-type AlphaFold2-ptm" option to run the old ColabFold for complexes,
-# equivalent to the original https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/beta/AlphaFold2_advanced.ipynb
-TF_FORCE_UNIFIED_MEMORY='1' XLA_PYTHON_CLIENT_MEM_FRACTION='4.0' time $SOFTWARE_DIR/ColabFold/bin/colabfold_batch dimer.fasta ./ --templates
\ No newline at end of file
diff --git a/predict_structure.sh b/predict_structure.sh
deleted file mode 100755
index 193de40..0000000
--- a/predict_structure.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash -ex
-#SBATCH -J test_colab
-#SBATCH -t 30:00
-#SBATCH -o /g/arendt/npapadop/cluster/alphafold_test.out
-#SBATCH -e /g/arendt/npapadop/cluster/alphafold_test.err
-#SBATCH -p gpu
-#SBATCH -C gpu=A100
-#SBATCH --gres=gpu:1
-#SBATCH -N 1
-#SBATCH --ntasks=32
-#SBATCH --mem=512000
-
-module load Anaconda3
-module load GCC
-module load bzip2
-module load CUDA
-source ~/.bash_profile
-conda activate /g/arendt/npapadop/repos/condas/colabfold
-
-BASE="/scratch/npapadop/"
-
-colabfold_batch "$BASE"/msas/test/ "$BASE"/test_res/ --stop-at-score 85
-
-module unload
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..c388a3c
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,26 @@
+# scripts usage
+
+Here we list the scripts in their logical use order:
+
+### Get sequence databases and obtain multiple sequence alignments for _Spongilla_.
+
+* `databases.sh`: download and build indices for the sequence databases UniRef30 and ColabFold
+  EnvDB. Modified from ColabFold.
+* `databases_pdb.sh`: download (sequence) PDB and build index
+* `align.sh`: build multiple sequence alignments for the _Spongilla_ proteome. A wrapper around
+  `colabfold_search`.
+
+### Predict structures for _Spongilla_
+
+* `fasta-splitter.pl`: written by Kirill Kryukov. A utility to partition FASTA files into pieces.
+  Used with `--n-parts 32` to split the _Spongilla_ proteome in batches.
+* `predict_structures.sh`: a wrapper around `colabfold_batch` to submit jobs to the EMBL cluster.
+* `submit_colab.sh`: a simple for loop to handle all 32 batches.
+
+### Use FoldSeek to search against available structures
+
+* `fs_query.sh`: build structural database for _Spongilla_ predicted structures.
+* `fs_afdb.sh, fs_pdb.sh, fs_sp.sh`: download the precomputed FoldSeek databases for AFDB, PDB, and
+  SwissProt, respectively. Split in three so we could run them in parallel.
+* `fs_search_afdb.sh, fs_search_pdb.sh, fs_search_swissprot.sh`: search with the _Spongilla_
+  structure database against the three target databases, AFDB, PDB, and SwissProt. 
\ No newline at end of file
diff --git a/align.sh b/scripts/align.sh
similarity index 100%
rename from align.sh
rename to scripts/align.sh
diff --git a/databases.sh b/scripts/databases.sh
similarity index 100%
rename from databases.sh
rename to scripts/databases.sh
diff --git a/create_index.sh b/scripts/databases_pdb.sh
similarity index 73%
rename from create_index.sh
rename to scripts/databases_pdb.sh
index 166150a..5fafbd4 100755
--- a/create_index.sh
+++ b/scripts/databases_pdb.sh
@@ -6,12 +6,13 @@
 #SBATCH -o /g/arendt/npapadop/cluster/create_index.out
 #SBATCH -e /g/arendt/npapadop/cluster/create_index.err
 
+# build the indices for the sequence databases uniref30, PDB. This is later used for the
+# MSAs of the Spongilla sequences.
+
 cd /scratch/npapadop/database
 
 module load MMseqs2
 
-mmseqs tsv2exprofiledb "uniref30_2103" "uniref30_2103_db"
-mmseqs createindex "uniref30_2103_db" tmp1 --remove-tmp-files 1 --threads 64
 mmseqs databases PDB PDB tmp
 mmseqs createindex PDB tmp3 --remove-tmp-files 1 --threads 64
 
diff --git a/fs_afdb.sh b/scripts/fs_afdb.sh
similarity index 100%
rename from fs_afdb.sh
rename to scripts/fs_afdb.sh
diff --git a/fs_pdb.sh b/scripts/fs_pdb.sh
similarity index 100%
rename from fs_pdb.sh
rename to scripts/fs_pdb.sh
diff --git a/fs_query.sh b/scripts/fs_query.sh
similarity index 100%
rename from fs_query.sh
rename to scripts/fs_query.sh
diff --git a/fs_search_afdb.sh b/scripts/fs_search_afdb.sh
similarity index 100%
rename from fs_search_afdb.sh
rename to scripts/fs_search_afdb.sh
diff --git a/fs_search_pdb.sh b/scripts/fs_search_pdb.sh
similarity index 100%
rename from fs_search_pdb.sh
rename to scripts/fs_search_pdb.sh
diff --git a/fs_search_swissprot.sh b/scripts/fs_search_swissprot.sh
similarity index 100%
rename from fs_search_swissprot.sh
rename to scripts/fs_search_swissprot.sh
diff --git a/fs_sp.sh b/scripts/fs_sp.sh
similarity index 100%
rename from fs_sp.sh
rename to scripts/fs_sp.sh
diff --git a/milot.sh b/scripts/predict_structures.sh
similarity index 100%
rename from milot.sh
rename to scripts/predict_structures.sh
diff --git a/submit_colab.sh b/scripts/submit_colab.sh
similarity index 100%
rename from submit_colab.sh
rename to scripts/submit_colab.sh
diff --git a/spongilla_create_index.sh b/spongilla_create_index.sh
deleted file mode 100755
index e0e6061..0000000
--- a/spongilla_create_index.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash -ex
-
-SCRIPT="/g/arendt/npapadop/repos/spongfold/create_index.sh"
-OUTPUT_DIR="/scratch/npapadop/Spongilla_lacustris_70AAcutoffTransDecoder/"
-DB_NAME="spongilla"
-
-sbatch ${SCRIPT} ${OUTPUT_DIR} ${DB_NAME}
\ No newline at end of file
diff --git a/spongilla_mkdb.sh b/spongilla_mkdb.sh
deleted file mode 100755
index f441ea3..0000000
--- a/spongilla_mkdb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash -ex
-
-SCRIPT="/g/arendt/npapadop/repos/spongfold/mkdb.sh"
-INPUT="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
-OUTPUT_DIR="/scratch/npapadop/Spongilla_lacustris_70AAcutoffTransDecoder/"
-DB_NAME="spongilla"
-
-sbatch ${SCRIPT} ${INPUT} ${OUTPUT_DIR} ${DB_NAME}
\ No newline at end of file
diff --git a/spongilla_pdb70.sh b/spongilla_pdb70.sh
deleted file mode 100755
index 754ec40..0000000
--- a/spongilla_pdb70.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash -ex
-
-SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh"
-QUERYDB="/scratch/npapadop/Spongilla_lacustris_70AAcutoffTransDecoder/spongilla"
-TARGETDB="/scratch/npapadop/PDB70/PDB70"
-RESULTDB="/scratch/npapadop/spongilla_pdb70/result"
-TMP="/scratch/npapadop/spongilla_pdb70/tmp"
-
-sbatch ${SCRIPT} ${QUERYDB} ${TARGETDB} ${RESULTDB} ${TMP}
\ No newline at end of file
diff --git a/spongilla_uniref100.sh b/spongilla_uniref100.sh
deleted file mode 100755
index b18cd4a..0000000
--- a/spongilla_uniref100.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash -ex
-
-SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh"
-QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep"
-DBBASE="/scratch/npapadop"
-BASE="/scratch/npapadop"
-DB1="Uniref100/Uniref100"
-FILTER="1"
-
-sbatch ${SCRIPT} ${QUERY} ${DBBASE} ${BASE} ${DB1} ${FILTER}
\ No newline at end of file
-- 
GitLab