diff --git a/align.sh b/align.sh new file mode 100755 index 0000000000000000000000000000000000000000..b6c8228396449336bff35740b17825da12af6082 --- /dev/null +++ b/align.sh @@ -0,0 +1,68 @@ +#!/bin/bash -ex +#SBATCH -J align +#SBATCH -t 10:00:00 +#SBATCH -c 128 +#SBATCH --mem=200G +#SBATCH -o /g/arendt/npapadop/cluster/align_%j.out +#SBATCH -e /g/arendt/npapadop/cluster/align_%j.err + +# a heavily modified version of the colabfold_search.sh script from https://github.com/sokrypton/ColabFold/ +# briefly: +# - we replace the MMseqs2 binary with the version in the module system +# - replace variables with static files and locations +# - replace DB2; initially BFD/Mgnify, now PDB70 +# - remove DB3; initially ColabfoldDB +# - keep query database (search against UniRef30), since we are interested in the sequence alignments + +module load MMseqs2 + +export OMP_NUM_THREADS=128 +# MMSEQS="$1" +QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep" +DBBASE="/scratch/npapadop/database" +BASE="/scratch/npapadop/result" +DB1="uniref30_2103_db" +DB2="PDB" +# DB3="$7" +FILTER="1" +THREADS="128" +SENSITIVITY=8 +EXPAND_EVAL=inf +ALIGN_EVAL=10 +DIFF=3000 +QSC=-20.0 +MAX_ACCEPT=1000000 +if [ "${FILTER}" = "1" ]; then +# 0.1 was not used in benchmarks due to POSIX shell bug in line above +# EXPAND_EVAL=0.1 + ALIGN_EVAL=10 + QSC=0.8 + MAX_ACCEPT=100000 +fi +export MMSEQS_CALL_DEPTH=1 +SEARCH_PARAM="--num-iterations 3 --db-load-mode 2 -a -s ${SENSITIVITY} -e 0.1 --max-seqs 10000" +FILTER_PARAM="--filter-msa ${FILTER} --filter-min-enable 1000 --diff ${DIFF} --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95" +EXPAND_PARAM="--expansion-mode 0 -e ${EXPAND_EVAL} --expand-filter-clusters ${FILTER} --max-seq-id 0.95" +mkdir -p "${BASE}" +mmseqs createdb "${QUERY}" "${BASE}/qdb" +mmseqs search "${BASE}/qdb" "${DBBASE}/${DB1}" "${BASE}/res" "${BASE}/tmp" $SEARCH_PARAM +mmseqs expandaln "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" --db-load-mode 2 --threads ${THREADS} ${EXPAND_PARAM} +mmseqs mvdb "${BASE}/tmp/latest/profile_1" "${BASE}/prof_res" +mmseqs lndb "${BASE}/qdb_h" "${BASE}/prof_res_h" +mmseqs align "${BASE}/prof_res" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp" "${BASE}/res_exp_realign" --db-load-mode 2 --threads ${THREADS} -e ${ALIGN_EVAL} --max-accept ${MAX_ACCEPT} --alt-ali 10 -a +mmseqs filterresult "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign" "${BASE}/res_exp_realign_filter" --db-load-mode 2 --threads ${THREADS} --qid 0 --qsc $QSC --diff 0 --max-seq-id 1.0 --filter-min-enable 100 +mmseqs result2msa "${BASE}/qdb" "${DBBASE}/${DB1}.idx" "${BASE}/res_exp_realign_filter" "${BASE}/uniref.a3m" --msa-format-mode 6 --db-load-mode 2 --threads ${THREADS} ${FILTER_PARAM} +mmseqs rmdb "${BASE}/res_exp_realign" +mmseqs rmdb "${BASE}/res_exp" +mmseqs rmdb "${BASE}/res" +mmseqs rmdb "${BASE}/res_exp_realign_filter" +# align profile database against PDB +mmseqs search "${BASE}/prof_res" "${DBBASE}/${DB2}" "${BASE}/res_pdb" "${BASE}/tmp" --db-load-mode 2 --threads ${THREADS} -s 7.5 -a -e 0.1 +mmseqs convertalis "${BASE}/prof_res" "${DBBASE}/${DB2}.idx" "${BASE}/res_pdb" "${BASE}/${DB2}.m8" --threads ${THREADS} --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,cigar --db-load-mode 2 +mmseqs rmdb "${BASE}/res_pdb" + +mmseqs rmdb "${BASE}/qdb" +mmseqs rmdb "${BASE}/qdb_h" +mmseqs rmdb "${BASE}/res" +rm -f -- "${BASE}/prof_res"* +rm -rf -- "${BASE}/tmp" diff --git a/create_index.sh b/create_index.sh index 572849a52ed360bc86ffdef430e420f621900c3a..f5f81b4d9f51aa6f74ed7d18463cfd39cfd75b77 100755 --- a/create_index.sh +++ b/create_index.sh @@ -1,18 +1,18 @@ #!/bin/bash -ex #SBATCH -J create_index -#SBATCH -t 02:00:00 -#SBATCH -c 12 -#SBATCH --mem=50G +#SBATCH -t 08:00:00 +#SBATCH -c 64 +#SBATCH --mem=200G #SBATCH -o /g/arendt/npapadop/cluster/create_index.out #SBATCH -e /g/arendt/npapadop/cluster/create_index.err -OUTPUT_DIR="$1" -DATABASE="$2" - -cd ${OUTPUT_DIR} +cd /scratch/npapadop/database module load MMseqs2 -mmseqs createindex ${DATABASE} tmp --threads 12 --remove-tmp-files 1 +# mmseqs tsv2exprofiledb "uniref30_2103" "uniref30_2103_db" +# mmseqs createindex "uniref30_2103_db" tmp1 --remove-tmp-files 1 --threads 64 +mmseqs databases PDB PDB tmp +mmseqs createindex PDB tmp3 --remove-tmp-files 1 --threads 64 module unload MMseqs2 \ No newline at end of file diff --git a/spongilla_pdb70.sh b/spongilla_pdb70.sh new file mode 100755 index 0000000000000000000000000000000000000000..754ec40fdd3d2d3bdfa31ed6dea7ec43bba2ee0d --- /dev/null +++ b/spongilla_pdb70.sh @@ -0,0 +1,9 @@ +#!/bin/bash -ex + +SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh" +QUERYDB="/scratch/npapadop/Spongilla_lacustris_70AAcutoffTransDecoder/spongilla" +TARGETDB="/scratch/npapadop/PDB70/PDB70" +RESULTDB="/scratch/npapadop/spongilla_pdb70/result" +TMP="/scratch/npapadop/spongilla_pdb70/tmp" + +sbatch ${SCRIPT} ${QUERYDB} ${TARGETDB} ${RESULTDB} ${TMP} \ No newline at end of file diff --git a/spongilla_uniref100.sh b/spongilla_uniref100.sh new file mode 100755 index 0000000000000000000000000000000000000000..b18cd4ad20808674288cd9fd57d3f90993a42edb --- /dev/null +++ b/spongilla_uniref100.sh @@ -0,0 +1,10 @@ +#!/bin/bash -ex + +SCRIPT="/g/arendt/npapadop/repos/spongfold/align.sh" +QUERY="/g/arendt/data/spongilla_singlecell_dataset/spongilla_lacustris_Trinity.fasta.transdecoder_70AA_mediumheader.pep" +DBBASE="/scratch/npapadop" +BASE="/scratch/npapadop" +DB1="Uniref100/Uniref100" +FILTER="1" + +sbatch ${SCRIPT} ${QUERY} ${DBBASE} ${BASE} ${DB1} ${FILTER} \ No newline at end of file