fixed ModelArchive bugs (protein names/models, protein query length); added...

fixed ModelArchive bugs (protein names/models, protein query length); added AFDB self comparison as per editors' request

fixed ModelArchive bugs (protein names/models, protein query length); added...
fixed ModelArchive bugs (protein names/models, protein query length); added AFDB self comparison as per editors' request
9d321e6a · Nikolaos Papadopoulos · e0fada09 · 9d321e6a · 9d321e6a · 9d321e6a
Commit 9d321e6a authored 2 years ago by Nikolaos Papadopoulos
--- a/analysis/suppl-ModelArchive_metadata.ipynb
+++ b/analysis/suppl-ModelArchive_metadata.ipynb
--- a/analysis/suppl-ModelArchive_preparation.ipynb
+++ b/analysis/suppl-ModelArchive_preparation.ipynb
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2022-06-08 14:02\n"
+      "2022-08-16 07:52\n"
     ]
    }
   ],

 %% Cell type:code id:e951dd81-8a76-45fb-a00d-8de54697bf0f tags:

 ``` python
 from datetime import datetime, timezone
 import pytz

 utc_dt = datetime.now(timezone.utc) # UTC time
 dt = utc_dt.astimezone()
 tz = pytz.timezone('Europe/Berlin')
 berlin_now = datetime.now(tz)

 print(f'{berlin_now:%Y-%m-%d %H:%M}')
 ```

 %% Output

-    2022-06-08 14:02
+    2022-08-16 07:52

 %% Cell type:markdown id:84d83807-5bdb-4fb7-8bcc-069c17f984a4 tags:

 In this script I will prepare the data for deposition in [ModelArchive](https://www.modelarchive.org). I will rename the PDB files from the best structures and the corresponding metadata (.json) files so that the isoform ID is in the file name where the MMseqs2 query ID would be instead. I will then archive them in tarballs and upload using

 %% Cell type:code id:3bebc329-4921-4890-baac-5f3718a687d9 tags:

 ``` python
 import os
 import shutil
 from tqdm import tqdm

 import numpy as np
 import pandas as pd
 ```

 %% Cell type:code id:a8ccaf2f-10dc-4ee6-9b80-60621401866b tags:

 ``` python
 best_models = '/g/arendt/npapadop/data/spongfold_publish/best_models/'
 metadata = '/g/arendt/npapadop/data/spongfold_publish/best_model_metadata/'
 ```

 %% Cell type:code id:baf77327-f8b6-4d69-bde3-6cae33764844 tags:

 ``` python
 named_models = '/g/arendt/npapadop/data/spongfold_publish/named_models/'
 named_metadata = '/g/arendt/npapadop/data/spongfold_publish/named_metadata/'
 ```

 %% Cell type:code id:0eb411cd-c982-4b74-a67c-30ed9fc724d0 tags:

 ``` python
 if not os.path.exists(named_models):
    os.mkdir(named_models)
 if not os.path.exists(named_metadata):
    os.mkdir(named_metadata)
 ```

 %% Cell type:code id:b8e1583a-c4f0-49f0-b170-fdbd421b40ff tags:

 ``` python
 structure_annotation = pd.read_parquet('../data/structure_annotation.parquet')
 structure_annotation.reset_index(inplace=True)
 ```

 %% Cell type:code id:18fb42d0-780e-4a6d-919c-b9913d9d174d tags:

 ``` python
 rename = {}
 for query, isoform in zip(structure_annotation['query'], structure_annotation['isoform']):
    rename[query] = isoform

 input_model = {}
 for path in os.listdir(best_models): # dir only contains PDB files
    query = path.split('_')[0]
    input_model[query] = best_models + path

 input_metadata = {}
 for path in os.listdir(metadata): # contains a3m and json files, so only keep the jsons:
    if path.split('.')[1] == 'json':
        query = path.split('_')[0]
        input_metadata[query] = metadata + path
 ```

 %% Cell type:markdown id:353e9492-9dbd-42e3-ba89-8abe6a23f24f tags:

 Test that metadata and models match:

 %% Cell type:code id:b072fafc-fa20-4b0a-8deb-0efc669498ed tags:

 ``` python
 np.intersect1d(list(input_metadata.keys()), list(input_model.keys())).shape, len(input_metadata)
 ```

 %% Output

    ((41932,), 41932)

 %% Cell type:markdown id:2735d460-68e7-4626-ac78-dd9b1dbdb01a tags:

 Test that all the predicted structures are in the list of annotated peptides:

 %% Cell type:code id:d29b226d-4e8a-4c01-84e0-00beac861338 tags:

 ``` python
 np.intersect1d(list(rename.keys()), list(input_model.keys())).shape
 ```

 %% Output

    (41932,)

 %% Cell type:markdown id:3c1c09bf-cce7-46f5-9b52-d707b397ca1a tags:

 This number is smaller than the total peptides in the proteome because eleven peptides were too long to get predicted structures:

 %% Cell type:code id:709832d8-76e2-4b7e-bbfe-0c3ac55c8520 tags:

 ``` python
 structure_annotation[structure_annotation['plddt'].isnull()]
 ```

 %% Output

           query        isoform  MSA size  query length              gene name  \
    1548   15002  c104658_g3_i5    3238.0          3658  c104658_g3_i5_m.80315
    2948   16435  c104956_g1_i1    5824.0          3548  c104956_g1_i1_m.84869
    4265   16441  c104958_g1_i1    7300.0          3094  c104958_g1_i1_m.84896
    8310   16304  c104922_g1_i2    3960.0          3866  c104922_g1_i2_m.84358
    16426  16491  c104973_g1_i2    4396.0          3868  c104973_g1_i2_m.85101
    17787  16186  c104894_g1_i3   20764.0          4624  c104894_g1_i3_m.83996
    20465  15933  c104839_g1_i3    2958.0          3028  c104839_g1_i3_m.83198
    25849  16094  c104872_g1_i1   13498.0          3716  c104872_g1_i1_m.83696
    31264  15931  c104839_g1_i1    3259.0          3032  c104839_g1_i1_m.83180
    38095  16431  c104954_g1_i2    8070.0          4832  c104954_g1_i2_m.84837
    39412  15226  c104704_g1_i1    2004.0          2923  c104704_g1_i1_m.81036
    
           protein_id     gene_id  has_start  has_end target  ...  Preferred_name  \
    1548        80315  c104658_g3       True     True   None  ...            None
    2948        84869  c104956_g1       True     True   None  ...            None
    4265        84896  c104958_g1      False     True   None  ...            None
    8310        84358  c104922_g1      False     True   None  ...            None
    16426       85101  c104973_g1      False     True   None  ...            None
    17787       83996  c104894_g1      False     True   None  ...            None
    20465       83198  c104839_g1      False     True   None  ...            None
    25849       83696  c104872_g1      False     True   None  ...            None
    31264       83180  c104839_g1      False     True   None  ...            None
    38095       84837  c104954_g1      False    False   None  ...            None
    39412       81036  c104704_g1       True     True   None  ...            None
    
            GOs  PFAMs  Entry name Gene names  Function [CC]  \
    1548   None   None        None       None           None
    2948   None   None        None       None           None
    4265   None   None        None       None           None
    8310   None   None        None       None           None
    16426  None   None        None       None           None
    17787  None   None        None       None           None
    20465  None   None        None       None           None
    25849  None   None        None       None           None
    31264  None   None        None       None           None
    38095  None   None        None       None           None
    39412  None   None        None       None           None
    
           Taxonomic lineage (PHYLUM) origin plddt complete_protein
    1548                         None   None   NaN             True
    2948                         None   None   NaN             True
    4265                         None   None   NaN            False
    8310                         None   None   NaN            False
    16426                        None   None   NaN            False
    17787                        None   None   NaN            False
    20465                        None   None   NaN            False
    25849                        None   None   NaN            False
    31264                        None   None   NaN            False
    38095                        None   None   NaN            False
    39412                        None   None   NaN             True
    
    [11 rows x 31 columns]

 %% Cell type:markdown id:028d7534-ee18-4bed-995c-3f4d708753f3 tags:

 Now copy each query protein and each metadata json to the adjacent folder while also renaming them to the isoform ID:

 %% Cell type:code id:b77909f1-0d02-495d-9777-3da71d54694e tags:

 ``` python
 for query in tqdm(input_model.keys()):
    isoform = rename[int(query)]
    output_model = input_model[str(query)].replace(query, isoform).replace(best_models, named_models)
    output_metadata = input_metadata[str(query)].replace(query, isoform).replace(metadata, named_metadata)
    shutil.copy(input_model[query], output_model)
    shutil.copy(input_metadata[query], output_metadata)
 ```

 %% Output

    100%|██████████| 41932/41932 [10:10<00:00, 68.66it/s]

 %% Cell type:code id:0a79e5f0-0767-4e91-a706-b8afe46947ec tags:

 ``` python
 %%bash
 cd /g/arendt/npapadop/data/spongfold_publish/
 tar cvzf 2022-06-08_Spongilla_lacustris_PDB.tar.gz named_models/
 tar cvzf 2022-06-08_Spongilla_lacustris_json.tar.gz named_metadata/
 ```

 %% Output

    IOPub data rate exceeded.
    The notebook server will temporarily stop sending output
    to the client in order to avoid crashing it.
    To change this limit, set the config variable
    `--NotebookApp.iopub_data_rate_limit`.
    
    Current values:
    NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
    NotebookApp.rate_limit_window=3.0 (secs)
    

--- a/analysis/suppl-compare_seq_profile.ipynb
+++ b/analysis/suppl-compare_seq_profile.ipynb
--- a/scripts/fs_self_search_afdb.sh
+++ b/scripts/fs_self_search_afdb.sh
+#!/bin/bash -ex
+#SBATCH -J fs_self_afdb
+#SBATCH -t 10:00:00
+#SBATCH -c 32
+#SBATCH -N 1
+#SBATCH --mem=128G
+#SBATCH -o /g/arendt/npapadop/cluster/fs_self.out
+#SBATCH -e /g/arendt/npapadop/cluster/fs_self.err
+
+module load GCC
+module load bzip2
+
+FOLDSEEK="/g/arendt/npapadop/repos/foldseek/build/bin/foldseek"
+QUERYDB="/scratch/npapadop/foldseek_target/afdb"
+TARGETDB="/scratch/npapadop/foldseek_target/afdb"
+ALIGNMENT="/scratch/npapadop/foldseek_results/self/self"
+
+"$FOLDSEEK" search "$QUERYDB" "$TARGETDB" "$ALIGNMENT" tmpFolder -a --threads 16
+"$FOLDSEEK" aln2tmscore "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_aln_tmscore
+"$FOLDSEEK" createtsv "$QUERYDB" "$TARGETDB" "$ALIGNMENT"_aln_tmscore "$ALIGNMENT"_aln_tmscore.tsv
+"$FOLDSEEK" convertalis "$QUERYDB" "$TARGETDB" "$ALIGNMENT" "$ALIGNMENT"_score.tsv
+
+module unload