migrating read/write to scripts and out of notebook

f14f1db2 · Nikolaos Papadopoulos · 62a76855 · f14f1db2 · f14f1db2 · f14f1db2
Commit f14f1db2 authored 2 years ago by Nikolaos Papadopoulos
--- a/analysis/Untitled.ipynb
+++ b/analysis/Untitled.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "814d8de9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-07-27 13:53\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3808ea96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "from os.path import exists\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ad6f9837",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation = pd.read_parquet('../old_data/structure_annotation.parquet')\n",
+    "sequence_annotation = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1a0df0c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "just_mmseqs = pd.read_csv('../data/cfres_mmseqs_s75_e1.m8', sep=\"\\s+\", header=None)\n",
+    "just_mmseqs.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "\n",
+    "just_mmseqs['gene_id'] = just_mmseqs['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n",
+    "just_mmseqs['normalized bit score'] = just_mmseqs['bit score'] / just_mmseqs['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b09c96b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def best_bit_score(df, sort_by='normalized bit score', tiebreak='alignment length'):\n",
+    "    have_max = df[sort_by] == np.max(df[sort_by])\n",
+    "    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])\n",
+    "    return df[have_max][max_ali].index.values[0]\n",
+    "\n",
+    "def keep_best(df, groupby='gene_id'):\n",
+    "    df.reset_index(inplace=True)\n",
+    "    idx = df.groupby(groupby).apply(best_bit_score)\n",
+    "    res = df.loc[idx].copy()\n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ff6f8bf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "just_mmseqs_filtered = keep_best(just_mmseqs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "af68258b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hhblits = pd.read_csv('../data/cfres_hhblits_e1.m8', sep=\"\\s+\", header=None)\n",
+    "hhblits.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
+    "                       \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
+    "                       \"e value\", \"bit score\"]\n",
+    "\n",
+    "hhblits['gene_id'] = hhblits['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))\n",
+    "hhblits['normalized bit score'] = hhblits['bit score'] / hhblits['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5380700c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hhblits_filtered = keep_best(hhblits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2d829347",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',\n",
+       "       'Description', 'Preferred_name', 'GOs', 'PFAMs', 'gene_id',\n",
+       "       'protein_id'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sequence_annotation.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b72335fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structural_annotation['normalized bit score'] = structural_annotation['bit score'] / structural_annotation['alignment length']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e7595d52",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.legend.Legend at 0x7fdd93502580>"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df3BV5b3v8feXgAUUxJLcthJs0jOIAgkBg3oLF+GgGNCRudoZRUFOW0qZi0due0XwtMcWOu30jJxbD1WhDKWWuVSoSCn3CtpSsIK1hx+6G34oGmlKQlqNoShyoPzwe//ITs4m7JCdZO2svdf+vGYyZK317LW/e0/45Mmzn/Usc3dERCT7dQu7ABERCYYCXUQkIhToIiIRoUAXEYkIBbqISER0D+uJ8/PzvaioKKynFxHJSnv27Hnf3QuSHQst0IuKiti9e3dYTy8ikpXM7E+tHdOQi4hIRCjQRUQiQoEuIhIRoY2hi0juOHPmDLW1tZw6dSrsUrJGz549KSwspEePHik/RoEuImlXW1tLnz59KCoqwszCLifjuTsNDQ3U1tZSXFyc8uM05CIiaXfq1Cn69++vME+RmdG/f/92/0WjQBeRLqEwb5+OvF8KdBGRiNAYuoh0uS0H3g30fDcP+VSg58tW6qFfzMHNYVcgIpIyBbqI5ITq6mquueYaZs6cybBhw7jvvvvYsmULo0ePZtCgQezcuZNvf/vbzJgxg4kTJ1JUVMT69et5+OGHKSkpoaKigjNnzgCwYMEChgwZQmlpKQ899BAA9fX13HXXXYwaNYpRo0bxyiuvANDQ0MDEiRMZMWIEX/3qV/nsZz/L+++/z4kTJ7jtttsYPnw4w4YNY+3atZ1+jQp0EckZVVVVzJ07l8rKSt58801+9rOfsWPHDhYvXsz3vvc9AN555x2ef/55fvnLXzJt2jTGjx/P3r176dWrF88//zxHjx7lF7/4Bfv376eyspJvfvObAMydO5evfe1r7Nq1i+eee46ZM2cCsHDhQsaMGcPrr7/OHXfcweHDhwF44YUXuPLKK/nDH/7Avn37qKio6PTrU6CLSM4oLi6mpKSEbt26MXToUCZMmICZUVJSQnV1NQCTJk2iR48elJSUcO7cueagbWrTt29fevbsycyZM1m/fj29e/cGYMuWLTzwwAOUlZVxxx138OGHH3L8+HFefvllpk2bBsBtt93GFVdc0Xy+LVu2MH/+fLZv387ll1/e6denQL+IWM2xsEsQkQB94hOfaP6+W7duzdvdunXj7Nmz57Xp1q0bPXr0aJ4+2NSme/fu7Ny5k7vuuosNGzY0B/7HH3/Mq6++SiwWIxaLceTIEfr06QMkn4J49dVXs2fPHkpKSnjkkUdYtGhRp1+fAr01+kBURJL46KOP+OCDD5g8eTKPP/44sVgMgIkTJ/LEE080t2vaP3bsWFavXg3A5s2b+etf/wpAXV0dvXv3Ztq0aTz00EO89tprna5N0xZFpMtl8zTD48ePM2XKFE6dOoW784Mf/ACAJUuWMGfOHEpLSzl79ixjx45l2bJlfOtb32Lq1KmMHDmSm266iauuugqAvXv3Mm/evOa/BJYuXdrp2szdO32SjigvL/eMvsHFwc3Eao5RdvPUsCsRyXpvvPEG1157bdhlZISmm/vk5+e32TbZ+2Zme9y9PFn7NodczGylmb1nZvtaOX6fmVXGv35nZsPbrFJERAKXyhj608DF5tP8EbjJ3UuB7wDLA6hLRCSSqqurU+qdd0SbY+ju/rKZFV3k+O8SNn8PFAZQl4iItFPQs1y+DLQ6PcTMZpnZbjPbXV9fH/BTB0tTFkUk2wQW6GY2nsZAn99aG3df7u7l7l5eUFAQ1FN3maAXFBIRCVIg0xbNrBRYAUxy94YgzikiIu3T6UA3s6uA9cB0d3+r8yVlrvy6rTBE0xhFOi3oC/cGT2qzSXV1Nbfffjv79p0/YW/cuHEsXryY8vLzZwI+/fTT7N69+7yLhZpcdtllfPTRR9TV1fHggw+ybt06YrEYdXV1TJ48uXOvpRNSmbb4DPAqMNjMas3sy2Y228xmx5s8CvQHnjKzmJll8OTyTtCVoyLSwpVXXsm6deuAxitDN23aFGo9bQa6u09198+4ew93L3T3H7v7MndfFj8+092vcPey+FfSCe/Z5LyxcgW5SGScO3eOr3zlKwwdOpSJEydy8uRJAJ599lmuv/56rr76arZv397cvqamhoqKCgYPHszChQsvOF91dTXDhg3j9OnTPProo6xdu5aysjLWrl3Lb3/7W8rKyigrK2PEiBEcP3487a9Pa7k0aQrug5sbh1biNNtFJDrefvtt5syZw/79++nXrx/PPfccAGfPnmXnzp08/vjj5wX3zp07Wb16NbFYjGeffZbWrm6/5JJLWLRoEXfffTexWIy7776bxYsX8+STTxKLxdi+fTu9evVK++tToItIziguLqasrAyA6667rnnJ3DvvvPOCfQC33HIL/fv3p1evXtx5553s2LEj5ecaPXo0X//611myZAnHjh2je/f0L52lQBeRnJG4fG5eXt4FS+Ym7oMLl71NtgxuaxYsWMCKFSs4efIkN954I2+++WZnSk+JAl1EpBW//vWvOXr0KCdPnmTDhg2MHj261bZ9+vQ5b5z8nXfeoaSkhPnz51NeXt4lga7lc0Wk66UwzTATjBkzhunTp1NVVcW99957wdTGROPHj+f73/8+ZWVlPPLII+zYsYNt27aRl5fHkCFDmDQp/a9Zy+c2Obi58YcsvmxuorKbp2o5XZFO0PK5HRP48rnSSLNdRCTTKdBFRCJCgZ5IFxGJSBZToIuIRIQCXUQkIhToIiIRoXno7bTlwLvcPORTYZchktVeqnkp0PONGzgusHPV19dz++23c/r0aZYsWcJf/vIXHn30UT796U+zbdu2wJ4nHRToCWI1xygb2C/sMkQkRL/5zW+45ppr+OlPfwpARUUFTz31FOPHjw+5srZpyEVEcsKqVasoLS1l+PDhTJ8+nT/96U9MmDCB0tJSJkyYwOHDh4nFYjz88MNs2rSJsrIyFi5cyI4dO5g9ezbz5s3j3LlzzJs3j1GjRlFaWsqPfvSjsF/WedRD74imq0pFJCvs37+f7373u7zyyivk5+dz9OhRZsyYwf3338+MGTNYuXIlDz74IBs2bGDRokXn3alo27ZtzXc0Wr58OZdffjm7du3ib3/7G6NHj2bixIkUFxeH/AobqYcepytBRaJr69atfOELXyA/Px+AT37yk7z66qvce++9AEyfPj2lpXF/9atfsWrVKsrKyrjhhhtoaGjg7bffTmvt7aEeejvl120FjbOLZBV3b3Pp21SWxnV3fvjDH3LrrbcGVVqg1EMXkcibMGECP//5z2loaADg6NGjfP7zn2fNmjUArF69mjFjxrR5nltvvZWlS5dy5swZAN566y1OnDiRvsLbST30FMS2PBN2CSKREuQ0w1QMHTqUb3zjG9x0003k5eUxYsQIlixZwpe+9CUee+wxCgoK+MlPftLmeWbOnEl1dTUjR47E3SkoKGDDhg3pfwEp0vK5cU2hXTawX5vj6WUD++lDUZF20PK5HaPlc0VEcpQCXUQkIhToItIlwhrezVYdeb/aDHQzW2lm75nZvlaOm5ktMbMqM6s0s5HtriKLbTnwbtgliGS8nj170tDQoFBPkbvT0NBAz5492/W4VGa5PA08Aaxq5fgkYFD86wZgafxfEREACgsLqa2tpb6+PuxSskbPnj0pLCxs12PaDHR3f9nMii7SZAqwyht/9f7ezPqZ2Wfc/c/tqkREIqtHjx4Zc3l8lAUxhj4AqEnYro3vu4CZzTKz3Wa2O1N/U2sJABHJVkEEerLrZZMOlLn7cncvd/fygoKCAJ5aRESaBBHotcDAhO1CoC6A83aZznywmV+3NcBKREQ6LohA3wjcH5/tciPwQc6Mnx/cHHYFIiLN2vxQ1MyeAcYB+WZWC3wL6AHg7suATcBkoAr4D+CL6So2LQ5uJr/uWNhViIh0WiqzXKa2cdyBOYFVJCIiHaIrRUVEIkKB3gma4igimUSB3gEKchHJRAp0EZGIUKCLiESEAl1EJCJyPtA1Hi4iUZHzgS4iEhUKdBGRiFCgB0FruohIBlCgd5BuPScimUaBLiISEQp0EZGIUKCLiESEAl1EJCIU6CIiEaFAD4CuNhWRTKBAFxGJCAW6iEhE5Hagd+IKz/y6rQEWIiLSebkd6AHSlaMiEjYFuohIROR0oGt2iohESUqBbmYVZnbQzKrMbEGS45eb2f81sz+Y2X4z+2LwpYqIyMW0Gehmlgc8CUwChgBTzWxIi2ZzgAPuPhwYB/yrmV0ScK0iInIRqfTQrweq3P2Qu58G1gBTWrRxoI+ZGXAZcBQ4G2ilIiJyUakE+gCgJmG7Nr4v0RPAtUAdsBeY6+4fB1JhmmhWiohETSqBbkn2eYvtW4EYcCVQBjxhZn0vOJHZLDPbbWa76+vr21lqsDSPXESiJpVArwUGJmwX0tgTT/RFYL03qgL+CFzT8kTuvtzdy929vKCgoKM1Zy7dik5EQpRKoO8CBplZcfyDznuAjS3aHAYmAJjZp4DBwKEgCxURkYvr3lYDdz9rZg8ALwJ5wEp3329ms+PHlwHfAZ42s700DtHMd/f301h3xsmv2woD+4VdhojksDYDHcDdNwGbWuxblvB9HTAx2NJERKQ9cvpK0aDpylMRCZMCXUQkIhToIiIRoUBPB01fFJEQ5GSg6ypREYminAx0EZEoUqAHTL1/EQmLAl1EJCIU6CIiEaFAFxGJCAW6iEhEKNBFRCJCgS4iEhEKdBGRiFCgi4hEhAJdRCQiFOgiIhGRc4G+5cC7jbeLSyPd6EJEwpBzgS4iElUKdBGRiFCgi4hEhAI9YOkenxcRaY0CXUQkIhToIiIRkVKgm1mFmR00syozW9BKm3FmFjOz/Wb222DLDI6GREQkqrq31cDM8oAngVuAWmCXmW109wMJbfoBTwEV7n7YzP5LmuoVEZFWpNJDvx6ocvdD7n4aWANMadHmXmC9ux8GcPf3gi0z++jeoiLS1VIJ9AFATcJ2bXxfoquBK8zsJTPbY2b3JzuRmc0ys91mtru+vr5jFYuISFKpBLol2ecttrsD1wG3AbcC/2xmV1/wIPfl7l7u7uUFBQXtLrbTDm7usqfSWL2IdLU2x9Bp7JEPTNguBOqStHnf3U8AJ8zsZWA48FYgVYqISJtS6aHvAgaZWbGZXQLcA2xs0eaXwH8zs+5m1hu4AXgj2FJFRORi2uyhu/tZM3sAeBHIA1a6+34zmx0/vszd3zCzF4BK4GNghbvvS2fhIiJyvlSGXHD3TcCmFvuWtdh+DHgsuNKCp2VtRSTKdKWoiEhEKNBFRCJCgZ5uXThVUkRymwI9nQ5u1ri9iHQZBbqISEQo0NNIvXMR6UoKdBGRiEhpHnqUvXby7aT7R/Ya1MWViIh0jnroIiIRoUAXEYmInBxyaW2YRUQkm6mHLiISEQr0rqCrRUWkCyjQRUQiQoEuIhIRCvQuoCtGRaQrKNBFRCIiZ6YtbjnwLvntaJ84tVFXjYpINlAPXUQkIhToXUnTF0UkjXIm0PPrtoZdgohIWuVMoIuIRJ0CvatouEVE0ixnZrlkwoJcsZpjlA0OuwoRiaqUeuhmVmFmB82syswWXKTdKDM7Z2ZfCK7EAKh3LCI5oM0eupnlAU8CtwC1wC4z2+juB5K0+xfgxXQUGqYg5qTralERSbdUeujXA1XufsjdTwNrgClJ2v0j8BzwXoD1BUJhKiK5IJVAHwDUJGzXxvc1M7MBwH8Hll3sRGY2y8x2m9nu+vr69tYqIiIXkUqgW5J93mL7cWC+u5+72Incfbm7l7t7eUFBQYoliohIKlKZ5VILDEzYLgTqWrQpB9aYGUA+MNnMzrr7hiCKFBGRtqXSQ98FDDKzYjO7BLgH2JjYwN2L3b3I3YuAdcD/UJgnt+XAu2GXICIR1WYP3d3PmtkDNM5eyQNWuvt+M5sdP37RcXMREekaKV1Y5O6bgE0t9iUNcnf/h86XFXEHN8PgSWFXISIRo0v/Q6BplCKSDpEP9NiWZ8IuQUSkS0Q+0EVEcoUCXUQkIhToIiIRkTPL5walswt16c5JIpIu6qGHRUv6ikjAFOgiIhER6SGXl2pe4lAG3KlIRKQrqIcuIhIRCnQRkYiI9JBLrOYYfdN4/iBuTSciEhT10EVEIiLSgd63oTLsElqlBbpEJGiRDnQRkVyiQBcRiYjIBno23OotG2oUkewR2UDPFgp1EQmKAl1EJCIU6AF57eTbzV/tkbj6onrrItIZkb2wKL9uK4fDLiJVzSsvjgy1DBHJbuqhZwDNSReRICjQM4RCXUQ6K6VAN7MKMztoZlVmtiDJ8fvMrDL+9TszGx58qdkj1fF03b1IRILUZqCbWR7wJDAJGAJMNbMhLZr9EbjJ3UuB7wDLgy40FyjgRaQzUumhXw9Uufshdz8NrAGmJDZw99+5+1/jm78HCoMtU0RE2pLKLJcBQE3Cdi1ww0XafxlIesNMM5sFzAK46qqrUiyxfV6qeQkgY+5U1KEldg9uhsGT0lSRiERVKj10S7LPkzY0G09joM9Pdtzdl7t7ubuXFxQUpF5lDtFcdBHpqFQCvRYYmLBdCNS1bGRmpcAKYIq7NwRTXm7SjBcR6YhUhlx2AYPMrBg4AtwD3JvYwMyuAtYD0939rcCrjIjWZr3obkciEoQ2A93dz5rZA8CLQB6w0t33m9ns+PFlwKNAf+ApMwM46+7l6StbRERaSunSf3ffBGxqsW9ZwvczgZnBlpY7zvvgNGEwq+kDXoBxA8d1WT0ikp0iu5ZLtmoK92M1L523P5VwT2zTUuJjWmunXxoi2S2SgR6rOUbfsItIo4703C8W9kG0b+0XRnt/+eiXikjHRTLQc0l7gzdd50rlsUHWKiIXilyga8pfZlGIi3SdyK222LehMuwSRERCEbkeOkQn1A/FXuZzZWPDLqNLpTK2rtk/IslFMtAj4ciesCvIKK0FvcJd5D8p0DPUofoTYZeQ1RT0kosU6BJ5qc7PF8l2WR/omkUhTfSzILku6wM98o7sgQHXhV1FTtAwjWS7aAV6BD9IPFR/gs+hUO9qCnfJRtEK9Ig6VH+CDz8+RtnAfmGXEjmducJVQS+ZRoGeJfo2VEK3S9VTz1Dq0UsmiFSgR32q36H6E3xuQNhVSBPNjZdME6lAzwWxGg29ZBOFu3SlyAT6odjLYZfQJfo2VMLA3FoOIIo0Li/pEJlAzyXqpWcnfQAr6aZAF8kyGsaR1ijQs1CyGS/qtUebPoCVVEQj0CN4QVG7HNkD/F3YVUjIFO6S9YEeqzlG34ZoT1dMpuliI4CyyN2mRDorlfF6hX70ZH2gR+VmFh3Rt6GSD/uXNn/PwLEaepGUXaxHrw9ns1PWB7okiA+95OKdjqRzUl2pUkGf2VIKdDOrAP4NyANWuPv3Wxy3+PHJwH8A/+DurwVcazMtk/qf+jZUQsGlQPxK2f6N+5tulq3eunSF1nr77d0vndNmoJtZHvAkcAtQC+wys43ufiCh2SRgUPzrBmBp/N+0idUco6zbO+l8iqyRbMmD5qGobpcS+/jvGoP9yJ7Gsff4MI3CXtIhlRk5qexX0LdfKj3064Eqdz8EYGZrgClAYqBPAVa5uwO/N7N+ZvYZd/9z4BXH9W2o5FC6Tp7FWn6mcKj+BH2p5FDDhW0ONcCH/UsV7JKRgvpLvLW/DFprk81SCfQBQE3Cdi0X9r6TtRkAnBfoZjYLmBXf/MjMDrarWsgH3m/nY3KB3pfk9L5cSO9Jctn0vny2tQOpBLol2ecdaIO7LweWp/CcyQsx2+3u5R19fFTpfUlO78uF9J4kF5X3JZUZzLXAwITtQqCuA21ERCSNUgn0XcAgMys2s0uAe4CNLdpsBO63RjcCH6Rz/FxERC7U5pCLu581sweAF2mctrjS3feb2ez48WXAJhqnLFbROG3xi2mqt8PDNRGn9yU5vS8X0nuSXCTeF2ucmCIiItlOq4CIiESEAl1EJCKyJtDNrMLMDppZlZktCLuesJnZQDPbZmZvmNl+M5sbdk2ZxMzyzOx1M/t/YdeSKeIX/K0zszfjPzf/NeyawmZmX4v//9lnZs+YWc+wa+qMrAj0hOUHJgFDgKlmNiTcqkJ3Fvhf7n4tcCMwR+/JeeYCb4RdRIb5N+AFd78GGE6Ovz9mNgB4ECh392E0Tvq4J9yqOicrAp2E5Qfc/TTQtPxAznL3PzctgObux2n8zzkg3Koyg5kVArcBK8KuJVOYWV9gLPBjAHc/7e7HQi0qM3QHeplZd6A3WX79TLYEemtLCwhgZkXACODfQy4lUzwOPAx8HHIdmeRzQD3wk/hQ1AozuzTsosLk7keAxcBhGpcp+cDdfxVuVZ2TLYGe0tICucjMLgOeA/6nu38Ydj1hM7PbgffcPcfvS3iB7sBIYKm7jwBOADn9WZSZXUHjX/rFwJXApWY2LdyqOidbAl1LCyRhZj1oDPPV7r4+7HoyxGjgDjOrpnFo7u/N7P+EW1JGqAVq3b3pr7h1NAZ8LrsZ+KO717v7GWA98PmQa+qUbAn0VJYfyCnxm4r8GHjD3f932PVkCnd/xN0L3b2Ixp+Tre6e1b2uILj7X4AaMxsc3zWB85fAzkWHgRvNrHf8/9MEsvyD4qy4BV1ryw+EXFbYRgPTgb1mFovv+yd33xReSZLh/hFYHe8UHSJ9S3RkBXf/dzNbB7xG46yx18nyJQB06b+ISERky5CLiIi0QYEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYmI/w9iflt0knsXdQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, ax = plt.subplots()\n",
+    "ax.hist(just_mmseqs_filtered['normalized bit score'], bins=100, label='mmseqs', alpha=0.3, density=True);\n",
+    "ax.hist(hhblits_filtered['normalized bit score'], bins=100, label='hhblits', alpha=0.3, density=True);\n",
+    "ax.hist(structural_annotation['normalized bit score'], bins=100, label='coffe', alpha=0.3, density=True);\n",
+    "ax.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "efdcc8b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_up_id = pd.concat([hhblits_filtered['target'].drop_duplicates(),\n",
+    "                          just_mmseqs_filtered['target'].drop_duplicates()])\n",
+    "unique_up_id.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e1fcdc85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(f'../data/milot_unique_ids.txt', 'w') as tmp:\n",
+    "    ids_as_str = unique_up_id.astype(str).values\n",
+    "    tmp.write(','.join(ids_as_str))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec480a86",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-block alert-warning\"><b>DISCLAIMER:</b> To make this cell execute, you may need to copy the contents of the UPIMAPI/resource folder into the main repository directory.</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "87d494b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-07-27 11:48:46: ID mapping has begun.\n",
+      "Auto determined \"full id\" as: False\n",
+      "../data/uniprotinfo.fasta not found. Will perform mapping for all IDs.\n",
+      "Checking which IDs are missing information.\n",
+      "Checking which IDs are missing information.: 100%|█| 24540/24540 [00:00<00:00, 3\n",
+      "Information already gathered for 0 ids. Still missing for 24540.\n",
+      "Building FASTA from 24540 IDs.\n",
+      "UniProt ID mapping:   0%|                                 | 0/3 [00:00<?, ?it/s]\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 1130, in <module>\n",
+      "    upimapi()\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 1125, in upimapi\n",
+      "    uniprot_fasta_workflow(ids, f'{args.output}/uniprotinfo.fasta', step=args.step, sleep_time=args.sleep)\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 309, in uniprot_fasta_workflow\n",
+      "    uniprotinfo = get_uniprot_fasta(ids_missing, step=step, sleep_time=sleep_time)\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 285, in get_uniprot_fasta\n",
+      "    data = uniprot_request(ids[i:j], original_database='ACC+ID', database_destination='', output_format='fasta')\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 219, in uniprot_request\n",
+      "    'columns': string4mapping(columns=columns, databases=databases)\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 176, in string4mapping\n",
+      "    result = ','.join([columns_dict[column] for column in columns])\n",
+      "  File \"/g/arendt/npapadop/repos/UPIMAPI/upimapi.py\", line 176, in <listcomp>\n",
+      "    result = ','.join([columns_dict[column] for column in columns])\n",
+      "KeyError: 'Entry'\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/milot_unique_ids.txt -o ../data/ --fasta --no-annotation"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:814d8de9 tags:
+
+``` python
+from datetime import datetime, timezone
+import pytz
+
+utc_dt = datetime.now(timezone.utc) # UTC time
+dt = utc_dt.astimezone()
+tz = pytz.timezone('Europe/Berlin')
+berlin_now = datetime.now(tz)
+print(f'{berlin_now:%Y-%m-%d %H:%M}')
+```
+
+%% Output
+
+    2022-07-27 13:53
+
+%% Cell type:code id:3808ea96 tags:
+
+``` python
+import glob
+from os.path import exists
+
+import pandas as pd
+import numpy as np
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+```
+
+%% Cell type:code id:ad6f9837 tags:
+
+``` python
+structural_annotation = pd.read_parquet('../old_data/structure_annotation.parquet')
+sequence_annotation = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\t')
+```
+
+%% Cell type:code id:1a0df0c0 tags:
+
+``` python
+just_mmseqs = pd.read_csv('../data/cfres_mmseqs_s75_e1.m8', sep="\s+", header=None)
+just_mmseqs.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
+                       "no. gap open", "query start", "query end", "target start", "target end",
+                       "e value", "bit score"]
+
+just_mmseqs['gene_id'] = just_mmseqs['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))
+just_mmseqs['normalized bit score'] = just_mmseqs['bit score'] / just_mmseqs['alignment length']
+```
+
+%% Cell type:code id:b09c96b5 tags:
+
+``` python
+def best_bit_score(df, sort_by='normalized bit score', tiebreak='alignment length'):
+    have_max = df[sort_by] == np.max(df[sort_by])
+    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])
+    return df[have_max][max_ali].index.values[0]
+
+def keep_best(df, groupby='gene_id'):
+    df.reset_index(inplace=True)
+    idx = df.groupby(groupby).apply(best_bit_score)
+    res = df.loc[idx].copy()
+    return res
+```
+
+%% Cell type:code id:ff6f8bf7 tags:
+
+``` python
+just_mmseqs_filtered = keep_best(just_mmseqs)
+```
+
+%% Cell type:code id:af68258b tags:
+
+``` python
+hhblits = pd.read_csv('../data/cfres_hhblits_e1.m8', sep="\s+", header=None)
+hhblits.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
+                       "no. gap open", "query start", "query end", "target start", "target end",
+                       "e value", "bit score"]
+
+hhblits['gene_id'] = hhblits['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))
+hhblits['normalized bit score'] = hhblits['bit score'] / hhblits['alignment length']
+```
+
+%% Cell type:code id:5380700c tags:
+
+``` python
+hhblits_filtered = keep_best(hhblits)
+```
+
+%% Cell type:code id:2d829347 tags:
+
+``` python
+sequence_annotation.columns
+```
+
+%% Output
+
+    Index(['evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',
+           'Description', 'Preferred_name', 'GOs', 'PFAMs', 'gene_id',
+           'protein_id'],
+          dtype='object')
+
+%% Cell type:code id:b72335fe tags:
+
+``` python
+structural_annotation['normalized bit score'] = structural_annotation['bit score'] / structural_annotation['alignment length']
+```
+
+%% Cell type:code id:e7595d52 tags:
+
+``` python
+fig, ax = plt.subplots()
+ax.hist(just_mmseqs_filtered['normalized bit score'], bins=100, label='mmseqs', alpha=0.3, density=True);
+ax.hist(hhblits_filtered['normalized bit score'], bins=100, label='hhblits', alpha=0.3, density=True);
+ax.hist(structural_annotation['normalized bit score'], bins=100, label='coffe', alpha=0.3, density=True);
+ax.legend()
+```
+
+%% Output
+
+    <matplotlib.legend.Legend at 0x7fdd93502580>
+
+
+
+%% Cell type:code id:efdcc8b4 tags:
+
+``` python
+unique_up_id = pd.concat([hhblits_filtered['target'].drop_duplicates(),
+                          just_mmseqs_filtered['target'].drop_duplicates()])
+unique_up_id.drop_duplicates(inplace=True)
+```
+
+%% Cell type:code id:e1fcdc85 tags:
+
+``` python
+with open(f'../data/milot_unique_ids.txt', 'w') as tmp:
+    ids_as_str = unique_up_id.astype(str).values
+    tmp.write(','.join(ids_as_str))
+```
+
+%% Cell type:markdown id:ec480a86 tags:
+
+<div class="alert alert-block alert-warning"><b>DISCLAIMER:</b> To make this cell execute, you may need to copy the contents of the UPIMAPI/resource folder into the main repository directory.</div>
+
+%% Cell type:code id:87d494b2 tags:
+
+``` python
+!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/milot_unique_ids.txt -o ../data/ --fasta --no-annotation
+```
+
+%% Output
+
+    2022-07-27 11:48:46: ID mapping has begun.
+    Auto determined "full id" as: False
+    ../data/uniprotinfo.fasta not found. Will perform mapping for all IDs.
+    Checking which IDs are missing information.
+    Checking which IDs are missing information.: 100%|█| 24540/24540 [00:00<00:00, 3
+    Information already gathered for 0 ids. Still missing for 24540.
+    Building FASTA from 24540 IDs.
+    UniProt ID mapping:   0%|                                 | 0/3 [00:00<?, ?it/s]
+    Traceback (most recent call last):
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 1130, in <module>
+        upimapi()
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 1125, in upimapi
+        uniprot_fasta_workflow(ids, f'{args.output}/uniprotinfo.fasta', step=args.step, sleep_time=args.sleep)
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 309, in uniprot_fasta_workflow
+        uniprotinfo = get_uniprot_fasta(ids_missing, step=step, sleep_time=sleep_time)
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 285, in get_uniprot_fasta
+        data = uniprot_request(ids[i:j], original_database='ACC+ID', database_destination='', output_format='fasta')
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 219, in uniprot_request
+        'columns': string4mapping(columns=columns, databases=databases)
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 176, in string4mapping
+        result = ','.join([columns_dict[column] for column in columns])
+      File "/g/arendt/npapadop/repos/UPIMAPI/upimapi.py", line 176, in <listcomp>
+        result = ','.join([columns_dict[column] for column in columns])
+    KeyError: 'Entry'
--- a/analysis/dotplots.ipynb
+++ b/analysis/dotplots.ipynb
--- a/analysis/read-write.ipynb
+++ b/analysis/read-write.ipynb
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2022-07-13 14:13\n"
+      "2022-07-15 10:36\n"
     ]
    }
   ],
@@ -475,63 +475,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "id": "17e219d3-604d-4538-b2be-2768f5aa4df1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 41932/41932 [28:14<00:00, 24.75it/s]  \n"
-     ]
-    }
-   ],
-   "source": [
-    "N = len(os.listdir(structure_list))\n",
-    "proteins = [\"\"] * N\n",
-    "plddt = {}\n",
-    "avg_plddt = [0.] * N\n",
-    "\n",
-    "for i, protein in enumerate(tqdm(os.listdir(structure_list))):\n",
-    "    full_name = protein.split(\".\")[0]\n",
-    "    metadata_loc = metadata + full_name + \"_scores.json\"\n",
-    "    with open(metadata_loc, \"r\") as f:\n",
-    "        score = json.load(f)\n",
-    "    name = \"_\".join(full_name.split(\"_\")[:3])\n",
-    "    proteins[i] = name\n",
-    "    avg_plddt[i] = np.mean(score[\"plddt\"])\n",
-    "    plddt[name] = score['plddt']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "10171271-cae7-4db0-a561-b0091c7bc2ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.savez('../data/spongilla_plddt.npz', **plddt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "def42203-df23-4f4b-ad7b-a01dde971452",
+   "execution_count": 14,
+   "id": "3831f7ff-07b6-42ae-8276-ab899ab6ffa0",
   "metadata": {},
   "outputs": [],
   "source": [
-    "alphafold = pd.DataFrame({\"isoform\": proteins, \"plddt\": avg_plddt})"
+    "alphafold.to_csv(\"../data/structure_predictions.csv\")"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "3831f7ff-07b6-42ae-8276-ab899ab6ffa0",
+   "cell_type": "markdown",
+   "id": "86732d06-f95e-4583-9997-86bb9c2e0a52",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "alphafold.to_csv(\"../data/structure_predictions.csv\")"
+    "Should take about 30-45mins to complete"
   ]
  },
  {
@@ -550,58 +507,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "id": "f9a0fd51-3e71-4c32-95d6-6f0dbeb4cb3c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def read_foldseek(result_path):\n",
-    "    df = pd.read_csv(result_path, sep=\"\\s+\", header=None)\n",
-    "    df.columns = [\"query\", \"target\", \"seq. id.\", \"alignment length\", \"no. mismatches\",\n",
-    "                   \"no. gap open\", \"query start\", \"query end\", \"target start\", \"target end\",\n",
-    "                   \"e value\", \"bit score\"]\n",
-    "    df[\"query\"] = df[\"query\"].str.split(\"_\").str[0]\n",
-    "    df[\"corrected bit score\"] = df[\"bit score\"] / df[\"alignment length\"]\n",
-    "    if \"pdb\" in result_path:\n",
-    "        df[\"target\"] = df[\"target\"].str.split(\".\").str[0]\n",
-    "    else:\n",
-    "        df[\"target\"] = df[\"target\"].str.split(\"-\").str[:3].str.join(\"-\")\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "4e8efa67-66a8-4f4b-87c3-e0ae57991860",
+   "execution_count": null,
+   "id": "f79aa33d-50b9-4cf0-876a-001a63db761a",
   "metadata": {},
   "outputs": [],
   "source": [
-    "pdb = read_foldseek(fs_pdb)\n",
-    "pdb[\"query\"] = pdb[\"query\"].values.astype(int)\n",
-    "afdb = read_foldseek(fs_afdb)\n",
-    "afdb[\"query\"] = afdb[\"query\"].values.astype(int)\n",
-    "swp = read_foldseek(fs_swp)\n",
-    "swp[\"query\"] = swp[\"query\"].values.astype(int)"
+    "!sbatch ../scripts/io_read_pdb.sh\n",
+    "!sbatch ../scripts/io_read_afdb.sh\n",
+    "!sbatch ../scripts/io_read_swp.sh"
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "b4532df6-fdca-40fa-aac4-b7519569ab75",
+   "cell_type": "markdown",
+   "id": "1c3f8ded-6be0-4749-ad85-da5d65be9be6",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "def get_aligned_plddt(df, plddt, name_dict):\n",
-    "    aligned_plddt = [0.] * len(df)\n",
-    "    for e in tqdm(df.iterrows()):\n",
-    "        index, row = e\n",
-    "        query = row['query']\n",
-    "        isoform = name_dict[query]\n",
-    "        protein = plddt[isoform]\n",
-    "        start = row['query start'] - 1\n",
-    "        end = row['query end'] - 1\n",
-    "        aligned_plddt[index] = np.mean(protein[start:end])\n",
-    "    return aligned_plddt"
+    "should take about 3-5h to complete."
   ]
  },
  {
@@ -619,7 +540,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "9496151e-6ac4-40ed-a5c1-e3fd1e84fe05",
   "metadata": {},
   "outputs": [
@@ -627,34 +548,23 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "1470483it [23:47, 1030.14it/s]\n",
-      "1223371it [20:26, 883.72it/s] "
+      "1470483it [27:00, 907.18it/s] \n"
     ]
    }
   ],
   "source": [
    "pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)\n",
-    "afdb['aligned_plddt'] = get_aligned_plddt(afdb, plddt, query_to_isoform)\n",
-    "swp['aligned_plddt'] = get_aligned_plddt(swp, plddt, query_to_isoform)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4a12ca36-748d-4a73-a95c-ed0c8ef4c772",
-   "metadata": {},
-   "source": [
-    "We can already obtain the UniProt ID for the AFDB and SwissProt targets, since it is helpfully included in the name."
+    "pdb.to_parquet('../data/pdb_tmp.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "122b7f1c-adab-4a0d-8486-41bebcd9dfd3",
+   "id": "85f5b721-880c-400a-a330-37a35503a740",
   "metadata": {},
   "outputs": [],
   "source": [
-    "afdb[\"uniprot\"] = afdb[\"target\"].str.split(\"-\").str[1]\n",
-    "swp[\"uniprot\"] = swp[\"target\"].str.split(\"-\").str[1]"
+    "print(\"done\")"
   ]
  },
  {
@@ -665,50 +575,6 @@
    "For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes."
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9b85e1a6-4df6-4975-9007-d91b8e25b724",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_from_uniprot(df, column, uniprot_from=\"ACC\", uniprot_to=\"EGGNOG_ID\", request_size=40000):\n",
-    "    ids = df[column][~df[column].isnull()]\n",
-    "    url = 'https://www.uniprot.org/uploadlists/'\n",
-    "\n",
-    "    params = {\n",
-    "    'from': uniprot_from,\n",
-    "    'to': uniprot_to,\n",
-    "    'format': 'tab',\n",
-    "    'query': \" \".join(ids.unique())\n",
-    "    }\n",
-    "\n",
-    "    data = urllib.parse.urlencode(params)\n",
-    "    data = data.encode('utf-8')\n",
-    "    req = urllib.request.Request(url, data)\n",
-    "    with urllib.request.urlopen(req) as f:\n",
-    "        response = f.read()\n",
-    "    return response\n",
-    "\n",
-    "def create_id_map(response, id_in, id_out):\n",
-    "    map_from = []\n",
-    "    map_to = []\n",
-    "    for row in tqdm(response.decode(\"utf-8\").split(\"\\n\")[:-1]):\n",
-    "        col1, col2 = row.split(\"\\t\")\n",
-    "        map_from.append(col1)\n",
-    "        map_to.append(col2)\n",
-    "\n",
-    "    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})\n",
-    "    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(\",\".join))\n",
-    "#     res.set_index(id_in, inplace=True)\n",
-    "    return res\n",
-    "\n",
-    "def enrich_from_uniprot(df, column_from, column_to, uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\"):\n",
-    "    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)\n",
-    "    df_map = create_id_map(response, column_from, column_to)\n",
-    "    return df.join(df_map, on=column_from)"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -721,18 +587,6 @@
    "pdb = enrich_from_uniprot(pdb, \"target\", \"uniprot\", uniprot_from=\"PDB_ID\", uniprot_to=\"ACC\")"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "e51f2299-2aaa-4a80-836e-bf6fa09df562",
-   "metadata": {},
-   "source": [
-    "### 3.2 Obtaining sequences for the UniProt IDs\n",
-    "\n",
-    "Now that we have a UniProt ID for each FoldSeek hit we need to get the annotation that is associated with them. We will use two sources; EggNOG mapper (Emapper) and UniProt itself. For Emapper we will use UPIMAPI to query the API and get the FASTA-formatted sequence that corresponds to each ID. This will then be (manually) submitted to EggNOG, and the results collected and collated. UPIMAPI will also get us the UniProt annotation.\n",
-    "\n",
-    "First let's find all the unique IDs present in our different target databases:"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -746,14 +600,6 @@
    "unique_up_id.drop_duplicates(inplace=True)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "f8fbc904-aaf5-4f2a-823c-803b35162765",
-   "metadata": {},
-   "source": [
-    "We will write the IDs in a text file without a header and an index, since we plan on passing this on to `upimapi.py`. This should also make it easier to reproduce our work."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -764,14 +610,6 @@
    "unique_up_id.to_csv('../data/unique_uniprot_ids.csv', header=False, index=False)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "812f9ac5-4ef4-495a-bf60-b8b9eea0b9b6",
-   "metadata": {},
-   "source": [
-    "UPIMAPI, a bit surprisingly, does not accept a file of newline-separated IDs, but requires an input stream of comma-separated IDs. Not much we can do about it ¯\\\\\\_(ツ)_/¯"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -784,14 +622,6 @@
    "    tmp.write(','.join(ids_as_str))"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "1da0e8ce-564e-42a5-a3b5-34c777dbc018",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-block alert-warning\"><b>DISCLAIMER:</b> To make this cell execute, you may need to copy the contents of the UPIMAPI/resource folder into the main repository directory.</div>"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -802,16 +632,6 @@
    "!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --fasta --no-annotation"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "id": "57a1ea9a-b33b-4eeb-bdfc-d92c9ef145fd",
-   "metadata": {},
-   "source": [
-    "### 3.3 Annotating UniProt sequences - EggNOG mapper\n",
-    "\n",
-    "To submit to Emapper we first need to split in chunks of size $\\leq 100.000$. We will use `fasta-splitter.pl` [by Kirill Kryukov](http://kirill-kryukov.com/study/tools/fasta-splitter/)."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,

 %% Cell type:code id:a5e450d0-6b47-405d-b6f4-fdbb3edfb641 tags:

 ``` python
 from datetime import datetime, timezone
 import pytz

 utc_dt = datetime.now(timezone.utc) # UTC time
 dt = utc_dt.astimezone()
 tz = pytz.timezone('Europe/Berlin')
 berlin_now = datetime.now(tz)

 print(f'{berlin_now:%Y-%m-%d %H:%M}')
 ```

 %% Output

-    2022-07-13 14:13
+    2022-07-15 10:36

 %% Cell type:markdown id:3d79e6d8-8b76-4d9b-8621-20543ce83f54 tags:

 # Expanding functional annotation of a non-model species using predicted protein structure similarity

 ***WARNING:*** The starting point of this analysis is a **predicted proteome** produced by the Trinity <i>de-novo</i> assembler. We produced multiple sequence alignments for each **isoform** with MMSeqs2 against the ColabFoldDB and UniRef30 (see the [ColabFold website](https://colabfold.mmseqs.com)); the MSAs were used as input for AlphaFold; the predicted protein structures were queried by FoldSeek against PDB, SwissProt, and AlphaFoldDB. The same sequences were queried by EggNOG. However, ultimately the analysis we conduct here is going to be on the **gene level**; we will therefore summarize results on the isoform level, either by concatenating them or by keeping the best-scoring isoform.

 This notebook gathers the preparatory steps for the analysis. This is the following:

 1. **Parse MSA files** and extract MSA size, query length, _Spongilla_ transcript name
 2. **Parse AlphaFold predictions**
    2a. extract average pLDDT score per prediction
    2b. keep best-scoring isoform per gene
 3. **Read and filter FoldSeek predictions** for each target database (PDB, SwissProt, AlphaFoldDB). Obtain annotation via EggNOG mapper, UniProt.
 4. **Read and filter sequence-based annotation** for the _Spongilla_ proteome.
 5. **Format structural-based annotation** Summarize in single table, similar to how sequence annotation would look like.




 We are going to use
 * `numpy` and `pandas` for I/O and data wrangling;
 * `tqdm` for progress bars;
 * `glob`, `os` for file navigation;
 * `json` to parse some AlphaFold output;
 * `urllib` to query the UniProt API;
 * `UPIMAPI`, a tool to programmatically access UniProt;
 * `fasta-splitter.pl`, a Perl script to parse FASTA files.

 %% Cell type:code id:1eae5786-023a-45ab-ac4a-ccb3a1db5760 tags:

 ``` python
 import json
 import glob
 import os

 import urllib.parse
 import urllib.request

 from tqdm import tqdm

 import numpy as np
 import pandas as pd
 ```

 %% Cell type:markdown id:ebbcdb49-737d-4ed9-b081-56056ecd6cf6 tags:

 ## 0. Setup

 We are going to gather the input here; we need

 * input MSAs for _Spongilla lacustris_ proteins (actually Transdecoder-predicted peptides). These were the input to AlphaFold.
 * `foldseek` search results for the _Spongilla_ predicted structures against three target databases: AlphaFoldDB, PDB, and SwissProt.
 * AlphaFold predicted structures; we obtain a list of structures (input to FoldSeek) as well as auxiliary files with e.g. prediction quality scores.
 * input peptides

 %% Cell type:code id:c58419bf-f9d3-40f6-ae23-a4bf978f2b35 tags:

 ``` python
 # MSA location
 msas = '../data/msa/'
 # FOLDSEEK scores
 fs_pdb = '../data/pdb_score.tsv'
 fs_afdb = '../data/afdb_score.tsv'
 fs_swp = '../data/swissprot_score.tsv'
 # AlphaFold predictions
 structure_list = '../data/named_models/'
 metadata = '../data/named_metadata/'
 # input fasta
 all_peptides = '../data/Spongilla_lacustris_translated_proteome.fasta'
 ```

 %% Cell type:markdown id:9a44be9c-3fdf-4c62-9e16-bea75e383d00 tags:

 ## 1. Sequence information


 ### 1.1 Input peptides

 The input for the sequence annotation pipeline was predicted peptides, as generated from the assembled genome. Read the peptides and keep track of which sequences start with a methionine and end with a star (i.e. complete transcripts assembled by Trinity/TransDecoder).

 Note that there will be multiple (predicted) isoforms per (predicted) gene. We produced one MSA per isoform.

 %% Cell type:code id:40fd2798-4dba-48f6-bbd1-fc5758c84835 tags:

 ``` python
 isoforms = []
 proteins = []
 sequences = []
 with open(all_peptides, "r") as fasta_file:
    current_protein = None
    current_id = ""
    for line in tqdm(fasta_file):
        if line.startswith(">"):
            current_id = line.strip()[1:]
            isoform, protein = current_id.split(".")
            isoforms.append(isoform[:-2])
            proteins.append(protein)
            if current_protein is not None:
                sequences.append(current_protein)
            current_protein = ""
        else:
            current_protein += line.strip()
    sequences.append(current_protein)
 ```

 %% Output

    223492it [00:00, 1285673.28it/s]

 %% Cell type:code id:2e2fafde-8653-4286-aeb4-cecb55804e4f tags:

 ``` python
 fasta = pd.DataFrame({"isoform": isoforms, "protein": proteins, "seq": sequences})

 fasta["has_start"] = fasta["seq"].str.startswith("M")
 fasta["has_end"] = fasta["seq"].str[-1].str.startswith("*")
 fasta.set_index("isoform", inplace=True)
 fasta = fasta[["has_start", "has_end"]]
 ```

 %% Cell type:markdown id:9ef9eeba-0082-4109-ad9b-33df3db9b129 tags:

 ### 1.2 Multiple sequence alignments

 The peptides were aligned against UniRef30 and ColabFoldDB using MMSeqs2. Read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name.

 %% Cell type:code id:cc2f89f4-1b24-4b1d-b1ef-b1c40ea99ff4 tags:

 ``` python
 N = len(glob.glob(msas+"*.a3m"))
 seq_id = [""] * N
 no_seqs = [0] * N
 seq_length = [0] * N
 gene_name = [""] * N

 for i, alignment in enumerate(tqdm(glob.glob(msas+"*.a3m"))):
    try:
        with open(alignment, "r") as f:
            seq_id[i] = alignment.split("/")[-1].split(".")[0]
            lines = f.readlines()
            no_seqs[i] = (len(lines) - 2) // 2
            seq_length[i] = len(lines[1].rstrip()[1:])
            gene_name[i] = lines[0].rstrip()[1:]
            # print(no_seqs, seq_length, gene_name)
    except FileNotFoundError:
        continue
 ```

 %% Output

    100%|██████████| 41945/41945 [01:22<00:00, 509.76it/s]

 %% Cell type:code id:64a7ca10-f99c-402e-a9bf-42826933ae9e tags:

 ``` python
 sequence_info = pd.DataFrame({"query": seq_id, "MSA size": no_seqs, "query length": seq_length, "gene name": gene_name})
 sequence_info["protein_id"] = sequence_info["gene name"].str.split(".").str[1]
 sequence_info["isoform"] = sequence_info["gene name"].str.split(".").str[0].str[:-2]
 sequence_info["gene_id"] = sequence_info["gene name"].str.split(".").str[0].str.split("_").str[:2].str.join("_")
 sequence_info.set_index("isoform", inplace=True)
 ```

 %% Cell type:markdown id:46c7c3ce-0f76-4728-a188-5c04247d1eef tags:

 include the start codon/end codon information from the raw sequences we parsed:

 %% Cell type:code id:d57aa70c-9646-4dfb-80b8-2eb3791c32ba tags:

 ``` python
 sequence_info
 ```

 %% Output

                   query  MSA size  query length              gene name  \
    isoform
    c100000_g1_i1      0      2765           433  c100000_g1_i1_m.41809
    c103531_g2_i2  10000      2082           152  c103531_g2_i2_m.66483
    c103531_g3_i1  10001      1924           201  c103531_g3_i1_m.66482
    c103531_g4_i1  10002      1959           215  c103531_g4_i1_m.66484
    c103532_g1_i1  10003       203           288  c103532_g1_i1_m.66485
    ...              ...       ...           ...                    ...
    c103530_g1_i3   9998      9311           992  c103530_g1_i3_m.66470
    c103531_g2_i1   9999      2103           221  c103531_g2_i1_m.66481
    c100434_g1_i2    999     15780           124  c100434_g1_i2_m.44014
    c100036_g1_i3     99         3           290  c100036_g1_i3_m.42037
    c100002_g1_i1      9         1           112  c100002_g1_i1_m.41844
    
                  protein_id     gene_id
    isoform
    c100000_g1_i1      41809  c100000_g1
    c103531_g2_i2      66483  c103531_g2
    c103531_g3_i1      66482  c103531_g3
    c103531_g4_i1      66484  c103531_g4
    c103532_g1_i1      66485  c103532_g1
    ...                  ...         ...
    c103530_g1_i3      66470  c103530_g1
    c103531_g2_i1      66481  c103531_g2
    c100434_g1_i2      44014  c100434_g1
    c100036_g1_i3      42037  c100036_g1
    c100002_g1_i1      41844  c100002_g1
    
    [41945 rows x 6 columns]

 %% Cell type:code id:5ae5dfb2-84a6-420f-8ac2-2c79d4f95e16 tags:

 ``` python
 sequence_info = sequence_info.join(fasta)
 # this column will hold NaNs later so convert it to Int64, which can deal with nulls.
 sequence_info["protein_id"] = sequence_info["protein_id"].astype(int).astype("Int64")
 ```

 %% Cell type:code id:c61347d2-f602-4f76-a656-70528242a4be tags:

 ``` python
 sequence_info.to_csv("../data/sequence_info.csv")
 # sequence_info = pd.read_csv("../data/sequence_info.csv")
 ```

 %% Cell type:markdown id:1b716527-0af8-4945-b70f-a11302c11685 tags:

 ## 2. AlphaFold-predicted structures

 We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID.

-%% Cell type:code id:17e219d3-604d-4538-b2be-2768f5aa4df1 tags:
-
-``` python
-N = len(os.listdir(structure_list))
-proteins = [""] * N
-plddt = {}
-avg_plddt = [0.] * N
-
-for i, protein in enumerate(tqdm(os.listdir(structure_list))):
-    full_name = protein.split(".")[0]
-    metadata_loc = metadata + full_name + "_scores.json"
-    with open(metadata_loc, "r") as f:
-        score = json.load(f)
-    name = "_".join(full_name.split("_")[:3])
-    proteins[i] = name
-    avg_plddt[i] = np.mean(score["plddt"])
-    plddt[name] = score['plddt']
-```
-
-%% Output
-
-    100%|██████████| 41932/41932 [28:14<00:00, 24.75it/s]
-
-%% Cell type:code id:10171271-cae7-4db0-a561-b0091c7bc2ba tags:
-
-``` python
-np.savez('../data/spongilla_plddt.npz', **plddt)
-```
-
-%% Cell type:code id:def42203-df23-4f4b-ad7b-a01dde971452 tags:
-
-``` python
-alphafold = pd.DataFrame({"isoform": proteins, "plddt": avg_plddt})
-```
-
 %% Cell type:code id:3831f7ff-07b6-42ae-8276-ab899ab6ffa0 tags:

 ``` python
 alphafold.to_csv("../data/structure_predictions.csv")
 ```

+%% Cell type:markdown id:86732d06-f95e-4583-9997-86bb9c2e0a52 tags:
+
+Should take about 30-45mins to complete
+
 %% Cell type:markdown id:9c6410a0-999a-41c5-af8c-3c30abbb1654 tags:

 ## 3. FoldSeek predictions

 We searched against AlphaFoldDB (predicted), PDB (crystal), and SwissProt (predicted) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation.

 ### 3.1 Reading results

 The FoldSeek results come in BLAST format. We will read them and convert the query (FoldSeek's internal unique ID for each input structure) from string to integer.

-%% Cell type:code id:f9a0fd51-3e71-4c32-95d6-6f0dbeb4cb3c tags:
+%% Cell type:code id:f79aa33d-50b9-4cf0-876a-001a63db761a tags:

 ``` python
-def read_foldseek(result_path):
-    df = pd.read_csv(result_path, sep="\s+", header=None)
-    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
-                   "no. gap open", "query start", "query end", "target start", "target end",
-                   "e value", "bit score"]
-    df["query"] = df["query"].str.split("_").str[0]
-    df["corrected bit score"] = df["bit score"] / df["alignment length"]
-    if "pdb" in result_path:
-        df["target"] = df["target"].str.split(".").str[0]
-    else:
-        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
-    return df
+!sbatch ../scripts/io_read_pdb.sh
+!sbatch ../scripts/io_read_afdb.sh
+!sbatch ../scripts/io_read_swp.sh
 ```

-%% Cell type:code id:4e8efa67-66a8-4f4b-87c3-e0ae57991860 tags:
+%% Cell type:markdown id:1c3f8ded-6be0-4749-ad85-da5d65be9be6 tags:

-``` python
-pdb = read_foldseek(fs_pdb)
-pdb["query"] = pdb["query"].values.astype(int)
-afdb = read_foldseek(fs_afdb)
-afdb["query"] = afdb["query"].values.astype(int)
-swp = read_foldseek(fs_swp)
-swp["query"] = swp["query"].values.astype(int)
-```
-
-%% Cell type:code id:b4532df6-fdca-40fa-aac4-b7519569ab75 tags:
-
-``` python
-def get_aligned_plddt(df, plddt, name_dict):
-    aligned_plddt = [0.] * len(df)
-    for e in tqdm(df.iterrows()):
-        index, row = e
-        query = row['query']
-        isoform = name_dict[query]
-        protein = plddt[isoform]
-        start = row['query start'] - 1
-        end = row['query end'] - 1
-        aligned_plddt[index] = np.mean(protein[start:end])
-    return aligned_plddt
-```
+should take about 3-5h to complete.

 %% Cell type:code id:563af56b-7a1b-4c3a-be9c-8c79fe684c01 tags:

 ``` python
 plddt = np.load('../data/spongilla_plddt.npz')

 sequence_info = pd.read_csv("../data/sequence_info.csv")
 query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
 ```

 %% Cell type:code id:9496151e-6ac4-40ed-a5c1-e3fd1e84fe05 tags:

 ``` python
 pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
-afdb['aligned_plddt'] = get_aligned_plddt(afdb, plddt, query_to_isoform)
-swp['aligned_plddt'] = get_aligned_plddt(swp, plddt, query_to_isoform)
+pdb.to_parquet('../data/pdb_tmp.parquet')
 ```

 %% Output

-    1470483it [23:47, 1030.14it/s]
-    1223371it [20:26, 883.72it/s]
-
-%% Cell type:markdown id:4a12ca36-748d-4a73-a95c-ed0c8ef4c772 tags:
+    1470483it [27:00, 907.18it/s]

-We can already obtain the UniProt ID for the AFDB and SwissProt targets, since it is helpfully included in the name.
-
-%% Cell type:code id:122b7f1c-adab-4a0d-8486-41bebcd9dfd3 tags:
+%% Cell type:code id:85f5b721-880c-400a-a330-37a35503a740 tags:

 ``` python
-afdb["uniprot"] = afdb["target"].str.split("-").str[1]
-swp["uniprot"] = swp["target"].str.split("-").str[1]
+print("done")
 ```

 %% Cell type:markdown id:c2343e57-f527-4634-87a9-cbd2bc8aee5f tags:

 For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes.

-%% Cell type:code id:9b85e1a6-4df6-4975-9007-d91b8e25b724 tags:
-
-``` python
-def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
-    ids = df[column][~df[column].isnull()]
-    url = 'https://www.uniprot.org/uploadlists/'
-
-    params = {
-    'from': uniprot_from,
-    'to': uniprot_to,
-    'format': 'tab',
-    'query': " ".join(ids.unique())
-    }
-
-    data = urllib.parse.urlencode(params)
-    data = data.encode('utf-8')
-    req = urllib.request.Request(url, data)
-    with urllib.request.urlopen(req) as f:
-        response = f.read()
-    return response
-
-def create_id_map(response, id_in, id_out):
-    map_from = []
-    map_to = []
-    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
-        col1, col2 = row.split("\t")
-        map_from.append(col1)
-        map_to.append(col2)
-
-    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
-    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
-#     res.set_index(id_in, inplace=True)
-    return res
-
-def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
-    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
-    df_map = create_id_map(response, column_from, column_to)
-    return df.join(df_map, on=column_from)
-```
-
 %% Cell type:code id:76ff0420-e724-4152-abbc-29d25bfd3f59 tags:

 ``` python
 pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC")
 ```

-%% Cell type:markdown id:e51f2299-2aaa-4a80-836e-bf6fa09df562 tags:
-
-### 3.2 Obtaining sequences for the UniProt IDs
-
-Now that we have a UniProt ID for each FoldSeek hit we need to get the annotation that is associated with them. We will use two sources; EggNOG mapper (Emapper) and UniProt itself. For Emapper we will use UPIMAPI to query the API and get the FASTA-formatted sequence that corresponds to each ID. This will then be (manually) submitted to EggNOG, and the results collected and collated. UPIMAPI will also get us the UniProt annotation.
-
-First let's find all the unique IDs present in our different target databases:
-
 %% Cell type:code id:7262d562-3c2d-4a7a-9361-1bd197d30a7c tags:

 ``` python
 unique_up_id = pd.concat([pdb['uniprot'].drop_duplicates(),
                          afdb['uniprot'].drop_duplicates(),
                          swp['uniprot'].drop_duplicates()])
 unique_up_id.drop_duplicates(inplace=True)
 ```

-%% Cell type:markdown id:f8fbc904-aaf5-4f2a-823c-803b35162765 tags:
-
-We will write the IDs in a text file without a header and an index, since we plan on passing this on to `upimapi.py`. This should also make it easier to reproduce our work.
-
 %% Cell type:code id:df0fdae3-52e6-4e7a-87e8-c81401278b09 tags:

 ``` python
 unique_up_id.to_csv('../data/unique_uniprot_ids.csv', header=False, index=False)
 ```

-%% Cell type:markdown id:812f9ac5-4ef4-495a-bf60-b8b9eea0b9b6 tags:
-
-UPIMAPI, a bit surprisingly, does not accept a file of newline-separated IDs, but requires an input stream of comma-separated IDs. Not much we can do about it ¯\\\_(ツ)_/¯
-
 %% Cell type:code id:ec3015c3-316e-48fe-a08a-4fd4323cb9ea tags:

 ``` python
 with open(f'../data/foldseek_unique_ids.txt', 'w') as tmp:
    ids_as_str = unique_up_id.astype(str)[0]
    tmp.write(','.join(ids_as_str))
 ```

-%% Cell type:markdown id:1da0e8ce-564e-42a5-a3b5-34c777dbc018 tags:
-
-<div class="alert alert-block alert-warning"><b>DISCLAIMER:</b> To make this cell execute, you may need to copy the contents of the UPIMAPI/resource folder into the main repository directory.</div>
-
 %% Cell type:code id:4650ac1f-4d11-4590-b39f-2a16468d808e tags:

 ``` python
 !python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --fasta --no-annotation
 ```

-%% Cell type:markdown id:57a1ea9a-b33b-4eeb-bdfc-d92c9ef145fd tags:
-
-### 3.3 Annotating UniProt sequences - EggNOG mapper
-
-To submit to Emapper we first need to split in chunks of size $\leq 100.000$. We will use `fasta-splitter.pl` [by Kirill Kryukov](http://kirill-kryukov.com/study/tools/fasta-splitter/).
-
 %% Cell type:code id:5b403921-8514-431a-a34d-f92fbf034ab8 tags:

 ``` python
 %%bash
 mkdir ../data/uniprot_fastas/ -p
 perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/
 ```

 %% Cell type:markdown id:cc1e0086-8c3f-4881-938e-0270a9f9e53e tags:

 These were manually submitted to emapper, and the results (the `out.emapper.annotations` tsv file of each run) downloaded and saved with an `eggnog` ending.

 Let's process the retrieved emapper results:

 %% Cell type:code id:22630619-3006-425f-bace-980499b61afd tags:

 ``` python
 uniprot_eggnog = []
 for file in glob.glob('../data/uniprot_fastas/*.eggnog'):
    tmp = pd.read_csv(file, sep='\t', skiprows=4, skipfooter=3, engine='python')
    uniprot_eggnog.append(tmp)
 ```

 %% Cell type:markdown id:5b7287bb-7467-436b-8e9b-ff59e2d721b7 tags:

 We can concatenate the files, remove unnecessary columns and convert string columns to categorical to save even more space, before saving as a parquet file.

 %% Cell type:code id:65cd9d5e-fdac-4a14-94f7-787fdba690fa tags:

 ``` python
 uniprot_annotation = pd.concat(uniprot_eggnog)

 uniprot_annotation['uniprot'] = uniprot_annotation['#query'].str.split('|').str[1]
 uniprot_annotation.reset_index(drop=True, inplace=True)
 # remove unnecessary columns
 dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',
               'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',
               'KEGG_TC', 'CAZy', 'BiGG_Reaction']
 uniprot_annotation.drop(dead_weight, axis=1, inplace=True)
 # convert rest to categorical to save space
 to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',
                  'Description', 'Preferred_name', 'GOs', 'PFAMs']
 uniprot_annotation[to_categorical] = uniprot_annotation[to_categorical].astype("category")
 # finally save in parquet format
 uniprot_annotation.to_parquet('../data/uniprot_annotation.parquet')
 ```

 %% Cell type:code id:500fd12f-5e1e-48ef-99c1-3db0424d7cb6 tags:

 ``` python
 uniprot_annotation = pd.read_parquet('../data/uniprot_annotation.parquet')
 ```

 %% Cell type:markdown id:8a2910c3-dbd5-4268-8eed-eef8c9796ca5 tags:

 ### 3.4 Annotating UniProt sequences - UniProt

 We will also query UniProt to get useful gene names, protein names, functional annotation, and taxonomic information. This info is also included in Emapper, but here we get it in a more accessible form.

 %% Cell type:code id:c7d97917-53c6-4b51-98a5-97a55100aa37 tags:

 ``` python
 !python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation
 ```

 %% Cell type:markdown id:abcfc14c-c236-47c1-ad7a-1bba7244e46b tags:

 The `csv` file really is huge though so we'll read it and convert to parquet format:

 %% Cell type:code id:87957304-19a6-451d-aa82-9ece85cc90de tags:

 ``` python
 foldseek_full = pd.read_csv('../data/uniprotinfo.tsv', sep='\t')
 foldseek_full = foldseek_full[['Entry', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']]
 foldseek_full.columns = ['uniprot', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']
 foldseek_full.to_parquet('../data/fs_targets.parquet')
 ```

 %% Cell type:code id:09465de2-c112-4de3-b487-6a27f8cf0504 tags:

 ``` python
 foldseek_full = pd.read_parquet('../data/fs_targets.parquet')
 ```

 %% Cell type:markdown id:86ebcf92-6b4f-4b4c-89dc-f3efc0dfb6b0 tags:

 Add the UniProt and EggNOG annotation to the FoldSeek results:

 %% Cell type:code id:019667f8-47b2-4959-abe8-38e5529b4e86 tags:

 ``` python
 pdb = pdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
 afdb = afdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
 swp = swp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
 ```

 %% Cell type:markdown id:bf6217eb-a982-425e-b68f-2ec57972e864 tags:

 Convert non-numeric columns to categorical; many annotations will be repeated (explore by using `.value_counts()` on any of these columns), therefore we will gain a lot of space by converting away from `object`.

 %% Cell type:code id:85f48253-2176-4b3b-ae78-e18dc8156d6f tags:

 ``` python
 to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'target',
                  'Description', 'Preferred_name', 'GOs', 'PFAMs', 'Entry name',
                  'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']
 pdb[to_categorical] = pdb[to_categorical].astype("category")
 afdb[to_categorical] = afdb[to_categorical].astype("category")
 swp[to_categorical] = swp[to_categorical].astype("category")
 ```

 %% Cell type:markdown id:6769b2e4-6f33-4c88-9519-3426690c5339 tags:

 ## 4. Structural annotation summary

 The structural annotation pipeline can be summarised by keeping the best-scoring annotation for each _Spongilla_ peptide. We will do this by first dropping unannotated entries from the target DB tables, and then keeping the best bit score per query peptide. Finally, we will concatenate the target DB tables and keep the best-scoring annotation for each query peptide. These tables are too big and unwieldy to save anyway.

 %% Cell type:code id:fe4b9101-4312-440b-a437-1dd701e02129 tags:

 ``` python
 def remove_unannotated(df):
    no_desc = df['Description'] == "-"
    no_func = df['Function [CC]'].isnull()
    no_annot = no_desc & no_func
    return df[~no_annot]

 def best_bit_score(df, sort_by='bit score', tiebreak='alignment length'):
    have_max = df[sort_by] == np.max(df[sort_by])
    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])
    return df[have_max][max_ali].index.values[0]

 def keep_best_annotated(df):
    df = remove_unannotated(df)
    idx = df.groupby('query').apply(best_bit_score)
    res = df.loc[idx].copy()
    return res

 def tag_origin(df, prefix):
    df.columns = prefix + " " + df.columns
    return df
 ```

 %% Cell type:code id:9bbd5e11-77c9-4657-b0e8-d6d3e7055d3a tags:

 ``` python
 pdb_best = keep_best_annotated(pdb)
 pdb_best['origin'] = 'PDB'
 afdb_best = keep_best_annotated(afdb)
 afdb_best['origin'] = "AFDB"
 swp_best = keep_best_annotated(swp)
 swp_best['origin'] = 'SwissProt'
 ```

 %% Cell type:markdown id:5c879807-6e82-405c-9bb5-33bcff5bdae6 tags:

 Prune matrices to make them smaller:

 %% Cell type:code id:542ca93a-9f73-45ca-8c76-48152dae9a58 tags:

 ``` python
 dead_weight = ['no. mismatches', 'no. gap open', 'query start', 'query end',
               'target start', 'target end', 'target']
 pdb_best.drop(dead_weight, axis=1, inplace=True)
 afdb_best.drop(dead_weight, axis=1, inplace=True)
 swp_best.drop(dead_weight, axis=1, inplace=True)
 ```

 %% Cell type:code id:7bb9638e-06d3-42a4-bb74-9220c0cfe886 tags:

 ``` python
 pdb_best.to_parquet('../data/fs_best_pdb.parquet')
 afdb_best.to_parquet('../data/fs_best_afdb.parquet')
 swp_best.to_parquet('../data/fs_best_swp.parquet')
 ```

 %% Cell type:code id:b52b7268-3247-4884-84cb-5fa3d452878a tags:

 ``` python
 best = pd.concat([pdb_best, afdb_best, swp_best])
 best = best.sort_values('corrected bit score', ascending=False).drop_duplicates(['query'])
 best.set_index('query', inplace=True)
 ```

 %% Cell type:code id:d85514be-1d56-4837-801d-a3bc24a8e889 tags:

 ``` python
 sequence_info = pd.read_csv("../data/sequence_info.csv")
 alphafold = pd.read_csv("../data/structure_predictions.csv", index_col=0)
 ```

 %% Cell type:markdown id:454754cc-d44b-4f5d-bc0b-5c29a2ef4e44 tags:

 Add the sequence information and the AlphaFold plddt score to complete the table, then save in parquet format.

 %% Cell type:code id:c9e7e73d-1670-4cf4-b6b6-f4f9cebc928a tags:

 ``` python
 structural_annotation = sequence_info.join(best).join(alphafold)
 ```

 %% Cell type:code id:21d1d18c-5de1-43e8-83f3-04d4bf07eaa2 tags:

 ``` python
 structural_annotation.to_parquet('../data/structure_annotation.parquet')
 ```

 %% Cell type:markdown id:2a395280-bf0f-446d-9f48-2d870d022586 tags:

 ## 5. Sequence-based annotation

 We used EggNOG mapper to annotate the _Spongilla_ predicted proteome. The resulting `out.emapper.annotations` tsv was downloaded from the website and is processed here to a more usable state.

 %% Cell type:code id:c1bd80a1-f155-4330-8894-fcca687f48a5 tags:

 ``` python
 eggnog = pd.read_csv('../data/MM_ipw26ffh.emapper.annotations.tsv', sep='\t', skiprows=4, skipfooter=3, engine='python')
 eggnog["gene_id"] = eggnog["#query"].str.split("_").str[:2].str.join("_")
 eggnog["protein_id"] = eggnog["#query"].str.split(".").str[1]
 eggnog.drop(['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',
             'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',
             'CAZy', 'BiGG_Reaction'], axis=1, inplace=True)
 ```

 %% Cell type:code id:6c7e43fe-2951-4104-b15b-129286601a4d tags:

 ``` python
 eggnog.to_csv('../data/Slacustris_eggnog.tsv', sep='\t', index=False)
 ```

 %% Cell type:code id:f46b7ad3-99c8-487b-a3c1-29aee10b9775 tags:

 ``` python
 ```

--- a/analysis/suppl-horizontal_gene_transfer.ipynb
+++ b/analysis/suppl-horizontal_gene_transfer.ipynb
@@ -677,7 +677,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,

--- a/scripts/io_parse_structures.py
+++ b/scripts/io_parse_structures.py
+import os
+import json
+from tqdm import tqdm
+
+import numpy as np
+import pandas as pd
+
+structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
+metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
+
+N = len(os.listdir(structure_list))
+proteins = [""] * N
+plddt = {}
+avg_plddt = [0.] * N
+
+for i, protein in enumerate(tqdm(os.listdir(structure_list))):
+    full_name = protein.split(".")[0]
+    metadata_loc = metadata + full_name + "_scores.json"
+    with open(metadata_loc, "r") as f:
+        score = json.load(f)
+    name = "_".join(full_name.split("_")[:3])
+    proteins[i] = name
+    avg_plddt[i] = np.mean(score["plddt"])
+    plddt[name] = score['plddt']
+
+# save outputs
+np.savez('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz', **plddt)
+
+alphafold = pd.DataFrame({"isoform": proteins, "plddt": avg_plddt})
+alphafold.to_csv("/g/arendt/npapadop/repos/coffe/data/structure_predictions.csv")
\ No newline at end of file
--- a/scripts/io_parse_structures.sh
+++ b/scripts/io_parse_structures.sh
+#!/bin/bash -ex
+#SBATCH -J coffe_parse_structures
+#SBATCH -t 1:00:00
+#SBATCH -c 1
+#SBATCH --mem=8G
+#SBATCH -o /g/arendt/npapadop/cluster/io_parse_structures.out
+#SBATCH -e /g/arendt/npapadop/cluster/io_parse_structures.err
+
+module load Anaconda3
+source ~/.bash_profile
+conda activate /g/arendt/npapadop/repos/condas/milot
+
+python /g/arendt/npapadop/repos/coffe/scripts/io_parse_structures.py
\ No newline at end of file
--- a/scripts/io_read_afdb.py
+++ b/scripts/io_read_afdb.py
+import os
+import json
+from tqdm import tqdm
+
+import numpy as np
+import pandas as pd
+
+def read_foldseek(result_path):
+    df = pd.read_csv(result_path, sep="\s+", header=None)
+    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
+                   "no. gap open", "query start", "query end", "target start", "target end",
+                   "e value", "bit score"]
+    df["query"] = df["query"].str.split("_").str[0]
+    df["corrected bit score"] = df["bit score"] / df["alignment length"]
+    if "pdb" in result_path:
+        df["target"] = df["target"].str.split(".").str[0]
+    else:
+        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
+    return df
+
+def get_aligned_plddt(df, plddt, name_dict):
+    aligned_plddt = [0.] * len(df)
+    for e in tqdm(df.iterrows()):
+        index, row = e
+        query = row['query']
+        isoform = name_dict[query]
+        protein = plddt[isoform]
+        start = row['query start'] - 1
+        end = row['query end'] - 1
+        aligned_plddt[index] = np.mean(protein[start:end])
+    return aligned_plddt
+
+def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
+    ids = df[column][~df[column].isnull()]
+    url = 'https://www.uniprot.org/uploadlists/'
+
+    params = {
+    'from': uniprot_from,
+    'to': uniprot_to,
+    'format': 'tab',
+    'query': " ".join(ids.unique())
+    }
+
+    data = urllib.parse.urlencode(params)
+    data = data.encode('utf-8')
+    req = urllib.request.Request(url, data)
+    with urllib.request.urlopen(req) as f:
+        response = f.read()
+    return response
+
+def create_id_map(response, id_in, id_out):
+    map_from = []
+    map_to = []
+    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
+        col1, col2 = row.split("\t")
+        map_from.append(col1)
+        map_to.append(col2)
+
+    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
+    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
+#     res.set_index(id_in, inplace=True)
+    return res
+
+def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
+    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
+    df_map = create_id_map(response, column_from, column_to)
+    return df.join(df_map, on=column_from)
+
+# MSA location
+msas = '/g/arendt/npapadop/repos/coffe/data/msa/'
+# FOLDSEEK scores
+fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
+fs_afdb = '/g/arendt/npapadop/repos/coffe/data/afdb_score.tsv'
+fs_swp = '/g/arendt/npapadop/repos/coffe/data/swissprot_score.tsv'
+# AlphaFold predictions
+structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
+metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
+# input fasta
+all_peptides = '/g/arendt/npapadop/repos/coffe/data/Spongilla_lacustris_translated_proteome.fasta'
+
+plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
+
+sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
+query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
+
+afdb = read_foldseek(fs_afdb)
+afdb["query"] = afdb["query"].values.astype(int)
+
+afdb['aligned_plddt'] = get_aligned_plddt(afdb, plddt, query_to_isoform)
+afdb["uniprot"] = afdb["target"].str.split("-").str[1]
+afdb_best.to_parquet('/g/arendt/npapadop/repos/coffe/data/afdb_tmp.parquet')
\ No newline at end of file
--- a/scripts/io_read_afdb.sh
+++ b/scripts/io_read_afdb.sh
+#!/bin/bash -ex
+#SBATCH -J coffe_read_afdb
+#SBATCH -t 5:00:00
+#SBATCH -c 1
+#SBATCH --mem=16G
+#SBATCH -o /g/arendt/npapadop/cluster/io_read_afdb.out
+#SBATCH -e /g/arendt/npapadop/cluster/io_read_afdb.err
+
+module load Anaconda3
+source ~/.bash_profile
+conda activate /g/arendt/npapadop/repos/condas/milot
+
+python /g/arendt/npapadop/repos/coffe/scripts/io_read_afdb.py
\ No newline at end of file
--- a/scripts/io_read_pdb.py
+++ b/scripts/io_read_pdb.py
+import os
+import json
+from tqdm import tqdm
+
+import numpy as np
+import pandas as pd
+
+def read_foldseek(result_path):
+    df = pd.read_csv(result_path, sep="\s+", header=None)
+    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
+                   "no. gap open", "query start", "query end", "target start", "target end",
+                   "e value", "bit score"]
+    df["query"] = df["query"].str.split("_").str[0]
+    df["corrected bit score"] = df["bit score"] / df["alignment length"]
+    if "pdb" in result_path:
+        df["target"] = df["target"].str.split(".").str[0]
+    else:
+        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
+    return df
+
+def get_aligned_plddt(df, plddt, name_dict):
+    aligned_plddt = [0.] * len(df)
+    for e in tqdm(df.iterrows()):
+        index, row = e
+        query = row['query']
+        isoform = name_dict[query]
+        protein = plddt[isoform]
+        start = row['query start'] - 1
+        end = row['query end'] - 1
+        aligned_plddt[index] = np.mean(protein[start:end])
+    return aligned_plddt
+
+def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
+    ids = df[column][~df[column].isnull()]
+    url = 'https://www.uniprot.org/uploadlists/'
+
+    params = {
+    'from': uniprot_from,
+    'to': uniprot_to,
+    'format': 'tab',
+    'query': " ".join(ids.unique())
+    }
+
+    data = urllib.parse.urlencode(params)
+    data = data.encode('utf-8')
+    req = urllib.request.Request(url, data)
+    with urllib.request.urlopen(req) as f:
+        response = f.read()
+    return response
+
+def create_id_map(response, id_in, id_out):
+    map_from = []
+    map_to = []
+    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
+        col1, col2 = row.split("\t")
+        map_from.append(col1)
+        map_to.append(col2)
+
+    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
+    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
+#     res.set_index(id_in, inplace=True)
+    return res
+
+def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
+    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
+    df_map = create_id_map(response, column_from, column_to)
+    return df.join(df_map, on=column_from)
+
+# MSA location
+msas = '/g/arendt/npapadop/repos/coffe/data/msa/'
+# FOLDSEEK scores
+fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
+fs_afdb = '/g/arendt/npapadop/repos/coffe/data/afdb_score.tsv'
+fs_swp = '/g/arendt/npapadop/repos/coffe/data/swissprot_score.tsv'
+# AlphaFold predictions
+structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
+metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
+# input fasta
+all_peptides = '/g/arendt/npapadop/repos/coffe/data/Spongilla_lacustris_translated_proteome.fasta'
+
+plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
+
+sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
+query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
+
+pdb = read_foldseek(fs_pdb)
+pdb["query"] = pdb["query"].values.astype(int)
+
+pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
+pdb = enrich_from_uniprot(pdb, "target", "uniprot", uniprot_from="PDB_ID", uniprot_to="ACC")
+pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
\ No newline at end of file
--- a/scripts/io_read_pdb.sh
+++ b/scripts/io_read_pdb.sh
+#!/bin/bash -ex
+#SBATCH -J coffe_read_pdb
+#SBATCH -t 5:00:00
+#SBATCH -c 1
+#SBATCH --mem=16G
+#SBATCH -o /g/arendt/npapadop/cluster/io_read_pdb.out
+#SBATCH -e /g/arendt/npapadop/cluster/io_read_pdb.err
+
+module load Anaconda3
+source ~/.bash_profile
+conda activate /g/arendt/npapadop/repos/condas/milot
+
+python /g/arendt/npapadop/repos/coffe/scripts/io_read_pdb.py
\ No newline at end of file
--- a/scripts/io_read_swp.py
+++ b/scripts/io_read_swp.py
+import os
+import json
+from tqdm import tqdm
+
+import numpy as np
+import pandas as pd
+
+def read_foldseek(result_path):
+    df = pd.read_csv(result_path, sep="\s+", header=None)
+    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
+                   "no. gap open", "query start", "query end", "target start", "target end",
+                   "e value", "bit score"]
+    df["query"] = df["query"].str.split("_").str[0]
+    df["corrected bit score"] = df["bit score"] / df["alignment length"]
+    if "pdb" in result_path:
+        df["target"] = df["target"].str.split(".").str[0]
+    else:
+        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
+    return df
+
+def get_aligned_plddt(df, plddt, name_dict):
+    aligned_plddt = [0.] * len(df)
+    for e in tqdm(df.iterrows()):
+        index, row = e
+        query = row['query']
+        isoform = name_dict[query]
+        protein = plddt[isoform]
+        start = row['query start'] - 1
+        end = row['query end'] - 1
+        aligned_plddt[index] = np.mean(protein[start:end])
+    return aligned_plddt
+
+def get_from_uniprot(df, column, uniprot_from="ACC", uniprot_to="EGGNOG_ID", request_size=40000):
+    ids = df[column][~df[column].isnull()]
+    url = 'https://www.uniprot.org/uploadlists/'
+
+    params = {
+    'from': uniprot_from,
+    'to': uniprot_to,
+    'format': 'tab',
+    'query': " ".join(ids.unique())
+    }
+
+    data = urllib.parse.urlencode(params)
+    data = data.encode('utf-8')
+    req = urllib.request.Request(url, data)
+    with urllib.request.urlopen(req) as f:
+        response = f.read()
+    return response
+
+def create_id_map(response, id_in, id_out):
+    map_from = []
+    map_to = []
+    for row in tqdm(response.decode("utf-8").split("\n")[:-1]):
+        col1, col2 = row.split("\t")
+        map_from.append(col1)
+        map_to.append(col2)
+
+    res = pd.DataFrame({id_in: map_from[1:], id_out: map_to[1:]})
+    res = pd.DataFrame(res.groupby(id_in)[id_out].apply(",".join))
+#     res.set_index(id_in, inplace=True)
+    return res
+
+def enrich_from_uniprot(df, column_from, column_to, uniprot_from="PDB_ID", uniprot_to="ACC"):
+    response = get_from_uniprot(df, column_from, uniprot_from=uniprot_from, uniprot_to=uniprot_to)
+    df_map = create_id_map(response, column_from, column_to)
+    return df.join(df_map, on=column_from)
+
+# MSA location
+msas = '/g/arendt/npapadop/repos/coffe/data/msa/'
+# FOLDSEEK scores
+fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
+fs_afdb = '/g/arendt/npapadop/repos/coffe/data/afdb_score.tsv'
+fs_swp = '/g/arendt/npapadop/repos/coffe/data/swissprot_score.tsv'
+# AlphaFold predictions
+structure_list = '/g/arendt/npapadop/repos/coffe/data/named_models/'
+metadata = '/g/arendt/npapadop/repos/coffe/data/named_metadata/'
+# input fasta
+all_peptides = '/g/arendt/npapadop/repos/coffe/data/Spongilla_lacustris_translated_proteome.fasta'
+
+plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')
+
+sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
+query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
+
+swp = read_foldseek(fs_swp)
+swp["query"] = swp["query"].values.astype(int)
+
+swp['aligned_plddt'] = get_aligned_plddt(swp, plddt, query_to_isoform)
+swp["uniprot"] = swp["target"].str.split("-").str[1]
+swp_best.to_parquet('/g/arendt/npapadop/repos/coffe/data/swp_tmp.parquet')
\ No newline at end of file
--- a/scripts/io_read_swp.sh
+++ b/scripts/io_read_swp.sh
+#!/bin/bash -ex
+#SBATCH -J coffe_read_swp
+#SBATCH -t 5:00:00
+#SBATCH -c 1
+#SBATCH --mem=32G
+#SBATCH -o /g/arendt/npapadop/cluster/io_read_swp.out
+#SBATCH -e /g/arendt/npapadop/cluster/io_read_swp.err
+
+module load Anaconda3
+source ~/.bash_profile
+conda activate /g/arendt/npapadop/repos/condas/milot
+
+python /g/arendt/npapadop/repos/coffe/scripts/io_read_swp.py
\ No newline at end of file
--- a/scripts/pdum_seq_alignments.sh
+++ b/scripts/pdum_seq_alignments.sh
+#!/bin/bash -ex
+#SBATCH -J align
+#SBATCH -t 10-00
+#SBATCH -c 32
+#SBATCH --mem=256G
+#SBATCH -o /g/arendt/npapadop/cluster/align_platy.out
+#SBATCH -e /g/arendt/npapadop/cluster/align_platy.err
+
+module load Anaconda3
+module load GCC
+module load bzip2
+module load CUDA
+source ~/.bash_profile
+conda activate /g/arendt/npapadop/repos/condas/milot
+
+MMSEQS="/g/arendt/npapadop/repos/MMseqs2/build/bin/mmseqs"
+QUERY="/g/arendt/npapadop/data/sequences/pdumv021_proteome.fasta"
+BASE="/scratch/npapadop/"
+
+cd "${BASE}"
+colabfold_search "${QUERY}" sequence_dbs/ msas --mmseqs "${MMSEQS}" --threads 32
+
+module unload