reworking analysis to accommodate new UniProt RESTful API

e0fada09 · Nikolaos Papadopoulos · 5276fe26 · e0fada09 · e0fada09 · e0fada09
Commit e0fada09 authored 2 years ago by Nikolaos Papadopoulos
--- a/analysis/Untitled.ipynb
+++ b/analysis/Untitled.ipynb
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2022-07-29 14:01\n"
+      "2022-07-29 15:06\n"
     ]
    }
   ],
@@ -128,6 +128,17 @@
    "del hhblits"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "da58c74f-5cd2-435a-8281-6195ed7f15af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mmseqs_filtered.drop(columns=['index'], inplace=True)\n",
+    "hhblits_filtered.drop(columns=['index'], inplace=True)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 9,
@@ -171,7 +182,7 @@
    {
     "data": {
      "text/plain": [
-       "<matplotlib.legend.Legend at 0x7f58ab679250>"
+       "<matplotlib.legend.Legend at 0x7f7975465070>"
      ]
     },
     "execution_count": 11,
@@ -180,7 +191,7 @@
    },
    {
     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df3BV5b3v8feXgAUUxJLcthJs0jOIAgkBg3oLF+GgGNCRudoZRUFOW0qZi0due0XwtMcWOu30jJxbD1WhDKWWuVSoSCn3CtpSsIK1hx+6G34oGmlKQlqNoShyoPzwe//ITs4m7JCdZO2svdf+vGYyZK317LW/e0/45Mmzn/Usc3dERCT7dQu7ABERCYYCXUQkIhToIiIRoUAXEYkIBbqISER0D+uJ8/PzvaioKKynFxHJSnv27Hnf3QuSHQst0IuKiti9e3dYTy8ikpXM7E+tHdOQi4hIRCjQRUQiQoEuIhIRoY2hi0juOHPmDLW1tZw6dSrsUrJGz549KSwspEePHik/RoEuImlXW1tLnz59KCoqwszCLifjuTsNDQ3U1tZSXFyc8uM05CIiaXfq1Cn69++vME+RmdG/f/92/0WjQBeRLqEwb5+OvF8KdBGRiNAYuoh0uS0H3g30fDcP+VSg58tW6qFfzMHNYVcgIpIyBbqI5ITq6mquueYaZs6cybBhw7jvvvvYsmULo0ePZtCgQezcuZNvf/vbzJgxg4kTJ1JUVMT69et5+OGHKSkpoaKigjNnzgCwYMEChgwZQmlpKQ899BAA9fX13HXXXYwaNYpRo0bxyiuvANDQ0MDEiRMZMWIEX/3qV/nsZz/L+++/z4kTJ7jtttsYPnw4w4YNY+3atZ1+jQp0EckZVVVVzJ07l8rKSt58801+9rOfsWPHDhYvXsz3vvc9AN555x2ef/55fvnLXzJt2jTGjx/P3r176dWrF88//zxHjx7lF7/4Bfv376eyspJvfvObAMydO5evfe1r7Nq1i+eee46ZM2cCsHDhQsaMGcPrr7/OHXfcweHDhwF44YUXuPLKK/nDH/7Avn37qKio6PTrU6CLSM4oLi6mpKSEbt26MXToUCZMmICZUVJSQnV1NQCTJk2iR48elJSUcO7cueagbWrTt29fevbsycyZM1m/fj29e/cGYMuWLTzwwAOUlZVxxx138OGHH3L8+HFefvllpk2bBsBtt93GFVdc0Xy+LVu2MH/+fLZv387ll1/e6denQL+IWM2xsEsQkQB94hOfaP6+W7duzdvdunXj7Nmz57Xp1q0bPXr0aJ4+2NSme/fu7Ny5k7vuuosNGzY0B/7HH3/Mq6++SiwWIxaLceTIEfr06QMkn4J49dVXs2fPHkpKSnjkkUdYtGhRp1+fAr01+kBURJL46KOP+OCDD5g8eTKPP/44sVgMgIkTJ/LEE080t2vaP3bsWFavXg3A5s2b+etf/wpAXV0dvXv3Ztq0aTz00EO89tprna5N0xZFpMtl8zTD48ePM2XKFE6dOoW784Mf/ACAJUuWMGfOHEpLSzl79ixjx45l2bJlfOtb32Lq1KmMHDmSm266iauuugqAvXv3Mm/evOa/BJYuXdrp2szdO32SjigvL/eMvsHFwc3Eao5RdvPUsCsRyXpvvPEG1157bdhlZISmm/vk5+e32TbZ+2Zme9y9PFn7NodczGylmb1nZvtaOX6fmVXGv35nZsPbrFJERAKXyhj608DF5tP8EbjJ3UuB7wDLA6hLRCSSqqurU+qdd0SbY+ju/rKZFV3k+O8SNn8PFAZQl4iItFPQs1y+DLQ6PcTMZpnZbjPbXV9fH/BTB0tTFkUk2wQW6GY2nsZAn99aG3df7u7l7l5eUFAQ1FN3maAXFBIRCVIg0xbNrBRYAUxy94YgzikiIu3T6UA3s6uA9cB0d3+r8yVlrvy6rTBE0xhFOi3oC/cGT2qzSXV1Nbfffjv79p0/YW/cuHEsXryY8vLzZwI+/fTT7N69+7yLhZpcdtllfPTRR9TV1fHggw+ybt06YrEYdXV1TJ48uXOvpRNSmbb4DPAqMNjMas3sy2Y228xmx5s8CvQHnjKzmJll8OTyTtCVoyLSwpVXXsm6deuAxitDN23aFGo9bQa6u09198+4ew93L3T3H7v7MndfFj8+092vcPey+FfSCe/Z5LyxcgW5SGScO3eOr3zlKwwdOpSJEydy8uRJAJ599lmuv/56rr76arZv397cvqamhoqKCgYPHszChQsvOF91dTXDhg3j9OnTPProo6xdu5aysjLWrl3Lb3/7W8rKyigrK2PEiBEcP3487a9Pa7k0aQrug5sbh1biNNtFJDrefvtt5syZw/79++nXrx/PPfccAGfPnmXnzp08/vjj5wX3zp07Wb16NbFYjGeffZbWrm6/5JJLWLRoEXfffTexWIy7776bxYsX8+STTxKLxdi+fTu9evVK++tToItIziguLqasrAyA6667rnnJ3DvvvPOCfQC33HIL/fv3p1evXtx5553s2LEj5ecaPXo0X//611myZAnHjh2je/f0L52lQBeRnJG4fG5eXt4FS+Ym7oMLl71NtgxuaxYsWMCKFSs4efIkN954I2+++WZnSk+JAl1EpBW//vWvOXr0KCdPnmTDhg2MHj261bZ9+vQ5b5z8nXfeoaSkhPnz51NeXt4lga7lc0Wk66UwzTATjBkzhunTp1NVVcW99957wdTGROPHj+f73/8+ZWVlPPLII+zYsYNt27aRl5fHkCFDmDQp/a9Zy+c2Obi58YcsvmxuorKbp2o5XZFO0PK5HRP48rnSSLNdRCTTKdBFRCJCgZ5IFxGJSBZToIuIRIQCXUQkIhToIiIRoXno7bTlwLvcPORTYZchktVeqnkp0PONGzgusHPV19dz++23c/r0aZYsWcJf/vIXHn30UT796U+zbdu2wJ4nHRToCWI1xygb2C/sMkQkRL/5zW+45ppr+OlPfwpARUUFTz31FOPHjw+5srZpyEVEcsKqVasoLS1l+PDhTJ8+nT/96U9MmDCB0tJSJkyYwOHDh4nFYjz88MNs2rSJsrIyFi5cyI4dO5g9ezbz5s3j3LlzzJs3j1GjRlFaWsqPfvSjsF/WedRD74imq0pFJCvs37+f7373u7zyyivk5+dz9OhRZsyYwf3338+MGTNYuXIlDz74IBs2bGDRokXn3alo27ZtzXc0Wr58OZdffjm7du3ib3/7G6NHj2bixIkUFxeH/AobqYcepytBRaJr69atfOELXyA/Px+AT37yk7z66qvce++9AEyfPj2lpXF/9atfsWrVKsrKyrjhhhtoaGjg7bffTmvt7aEeejvl120FjbOLZBV3b3Pp21SWxnV3fvjDH3LrrbcGVVqg1EMXkcibMGECP//5z2loaADg6NGjfP7zn2fNmjUArF69mjFjxrR5nltvvZWlS5dy5swZAN566y1OnDiRvsLbST30FMS2PBN2CSKREuQ0w1QMHTqUb3zjG9x0003k5eUxYsQIlixZwpe+9CUee+wxCgoK+MlPftLmeWbOnEl1dTUjR47E3SkoKGDDhg3pfwEp0vK5cU2hXTawX5vj6WUD++lDUZF20PK5HaPlc0VEcpQCXUQkIhToItIlwhrezVYdeb/aDHQzW2lm75nZvlaOm5ktMbMqM6s0s5HtriKLbTnwbtgliGS8nj170tDQoFBPkbvT0NBAz5492/W4VGa5PA08Aaxq5fgkYFD86wZgafxfEREACgsLqa2tpb6+PuxSskbPnj0pLCxs12PaDHR3f9nMii7SZAqwyht/9f7ezPqZ2Wfc/c/tqkREIqtHjx4Zc3l8lAUxhj4AqEnYro3vu4CZzTKz3Wa2O1N/U2sJABHJVkEEerLrZZMOlLn7cncvd/fygoKCAJ5aRESaBBHotcDAhO1CoC6A83aZznywmV+3NcBKREQ6LohA3wjcH5/tciPwQc6Mnx/cHHYFIiLN2vxQ1MyeAcYB+WZWC3wL6AHg7suATcBkoAr4D+CL6So2LQ5uJr/uWNhViIh0WiqzXKa2cdyBOYFVJCIiHaIrRUVEIkKB3gma4igimUSB3gEKchHJRAp0EZGIUKCLiESEAl1EJCJyPtA1Hi4iUZHzgS4iEhUKdBGRiFCgB0FruohIBlCgd5BuPScimUaBLiISEQp0EZGIUKCLiESEAl1EJCIU6CIiEaFAD4CuNhWRTKBAFxGJCAW6iEhE5Hagd+IKz/y6rQEWIiLSebkd6AHSlaMiEjYFuohIROR0oGt2iohESUqBbmYVZnbQzKrMbEGS45eb2f81sz+Y2X4z+2LwpYqIyMW0Gehmlgc8CUwChgBTzWxIi2ZzgAPuPhwYB/yrmV0ScK0iInIRqfTQrweq3P2Qu58G1gBTWrRxoI+ZGXAZcBQ4G2ilIiJyUakE+gCgJmG7Nr4v0RPAtUAdsBeY6+4fB1JhmmhWiohETSqBbkn2eYvtW4EYcCVQBjxhZn0vOJHZLDPbbWa76+vr21lqsDSPXESiJpVArwUGJmwX0tgTT/RFYL03qgL+CFzT8kTuvtzdy929vKCgoKM1Zy7dik5EQpRKoO8CBplZcfyDznuAjS3aHAYmAJjZp4DBwKEgCxURkYvr3lYDdz9rZg8ALwJ5wEp3329ms+PHlwHfAZ42s700DtHMd/f301h3xsmv2woD+4VdhojksDYDHcDdNwGbWuxblvB9HTAx2NJERKQ9cvpK0aDpylMRCZMCXUQkIhToIiIRoUBPB01fFJEQ5GSg6ypREYminAx0EZEoUqAHTL1/EQmLAl1EJCIU6CIiEaFAFxGJCAW6iEhEKNBFRCJCgS4iEhEKdBGRiFCgi4hEhAJdRCQiFOgiIhGRc4G+5cC7jbeLSyPd6EJEwpBzgS4iElUKdBGRiFCgi4hEhAI9YOkenxcRaY0CXUQkIhToIiIRkVKgm1mFmR00syozW9BKm3FmFjOz/Wb222DLDI6GREQkqrq31cDM8oAngVuAWmCXmW109wMJbfoBTwEV7n7YzP5LmuoVEZFWpNJDvx6ocvdD7n4aWANMadHmXmC9ux8GcPf3gi0z++jeoiLS1VIJ9AFATcJ2bXxfoquBK8zsJTPbY2b3JzuRmc0ys91mtru+vr5jFYuISFKpBLol2ecttrsD1wG3AbcC/2xmV1/wIPfl7l7u7uUFBQXtLrbTDm7usqfSWL2IdLU2x9Bp7JEPTNguBOqStHnf3U8AJ8zsZWA48FYgVYqISJtS6aHvAgaZWbGZXQLcA2xs0eaXwH8zs+5m1hu4AXgj2FJFRORi2uyhu/tZM3sAeBHIA1a6+34zmx0/vszd3zCzF4BK4GNghbvvS2fhIiJyvlSGXHD3TcCmFvuWtdh+DHgsuNKCp2VtRSTKdKWoiEhEKNBFRCJCgZ5uXThVUkRymwI9nQ5u1ri9iHQZBbqISEQo0NNIvXMR6UoKdBGRiEhpHnqUvXby7aT7R/Ya1MWViIh0jnroIiIRoUAXEYmInBxyaW2YRUQkm6mHLiISEQr0rqCrRUWkCyjQRUQiQoEuIhIRCvQuoCtGRaQrKNBFRCIiZ6YtbjnwLvntaJ84tVFXjYpINlAPXUQkIhToXUnTF0UkjXIm0PPrtoZdgohIWuVMoIuIRJ0CvatouEVE0ixnZrlkwoJcsZpjlA0OuwoRiaqUeuhmVmFmB82syswWXKTdKDM7Z2ZfCK7EAKh3LCI5oM0eupnlAU8CtwC1wC4z2+juB5K0+xfgxXQUGqYg5qTralERSbdUeujXA1XufsjdTwNrgClJ2v0j8BzwXoD1BUJhKiK5IJVAHwDUJGzXxvc1M7MBwH8Hll3sRGY2y8x2m9nu+vr69tYqIiIXkUqgW5J93mL7cWC+u5+72Incfbm7l7t7eUFBQYoliohIKlKZ5VILDEzYLgTqWrQpB9aYGUA+MNnMzrr7hiCKFBGRtqXSQ98FDDKzYjO7BLgH2JjYwN2L3b3I3YuAdcD/UJgnt+XAu2GXICIR1WYP3d3PmtkDNM5eyQNWuvt+M5sdP37RcXMREekaKV1Y5O6bgE0t9iUNcnf/h86XFXEHN8PgSWFXISIRo0v/Q6BplCKSDpEP9NiWZ8IuQUSkS0Q+0EVEcoUCXUQkIhToIiIRkTPL5walswt16c5JIpIu6qGHRUv6ikjAFOgiIhER6SGXl2pe4lAG3KlIRKQrqIcuIhIRCnQRkYiI9JBLrOYYfdN4/iBuTSciEhT10EVEIiLSgd63oTLsElqlBbpEJGiRDnQRkVyiQBcRiYjIBno23OotG2oUkewR2UDPFgp1EQmKAl1EJCIU6AF57eTbzV/tkbj6onrrItIZkb2wKL9uK4fDLiJVzSsvjgy1DBHJbuqhZwDNSReRICjQM4RCXUQ6K6VAN7MKMztoZlVmtiDJ8fvMrDL+9TszGx58qdkj1fF03b1IRILUZqCbWR7wJDAJGAJMNbMhLZr9EbjJ3UuB7wDLgy40FyjgRaQzUumhXw9Uufshdz8NrAGmJDZw99+5+1/jm78HCoMtU0RE2pLKLJcBQE3Cdi1ww0XafxlIesNMM5sFzAK46qqrUiyxfV6qeQkgY+5U1KEldg9uhsGT0lSRiERVKj10S7LPkzY0G09joM9Pdtzdl7t7ubuXFxQUpF5lDtFcdBHpqFQCvRYYmLBdCNS1bGRmpcAKYIq7NwRTXm7SjBcR6YhUhlx2AYPMrBg4AtwD3JvYwMyuAtYD0939rcCrjIjWZr3obkciEoQ2A93dz5rZA8CLQB6w0t33m9ns+PFlwKNAf+ApMwM46+7l6StbRERaSunSf3ffBGxqsW9ZwvczgZnBlpY7zvvgNGEwq+kDXoBxA8d1WT0ikp0iu5ZLtmoK92M1L523P5VwT2zTUuJjWmunXxoi2S2SgR6rOUbfsItIo4703C8W9kG0b+0XRnt/+eiXikjHRTLQc0l7gzdd50rlsUHWKiIXilyga8pfZlGIi3SdyK222LehMuwSRERCEbkeOkQn1A/FXuZzZWPDLqNLpTK2rtk/IslFMtAj4ciesCvIKK0FvcJd5D8p0DPUofoTYZeQ1RT0kosU6BJ5qc7PF8l2WR/omkUhTfSzILku6wM98o7sgQHXhV1FTtAwjWS7aAV6BD9IPFR/gs+hUO9qCnfJRtEK9Ig6VH+CDz8+RtnAfmGXEjmducJVQS+ZRoGeJfo2VEK3S9VTz1Dq0UsmiFSgR32q36H6E3xuQNhVSBPNjZdME6lAzwWxGg29ZBOFu3SlyAT6odjLYZfQJfo2VMLA3FoOIIo0Li/pEJlAzyXqpWcnfQAr6aZAF8kyGsaR1ijQs1CyGS/qtUebPoCVVEQj0CN4QVG7HNkD/F3YVUjIFO6S9YEeqzlG34ZoT1dMpuliI4CyyN2mRDorlfF6hX70ZH2gR+VmFh3Rt6GSD/uXNn/PwLEaepGUXaxHrw9ns1PWB7okiA+95OKdjqRzUl2pUkGf2VIKdDOrAP4NyANWuPv3Wxy3+PHJwH8A/+DurwVcazMtk/qf+jZUQsGlQPxK2f6N+5tulq3eunSF1nr77d0vndNmoJtZHvAkcAtQC+wys43ufiCh2SRgUPzrBmBp/N+0idUco6zbO+l8iqyRbMmD5qGobpcS+/jvGoP9yJ7Gsff4MI3CXtIhlRk5qexX0LdfKj3064Eqdz8EYGZrgClAYqBPAVa5uwO/N7N+ZvYZd/9z4BXH9W2o5FC6Tp7FWn6mcKj+BH2p5FDDhW0ONcCH/UsV7JKRgvpLvLW/DFprk81SCfQBQE3Cdi0X9r6TtRkAnBfoZjYLmBXf/MjMDrarWsgH3m/nY3KB3pfk9L5cSO9Jctn0vny2tQOpBLol2ecdaIO7LweWp/CcyQsx2+3u5R19fFTpfUlO78uF9J4kF5X3JZUZzLXAwITtQqCuA21ERCSNUgn0XcAgMys2s0uAe4CNLdpsBO63RjcCH6Rz/FxERC7U5pCLu581sweAF2mctrjS3feb2ez48WXAJhqnLFbROG3xi2mqt8PDNRGn9yU5vS8X0nuSXCTeF2ucmCIiItlOq4CIiESEAl1EJCKyJtDNrMLMDppZlZktCLuesJnZQDPbZmZvmNl+M5sbdk2ZxMzyzOx1M/t/YdeSKeIX/K0zszfjPzf/NeyawmZmX4v//9lnZs+YWc+wa+qMrAj0hOUHJgFDgKlmNiTcqkJ3Fvhf7n4tcCMwR+/JeeYCb4RdRIb5N+AFd78GGE6Ovz9mNgB4ECh392E0Tvq4J9yqOicrAp2E5Qfc/TTQtPxAznL3PzctgObux2n8zzkg3Koyg5kVArcBK8KuJVOYWV9gLPBjAHc/7e7HQi0qM3QHeplZd6A3WX79TLYEemtLCwhgZkXACODfQy4lUzwOPAx8HHIdmeRzQD3wk/hQ1AozuzTsosLk7keAxcBhGpcp+cDdfxVuVZ2TLYGe0tICucjMLgOeA/6nu38Ydj1hM7PbgffcPcfvS3iB7sBIYKm7jwBOADn9WZSZXUHjX/rFwJXApWY2LdyqOidbAl1LCyRhZj1oDPPV7r4+7HoyxGjgDjOrpnFo7u/N7P+EW1JGqAVq3b3pr7h1NAZ8LrsZ+KO717v7GWA98PmQa+qUbAn0VJYfyCnxm4r8GHjD3f932PVkCnd/xN0L3b2Ixp+Tre6e1b2uILj7X4AaMxsc3zWB85fAzkWHgRvNrHf8/9MEsvyD4qy4BV1ryw+EXFbYRgPTgb1mFovv+yd33xReSZLh/hFYHe8UHSJ9S3RkBXf/dzNbB7xG46yx18nyJQB06b+ISERky5CLiIi0QYEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYmI/w9iflt0knsXdQAAAABJRU5ErkJggg==\n",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZdUlEQVR4nO3df3BV5b3v8feXgAUUxJLcthJs0jOIAgkBg3oLF+GgGNCRudoZRUFOW0qZi0due0XwtMcWOu30jJxbD1WhDKWWuVSoSCn3CtpSsIK1hx+6G34oGmlKQlqNoShyoPzwe//ITs4m7JCdZO2svdf+vGYyZK317LW/e0/45Mmzn/Usc3dERCT7dQu7ABERCYYCXUQkIhToIiIRoUAXEYkIBbqISER0D+uJ8/PzvaioKKynFxHJSnv27Hnf3QuSHQst0IuKiti9e3dYTy8ikpXM7E+tHdOQi4hIRCjQRUQiQoEuIhIRoY2hi0juOHPmDLW1tZw6dSrsUrJGz549KSwspEePHik/RoEuImlXW1tLnz59KCoqwszCLifjuTsNDQ3U1tZSXFyc8uM05CIiaXfq1Cn69++vME+RmdG/f/92/0WjQBeRLqEwb5+OvF8KdBGRiNAYuoh0uS0H3g30fDcP+VSg58tW6qFfzMHNYVcgIpIyBbqI5ITq6mquueYaZs6cybBhw7jvvvvYsmULo0ePZtCgQezcuZNvf/vbzJgxg4kTJ1JUVMT69et5+OGHKSkpoaKigjNnzgCwYMEChgwZQmlpKQ899BAA9fX13HXXXYwaNYpRo0bxyiuvANDQ0MDEiRMZMWIEX/3qV/nsZz/L+++/z4kTJ7jtttsYPnw4w4YNY+3atZ1+jQp0EckZVVVVzJ07l8rKSt58801+9rOfsWPHDhYvXsz3vvc9AN555x2ef/55fvnLXzJt2jTGjx/P3r176dWrF88//zxHjx7lF7/4Bfv376eyspJvfvObAMydO5evfe1r7Nq1i+eee46ZM2cCsHDhQsaMGcPrr7/OHXfcweHDhwF44YUXuPLKK/nDH/7Avn37qKio6PTrU6CLSM4oLi6mpKSEbt26MXToUCZMmICZUVJSQnV1NQCTJk2iR48elJSUcO7cueagbWrTt29fevbsycyZM1m/fj29e/cGYMuWLTzwwAOUlZVxxx138OGHH3L8+HFefvllpk2bBsBtt93GFVdc0Xy+LVu2MH/+fLZv387ll1/e6denQL+IWM2xsEsQkQB94hOfaP6+W7duzdvdunXj7Nmz57Xp1q0bPXr0aJ4+2NSme/fu7Ny5k7vuuosNGzY0B/7HH3/Mq6++SiwWIxaLceTIEfr06QMkn4J49dVXs2fPHkpKSnjkkUdYtGhRp1+fAr01+kBURJL46KOP+OCDD5g8eTKPP/44sVgMgIkTJ/LEE080t2vaP3bsWFavXg3A5s2b+etf/wpAXV0dvXv3Ztq0aTz00EO89tprna5N0xZFpMtl8zTD48ePM2XKFE6dOoW784Mf/ACAJUuWMGfOHEpLSzl79ixjx45l2bJlfOtb32Lq1KmMHDmSm266iauuugqAvXv3Mm/evOa/BJYuXdrp2szdO32SjigvL/eMvsHFwc3Eao5RdvPUsCsRyXpvvPEG1157bdhlZISmm/vk5+e32TbZ+2Zme9y9PFn7NodczGylmb1nZvtaOX6fmVXGv35nZsPbrFJERAKXyhj608DF5tP8EbjJ3UuB7wDLA6hLRCSSqqurU+qdd0SbY+ju/rKZFV3k+O8SNn8PFAZQl4iItFPQs1y+DLQ6PcTMZpnZbjPbXV9fH/BTB0tTFkUk2wQW6GY2nsZAn99aG3df7u7l7l5eUFAQ1FN3maAXFBIRCVIg0xbNrBRYAUxy94YgzikiIu3T6UA3s6uA9cB0d3+r8yVlrvy6rTBE0xhFOi3oC/cGT2qzSXV1Nbfffjv79p0/YW/cuHEsXryY8vLzZwI+/fTT7N69+7yLhZpcdtllfPTRR9TV1fHggw+ybt06YrEYdXV1TJ48uXOvpRNSmbb4DPAqMNjMas3sy2Y228xmx5s8CvQHnjKzmJll8OTyTtCVoyLSwpVXXsm6deuAxitDN23aFGo9bQa6u09198+4ew93L3T3H7v7MndfFj8+092vcPey+FfSCe/Z5LyxcgW5SGScO3eOr3zlKwwdOpSJEydy8uRJAJ599lmuv/56rr76arZv397cvqamhoqKCgYPHszChQsvOF91dTXDhg3j9OnTPProo6xdu5aysjLWrl3Lb3/7W8rKyigrK2PEiBEcP3487a9Pa7k0aQrug5sbh1biNNtFJDrefvtt5syZw/79++nXrx/PPfccAGfPnmXnzp08/vjj5wX3zp07Wb16NbFYjGeffZbWrm6/5JJLWLRoEXfffTexWIy7776bxYsX8+STTxKLxdi+fTu9evVK++tToItIziguLqasrAyA6667rnnJ3DvvvPOCfQC33HIL/fv3p1evXtx5553s2LEj5ecaPXo0X//611myZAnHjh2je/f0L52lQBeRnJG4fG5eXt4FS+Ym7oMLl71NtgxuaxYsWMCKFSs4efIkN954I2+++WZnSk+JAl1EpBW//vWvOXr0KCdPnmTDhg2MHj261bZ9+vQ5b5z8nXfeoaSkhPnz51NeXt4lga7lc0Wk66UwzTATjBkzhunTp1NVVcW99957wdTGROPHj+f73/8+ZWVlPPLII+zYsYNt27aRl5fHkCFDmDQp/a9Zy+c2Obi58YcsvmxuorKbp2o5XZFO0PK5HRP48rnSSLNdRCTTKdBFRCJCgZ5IFxGJSBZToIuIRIQCXUQkIhToIiIRoXno7bTlwLvcPORTYZchktVeqnkp0PONGzgusHPV19dz++23c/r0aZYsWcJf/vIXHn30UT796U+zbdu2wJ4nHRToCWI1xygb2C/sMkQkRL/5zW+45ppr+OlPfwpARUUFTz31FOPHjw+5srZpyEVEcsKqVasoLS1l+PDhTJ8+nT/96U9MmDCB0tJSJkyYwOHDh4nFYjz88MNs2rSJsrIyFi5cyI4dO5g9ezbz5s3j3LlzzJs3j1GjRlFaWsqPfvSjsF/WedRD74imq0pFJCvs37+f7373u7zyyivk5+dz9OhRZsyYwf3338+MGTNYuXIlDz74IBs2bGDRokXn3alo27ZtzXc0Wr58OZdffjm7du3ib3/7G6NHj2bixIkUFxeH/AobqYcepytBRaJr69atfOELXyA/Px+AT37yk7z66qvce++9AEyfPj2lpXF/9atfsWrVKsrKyrjhhhtoaGjg7bffTmvt7aEeejvl120FjbOLZBV3b3Pp21SWxnV3fvjDH3LrrbcGVVqg1EMXkcibMGECP//5z2loaADg6NGjfP7zn2fNmjUArF69mjFjxrR5nltvvZWlS5dy5swZAN566y1OnDiRvsLbST30FMS2PBN2CSKREuQ0w1QMHTqUb3zjG9x0003k5eUxYsQIlixZwpe+9CUee+wxCgoK+MlPftLmeWbOnEl1dTUjR47E3SkoKGDDhg3pfwEp0vK5cU2hXTawX5vj6WUD++lDUZF20PK5HaPlc0VEcpQCXUQkIhToItIlwhrezVYdeb/aDHQzW2lm75nZvlaOm5ktMbMqM6s0s5HtriKLbTnwbtgliGS8nj170tDQoFBPkbvT0NBAz5492/W4VGa5PA08Aaxq5fgkYFD86wZgafxfEREACgsLqa2tpb6+PuxSskbPnj0pLCxs12PaDHR3f9nMii7SZAqwyht/9f7ezPqZ2Wfc/c/tqkREIqtHjx4Zc3l8lAUxhj4AqEnYro3vu4CZzTKz3Wa2O1N/U2sJABHJVkEEerLrZZMOlLn7cncvd/fygoKCAJ5aRESaBBHotcDAhO1CoC6A83aZznywmV+3NcBKREQ6LohA3wjcH5/tciPwQc6Mnx/cHHYFIiLN2vxQ1MyeAcYB+WZWC3wL6AHg7suATcBkoAr4D+CL6So2LQ5uJr/uWNhViIh0WiqzXKa2cdyBOYFVJCIiHaIrRUVEIkKB3gma4igimUSB3gEKchHJRAp0EZGIUKCLiESEAl1EJCJyPtA1Hi4iUZHzgS4iEhUKdBGRiFCgB0FruohIBlCgd5BuPScimUaBLiISEQp0EZGIUKCLiESEAl1EJCIU6CIiEaFAD4CuNhWRTKBAFxGJCAW6iEhE5Hagd+IKz/y6rQEWIiLSebkd6AHSlaMiEjYFuohIROR0oGt2iohESUqBbmYVZnbQzKrMbEGS45eb2f81sz+Y2X4z+2LwpYqIyMW0Gehmlgc8CUwChgBTzWxIi2ZzgAPuPhwYB/yrmV0ScK0iInIRqfTQrweq3P2Qu58G1gBTWrRxoI+ZGXAZcBQ4G2ilIiJyUakE+gCgJmG7Nr4v0RPAtUAdsBeY6+4fB1JhmmhWiohETSqBbkn2eYvtW4EYcCVQBjxhZn0vOJHZLDPbbWa76+vr21lqsDSPXESiJpVArwUGJmwX0tgTT/RFYL03qgL+CFzT8kTuvtzdy929vKCgoKM1Zy7dik5EQpRKoO8CBplZcfyDznuAjS3aHAYmAJjZp4DBwKEgCxURkYvr3lYDdz9rZg8ALwJ5wEp3329ms+PHlwHfAZ42s700DtHMd/f301h3xsmv2woD+4VdhojksDYDHcDdNwGbWuxblvB9HTAx2NJERKQ9cvpK0aDpylMRCZMCXUQkIhToIiIRoUBPB01fFJEQ5GSg6ypREYminAx0EZEoUqAHTL1/EQmLAl1EJCIU6CIiEaFAFxGJCAW6iEhEKNBFRCJCgS4iEhEKdBGRiFCgi4hEhAJdRCQiFOgiIhGRc4G+5cC7jbeLSyPd6EJEwpBzgS4iElUKdBGRiFCgi4hEhAI9YOkenxcRaY0CXUQkIhToIiIRkVKgm1mFmR00syozW9BKm3FmFjOz/Wb222DLDI6GREQkqrq31cDM8oAngVuAWmCXmW109wMJbfoBTwEV7n7YzP5LmuoVEZFWpNJDvx6ocvdD7n4aWANMadHmXmC9ux8GcPf3gi0z++jeoiLS1VIJ9AFATcJ2bXxfoquBK8zsJTPbY2b3JzuRmc0ys91mtru+vr5jFYuISFKpBLol2ecttrsD1wG3AbcC/2xmV1/wIPfl7l7u7uUFBQXtLrbTDm7usqfSWL2IdLU2x9Bp7JEPTNguBOqStHnf3U8AJ8zsZWA48FYgVYqISJtS6aHvAgaZWbGZXQLcA2xs0eaXwH8zs+5m1hu4AXgj2FJFRORi2uyhu/tZM3sAeBHIA1a6+34zmx0/vszd3zCzF4BK4GNghbvvS2fhIiJyvlSGXHD3TcCmFvuWtdh+DHgsuNKCp2VtRSTKdKWoiEhEKNBFRCJCgZ5uXThVUkRymwI9nQ5u1ri9iHQZBbqISEQo0NNIvXMR6UoKdBGRiEhpHnqUvXby7aT7R/Ya1MWViIh0jnroIiIRoUAXEYmInBxyaW2YRUQkm6mHLiISEQr0rqCrRUWkCyjQRUQiQoEuIhIRCvQuoCtGRaQrKNBFRCIiZ6YtbjnwLvntaJ84tVFXjYpINlAPXUQkIhToXUnTF0UkjXIm0PPrtoZdgohIWuVMoIuIRJ0CvatouEVE0ixnZrlkwoJcsZpjlA0OuwoRiaqUeuhmVmFmB82syswWXKTdKDM7Z2ZfCK7EAKh3LCI5oM0eupnlAU8CtwC1wC4z2+juB5K0+xfgxXQUGqYg5qTralERSbdUeujXA1XufsjdTwNrgClJ2v0j8BzwXoD1BUJhKiK5IJVAHwDUJGzXxvc1M7MBwH8Hll3sRGY2y8x2m9nu+vr69tYqIiIXkUqgW5J93mL7cWC+u5+72Incfbm7l7t7eUFBQYoliohIKlKZ5VILDEzYLgTqWrQpB9aYGUA+MNnMzrr7hiCKFBGRtqXSQ98FDDKzYjO7BLgH2JjYwN2L3b3I3YuAdcD/UJgnt+XAu2GXICIR1WYP3d3PmtkDNM5eyQNWuvt+M5sdP37RcXMREekaKV1Y5O6bgE0t9iUNcnf/h86XFXEHN8PgSWFXISIRo0v/Q6BplCKSDpEP9NiWZ8IuQUSkS0Q+0EVEcoUCXUQkIhToIiIRkTPL5walswt16c5JIpIu6qGHRUv6ikjAFOgiIhER6SGXl2pe4lAG3KlIRKQrqIcuIhIRCnQRkYiI9JBLrOYYfdN4/iBuTSciEhT10EVEIiLSgd63oTLsElqlBbpEJGiRDnQRkVyiQBcRiYjIBno23OotG2oUkewR2UDPFgp1EQmKAl1EJCIU6AF57eTbzV/tkbj6onrrItIZkb2wKL9uK4fDLiJVzSsvjgy1DBHJbuqhZwDNSReRICjQM4RCXUQ6K6VAN7MKMztoZlVmtiDJ8fvMrDL+9TszGx58qdkj1fF03b1IRILUZqCbWR7wJDAJGAJMNbMhLZr9EbjJ3UuB7wDLgy40FyjgRaQzUumhXw9Uufshdz8NrAGmJDZw99+5+1/jm78HCoMtU0RE2pLKLJcBQE3Cdi1ww0XafxlIesNMM5sFzAK46qqrUiyxfV6qeQkgY+5U1KEldg9uhsGT0lSRiERVKj10S7LPkzY0G09joM9Pdtzdl7t7ubuXFxQUpF5lDtFcdBHpqFQCvRYYmLBdCNS1bGRmpcAKYIq7NwRTXm7SjBcR6YhUhlx2AYPMrBg4AtwD3JvYwMyuAtYD0939rcCrjIjWZr3obkciEoQ2A93dz5rZA8CLQB6w0t33m9ns+PFlwKNAf+ApMwM46+7l6StbRERaSunSf3ffBGxqsW9ZwvczgZnBlpY7zvvgNGEwq+kDXoBxA8d1WT0ikp0iu5ZLtmoK92M1L523P5VwT2zTUuJjWmunXxoi2S2SgR6rOUbfsItIo4703C8W9kG0b+0XRnt/+eiXikjHRTLQc0l7gzdd50rlsUHWKiIXilyga8pfZlGIi3SdyK222LehMuwSRERCEbkeOkQn1A/FXuZzZWPDLqNLpTK2rtk/IslFMtAj4ciesCvIKK0FvcJd5D8p0DPUofoTYZeQ1RT0kosU6BJ5qc7PF8l2WR/omkUhTfSzILku6wM98o7sgQHXhV1FTtAwjWS7aAV6BD9IPFR/gs+hUO9qCnfJRtEK9Ig6VH+CDz8+RtnAfmGXEjmducJVQS+ZRoGeJfo2VEK3S9VTz1Dq0UsmiFSgR32q36H6E3xuQNhVSBPNjZdME6lAzwWxGg29ZBOFu3SlyAT6odjLYZfQJfo2VMLA3FoOIIo0Li/pEJlAzyXqpWcnfQAr6aZAF8kyGsaR1ijQs1CyGS/qtUebPoCVVEQj0CN4QVG7HNkD/F3YVUjIFO6S9YEeqzlG34ZoT1dMpuliI4CyyN2mRDorlfF6hX70ZH2gR+VmFh3Rt6GSD/uXNn/PwLEaepGUXaxHrw9ns1PWB7okiA+95OKdjqRzUl2pUkGf2VIKdDOrAP4NyANWuPv3Wxy3+PHJwH8A/+DurwVcazMtk/qf+jZUQsGlQPxK2f6N+5tulq3eunSF1nr77d0vndNmoJtZHvAkcAtQC+wys43ufiCh2SRgUPzrBmBp/N+0idUco6zbO+l8iqyRbMmD5qGobpcS+/jvGoP9yJ7Gsff4MI3CXtIhlRk5qexX0LdfKj3064Eqdz8EYGZrgClAYqBPAVa5uwO/N7N+ZvYZd/9z4BXH9W2o5FC6Tp7FWn6mcKj+BH2p5FDDhW0ONcCH/UsV7JKRgvpLvLW/DFprk81SCfQBQE3Cdi0X9r6TtRkAnBfoZjYLmBXf/MjMDrarWsgH3m/nY3KB3pfk9L5cSO9Jctn0vny2tQOpBLol2ecdaIO7LweWp/CcyQsx2+3u5R19fFTpfUlO78uF9J4kF5X3JZUZzLXAwITtQqCuA21ERCSNUgn0XcAgMys2s0uAe4CNLdpsBO63RjcCH6Rz/FxERC7U5pCLu581sweAF2mctrjS3feb2ez48WXAJhqnLFbROG3xi2mqt8PDNRGn9yU5vS8X0nuSXCTeF2ucmCIiItlOq4CIiESEAl1EJCKyJtDNrMLMDppZlZktCLuesJnZQDPbZmZvmNl+M5sbdk2ZxMzyzOx1M/t/YdeSKeIX/K0zszfjPzf/NeyawmZmX4v//9lnZs+YWc+wa+qMrAj0hOUHJgFDgKlmNiTcqkJ3Fvhf7n4tcCMwR+/JeeYCb4RdRIb5N+AFd78GGE6Ovz9mNgB4ECh392E0Tvq4J9yqOicrAp2E5Qfc/TTQtPxAznL3PzctgObux2n8zzkg3Koyg5kVArcBK8KuJVOYWV9gLPBjAHc/7e7HQi0qM3QHeplZd6A3WX79TLYEemtLCwhgZkXACODfQy4lUzwOPAx8HHIdmeRzQD3wk/hQ1AozuzTsosLk7keAxcBhGpcp+cDdfxVuVZ2TLYGe0tICucjMLgOeA/6nu38Ydj1hM7PbgffcPcfvS3iB7sBIYKm7jwBOADn9WZSZXUHjX/rFwJXApWY2LdyqOidbAl1LCyRhZj1oDPPV7r4+7HoyxGjgDjOrpnFo7u/N7P+EW1JGqAVq3b3pr7h1NAZ8LrsZ+KO717v7GWA98PmQa+qUbAn0VJYfyCnxm4r8GHjD3f932PVkCnd/xN0L3b2Ixp+Tre6e1b2uILj7X4AaMxsc3zWB85fAzkWHgRvNrHf8/9MEsvyD4qy4BV1ryw+EXFbYRgPTgb1mFovv+yd33xReSZLh/hFYHe8UHSJ9S3RkBXf/dzNbB7xG46yx18nyJQB06b+ISERky5CLiIi0QYEuIhIRCnQRkYhQoIuIRIQCXUQkIhToIiIRoUAXEYmI/w9iflt0knsXdQAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
@@ -213,7 +224,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 13,
   "id": "e8d0cfb7-c86d-41a6-99e7-d514bc1f3873",
   "metadata": {},
   "outputs": [
@@ -223,7 +234,7 @@
       "148"
      ]
     },
-     "execution_count": 22,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -236,7 +247,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 14,
   "id": "85231f72-12ad-4cc7-897a-544e0be27228",
   "metadata": {
    "tags": []
@@ -246,7 +257,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 148/148 [01:19<00:00,  1.86it/s]\n"
+      "100%|██████████| 148/148 [01:45<00:00,  1.41it/s]\n"
     ]
    }
   ],
@@ -269,35 +280,194 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da58c74f-5cd2-435a-8281-6195ed7f15af",
+   "cell_type": "markdown",
+   "id": "c3d17ee4-6176-443a-b320-a54aaa7ff8cd",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "mmseqs_filtered.drop(columns=['index'], inplace=True)\n",
-    "hhblits_filtered.drop(columns=['index'], inplace=True)"
+    "(wait for EggNOG-mapper to annotate the sequences)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
   "id": "eb0c48d8-ef71-440f-8ea0-6e7f3118f957",
   "metadata": {},
   "outputs": [],
   "source": [
-    "file = '/g/arendt/npapadop/data/spongfold_publish/'\n",
+    "file = '/g/arendt/npapadop/data/spongfold_publish/MM_ffr33uy6.emapper.annotations.tsv'\n",
    "sensitive = pd.read_csv(file, sep='\\t', skiprows=4, skipfooter=3, engine='python')"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "0a97dfb7-4783-44fa-a330-f07b0585fcf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = sensitive['#query'].str.split('_').str[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "942d4a2d-2a8b-44ff-89b4-8525f3ad926e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "A0A6A6LEY7    3\n",
+       "A0A1X7U3S9    3\n",
+       "A0A3N5U8U0    3\n",
+       "A0A6J0B743    3\n",
+       "A0A482W475    3\n",
+       "             ..\n",
+       "A0A6I8SCD7    1\n",
+       "A0A1A8MN43    1\n",
+       "A0A0C2FDV2    1\n",
+       "A0A2J8INC7    1\n",
+       "A0A0K0D6S2    1\n",
+       "Name: #query, Length: 20360, dtype: int64"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "909804dd-4006-4dcc-a4ba-308a5b3975d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>#query</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>seed_ortholog</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>evalue</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>eggNOG_OGs</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max_annot_lvl</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>COG_category</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Description</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Preferred_name</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>GOs</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>EC</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_ko</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Pathway</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Module</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_Reaction</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_rclass</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>BRITE</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>KEGG_TC</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CAZy</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>BiGG_Reaction</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>PFAMs</th>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: []\n",
+       "Index: [#query, seed_ortholog, evalue, score, eggNOG_OGs, max_annot_lvl, COG_category, Description, Preferred_name, GOs, EC, KEGG_ko, KEGG_Pathway, KEGG_Module, KEGG_Reaction, KEGG_rclass, BRITE, KEGG_TC, CAZy, BiGG_Reaction, PFAMs]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sensitive[sensitive['#query'].str.contains('UPI00005B2EF3')].T"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "id": "e6d1eb54-d942-465e-a868-517d442fe1dd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
   "id": "1c156c27-d2ec-47d6-b764-2b54816922a0",
   "metadata": {},
   "outputs": [],
   "source": [
-    "sensitive['uniprot'] = sensitive['#query'].str.split('|').str[1]\n",
+    "sensitive['uniprot'] = sensitive['#query'].str.split('_').str[1]\n",
    "sensitive.reset_index(drop=True, inplace=True)\n",
    "# remove unnecessary columns\n",
    "dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',\n",
@@ -309,27 +479,349 @@
    "                  'Description', 'Preferred_name', 'GOs', 'PFAMs']\n",
    "sensitive[to_categorical] = sensitive[to_categorical].astype(\"category\")\n",
    "# finally save in parquet format\n",
+    "sensitive.drop_duplicates(inplace=True)\n",
    "sensitive.to_parquet('../data/uniprot_profiles.parquet')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "821c79ed-2090-45c8-bc5c-736ba1096989",
+   "execution_count": 30,
+   "id": "d68690ea-060a-480f-802c-aac6baae570d",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(20360, 10)"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "response"
+    "sensitive.shape"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 33,
+   "id": "f60ac7bf-f031-4bd6-99f3-eb3117b50cf1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7417224       A0A409VGI9\n",
+       "7453017       A0A6A6LEY7\n",
+       "3711344       A0A1A8QDB6\n",
+       "5432158       A0A1X7UJF6\n",
+       "1658532           S9X2M4\n",
+       "               ...      \n",
+       "144364        A0A2F0BPH8\n",
+       "144949            L1J317\n",
+       "4535244       A0A0K0D6S2\n",
+       "2883206    UPI0003F0A19A\n",
+       "2883636       A0A7S2RZY7\n",
+       "Name: target, Length: 24540, dtype: object"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "unique_up_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
   "id": "fd08dceb-c307-4ad6-bd23-91b70aaf7ef4",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>query</th>\n",
+       "      <th>target</th>\n",
+       "      <th>seq. id.</th>\n",
+       "      <th>alignment length</th>\n",
+       "      <th>no. mismatches</th>\n",
+       "      <th>no. gap open</th>\n",
+       "      <th>query start</th>\n",
+       "      <th>query end</th>\n",
+       "      <th>target start</th>\n",
+       "      <th>target end</th>\n",
+       "      <th>e value</th>\n",
+       "      <th>bit score</th>\n",
+       "      <th>gene_id</th>\n",
+       "      <th>normalized bit score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3537669</th>\n",
+       "      <td>c100005_g1_i4_m.41851</td>\n",
+       "      <td>UPI000C6D6B72</td>\n",
+       "      <td>0.262</td>\n",
+       "      <td>138</td>\n",
+       "      <td>100</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>150</td>\n",
+       "      <td>7</td>\n",
+       "      <td>142</td>\n",
+       "      <td>1.464000e-04</td>\n",
+       "      <td>53</td>\n",
+       "      <td>c100005_g1</td>\n",
+       "      <td>0.384058</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3711020</th>\n",
+       "      <td>c100012_g4_i1_m.41921</td>\n",
+       "      <td>A0A4V1IVK5</td>\n",
+       "      <td>0.681</td>\n",
+       "      <td>133</td>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>32</td>\n",
+       "      <td>164</td>\n",
+       "      <td>73</td>\n",
+       "      <td>203</td>\n",
+       "      <td>3.106000e-49</td>\n",
+       "      <td>183</td>\n",
+       "      <td>c100012_g4</td>\n",
+       "      <td>1.375940</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3108519</th>\n",
+       "      <td>c100014_g2_i1_m.41927</td>\n",
+       "      <td>A0A1D1W000</td>\n",
+       "      <td>0.456</td>\n",
+       "      <td>59</td>\n",
+       "      <td>32</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>62</td>\n",
+       "      <td>118</td>\n",
+       "      <td>176</td>\n",
+       "      <td>4.196000e-04</td>\n",
+       "      <td>52</td>\n",
+       "      <td>c100014_g2</td>\n",
+       "      <td>0.881356</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7530038</th>\n",
+       "      <td>c100023_g1_i1_m.41965</td>\n",
+       "      <td>UPI0009E61F51</td>\n",
+       "      <td>0.342</td>\n",
+       "      <td>90</td>\n",
+       "      <td>57</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>92</td>\n",
+       "      <td>2743</td>\n",
+       "      <td>2829</td>\n",
+       "      <td>4.556000e-04</td>\n",
+       "      <td>53</td>\n",
+       "      <td>c100023_g1</td>\n",
+       "      <td>0.588889</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5277692</th>\n",
+       "      <td>c100036_g1_i4_m.42040</td>\n",
+       "      <td>A0A7S3BMQ4</td>\n",
+       "      <td>0.523</td>\n",
+       "      <td>99</td>\n",
+       "      <td>46</td>\n",
+       "      <td>0</td>\n",
+       "      <td>71</td>\n",
+       "      <td>169</td>\n",
+       "      <td>1</td>\n",
+       "      <td>98</td>\n",
+       "      <td>2.166000e-19</td>\n",
+       "      <td>100</td>\n",
+       "      <td>c100036_g1</td>\n",
+       "      <td>1.010101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8219227</th>\n",
+       "      <td>c99972_g1_i2_m.41694</td>\n",
+       "      <td>UPI0009E28D88</td>\n",
+       "      <td>0.466</td>\n",
+       "      <td>63</td>\n",
+       "      <td>33</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8</td>\n",
+       "      <td>70</td>\n",
+       "      <td>392</td>\n",
+       "      <td>453</td>\n",
+       "      <td>1.130000e-05</td>\n",
+       "      <td>57</td>\n",
+       "      <td>c99972_g1</td>\n",
+       "      <td>0.904762</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6087296</th>\n",
+       "      <td>c99980_g3_i1_m.41746</td>\n",
+       "      <td>UPI00005B2EF3</td>\n",
+       "      <td>0.434</td>\n",
+       "      <td>62</td>\n",
+       "      <td>35</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>63</td>\n",
+       "      <td>7</td>\n",
+       "      <td>68</td>\n",
+       "      <td>4.330000e-05</td>\n",
+       "      <td>51</td>\n",
+       "      <td>c99980_g3</td>\n",
+       "      <td>0.822581</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3238079</th>\n",
+       "      <td>c99987_g1_i2_m.41772</td>\n",
+       "      <td>A0A1X7URN4</td>\n",
+       "      <td>0.529</td>\n",
+       "      <td>220</td>\n",
+       "      <td>97</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>221</td>\n",
+       "      <td>37</td>\n",
+       "      <td>243</td>\n",
+       "      <td>1.868000e-61</td>\n",
+       "      <td>221</td>\n",
+       "      <td>c99987_g1</td>\n",
+       "      <td>1.004545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>574005</th>\n",
+       "      <td>c99993_g2_i1_m.41786</td>\n",
+       "      <td>UPI00177B4952</td>\n",
+       "      <td>0.663</td>\n",
+       "      <td>137</td>\n",
+       "      <td>41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>137</td>\n",
+       "      <td>23</td>\n",
+       "      <td>145</td>\n",
+       "      <td>1.384000e-49</td>\n",
+       "      <td>183</td>\n",
+       "      <td>c99993_g2</td>\n",
+       "      <td>1.335766</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>574028</th>\n",
+       "      <td>c99993_g3_i1_m.41790</td>\n",
+       "      <td>UPI00106CF215</td>\n",
+       "      <td>0.265</td>\n",
+       "      <td>133</td>\n",
+       "      <td>95</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>135</td>\n",
+       "      <td>93</td>\n",
+       "      <td>222</td>\n",
+       "      <td>5.938000e-04</td>\n",
+       "      <td>52</td>\n",
+       "      <td>c99993_g3</td>\n",
+       "      <td>0.390977</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3347 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         query         target  seq. id.  alignment length  \\\n",
+       "3537669  c100005_g1_i4_m.41851  UPI000C6D6B72     0.262               138   \n",
+       "3711020  c100012_g4_i1_m.41921     A0A4V1IVK5     0.681               133   \n",
+       "3108519  c100014_g2_i1_m.41927     A0A1D1W000     0.456                59   \n",
+       "7530038  c100023_g1_i1_m.41965  UPI0009E61F51     0.342                90   \n",
+       "5277692  c100036_g1_i4_m.42040     A0A7S3BMQ4     0.523                99   \n",
+       "...                        ...            ...       ...               ...   \n",
+       "8219227   c99972_g1_i2_m.41694  UPI0009E28D88     0.466                63   \n",
+       "6087296   c99980_g3_i1_m.41746  UPI00005B2EF3     0.434                62   \n",
+       "3238079   c99987_g1_i2_m.41772     A0A1X7URN4     0.529               220   \n",
+       "574005    c99993_g2_i1_m.41786  UPI00177B4952     0.663               137   \n",
+       "574028    c99993_g3_i1_m.41790  UPI00106CF215     0.265               133   \n",
+       "\n",
+       "         no. mismatches  no. gap open  query start  query end  target start  \\\n",
+       "3537669             100             0           13        150             7   \n",
+       "3711020              41             0           32        164            73   \n",
+       "3108519              32             0            4         62           118   \n",
+       "7530038              57             0            3         92          2743   \n",
+       "5277692              46             0           71        169             1   \n",
+       "...                 ...           ...          ...        ...           ...   \n",
+       "8219227              33             0            8         70           392   \n",
+       "6087296              35             0            2         63             7   \n",
+       "3238079              97             0            2        221            37   \n",
+       "574005               41             0            1        137            23   \n",
+       "574028               95             0            3        135            93   \n",
+       "\n",
+       "         target end       e value  bit score     gene_id  normalized bit score  \n",
+       "3537669         142  1.464000e-04         53  c100005_g1              0.384058  \n",
+       "3711020         203  3.106000e-49        183  c100012_g4              1.375940  \n",
+       "3108519         176  4.196000e-04         52  c100014_g2              0.881356  \n",
+       "7530038        2829  4.556000e-04         53  c100023_g1              0.588889  \n",
+       "5277692          98  2.166000e-19        100  c100036_g1              1.010101  \n",
+       "...             ...           ...        ...         ...                   ...  \n",
+       "8219227         453  1.130000e-05         57   c99972_g1              0.904762  \n",
+       "6087296          68  4.330000e-05         51   c99980_g3              0.822581  \n",
+       "3238079         243  1.868000e-61        221   c99987_g1              1.004545  \n",
+       "574005          145  1.384000e-49        183   c99993_g2              1.335766  \n",
+       "574028          222  5.938000e-04         52   c99993_g3              0.390977  \n",
+       "\n",
+       "[3347 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "mmseqs_filtered[~mmseqs_filtered['target'].isin(sensitive['uniprot'])]"
+    "hhblits_filtered[~hhblits_filtered['target'].isin(sensitive['uniprot'])]"
   ]
  },
  {

 %% Cell type:code id:273c6a8d tags:

 ``` python
 from datetime import datetime, timezone
 import pytz

 utc_dt = datetime.now(timezone.utc) # UTC time
 dt = utc_dt.astimezone()
 tz = pytz.timezone('Europe/Berlin')
 berlin_now = datetime.now(tz)
 print(f'{berlin_now:%Y-%m-%d %H:%M}')
 ```

 %% Output

-    2022-07-29 14:01
+    2022-07-29 15:06

 %% Cell type:code id:09defaef-09b9-4d8e-84a1-dc3c7a56b80a tags:

 ``` python
 import glob
 from os.path import exists
 from tqdm import tqdm
 import requests

 import pandas as pd
 import numpy as np

 import matplotlib.pyplot as plt
 import seaborn as sns
 ```

 %% Cell type:code id:8aaa0695 tags:

 ``` python
 structural_annotation = pd.read_parquet('../old_data/structure_annotation.parquet')
 sequence_annotation = pd.read_csv('../old_data/Slacustris_eggnog.tsv', sep='\t')
 ```

 %% Cell type:code id:95457088 tags:

 ``` python
 mmseqs = pd.read_csv('../data/cfres_mmseqs_s75_e1.m8', sep="\s+", header=None)
 mmseqs.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
                       "no. gap open", "query start", "query end", "target start", "target end",
                       "e value", "bit score"]

 mmseqs['gene_id'] = mmseqs['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))
 mmseqs['normalized bit score'] = mmseqs['bit score'] / mmseqs['alignment length']
 ```

 %% Cell type:code id:093bff38 tags:

 ``` python
 def best_bit_score(df, sort_by='normalized bit score', tiebreak='alignment length'):
    have_max = df[sort_by] == np.max(df[sort_by])
    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])
    return df[have_max][max_ali].index.values[0]

 def keep_best(df, groupby='gene_id'):
    df.reset_index(inplace=True)
    idx = df.groupby(groupby).apply(best_bit_score)
    res = df.loc[idx].copy()
    return res
 ```

 %% Cell type:code id:042ff08e tags:

 ``` python
 mmseqs_filtered = keep_best(mmseqs)
 del mmseqs
 ```

 %% Cell type:code id:9645fec8 tags:

 ``` python
 hhblits = pd.read_csv('../data/cfres_hhblits_e1.m8', sep="\s+", header=None)
 hhblits.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
                       "no. gap open", "query start", "query end", "target start", "target end",
                       "e value", "bit score"]

 hhblits['gene_id'] = hhblits['query'].str.split('_').str[:2].apply(lambda x: '_'.join(x))
 hhblits['normalized bit score'] = hhblits['bit score'] / hhblits['alignment length']
 ```

 %% Cell type:code id:7e9d0601 tags:

 ``` python
 hhblits_filtered = keep_best(hhblits)
 del hhblits
 ```

+%% Cell type:code id:da58c74f-5cd2-435a-8281-6195ed7f15af tags:
+
+``` python
+mmseqs_filtered.drop(columns=['index'], inplace=True)
+hhblits_filtered.drop(columns=['index'], inplace=True)
+```
+
 %% Cell type:code id:260c26ec tags:

 ``` python
 sequence_annotation.columns
 ```

 %% Output

    Index(['evalue', 'score', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',
           'Description', 'Preferred_name', 'GOs', 'PFAMs', 'gene_id',
           'protein_id'],
          dtype='object')

 %% Cell type:code id:c92b4a01 tags:

 ``` python
 structural_annotation['normalized bit score'] = structural_annotation['bit score'] / structural_annotation['alignment length']
 ```

 %% Cell type:code id:5ceaaff2 tags:

 ``` python
 fig, ax = plt.subplots()
 ax.hist(mmseqs_filtered['normalized bit score'], bins=100, label='mmseqs', alpha=0.3, density=True);
 ax.hist(hhblits_filtered['normalized bit score'], bins=100, label='hhblits', alpha=0.3, density=True);
 ax.hist(structural_annotation['normalized bit score'], bins=100, label='coffe', alpha=0.3, density=True);
 ax.legend()
 ```

 %% Output

-    <matplotlib.legend.Legend at 0x7f58ab679250>
+    <matplotlib.legend.Legend at 0x7f7975465070>

-
+

 %% Cell type:code id:7649fb1a tags:

 ``` python
 unique_up_id = pd.concat([hhblits_filtered['target'].drop_duplicates(),
                          mmseqs_filtered['target'].drop_duplicates()])
 unique_up_id.drop_duplicates(inplace=True)
 ```

 %% Cell type:code id:e8d0cfb7-c86d-41a6-99e7-d514bc1f3873 tags:

 ``` python
 request_size = 500
 no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)
 no_chunks
 ```

 %% Output

    148

 %% Cell type:code id:85231f72-12ad-4cc7-897a-544e0be27228 tags:

 ``` python
 request_size = 500
 no_chunks = np.ceil((len(unique_up_id) / request_size) * 3).astype(int)

 with open('../data/profile_sequences.fasta', 'w') as result:
    for i in tqdm(range(no_chunks)):
        a = (i * request_size) // 3
        b = ((i+1) * request_size) // 3
        chunk = unique_up_id[a:b]
        bait50 = ['UniRef50_' + c for c in chunk]
        bait90 = ['UniRef90_' + c for c in chunk]
        bait100 = ['UniRef100_' + c for c in chunk]
        expanded_chunk = bait50 + bait90 + bait100
        url = f"https://rest.uniprot.org/uniref/ids?ids={','.join(expanded_chunk)}&format=fasta"
        response = requests.get(url)
        result.write(response.text)
 ```

 %% Output

-    100%|██████████| 148/148 [01:19<00:00,  1.86it/s]
+    100%|██████████| 148/148 [01:45<00:00,  1.41it/s]

-%% Cell type:code id:da58c74f-5cd2-435a-8281-6195ed7f15af tags:
+%% Cell type:markdown id:c3d17ee4-6176-443a-b320-a54aaa7ff8cd tags:

-``` python
-mmseqs_filtered.drop(columns=['index'], inplace=True)
-hhblits_filtered.drop(columns=['index'], inplace=True)
-```
+(wait for EggNOG-mapper to annotate the sequences)

 %% Cell type:code id:eb0c48d8-ef71-440f-8ea0-6e7f3118f957 tags:

 ``` python
-file = '/g/arendt/npapadop/data/spongfold_publish/'
+file = '/g/arendt/npapadop/data/spongfold_publish/MM_ffr33uy6.emapper.annotations.tsv'
 sensitive = pd.read_csv(file, sep='\t', skiprows=4, skipfooter=3, engine='python')
 ```

+%% Cell type:code id:0a97dfb7-4783-44fa-a330-f07b0585fcf8 tags:
+
+``` python
+test = sensitive['#query'].str.split('_').str[1]
+```
+
+%% Cell type:code id:942d4a2d-2a8b-44ff-89b4-8525f3ad926e tags:
+
+``` python
+test.value_counts()
+```
+
+%% Output
+
+    A0A6A6LEY7    3
+    A0A1X7U3S9    3
+    A0A3N5U8U0    3
+    A0A6J0B743    3
+    A0A482W475    3
+                 ..
+    A0A6I8SCD7    1
+    A0A1A8MN43    1
+    A0A0C2FDV2    1
+    A0A2J8INC7    1
+    A0A0K0D6S2    1
+    Name: #query, Length: 20360, dtype: int64
+
+%% Cell type:code id:909804dd-4006-4dcc-a4ba-308a5b3975d0 tags:
+
+``` python
+sensitive[sensitive['#query'].str.contains('UPI00005B2EF3')].T
+```
+
+%% Output
+
+    Empty DataFrame
+    Columns: []
+    Index: [#query, seed_ortholog, evalue, score, eggNOG_OGs, max_annot_lvl, COG_category, Description, Preferred_name, GOs, EC, KEGG_ko, KEGG_Pathway, KEGG_Module, KEGG_Reaction, KEGG_rclass, BRITE, KEGG_TC, CAZy, BiGG_Reaction, PFAMs]
+
+%% Cell type:code id:e6d1eb54-d942-465e-a868-517d442fe1dd tags:
+
+``` python
+```
+
 %% Cell type:code id:1c156c27-d2ec-47d6-b764-2b54816922a0 tags:

 ``` python
-sensitive['uniprot'] = sensitive['#query'].str.split('|').str[1]
+sensitive['uniprot'] = sensitive['#query'].str.split('_').str[1]
 sensitive.reset_index(drop=True, inplace=True)
 # remove unnecessary columns
 dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',
               'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',
               'KEGG_TC', 'CAZy', 'BiGG_Reaction']
 sensitive.drop(dead_weight, axis=1, inplace=True)
 # convert rest to categorical to save space
 to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',
                  'Description', 'Preferred_name', 'GOs', 'PFAMs']
 sensitive[to_categorical] = sensitive[to_categorical].astype("category")
 # finally save in parquet format
+sensitive.drop_duplicates(inplace=True)
 sensitive.to_parquet('../data/uniprot_profiles.parquet')
 ```

-%% Cell type:code id:821c79ed-2090-45c8-bc5c-736ba1096989 tags:
+%% Cell type:code id:d68690ea-060a-480f-802c-aac6baae570d tags:

 ``` python
-response
+sensitive.shape
 ```

+%% Output
+
+    (20360, 10)
+
+%% Cell type:code id:f60ac7bf-f031-4bd6-99f3-eb3117b50cf1 tags:
+
+``` python
+unique_up_id
+```
+
+%% Output
+
+    7417224       A0A409VGI9
+    7453017       A0A6A6LEY7
+    3711344       A0A1A8QDB6
+    5432158       A0A1X7UJF6
+    1658532           S9X2M4
+                   ...
+    144364        A0A2F0BPH8
+    144949            L1J317
+    4535244       A0A0K0D6S2
+    2883206    UPI0003F0A19A
+    2883636       A0A7S2RZY7
+    Name: target, Length: 24540, dtype: object
+
 %% Cell type:code id:fd08dceb-c307-4ad6-bd23-91b70aaf7ef4 tags:

 ``` python
-mmseqs_filtered[~mmseqs_filtered['target'].isin(sensitive['uniprot'])]
+hhblits_filtered[~hhblits_filtered['target'].isin(sensitive['uniprot'])]
 ```

+%% Output
+
+                             query         target  seq. id.  alignment length  \
+    3537669  c100005_g1_i4_m.41851  UPI000C6D6B72     0.262               138
+    3711020  c100012_g4_i1_m.41921     A0A4V1IVK5     0.681               133
+    3108519  c100014_g2_i1_m.41927     A0A1D1W000     0.456                59
+    7530038  c100023_g1_i1_m.41965  UPI0009E61F51     0.342                90
+    5277692  c100036_g1_i4_m.42040     A0A7S3BMQ4     0.523                99
+    ...                        ...            ...       ...               ...
+    8219227   c99972_g1_i2_m.41694  UPI0009E28D88     0.466                63
+    6087296   c99980_g3_i1_m.41746  UPI00005B2EF3     0.434                62
+    3238079   c99987_g1_i2_m.41772     A0A1X7URN4     0.529               220
+    574005    c99993_g2_i1_m.41786  UPI00177B4952     0.663               137
+    574028    c99993_g3_i1_m.41790  UPI00106CF215     0.265               133
+    
+             no. mismatches  no. gap open  query start  query end  target start  \
+    3537669             100             0           13        150             7
+    3711020              41             0           32        164            73
+    3108519              32             0            4         62           118
+    7530038              57             0            3         92          2743
+    5277692              46             0           71        169             1
+    ...                 ...           ...          ...        ...           ...
+    8219227              33             0            8         70           392
+    6087296              35             0            2         63             7
+    3238079              97             0            2        221            37
+    574005               41             0            1        137            23
+    574028               95             0            3        135            93
+    
+             target end       e value  bit score     gene_id  normalized bit score
+    3537669         142  1.464000e-04         53  c100005_g1              0.384058
+    3711020         203  3.106000e-49        183  c100012_g4              1.375940
+    3108519         176  4.196000e-04         52  c100014_g2              0.881356
+    7530038        2829  4.556000e-04         53  c100023_g1              0.588889
+    5277692          98  2.166000e-19        100  c100036_g1              1.010101
+    ...             ...           ...        ...         ...                   ...
+    8219227         453  1.130000e-05         57   c99972_g1              0.904762
+    6087296          68  4.330000e-05         51   c99980_g3              0.822581
+    3238079         243  1.868000e-61        221   c99987_g1              1.004545
+    574005          145  1.384000e-49        183   c99993_g2              1.335766
+    574028          222  5.938000e-04         52   c99993_g3              0.390977
+    
+    [3347 rows x 14 columns]
+
 %% Cell type:code id:c0f35e75-1a3f-4bab-8afe-160d3347738f tags:

 ``` python
 mmseqs_filtered
 ```

 %% Cell type:code id:f99bd75f-ae5a-4d00-8a01-5588c711f45c tags:

 ``` python
 foldseek_full = pd.read_parquet('../old_data/fs_targets.parquet')
 ```

 %% Cell type:code id:a1359934-a10c-4301-9ffa-000acb4159e4 tags:

 ``` python
 mmseqs_filtered = mmseqs_filtered.merge(sensitive, on='uniprot', how='left')
 ```

 %% Cell type:code id:72eed180-8cb0-4f76-b316-f5717f56b026 tags:

 ``` python

 hhblits = hhblits.merge(sensitive, on='uniprot', how='left')
 ```

 %% Cell type:code id:e5b356e4-10f9-4782-a39c-4f19a221ebdb tags:

 ``` python
 ```

--- a/analysis/analysis.ipynb
+++ b/analysis/analysis.ipynb
--- a/analysis/io-get_uniprot_sequences.ipynb
+++ b/analysis/io-get_uniprot_sequences.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a9b30dc7-46a5-47f9-9a33-d74c135683eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2022-08-01 11:33\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "import pytz\n",
+    "\n",
+    "utc_dt = datetime.now(timezone.utc) # UTC time\n",
+    "dt = utc_dt.astimezone()\n",
+    "tz = pytz.timezone('Europe/Berlin')\n",
+    "berlin_now = datetime.now(tz)\n",
+    "print(f'{berlin_now:%Y-%m-%d %H:%M}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "715dbc9f-7bdb-4dfd-a345-2b93d78ce54a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "from os.path import exists\n",
+    "from tqdm import tqdm\n",
+    "import requests\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5757388f-4df8-4f11-81de-eeb41a0741a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb = pd.read_parquet('../data/pdb_tmp.parquet')\n",
+    "swp = pd.read_parquet('../data/swp_tmp.parquet')\n",
+    "afdb = pd.read_parquet('../data/afdb_tmp.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5a74880c-91e0-4415-8326-44bf37eabd9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_up_id = pd.concat([pdb['uniprot'].drop_duplicates(),\n",
+    "                          swp['uniprot'].drop_duplicates(),\n",
+    "                          afdb['uniprot'].drop_duplicates()])\n",
+    "unique_up_id.drop_duplicates(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "101dbd46-ca76-4143-a877-0a9418cabcd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "request_size = 500"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b7c254f6-6b23-4f1c-8cb7-c945aafb9f7c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 90%|█████████ | 8429/9330 [52:48<04:45,  3.16it/s]  IOPub message rate exceeded.\n",
+      "The notebook server will temporarily stop sending output\n",
+      "to the client in order to avoid crashing it.\n",
+      "To change this limit, set the config variable\n",
+      "`--NotebookApp.iopub_msg_rate_limit`.\n",
+      "\n",
+      "Current values:\n",
+      "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
+      "NotebookApp.rate_limit_window=3.0 (secs)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "request_size = 100\n",
+    "no_chunks = np.ceil(len(unique_up_id) / request_size).astype(int)\n",
+    "\n",
+    "with open('../data/uniprotinfo.fasta', 'w') as result:\n",
+    "    for i in tqdm(range(no_chunks)):\n",
+    "        a = i * request_size\n",
+    "        b = (i+1) * request_size\n",
+    "        chunk = [str(c) for c in unique_up_id[a:b]]\n",
+    "        url = f\"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={','.join(chunk)}&format=fasta\"\n",
+    "        response = requests.get(url)\n",
+    "        result.write(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0a4ca4a1-f008-4c41-95e5-7226f8f11389",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../data/uniprot_sequences.fasta: 928687 sequences, 368383841 bp => dividing into 10 parts of <= 100000 sequences\n",
+      "All done, 20 seconds elapsed\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "mkdir ../data/uniprot_fastas/ -p\n",
+    "perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd368a7f-a2b5-489e-9f55-0a9854e457c0",
+   "metadata": {},
+   "source": [
+    "(submit to EggNOG-mapper and wait for it to annotate the sequences, then download files and rename)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:a9b30dc7-46a5-47f9-9a33-d74c135683eb tags:
+
+``` python
+from datetime import datetime, timezone
+import pytz
+
+utc_dt = datetime.now(timezone.utc) # UTC time
+dt = utc_dt.astimezone()
+tz = pytz.timezone('Europe/Berlin')
+berlin_now = datetime.now(tz)
+print(f'{berlin_now:%Y-%m-%d %H:%M}')
+```
+
+%% Output
+
+    2022-08-01 11:33
+
+%% Cell type:code id:715dbc9f-7bdb-4dfd-a345-2b93d78ce54a tags:
+
+``` python
+import glob
+from os.path import exists
+from tqdm import tqdm
+import requests
+
+import pandas as pd
+import numpy as np
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+```
+
+%% Cell type:code id:5757388f-4df8-4f11-81de-eeb41a0741a3 tags:
+
+``` python
+pdb = pd.read_parquet('../data/pdb_tmp.parquet')
+swp = pd.read_parquet('../data/swp_tmp.parquet')
+afdb = pd.read_parquet('../data/afdb_tmp.parquet')
+```
+
+%% Cell type:code id:5a74880c-91e0-4415-8326-44bf37eabd9d tags:
+
+``` python
+unique_up_id = pd.concat([pdb['uniprot'].drop_duplicates(),
+                          swp['uniprot'].drop_duplicates(),
+                          afdb['uniprot'].drop_duplicates()])
+unique_up_id.drop_duplicates(inplace=True)
+```
+
+%% Cell type:code id:101dbd46-ca76-4143-a877-0a9418cabcd4 tags:
+
+``` python
+request_size = 500
+```
+
+%% Cell type:code id:b7c254f6-6b23-4f1c-8cb7-c945aafb9f7c tags:
+
+``` python
+request_size = 100
+no_chunks = np.ceil(len(unique_up_id) / request_size).astype(int)
+
+with open('../data/uniprotinfo.fasta', 'w') as result:
+    for i in tqdm(range(no_chunks)):
+        a = i * request_size
+        b = (i+1) * request_size
+        chunk = [str(c) for c in unique_up_id[a:b]]
+        url = f"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={','.join(chunk)}&format=fasta"
+        response = requests.get(url)
+        result.write(response.text)
+```
+
+%% Output
+
+     90%|█████████ | 8429/9330 [52:48<04:45,  3.16it/s]  IOPub message rate exceeded.
+    The notebook server will temporarily stop sending output
+    to the client in order to avoid crashing it.
+    To change this limit, set the config variable
+    `--NotebookApp.iopub_msg_rate_limit`.
+    
+    Current values:
+    NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
+    NotebookApp.rate_limit_window=3.0 (secs)
+    
+
+%% Cell type:code id:0a4ca4a1-f008-4c41-95e5-7226f8f11389 tags:
+
+``` python
+%%bash
+mkdir ../data/uniprot_fastas/ -p
+perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/
+```
+
+%% Output
+
+    ../data/uniprot_sequences.fasta: 928687 sequences, 368383841 bp => dividing into 10 parts of <= 100000 sequences
+    All done, 20 seconds elapsed
+
+%% Cell type:markdown id:cd368a7f-a2b5-489e-9f55-0a9854e457c0 tags:
+
+(submit to EggNOG-mapper and wait for it to annotate the sequences, then download files and rename)
--- a/analysis/process_pdb.ipynb
+++ b/analysis/process_pdb.ipynb
@@ -631,7 +631,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')"
+    "pdb.to_parquet('../data/pdb_tmp.parquet')"
   ]
  }
 ],

 %% Cell type:code id:5b92f263-711c-4cdf-8b11-5d1defa4a0d7 tags:

 ``` python
 from tqdm import tqdm
 import re
 import time
 import json
 import zlib
 from xml.etree import ElementTree
 from urllib.parse import urlparse, parse_qs, urlencode
 import requests
 from requests.adapters import HTTPAdapter, Retry

 import numpy as np
 import pandas as pd
 ```

 %% Cell type:code id:a4e75e96-4f37-46a9-af3e-44914e6114d3 tags:

 ``` python
 def read_foldseek(result_path):
    df = pd.read_csv(result_path, sep="\s+", header=None)
    df.columns = ["query", "target", "seq. id.", "alignment length", "no. mismatches",
                   "no. gap open", "query start", "query end", "target start", "target end",
                   "e value", "bit score"]
    df["query"] = df["query"].str.split("_").str[0]
    df["corrected bit score"] = df["bit score"] / df["alignment length"]
    if "pdb" in result_path:
        df["target"] = df["target"].str.split(".").str[0]
    else:
        df["target"] = df["target"].str.split("-").str[:3].str.join("-")
    return df

 def get_aligned_plddt(df, plddt, name_dict):
    aligned_plddt = [0.] * len(df)
    for e in tqdm(df.iterrows()):
        index, row = e
        query = row['query']
        isoform = name_dict[query]
        protein = plddt[isoform]
        start = row['query start'] - 1
        end = row['query end'] - 1
        aligned_plddt[index] = np.mean(protein[start:end])
    return aligned_plddt
 ```

 %% Cell type:code id:d8fda205-90f8-4941-b77e-39f3e10d6ab2 tags:

 ``` python
 # FOLDSEEK scores
 fs_pdb = '/g/arendt/npapadop/repos/coffe/data/pdb_score.tsv'
 # per-residue pLDDT score
 plddt = np.load('/g/arendt/npapadop/repos/coffe/data/spongilla_plddt.npz')

 sequence_info = pd.read_csv("/g/arendt/npapadop/repos/coffe/data/sequence_info.csv")
 query_to_isoform = sequence_info[['query', 'isoform']].set_index('query').to_dict()['isoform']
 ```

 %% Cell type:code id:a8f277d7-94f6-4316-859b-6162c639ee73 tags:

 ``` python
 pdb = read_foldseek(fs_pdb)
 pdb["query"] = pdb["query"].values.astype(int)

 pdb['aligned_plddt'] = get_aligned_plddt(pdb, plddt, query_to_isoform)
 ```

 %% Output

    1470483it [25:34, 958.34it/s]

 %% Cell type:markdown id:0ddf8b37-4f85-4076-be68-32be64e02df3 tags:

 Write out the unique PDB IDs in a file and submit it to the UniProt ID mapper:

 %% Cell type:code id:0b6e404b-a754-4779-a7fa-a6ea683c7abb tags:

 ``` python
 pd.Series(pdb['target'].unique()).to_csv('../data/pdb_ids.csv', header=False, index=False)
 ```

 %% Cell type:markdown id:49ed1f9c-8225-46a8-9abe-d6fb568cb175 tags:

 (retrieve result link)

 %% Cell type:code id:a23cc67a-1d9b-4c78-bcec-be7bd965318f tags:

 ``` python
 url = "https://rest.uniprot.org/idmapping/results/8f35e821d58602a990b260ae64cf77fb1b1f23d4?compressed=true&fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength&format=tsv&size=500"
 ```

 %% Cell type:markdown id:3e9c9787-5021-4e3e-96e7-8cc58d6820e1 tags:

 Use the UniProt API functions to retrieve the mapping results, since the download link seems to be broken?

 %% Cell type:code id:2048ee0c-7761-4635-9d0b-c5ce69f53384 tags:

 ``` python
 POLLING_INTERVAL = 3

 API_URL = "https://rest.uniprot.org"


 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
 session = requests.Session()
 session.mount("https://", HTTPAdapter(max_retries=retries))


 def get_next_link(headers):
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)


 def get_batch(batch_response, file_format, compressed):
    batch_url = get_next_link(batch_response.headers)
    while batch_url:
        batch_response = session.get(batch_url)
        batch_response.raise_for_status()
        yield decode_results(batch_response, file_format, compressed)
        batch_url = get_next_link(batch_response.headers)


 def combine_batches(all_results, batch_results, file_format):
    if file_format == "json":
        for key in ("results", "failedIds"):
            if key in batch_results and batch_results[key]:
                all_results[key] += batch_results[key]
    elif file_format == "tsv":
        return all_results + batch_results[1:]
    else:
        return all_results + batch_results
    return all_results


 def decode_results(response, file_format, compressed):
    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text


 def get_xml_namespace(element):
    m = re.match(r"\{(.*)\}", element.tag)
    return m.groups()[0] if m else ""


 def merge_xml_results(xml_results):
    merged_root = ElementTree.fromstring(xml_results[0])
    for result in xml_results[1:]:
        root = ElementTree.fromstring(result)
        for child in root.findall("{http://uniprot.org/uniprot}entry"):
            merged_root.insert(-1, child)
    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)


 def print_progress_batches(batch_index, size, total):
    n_fetched = min((batch_index + 1) * size, total)
    print(f"Fetched: {n_fetched} / {total}")


 def get_id_mapping_results_search(url):
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    if "size" in query:
        size = int(query["size"][0])
    else:
        size = 500
        query["size"] = size
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    parsed = parsed._replace(query=urlencode(query, doseq=True))
    url = parsed.geturl()
    request = session.get(url)
    request.raise_for_status()
    results = decode_results(request, file_format, compressed)
    total = int(request.headers["x-total-results"])
    print_progress_batches(0, size, total)
    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
        results = combine_batches(results, batch, file_format)
        print_progress_batches(i, size, total)
    if file_format == "xml":
        return merge_xml_results(results)
    return results
 ```

 %% Cell type:code id:9d46cd87-8f41-4d2e-a36c-1f06e86315b6 tags:

 ``` python
 results = get_id_mapping_results_search(url)
 ```

 %% Output

    Fetched: 500 / 144380
    Fetched: 1000 / 144380
    Fetched: 1500 / 144380
    Fetched: 2000 / 144380
    Fetched: 2500 / 144380
    Fetched: 3000 / 144380
    Fetched: 3500 / 144380
    Fetched: 4000 / 144380
    Fetched: 4500 / 144380
    Fetched: 5000 / 144380
    Fetched: 5500 / 144380
    Fetched: 6000 / 144380
    Fetched: 6500 / 144380
    Fetched: 7000 / 144380
    Fetched: 7500 / 144380
    Fetched: 8000 / 144380
    Fetched: 8500 / 144380
    Fetched: 9000 / 144380
    Fetched: 9500 / 144380
    Fetched: 10000 / 144380
    Fetched: 10500 / 144380
    Fetched: 11000 / 144380
    Fetched: 11500 / 144380
    Fetched: 12000 / 144380
    Fetched: 12500 / 144380
    Fetched: 13000 / 144380
    Fetched: 13500 / 144380
    Fetched: 14000 / 144380
    Fetched: 14500 / 144380
    Fetched: 15000 / 144380
    Fetched: 15500 / 144380
    Fetched: 16000 / 144380
    Fetched: 16500 / 144380
    Fetched: 17000 / 144380
    Fetched: 17500 / 144380
    Fetched: 18000 / 144380
    Fetched: 18500 / 144380
    Fetched: 19000 / 144380
    Fetched: 19500 / 144380
    Fetched: 20000 / 144380
    Fetched: 20500 / 144380
    Fetched: 21000 / 144380
    Fetched: 21500 / 144380
    Fetched: 22000 / 144380
    Fetched: 22500 / 144380
    Fetched: 23000 / 144380
    Fetched: 23500 / 144380
    Fetched: 24000 / 144380
    Fetched: 24500 / 144380
    Fetched: 25000 / 144380
    Fetched: 25500 / 144380
    Fetched: 26000 / 144380
    Fetched: 26500 / 144380
    Fetched: 27000 / 144380
    Fetched: 27500 / 144380
    Fetched: 28000 / 144380
    Fetched: 28500 / 144380
    Fetched: 29000 / 144380
    Fetched: 29500 / 144380
    Fetched: 30000 / 144380
    Fetched: 30500 / 144380
    Fetched: 31000 / 144380
    Fetched: 31500 / 144380
    Fetched: 32000 / 144380
    Fetched: 32500 / 144380
    Fetched: 33000 / 144380
    Fetched: 33500 / 144380
    Fetched: 34000 / 144380
    Fetched: 34500 / 144380
    Fetched: 35000 / 144380
    Fetched: 35500 / 144380
    Fetched: 36000 / 144380
    Fetched: 36500 / 144380
    Fetched: 37000 / 144380
    Fetched: 37500 / 144380
    Fetched: 38000 / 144380
    Fetched: 38500 / 144380
    Fetched: 39000 / 144380
    Fetched: 39500 / 144380
    Fetched: 40000 / 144380
    Fetched: 40500 / 144380
    Fetched: 41000 / 144380
    Fetched: 41500 / 144380
    Fetched: 42000 / 144380
    Fetched: 42500 / 144380
    Fetched: 43000 / 144380
    Fetched: 43500 / 144380
    Fetched: 44000 / 144380
    Fetched: 44500 / 144380
    Fetched: 45000 / 144380
    Fetched: 45500 / 144380
    Fetched: 46000 / 144380
    Fetched: 46500 / 144380
    Fetched: 47000 / 144380
    Fetched: 47500 / 144380
    Fetched: 48000 / 144380
    Fetched: 48500 / 144380
    Fetched: 49000 / 144380
    Fetched: 49500 / 144380
    Fetched: 50000 / 144380
    Fetched: 50500 / 144380
    Fetched: 51000 / 144380
    Fetched: 51500 / 144380
    Fetched: 52000 / 144380
    Fetched: 52500 / 144380
    Fetched: 53000 / 144380
    Fetched: 53500 / 144380
    Fetched: 54000 / 144380
    Fetched: 54500 / 144380
    Fetched: 55000 / 144380
    Fetched: 55500 / 144380
    Fetched: 56000 / 144380
    Fetched: 56500 / 144380
    Fetched: 57000 / 144380
    Fetched: 57500 / 144380
    Fetched: 58000 / 144380
    Fetched: 58500 / 144380
    Fetched: 59000 / 144380
    Fetched: 59500 / 144380
    Fetched: 60000 / 144380
    Fetched: 60500 / 144380
    Fetched: 61000 / 144380
    Fetched: 61500 / 144380
    Fetched: 62000 / 144380
    Fetched: 62500 / 144380
    Fetched: 63000 / 144380
    Fetched: 63500 / 144380
    Fetched: 64000 / 144380
    Fetched: 64500 / 144380
    Fetched: 65000 / 144380
    Fetched: 65500 / 144380
    Fetched: 66000 / 144380
    Fetched: 66500 / 144380
    Fetched: 67000 / 144380
    Fetched: 67500 / 144380
    Fetched: 68000 / 144380
    Fetched: 68500 / 144380
    Fetched: 69000 / 144380
    Fetched: 69500 / 144380
    Fetched: 70000 / 144380
    Fetched: 70500 / 144380
    Fetched: 71000 / 144380
    Fetched: 71500 / 144380
    Fetched: 72000 / 144380
    Fetched: 72500 / 144380
    Fetched: 73000 / 144380
    Fetched: 73500 / 144380
    Fetched: 74000 / 144380
    Fetched: 74500 / 144380
    Fetched: 75000 / 144380
    Fetched: 75500 / 144380
    Fetched: 76000 / 144380
    Fetched: 76500 / 144380
    Fetched: 77000 / 144380
    Fetched: 77500 / 144380
    Fetched: 78000 / 144380
    Fetched: 78500 / 144380
    Fetched: 79000 / 144380
    Fetched: 79500 / 144380
    Fetched: 80000 / 144380
    Fetched: 80500 / 144380
    Fetched: 81000 / 144380
    Fetched: 81500 / 144380
    Fetched: 82000 / 144380
    Fetched: 82500 / 144380
    Fetched: 83000 / 144380
    Fetched: 83500 / 144380
    Fetched: 84000 / 144380
    Fetched: 84500 / 144380
    Fetched: 85000 / 144380
    Fetched: 85500 / 144380
    Fetched: 86000 / 144380
    Fetched: 86500 / 144380
    Fetched: 87000 / 144380
    Fetched: 87500 / 144380
    Fetched: 88000 / 144380
    Fetched: 88500 / 144380
    Fetched: 89000 / 144380
    Fetched: 89500 / 144380
    Fetched: 90000 / 144380
    Fetched: 90500 / 144380
    Fetched: 91000 / 144380
    Fetched: 91500 / 144380
    Fetched: 92000 / 144380
    Fetched: 92500 / 144380
    Fetched: 93000 / 144380
    Fetched: 93500 / 144380
    Fetched: 94000 / 144380
    Fetched: 94500 / 144380
    Fetched: 95000 / 144380
    Fetched: 95500 / 144380
    Fetched: 96000 / 144380
    Fetched: 96500 / 144380
    Fetched: 97000 / 144380
    Fetched: 97500 / 144380
    Fetched: 98000 / 144380
    Fetched: 98500 / 144380
    Fetched: 99000 / 144380
    Fetched: 99500 / 144380
    Fetched: 100000 / 144380
    Fetched: 100500 / 144380
    Fetched: 101000 / 144380
    Fetched: 101500 / 144380
    Fetched: 102000 / 144380
    Fetched: 102500 / 144380
    Fetched: 103000 / 144380
    Fetched: 103500 / 144380
    Fetched: 104000 / 144380
    Fetched: 104500 / 144380
    Fetched: 105000 / 144380
    Fetched: 105500 / 144380
    Fetched: 106000 / 144380
    Fetched: 106500 / 144380
    Fetched: 107000 / 144380
    Fetched: 107500 / 144380
    Fetched: 108000 / 144380
    Fetched: 108500 / 144380
    Fetched: 109000 / 144380
    Fetched: 109500 / 144380
    Fetched: 110000 / 144380
    Fetched: 110500 / 144380
    Fetched: 111000 / 144380
    Fetched: 111500 / 144380
    Fetched: 112000 / 144380
    Fetched: 112500 / 144380
    Fetched: 113000 / 144380
    Fetched: 113500 / 144380
    Fetched: 114000 / 144380
    Fetched: 114500 / 144380
    Fetched: 115000 / 144380
    Fetched: 115500 / 144380
    Fetched: 116000 / 144380
    Fetched: 116500 / 144380
    Fetched: 117000 / 144380
    Fetched: 117500 / 144380
    Fetched: 118000 / 144380
    Fetched: 118500 / 144380
    Fetched: 119000 / 144380
    Fetched: 119500 / 144380
    Fetched: 120000 / 144380
    Fetched: 120500 / 144380
    Fetched: 121000 / 144380
    Fetched: 121500 / 144380
    Fetched: 122000 / 144380
    Fetched: 122500 / 144380
    Fetched: 123000 / 144380
    Fetched: 123500 / 144380
    Fetched: 124000 / 144380
    Fetched: 124500 / 144380
    Fetched: 125000 / 144380
    Fetched: 125500 / 144380
    Fetched: 126000 / 144380
    Fetched: 126500 / 144380
    Fetched: 127000 / 144380
    Fetched: 127500 / 144380
    Fetched: 128000 / 144380
    Fetched: 128500 / 144380
    Fetched: 129000 / 144380
    Fetched: 129500 / 144380
    Fetched: 130000 / 144380
    Fetched: 130500 / 144380
    Fetched: 131000 / 144380
    Fetched: 131500 / 144380
    Fetched: 132000 / 144380
    Fetched: 132500 / 144380
    Fetched: 133000 / 144380
    Fetched: 133500 / 144380
    Fetched: 134000 / 144380
    Fetched: 134500 / 144380
    Fetched: 135000 / 144380
    Fetched: 135500 / 144380
    Fetched: 136000 / 144380
    Fetched: 136500 / 144380
    Fetched: 137000 / 144380
    Fetched: 137500 / 144380
    Fetched: 138000 / 144380
    Fetched: 138500 / 144380
    Fetched: 139000 / 144380
    Fetched: 139500 / 144380
    Fetched: 140000 / 144380
    Fetched: 140500 / 144380
    Fetched: 141000 / 144380
    Fetched: 141500 / 144380
    Fetched: 142000 / 144380
    Fetched: 142500 / 144380
    Fetched: 143000 / 144380
    Fetched: 143500 / 144380
    Fetched: 144000 / 144380
    Fetched: 144380 / 144380

 %% Cell type:markdown id:7ae2cf71-0cde-4595-9c27-3be502c7de2b tags:

 Parse result json into a dict and that into a data frame

 %% Cell type:code id:bcbc223f-8615-41f8-a2cc-409ec1999d32 tags:

 ``` python
 pdb_to_uniprot = {}
 for r in tqdm(results[1:]):
    pdb_id, uniprot_id = r.split('\t')
    pdb_to_uniprot[pdb_id] = uniprot_id
 ```

 %% Output

    100%|██████████| 144380/144380 [00:00<00:00, 948289.08it/s]

 %% Cell type:code id:4232b8fd-228c-4d9b-9945-86511bbec8b1 tags:

 ``` python
 pdb_to_uniprot = pd.DataFrame.from_dict(pdb_to_uniprot, orient='index')
 ```

 %% Cell type:code id:906c2884-13a5-44b6-948b-3dc52590c271 tags:

 ``` python
 pdb_to_uniprot.columns = ['uniprot']
 ```

 %% Cell type:code id:1d10a20d-b3b4-4179-9499-8238a1c62236 tags:

 ``` python
 pdb = pdb.join(pdb_to_uniprot, on='target')
 ```

 %% Cell type:code id:d180bb5b-ba34-441d-a0c1-f259609c978e tags:

 ``` python
-pdb.to_parquet('/g/arendt/npapadop/repos/coffe/data/pdb_tmp.parquet')
+pdb.to_parquet('../data/pdb_tmp.parquet')
 ```

--- a/analysis/read-write.ipynb
+++ b/analysis/read-write.ipynb
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2022-07-15 10:36\n"
+      "2022-08-01 13:18\n"
     ]
    }
   ],
@@ -140,7 +140,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "223492it [00:00, 1285673.28it/s]\n"
+      "223492it [00:00, 688176.47it/s]\n"
     ]
    }
   ],
@@ -200,7 +200,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 41945/41945 [01:22<00:00, 509.76it/s] \n"
+      "100%|██████████| 41945/41945 [14:03<00:00, 49.70it/s] \n"
     ]
    }
   ],
@@ -249,196 +249,6 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "id": "d57aa70c-9646-4dfb-80b8-2eb3791c32ba",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>query</th>\n",
-       "      <th>MSA size</th>\n",
-       "      <th>query length</th>\n",
-       "      <th>gene name</th>\n",
-       "      <th>protein_id</th>\n",
-       "      <th>gene_id</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>isoform</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>c100000_g1_i1</th>\n",
-       "      <td>0</td>\n",
-       "      <td>2765</td>\n",
-       "      <td>433</td>\n",
-       "      <td>c100000_g1_i1_m.41809</td>\n",
-       "      <td>41809</td>\n",
-       "      <td>c100000_g1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103531_g2_i2</th>\n",
-       "      <td>10000</td>\n",
-       "      <td>2082</td>\n",
-       "      <td>152</td>\n",
-       "      <td>c103531_g2_i2_m.66483</td>\n",
-       "      <td>66483</td>\n",
-       "      <td>c103531_g2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103531_g3_i1</th>\n",
-       "      <td>10001</td>\n",
-       "      <td>1924</td>\n",
-       "      <td>201</td>\n",
-       "      <td>c103531_g3_i1_m.66482</td>\n",
-       "      <td>66482</td>\n",
-       "      <td>c103531_g3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103531_g4_i1</th>\n",
-       "      <td>10002</td>\n",
-       "      <td>1959</td>\n",
-       "      <td>215</td>\n",
-       "      <td>c103531_g4_i1_m.66484</td>\n",
-       "      <td>66484</td>\n",
-       "      <td>c103531_g4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103532_g1_i1</th>\n",
-       "      <td>10003</td>\n",
-       "      <td>203</td>\n",
-       "      <td>288</td>\n",
-       "      <td>c103532_g1_i1_m.66485</td>\n",
-       "      <td>66485</td>\n",
-       "      <td>c103532_g1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103530_g1_i3</th>\n",
-       "      <td>9998</td>\n",
-       "      <td>9311</td>\n",
-       "      <td>992</td>\n",
-       "      <td>c103530_g1_i3_m.66470</td>\n",
-       "      <td>66470</td>\n",
-       "      <td>c103530_g1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c103531_g2_i1</th>\n",
-       "      <td>9999</td>\n",
-       "      <td>2103</td>\n",
-       "      <td>221</td>\n",
-       "      <td>c103531_g2_i1_m.66481</td>\n",
-       "      <td>66481</td>\n",
-       "      <td>c103531_g2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c100434_g1_i2</th>\n",
-       "      <td>999</td>\n",
-       "      <td>15780</td>\n",
-       "      <td>124</td>\n",
-       "      <td>c100434_g1_i2_m.44014</td>\n",
-       "      <td>44014</td>\n",
-       "      <td>c100434_g1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c100036_g1_i3</th>\n",
-       "      <td>99</td>\n",
-       "      <td>3</td>\n",
-       "      <td>290</td>\n",
-       "      <td>c100036_g1_i3_m.42037</td>\n",
-       "      <td>42037</td>\n",
-       "      <td>c100036_g1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>c100002_g1_i1</th>\n",
-       "      <td>9</td>\n",
-       "      <td>1</td>\n",
-       "      <td>112</td>\n",
-       "      <td>c100002_g1_i1_m.41844</td>\n",
-       "      <td>41844</td>\n",
-       "      <td>c100002_g1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>41945 rows × 6 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "               query  MSA size  query length              gene name  \\\n",
-       "isoform                                                               \n",
-       "c100000_g1_i1      0      2765           433  c100000_g1_i1_m.41809   \n",
-       "c103531_g2_i2  10000      2082           152  c103531_g2_i2_m.66483   \n",
-       "c103531_g3_i1  10001      1924           201  c103531_g3_i1_m.66482   \n",
-       "c103531_g4_i1  10002      1959           215  c103531_g4_i1_m.66484   \n",
-       "c103532_g1_i1  10003       203           288  c103532_g1_i1_m.66485   \n",
-       "...              ...       ...           ...                    ...   \n",
-       "c103530_g1_i3   9998      9311           992  c103530_g1_i3_m.66470   \n",
-       "c103531_g2_i1   9999      2103           221  c103531_g2_i1_m.66481   \n",
-       "c100434_g1_i2    999     15780           124  c100434_g1_i2_m.44014   \n",
-       "c100036_g1_i3     99         3           290  c100036_g1_i3_m.42037   \n",
-       "c100002_g1_i1      9         1           112  c100002_g1_i1_m.41844   \n",
-       "\n",
-       "              protein_id     gene_id  \n",
-       "isoform                               \n",
-       "c100000_g1_i1      41809  c100000_g1  \n",
-       "c103531_g2_i2      66483  c103531_g2  \n",
-       "c103531_g3_i1      66482  c103531_g3  \n",
-       "c103531_g4_i1      66484  c103531_g4  \n",
-       "c103532_g1_i1      66485  c103532_g1  \n",
-       "...                  ...         ...  \n",
-       "c103530_g1_i3      66470  c103530_g1  \n",
-       "c103531_g2_i1      66481  c103531_g2  \n",
-       "c100434_g1_i2      44014  c100434_g1  \n",
-       "c100036_g1_i3      42037  c100036_g1  \n",
-       "c100002_g1_i1      41844  c100002_g1  \n",
-       "\n",
-       "[41945 rows x 6 columns]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sequence_info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
   "id": "5ae5dfb2-84a6-420f-8ac2-2c79d4f95e16",
   "metadata": {},
   "outputs": [],
@@ -450,7 +260,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "c61347d2-f602-4f76-a656-70528242a4be",
   "metadata": {
    "tags": []
@@ -470,17 +280,18 @@
   "source": [
    "## 2. AlphaFold-predicted structures\n",
    "\n",
-    "We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID."
+    "We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID. We will write this in `../data/structure_predictions.csv`."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "id": "3831f7ff-07b6-42ae-8276-ab899ab6ffa0",
   "metadata": {},
   "outputs": [],
   "source": [
-    "alphafold.to_csv(\"../data/structure_predictions.csv\")"
+    "# not executed\n",
+    "# !sbatch ../scripts/io_parse_structures.sh"
   ]
  },
  {
@@ -498,11 +309,13 @@
   "source": [
    "## 3. FoldSeek predictions\n",
    "\n",
-    "We searched against AlphaFoldDB (predicted), PDB (crystal), and SwissProt (predicted) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation.\n",
+    "We searched against AlphaFoldDB (Release 3, January 2021, containing predictions for SwissProt), and PDB (crystal) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation.\n",
    "\n",
    "### 3.1 Reading results\n",
    "\n",
-    "The FoldSeek results come in BLAST format. We will read them and convert the query (FoldSeek's internal unique ID for each input structure) from string to integer."
+    "The FoldSeek results come in BLAST format. We will read them and convert the query (FoldSeek's internal unique ID for each input structure) from string to integer.\n",
+    "\n",
+    "We'll save those in `parquet` format to make reading them much faster."
   ]
  },
  {
@@ -512,9 +325,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "!sbatch ../scripts/io_read_pdb.sh\n",
-    "!sbatch ../scripts/io_read_afdb.sh\n",
-    "!sbatch ../scripts/io_read_swp.sh"
+    "# NOT EXECUTED\n",
+    "# !sbatch ../scripts/io_read_pdb.sh\n",
+    "# !sbatch ../scripts/io_read_afdb.sh\n",
+    "# !sbatch ../scripts/io_read_swp.sh"
   ]
  },
  {
@@ -522,7 +336,7 @@
   "id": "1c3f8ded-6be0-4749-ad85-da5d65be9be6",
   "metadata": {},
   "source": [
-    "should take about 3-5h to complete."
+    "this step should take about 3-5h to complete on a single core."
   ]
  },
  {
@@ -530,43 +344,19 @@
   "id": "c2343e57-f527-4634-87a9-cbd2bc8aee5f",
   "metadata": {},
   "source": [
-    "For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes."
+    "For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes. Unfortunately, there is no easy way around this. Refer to `io-process_pdb.ipynb`."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "id": "5b403921-8514-431a-a34d-f92fbf034ab8",
+   "execution_count": 10,
+   "id": "09f34772-e59f-431f-8bdd-24923f7132d5",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "perl: warning: Setting locale failed.\n",
-      "perl: warning: Please check that your locale settings:\n",
-      "\tLANGUAGE = (unset),\n",
-      "\tLC_ALL = (unset),\n",
-      "\tLC_CTYPE = \"UTF-8\",\n",
-      "\tLC_TERMINAL = \"iTerm2\",\n",
-      "\tLANG = \"en_US.UTF-8\"\n",
-      "    are supported and installed on your system.\n",
-      "perl: warning: Falling back to a fallback locale (\"en_US.UTF-8\").\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "../data/uniprotinfo.fasta: 7206 sequences, 4638410 bp => dividing into 1 part of <= 100000 sequences\n",
-      "All done, 0 seconds elapsed\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "%%bash\n",
-    "mkdir ../data/uniprot_fastas/ -p\n",
-    "perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/"
+    "pdb_tmp = pd.read_parquet('../data/pdb_tmp.parquet')\n",
+    "swp_tmp = pd.read_parquet('../data/swp_tmp.parquet')\n",
+    "afdb_tmp = pd.read_parquet('../data/afdb_tmp.parquet')"
   ]
  },
  {
@@ -574,14 +364,14 @@
   "id": "cc1e0086-8c3f-4881-938e-0270a9f9e53e",
   "metadata": {},
   "source": [
-    "These were manually submitted to emapper, and the results (the `out.emapper.annotations` tsv file of each run) downloaded and saved with an `eggnog` ending.\n",
+    "We extract the UniProt IDs from the results of all databases and retrieve their sequences from UniProt, using the [EBI 'proteins' API](https://www.ebi.ac.uk/proteins/api/doc/). Refer to `io-get_uniprot_sequences.ipynb`.\n",
    "\n",
-    "Let's process the retrieved emapper results:"
+    "The resulting fasta files are (manually) submitted to EggNOG-mapper and downloaded, then renamed according to their respective input file. We will read and concatenate them here:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "id": "22630619-3006-425f-bace-980499b61afd",
   "metadata": {},
   "outputs": [],
@@ -602,7 +392,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "id": "65cd9d5e-fdac-4a14-94f7-787fdba690fa",
   "metadata": {},
   "outputs": [],
@@ -621,17 +411,8 @@
    "                  'Description', 'Preferred_name', 'GOs', 'PFAMs']\n",
    "uniprot_annotation[to_categorical] = uniprot_annotation[to_categorical].astype(\"category\")\n",
    "# finally save in parquet format\n",
-    "uniprot_annotation.to_parquet('../data/uniprot_annotation.parquet')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "500fd12f-5e1e-48ef-99c1-3db0424d7cb6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "uniprot_annotation = pd.read_parquet('../data/uniprot_annotation.parquet')"
+    "uniprot_annotation.to_parquet('../data/uniprot_annotation.parquet')\n",
+    "# uniprot_annotation = pd.read_parquet('../data/uniprot_annotation.parquet')"
   ]
  },
  {
@@ -651,7 +432,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation"
+    "# not executed\n",
+    "# !python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation"
   ]
  },
  {
@@ -664,7 +446,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "id": "87957304-19a6-451d-aa82-9ece85cc90de",
   "metadata": {},
   "outputs": [],
@@ -677,7 +459,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "id": "09465de2-c112-4de3-b487-6a27f8cf0504",
   "metadata": {},
   "outputs": [],
@@ -695,14 +477,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "id": "019667f8-47b2-4959-abe8-38e5529b4e86",
   "metadata": {},
   "outputs": [],
   "source": [
-    "pdb = pdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n",
-    "afdb = afdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n",
-    "swp = swp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')"
+    "pdb = pdb_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n",
+    "afdb = afdb_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')\n",
+    "swp = swp_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')"
   ]
  },
  {
@@ -715,7 +497,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "id": "85f48253-2176-4b3b-ae78-e18dc8156d6f",
   "metadata": {},
   "outputs": [],
@@ -740,7 +522,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "id": "fe4b9101-4312-440b-a437-1dd701e02129",
   "metadata": {},
   "outputs": [],
@@ -769,7 +551,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "id": "9bbd5e11-77c9-4657-b0e8-d6d3e7055d3a",
   "metadata": {},
   "outputs": [],
@@ -792,7 +574,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "542ca93a-9f73-45ca-8c76-48152dae9a58",
   "metadata": {
    "tags": []
@@ -808,7 +590,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "id": "7bb9638e-06d3-42a4-bb74-9220c0cfe886",
   "metadata": {},
   "outputs": [],
@@ -820,7 +602,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "id": "b52b7268-3247-4884-84cb-5fa3d452878a",
   "metadata": {},
   "outputs": [],
@@ -832,7 +614,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "id": "d85514be-1d56-4837-801d-a3bc24a8e889",
   "metadata": {},
   "outputs": [],
@@ -851,17 +633,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 75,
   "id": "c9e7e73d-1670-4cf4-b6b6-f4f9cebc928a",
   "metadata": {},
   "outputs": [],
   "source": [
-    "structural_annotation = sequence_info.join(best).join(alphafold)"
+    "structural_annotation = sequence_info.join(best)\n",
+    "structural_annotation = structural_annotation.set_index('isoform').join(alphafold.set_index('isoform'))\n",
+    "structural_annotation = structural_annotation.reset_index().set_index('query')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 76,
   "id": "21d1d18c-5de1-43e8-83f3-04d4bf07eaa2",
   "metadata": {},
   "outputs": [],
@@ -881,7 +665,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 61,
   "id": "c1bd80a1-f155-4330-8894-fcca687f48a5",
   "metadata": {},
   "outputs": [],
@@ -896,21 +680,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 62,
   "id": "6c7e43fe-2951-4104-b15b-129286601a4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "eggnog.to_csv('../data/Slacustris_eggnog.tsv', sep='\\t', index=False)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f46b7ad3-99c8-487b-a3c1-29aee10b9775",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {

 %% Cell type:code id:a5e450d0-6b47-405d-b6f4-fdbb3edfb641 tags:

 ``` python
 from datetime import datetime, timezone
 import pytz

 utc_dt = datetime.now(timezone.utc) # UTC time
 dt = utc_dt.astimezone()
 tz = pytz.timezone('Europe/Berlin')
 berlin_now = datetime.now(tz)

 print(f'{berlin_now:%Y-%m-%d %H:%M}')
 ```

 %% Output

-    2022-07-15 10:36
+    2022-08-01 13:18

 %% Cell type:markdown id:3d79e6d8-8b76-4d9b-8621-20543ce83f54 tags:

 # Expanding functional annotation of a non-model species using predicted protein structure similarity

 ***WARNING:*** The starting point of this analysis is a **predicted proteome** produced by the Trinity <i>de-novo</i> assembler. We produced multiple sequence alignments for each **isoform** with MMSeqs2 against the ColabFoldDB and UniRef30 (see the [ColabFold website](https://colabfold.mmseqs.com)); the MSAs were used as input for AlphaFold; the predicted protein structures were queried by FoldSeek against PDB, SwissProt, and AlphaFoldDB. The same sequences were queried by EggNOG. However, ultimately the analysis we conduct here is going to be on the **gene level**; we will therefore summarize results on the isoform level, either by concatenating them or by keeping the best-scoring isoform.

 This notebook gathers the preparatory steps for the analysis. This is the following:

 1. **Parse MSA files** and extract MSA size, query length, _Spongilla_ transcript name
 2. **Parse AlphaFold predictions**
    2a. extract average pLDDT score per prediction
    2b. keep best-scoring isoform per gene
 3. **Read and filter FoldSeek predictions** for each target database (PDB, SwissProt, AlphaFoldDB). Obtain annotation via EggNOG mapper, UniProt.
 4. **Read and filter sequence-based annotation** for the _Spongilla_ proteome.
 5. **Format structural-based annotation** Summarize in single table, similar to how sequence annotation would look like.




 We are going to use
 * `numpy` and `pandas` for I/O and data wrangling;
 * `tqdm` for progress bars;
 * `glob`, `os` for file navigation;
 * `json` to parse some AlphaFold output;
 * `urllib` to query the UniProt API;
 * `UPIMAPI`, a tool to programmatically access UniProt;
 * `fasta-splitter.pl`, a Perl script to parse FASTA files.

 %% Cell type:code id:1eae5786-023a-45ab-ac4a-ccb3a1db5760 tags:

 ``` python
 import json
 import glob
 import os

 import urllib.parse
 import urllib.request

 from tqdm import tqdm

 import numpy as np
 import pandas as pd
 ```

 %% Cell type:markdown id:ebbcdb49-737d-4ed9-b081-56056ecd6cf6 tags:

 ## 0. Setup

 We are going to gather the input here; we need

 * input MSAs for _Spongilla lacustris_ proteins (actually Transdecoder-predicted peptides). These were the input to AlphaFold.
 * `foldseek` search results for the _Spongilla_ predicted structures against three target databases: AlphaFoldDB, PDB, and SwissProt.
 * AlphaFold predicted structures; we obtain a list of structures (input to FoldSeek) as well as auxiliary files with e.g. prediction quality scores.
 * input peptides

 %% Cell type:code id:c58419bf-f9d3-40f6-ae23-a4bf978f2b35 tags:

 ``` python
 # MSA location
 msas = '../data/msa/'
 # FOLDSEEK scores
 fs_pdb = '../data/pdb_score.tsv'
 fs_afdb = '../data/afdb_score.tsv'
 fs_swp = '../data/swissprot_score.tsv'
 # AlphaFold predictions
 structure_list = '../data/named_models/'
 metadata = '../data/named_metadata/'
 # input fasta
 all_peptides = '../data/Spongilla_lacustris_translated_proteome.fasta'
 ```

 %% Cell type:markdown id:9a44be9c-3fdf-4c62-9e16-bea75e383d00 tags:

 ## 1. Sequence information


 ### 1.1 Input peptides

 The input for the sequence annotation pipeline was predicted peptides, as generated from the assembled genome. Read the peptides and keep track of which sequences start with a methionine and end with a star (i.e. complete transcripts assembled by Trinity/TransDecoder).

 Note that there will be multiple (predicted) isoforms per (predicted) gene. We produced one MSA per isoform.

 %% Cell type:code id:40fd2798-4dba-48f6-bbd1-fc5758c84835 tags:

 ``` python
 isoforms = []
 proteins = []
 sequences = []
 with open(all_peptides, "r") as fasta_file:
    current_protein = None
    current_id = ""
    for line in tqdm(fasta_file):
        if line.startswith(">"):
            current_id = line.strip()[1:]
            isoform, protein = current_id.split(".")
            isoforms.append(isoform[:-2])
            proteins.append(protein)
            if current_protein is not None:
                sequences.append(current_protein)
            current_protein = ""
        else:
            current_protein += line.strip()
    sequences.append(current_protein)
 ```

 %% Output

-    223492it [00:00, 1285673.28it/s]
+    223492it [00:00, 688176.47it/s]

 %% Cell type:code id:2e2fafde-8653-4286-aeb4-cecb55804e4f tags:

 ``` python
 fasta = pd.DataFrame({"isoform": isoforms, "protein": proteins, "seq": sequences})

 fasta["has_start"] = fasta["seq"].str.startswith("M")
 fasta["has_end"] = fasta["seq"].str[-1].str.startswith("*")
 fasta.set_index("isoform", inplace=True)
 fasta = fasta[["has_start", "has_end"]]
 ```

 %% Cell type:markdown id:9ef9eeba-0082-4109-ad9b-33df3db9b129 tags:

 ### 1.2 Multiple sequence alignments

 The peptides were aligned against UniRef30 and ColabFoldDB using MMSeqs2. Read the MSAs and extract the number of sequences in each as well as the sequence length and the _Spongilla_ transcript name.

 %% Cell type:code id:cc2f89f4-1b24-4b1d-b1ef-b1c40ea99ff4 tags:

 ``` python
 N = len(glob.glob(msas+"*.a3m"))
 seq_id = [""] * N
 no_seqs = [0] * N
 seq_length = [0] * N
 gene_name = [""] * N

 for i, alignment in enumerate(tqdm(glob.glob(msas+"*.a3m"))):
    try:
        with open(alignment, "r") as f:
            seq_id[i] = alignment.split("/")[-1].split(".")[0]
            lines = f.readlines()
            no_seqs[i] = (len(lines) - 2) // 2
            seq_length[i] = len(lines[1].rstrip()[1:])
            gene_name[i] = lines[0].rstrip()[1:]
            # print(no_seqs, seq_length, gene_name)
    except FileNotFoundError:
        continue
 ```

 %% Output

-    100%|██████████| 41945/41945 [01:22<00:00, 509.76it/s]
+    100%|██████████| 41945/41945 [14:03<00:00, 49.70it/s]

 %% Cell type:code id:64a7ca10-f99c-402e-a9bf-42826933ae9e tags:

 ``` python
 sequence_info = pd.DataFrame({"query": seq_id, "MSA size": no_seqs, "query length": seq_length, "gene name": gene_name})
 sequence_info["protein_id"] = sequence_info["gene name"].str.split(".").str[1]
 sequence_info["isoform"] = sequence_info["gene name"].str.split(".").str[0].str[:-2]
 sequence_info["gene_id"] = sequence_info["gene name"].str.split(".").str[0].str.split("_").str[:2].str.join("_")
 sequence_info.set_index("isoform", inplace=True)
 ```

 %% Cell type:markdown id:46c7c3ce-0f76-4728-a188-5c04247d1eef tags:

 include the start codon/end codon information from the raw sequences we parsed:

-%% Cell type:code id:d57aa70c-9646-4dfb-80b8-2eb3791c32ba tags:
-
-``` python
-sequence_info
-```
-
-%% Output
-
-                   query  MSA size  query length              gene name  \
-    isoform
-    c100000_g1_i1      0      2765           433  c100000_g1_i1_m.41809
-    c103531_g2_i2  10000      2082           152  c103531_g2_i2_m.66483
-    c103531_g3_i1  10001      1924           201  c103531_g3_i1_m.66482
-    c103531_g4_i1  10002      1959           215  c103531_g4_i1_m.66484
-    c103532_g1_i1  10003       203           288  c103532_g1_i1_m.66485
-    ...              ...       ...           ...                    ...
-    c103530_g1_i3   9998      9311           992  c103530_g1_i3_m.66470
-    c103531_g2_i1   9999      2103           221  c103531_g2_i1_m.66481
-    c100434_g1_i2    999     15780           124  c100434_g1_i2_m.44014
-    c100036_g1_i3     99         3           290  c100036_g1_i3_m.42037
-    c100002_g1_i1      9         1           112  c100002_g1_i1_m.41844
-    
-                  protein_id     gene_id
-    isoform
-    c100000_g1_i1      41809  c100000_g1
-    c103531_g2_i2      66483  c103531_g2
-    c103531_g3_i1      66482  c103531_g3
-    c103531_g4_i1      66484  c103531_g4
-    c103532_g1_i1      66485  c103532_g1
-    ...                  ...         ...
-    c103530_g1_i3      66470  c103530_g1
-    c103531_g2_i1      66481  c103531_g2
-    c100434_g1_i2      44014  c100434_g1
-    c100036_g1_i3      42037  c100036_g1
-    c100002_g1_i1      41844  c100002_g1
-    
-    [41945 rows x 6 columns]
-
 %% Cell type:code id:5ae5dfb2-84a6-420f-8ac2-2c79d4f95e16 tags:

 ``` python
 sequence_info = sequence_info.join(fasta)
 # this column will hold NaNs later so convert it to Int64, which can deal with nulls.
 sequence_info["protein_id"] = sequence_info["protein_id"].astype(int).astype("Int64")
 ```

 %% Cell type:code id:c61347d2-f602-4f76-a656-70528242a4be tags:

 ``` python
 sequence_info.to_csv("../data/sequence_info.csv")
 # sequence_info = pd.read_csv("../data/sequence_info.csv")
 ```

 %% Cell type:markdown id:1b716527-0af8-4945-b70f-a11302c11685 tags:

 ## 2. AlphaFold-predicted structures

-We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID.
+We used the MSAs to predict protein structures using ColabFold. Here, we read the per-residue pLDDT score from the best predicted structure per peptide and average it over all residues; then we merge _Spongilla_ isoforms by keeping the best score per gene ID. We will write this in `../data/structure_predictions.csv`.

 %% Cell type:code id:3831f7ff-07b6-42ae-8276-ab899ab6ffa0 tags:

 ``` python
-alphafold.to_csv("../data/structure_predictions.csv")
+# not executed
+# !sbatch ../scripts/io_parse_structures.sh
 ```

 %% Cell type:markdown id:86732d06-f95e-4583-9997-86bb9c2e0a52 tags:

 Should take about 30-45mins to complete

 %% Cell type:markdown id:9c6410a0-999a-41c5-af8c-3c30abbb1654 tags:

 ## 3. FoldSeek predictions

-We searched against AlphaFoldDB (predicted), PDB (crystal), and SwissProt (predicted) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation.
+We searched against AlphaFoldDB (Release 3, January 2021, containing predictions for SwissProt), and PDB (crystal) protein structures to detect structural similarity to our predicted _Spongilla_ peptide structures using FoldSeek. Here we read the FoldSeek output. In order to obtain useful phylogenetic information as well as functional annotation, we will translate all structural hits to their UniProt IDs and then query the EggNOG database and UniProt for annotation.

 ### 3.1 Reading results

 The FoldSeek results come in BLAST format. We will read them and convert the query (FoldSeek's internal unique ID for each input structure) from string to integer.

+We'll save those in `parquet` format to make reading them much faster.
+
 %% Cell type:code id:f79aa33d-50b9-4cf0-876a-001a63db761a tags:

 ``` python
-!sbatch ../scripts/io_read_pdb.sh
-!sbatch ../scripts/io_read_afdb.sh
-!sbatch ../scripts/io_read_swp.sh
+# NOT EXECUTED
+# !sbatch ../scripts/io_read_pdb.sh
+# !sbatch ../scripts/io_read_afdb.sh
+# !sbatch ../scripts/io_read_swp.sh
 ```

 %% Cell type:markdown id:1c3f8ded-6be0-4749-ad85-da5d65be9be6 tags:

-should take about 3-5h to complete.
+this step should take about 3-5h to complete on a single core.

 %% Cell type:markdown id:c2343e57-f527-4634-87a9-cbd2bc8aee5f tags:

-For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes.
+For PDB the situation is more complicated. We will take the PDB IDs and translate them to UniProt accession numbers using the UniProt API. This will return an inflated list of IDs since some PDB entries contain complexes. Unfortunately, there is no easy way around this. Refer to `io-process_pdb.ipynb`.

-%% Cell type:code id:5b403921-8514-431a-a34d-f92fbf034ab8 tags:
+%% Cell type:code id:09f34772-e59f-431f-8bdd-24923f7132d5 tags:

 ``` python
-%%bash
-mkdir ../data/uniprot_fastas/ -p
-perl ../scripts/fasta-splitter.pl --part-size 100000 ../data/uniprotinfo.fasta --nopad --measure count --out-dir ../data/uniprot_fastas/
+pdb_tmp = pd.read_parquet('../data/pdb_tmp.parquet')
+swp_tmp = pd.read_parquet('../data/swp_tmp.parquet')
+afdb_tmp = pd.read_parquet('../data/afdb_tmp.parquet')
 ```

-%% Output
-
-    perl: warning: Setting locale failed.
-    perl: warning: Please check that your locale settings:
-    	LANGUAGE = (unset),
-    	LC_ALL = (unset),
-    	LC_CTYPE = "UTF-8",
-    	LC_TERMINAL = "iTerm2",
-    	LANG = "en_US.UTF-8"
-        are supported and installed on your system.
-    perl: warning: Falling back to a fallback locale ("en_US.UTF-8").
-
-    ../data/uniprotinfo.fasta: 7206 sequences, 4638410 bp => dividing into 1 part of <= 100000 sequences
-    All done, 0 seconds elapsed
-
 %% Cell type:markdown id:cc1e0086-8c3f-4881-938e-0270a9f9e53e tags:

-These were manually submitted to emapper, and the results (the `out.emapper.annotations` tsv file of each run) downloaded and saved with an `eggnog` ending.
+We extract the UniProt IDs from the results of all databases and retrieve their sequences from UniProt, using the [EBI 'proteins' API](https://www.ebi.ac.uk/proteins/api/doc/). Refer to `io-get_uniprot_sequences.ipynb`.

-Let's process the retrieved emapper results:
+The resulting fasta files are (manually) submitted to EggNOG-mapper and downloaded, then renamed according to their respective input file. We will read and concatenate them here:

 %% Cell type:code id:22630619-3006-425f-bace-980499b61afd tags:

 ``` python
 uniprot_eggnog = []
 for file in glob.glob('../data/uniprot_fastas/*.eggnog'):
    tmp = pd.read_csv(file, sep='\t', skiprows=4, skipfooter=3, engine='python')
    uniprot_eggnog.append(tmp)
 ```

 %% Cell type:markdown id:5b7287bb-7467-436b-8e9b-ff59e2d721b7 tags:

 We can concatenate the files, remove unnecessary columns and convert string columns to categorical to save even more space, before saving as a parquet file.

 %% Cell type:code id:65cd9d5e-fdac-4a14-94f7-787fdba690fa tags:

 ``` python
 uniprot_annotation = pd.concat(uniprot_eggnog)

 uniprot_annotation['uniprot'] = uniprot_annotation['#query'].str.split('|').str[1]
 uniprot_annotation.reset_index(drop=True, inplace=True)
 # remove unnecessary columns
 dead_weight = ['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',
               'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE',
               'KEGG_TC', 'CAZy', 'BiGG_Reaction']
 uniprot_annotation.drop(dead_weight, axis=1, inplace=True)
 # convert rest to categorical to save space
 to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category',
                  'Description', 'Preferred_name', 'GOs', 'PFAMs']
 uniprot_annotation[to_categorical] = uniprot_annotation[to_categorical].astype("category")
 # finally save in parquet format
 uniprot_annotation.to_parquet('../data/uniprot_annotation.parquet')
-```
-
-%% Cell type:code id:500fd12f-5e1e-48ef-99c1-3db0424d7cb6 tags:
-
-``` python
-uniprot_annotation = pd.read_parquet('../data/uniprot_annotation.parquet')
+# uniprot_annotation = pd.read_parquet('../data/uniprot_annotation.parquet')
 ```

 %% Cell type:markdown id:8a2910c3-dbd5-4268-8eed-eef8c9796ca5 tags:

 ### 3.4 Annotating UniProt sequences - UniProt

 We will also query UniProt to get useful gene names, protein names, functional annotation, and taxonomic information. This info is also included in Emapper, but here we get it in a more accessible form.

 %% Cell type:code id:c7d97917-53c6-4b51-98a5-97a55100aa37 tags:

 ``` python
-!python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation
+# not executed
+# !python /g/arendt/npapadop/repos/UPIMAPI/upimapi.py -i ../data/foldseek_unique_ids.txt -o ../data/ --no-annotation
 ```

 %% Cell type:markdown id:abcfc14c-c236-47c1-ad7a-1bba7244e46b tags:

 The `csv` file really is huge though so we'll read it and convert to parquet format:

 %% Cell type:code id:87957304-19a6-451d-aa82-9ece85cc90de tags:

 ``` python
 foldseek_full = pd.read_csv('../data/uniprotinfo.tsv', sep='\t')
 foldseek_full = foldseek_full[['Entry', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']]
 foldseek_full.columns = ['uniprot', 'Entry name', 'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']
 foldseek_full.to_parquet('../data/fs_targets.parquet')
 ```

 %% Cell type:code id:09465de2-c112-4de3-b487-6a27f8cf0504 tags:

 ``` python
 foldseek_full = pd.read_parquet('../data/fs_targets.parquet')
 ```

 %% Cell type:markdown id:86ebcf92-6b4f-4b4c-89dc-f3efc0dfb6b0 tags:

 Add the UniProt and EggNOG annotation to the FoldSeek results:

 %% Cell type:code id:019667f8-47b2-4959-abe8-38e5529b4e86 tags:

 ``` python
-pdb = pdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
-afdb = afdb.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
-swp = swp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
+pdb = pdb_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
+afdb = afdb_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
+swp = swp_tmp.merge(foldseek_full, on='uniprot', how='left').merge(uniprot_annotation, on='uniprot', how='left')
 ```

 %% Cell type:markdown id:bf6217eb-a982-425e-b68f-2ec57972e864 tags:

 Convert non-numeric columns to categorical; many annotations will be repeated (explore by using `.value_counts()` on any of these columns), therefore we will gain a lot of space by converting away from `object`.

 %% Cell type:code id:85f48253-2176-4b3b-ae78-e18dc8156d6f tags:

 ``` python
 to_categorical = ['uniprot', 'eggNOG_OGs', 'max_annot_lvl', 'COG_category', 'target',
                  'Description', 'Preferred_name', 'GOs', 'PFAMs', 'Entry name',
                  'Gene names', 'Function [CC]', 'Taxonomic lineage (PHYLUM)']
 pdb[to_categorical] = pdb[to_categorical].astype("category")
 afdb[to_categorical] = afdb[to_categorical].astype("category")
 swp[to_categorical] = swp[to_categorical].astype("category")
 ```

 %% Cell type:markdown id:6769b2e4-6f33-4c88-9519-3426690c5339 tags:

 ## 4. Structural annotation summary

 The structural annotation pipeline can be summarised by keeping the best-scoring annotation for each _Spongilla_ peptide. We will do this by first dropping unannotated entries from the target DB tables, and then keeping the best bit score per query peptide. Finally, we will concatenate the target DB tables and keep the best-scoring annotation for each query peptide. These tables are too big and unwieldy to save anyway.

 %% Cell type:code id:fe4b9101-4312-440b-a437-1dd701e02129 tags:

 ``` python
 def remove_unannotated(df):
    no_desc = df['Description'] == "-"
    no_func = df['Function [CC]'].isnull()
    no_annot = no_desc & no_func
    return df[~no_annot]

 def best_bit_score(df, sort_by='bit score', tiebreak='alignment length'):
    have_max = df[sort_by] == np.max(df[sort_by])
    max_ali = df[have_max][tiebreak] == np.max(df[have_max][tiebreak])
    return df[have_max][max_ali].index.values[0]

 def keep_best_annotated(df):
    df = remove_unannotated(df)
    idx = df.groupby('query').apply(best_bit_score)
    res = df.loc[idx].copy()
    return res

 def tag_origin(df, prefix):
    df.columns = prefix + " " + df.columns
    return df
 ```

 %% Cell type:code id:9bbd5e11-77c9-4657-b0e8-d6d3e7055d3a tags:

 ``` python
 pdb_best = keep_best_annotated(pdb)
 pdb_best['origin'] = 'PDB'
 afdb_best = keep_best_annotated(afdb)
 afdb_best['origin'] = "AFDB"
 swp_best = keep_best_annotated(swp)
 swp_best['origin'] = 'SwissProt'
 ```

 %% Cell type:markdown id:5c879807-6e82-405c-9bb5-33bcff5bdae6 tags:

 Prune matrices to make them smaller:

 %% Cell type:code id:542ca93a-9f73-45ca-8c76-48152dae9a58 tags:

 ``` python
 dead_weight = ['no. mismatches', 'no. gap open', 'query start', 'query end',
               'target start', 'target end', 'target']
 pdb_best.drop(dead_weight, axis=1, inplace=True)
 afdb_best.drop(dead_weight, axis=1, inplace=True)
 swp_best.drop(dead_weight, axis=1, inplace=True)
 ```

 %% Cell type:code id:7bb9638e-06d3-42a4-bb74-9220c0cfe886 tags:

 ``` python
 pdb_best.to_parquet('../data/fs_best_pdb.parquet')
 afdb_best.to_parquet('../data/fs_best_afdb.parquet')
 swp_best.to_parquet('../data/fs_best_swp.parquet')
 ```

 %% Cell type:code id:b52b7268-3247-4884-84cb-5fa3d452878a tags:

 ``` python
 best = pd.concat([pdb_best, afdb_best, swp_best])
 best = best.sort_values('corrected bit score', ascending=False).drop_duplicates(['query'])
 best.set_index('query', inplace=True)
 ```

 %% Cell type:code id:d85514be-1d56-4837-801d-a3bc24a8e889 tags:

 ``` python
 sequence_info = pd.read_csv("../data/sequence_info.csv")
 alphafold = pd.read_csv("../data/structure_predictions.csv", index_col=0)
 ```

 %% Cell type:markdown id:454754cc-d44b-4f5d-bc0b-5c29a2ef4e44 tags:

 Add the sequence information and the AlphaFold plddt score to complete the table, then save in parquet format.

 %% Cell type:code id:c9e7e73d-1670-4cf4-b6b6-f4f9cebc928a tags:

 ``` python
-structural_annotation = sequence_info.join(best).join(alphafold)
+structural_annotation = sequence_info.join(best)
+structural_annotation = structural_annotation.set_index('isoform').join(alphafold.set_index('isoform'))
+structural_annotation = structural_annotation.reset_index().set_index('query')
 ```

 %% Cell type:code id:21d1d18c-5de1-43e8-83f3-04d4bf07eaa2 tags:

 ``` python
 structural_annotation.to_parquet('../data/structure_annotation.parquet')
 ```

 %% Cell type:markdown id:2a395280-bf0f-446d-9f48-2d870d022586 tags:

 ## 5. Sequence-based annotation

 We used EggNOG mapper to annotate the _Spongilla_ predicted proteome. The resulting `out.emapper.annotations` tsv was downloaded from the website and is processed here to a more usable state.

 %% Cell type:code id:c1bd80a1-f155-4330-8894-fcca687f48a5 tags:

 ``` python
 eggnog = pd.read_csv('../data/MM_ipw26ffh.emapper.annotations.tsv', sep='\t', skiprows=4, skipfooter=3, engine='python')
 eggnog["gene_id"] = eggnog["#query"].str.split("_").str[:2].str.join("_")
 eggnog["protein_id"] = eggnog["#query"].str.split(".").str[1]
 eggnog.drop(['#query', 'seed_ortholog', 'EC', 'KEGG_ko', 'KEGG_Pathway',
             'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass', 'BRITE', 'KEGG_TC',
             'CAZy', 'BiGG_Reaction'], axis=1, inplace=True)
 ```

 %% Cell type:code id:6c7e43fe-2951-4104-b15b-129286601a4d tags:

 ``` python
 eggnog.to_csv('../data/Slacustris_eggnog.tsv', sep='\t', index=False)
 ```
-
-%% Cell type:code id:f46b7ad3-99c8-487b-a3c1-29aee10b9775 tags:
-
-``` python
-```

--- a/analysis/suppl-annotation_categories.ipynb
+++ b/analysis/suppl-annotation_categories.ipynb
@@ -1426,7 +1426,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,

--- a/analysis/suppl-struct_seq_agreement.ipynb
+++ b/analysis/suppl-struct_seq_agreement.ipynb
@@ -10,7 +10,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2022-06-10 13:42\n"
+      "2022-08-01 14:41\n"
     ]
    }
   ],

 %% Cell type:code id:f741d1a4-672b-405c-a5d0-3c1c1ba430cf tags:

 ``` python
 from datetime import datetime, timezone
 import pytz

 utc_dt = datetime.now(timezone.utc) # UTC time
 dt = utc_dt.astimezone()
 tz = pytz.timezone('Europe/Berlin')
 berlin_now = datetime.now(tz)
 print(f'{berlin_now:%Y-%m-%d %H:%M}')
 ```

 %% Output

-    2022-06-10 13:42
+    2022-08-01 14:41

 %% Cell type:markdown id:00e53e1b-0a8a-4f71-9bde-d11ce1e21019 tags:

 # Agreement between sequence and structure performance

 We will compare the annotation produced by AlphaFold+FoldSeek against that produced by EggNOG-mapper. If our idea worked well, we should see that in the overwhelming majority of cases the structural pipeline identifies the same annotation for each _Spongilla_ protein.

 %% Cell type:code id:e5ab0788-bb61-42d7-8674-5f68f20c61f1 tags:

 ``` python
 import glob
 from os.path import exists

 import pandas as pd
 import numpy as np

 import matplotlib.pyplot as plt
 import seaborn as sns
 ```

 %% Cell type:markdown id:ac018feb-1310-48b3-8676-2f8b17386461 tags:

 Read the two tables:

 %% Cell type:code id:f0d82db4-58f6-4cbf-8c72-c7382c1f5787 tags:

 ``` python
 structural_annotation = pd.read_parquet('../data/structure_annotation.parquet')
 sequence_annotation = pd.read_csv('../data/Slacustris_eggnog.tsv', sep='\t')
 ```

 %% Cell type:markdown id:03e19839-65d0-4cd8-95fe-03732ba98b85 tags:

 Identify the bit score cut off and threshold the structure annotation:

 %% Cell type:code id:c2807a32-30cd-48f3-97c3-7828b35a6906 tags:

 ``` python
 keep = np.intersect1d(sequence_annotation['protein_id'], structural_annotation['protein_id'])
 # bitscore_cut_off = structural_annotation.set_index('protein_id').loc[keep]['bit score'].min()
 bitscore_cut_off = np.exp(5)

 keep = structural_annotation['bit score'] >= bitscore_cut_off
 structural_annotation = structural_annotation[keep].copy()
 structural_annotation.shape, sequence_annotation.shape
 ```

 %% Output

    ((25232, 30), (17990, 11))

 %% Cell type:markdown id:aeb40739-ad83-4b30-9ee6-e11304154d1c tags:

 ## EggNOG most specific OrthoGroup level

 this is the ideal scenario: for how many cases do the two annotation pipelines put a gene in the same group of orthologous genes? Extract the EggNOG OG information and compare. Look both directions; if the most specific sequence orthogroup is in the structure OGs or if the most specific structure OG is in the sequence OGs.

 %% Cell type:code id:697a91f7-a4d6-45d0-be0e-ffecafd76144 tags:

 ``` python
 orthogroups_prot = sequence_annotation[['protein_id', 'eggNOG_OGs']].merge(structural_annotation[['protein_id', 'eggNOG_OGs']],
                                                                           on='protein_id', suffixes=['_seq', '_struct'])
 ```

 %% Cell type:code id:a6df186f-96e1-482c-80db-e12c42b2a133 tags:

 ``` python
 orthogroups_prot.shape
 ```

 %% Output

    (16589, 3)

 %% Cell type:markdown id:13e367c7-a87e-47a8-8b9c-9b432dd39d9a tags:

 We lose ~1400 peptides - our cut-off might have been too strict. We'd like to be sure about our conclusions though, so let's keep going like this. Isolate the most specific orthogroup for each peptide.

 %% Cell type:code id:aa81c544-a9fd-4567-8f0e-15f69b769dbc tags:

 ``` python
 def keep_last(row):
    """
    A function to isolate the root EggNOG orthogroup.

    Expects a comma-separated string where the root orthogroup
    contains the word 'root'.
    """
    x = np.array(row.split(','))
    return x[-1]
 ```

 %% Cell type:code id:d42fc12d-3930-4b18-854f-fa3ebbfb14aa tags:

 ``` python
 orthogroups_prot['most_specific_struct'] = orthogroups_prot['eggNOG_OGs_struct'].apply(keep_last)
 orthogroups_prot['most_specific_seq'] = orthogroups_prot['eggNOG_OGs_seq'].apply(keep_last)
 ```

 %% Cell type:markdown id:3a69da1c-4a38-484c-a588-4c05b4bab311 tags:

 Compare the root orthogroup lists of each gene between tables. Hitting at least one is considered a success.

 %% Cell type:code id:00273b01-9177-44ac-9bb7-033357c27ebe tags:

 ``` python
 def contains_OG(row):
    seq_contains_struct = row['most_specific_struct'] in row['eggNOG_OGs_seq']
    struct_contains_seq = row['most_specific_seq'] in row['eggNOG_OGs_struct']
    return seq_contains_struct | struct_contains_seq
 ```

 %% Cell type:code id:57aaa3cb-00a1-40a9-ac2b-c44906520e43 tags:

 ``` python
 orthogroups_prot['most_specific_OG'] = orthogroups_prot.apply(contains_OG, axis=1)
 ```

 %% Cell type:code id:42ddcbfe-ba25-47d0-b0b6-681a5cc98c1d tags:

 ``` python
 specific_overlap = np.sum(orthogroups_prot['most_specific_OG'])
 total = len(orthogroups_prot)
 print(f'A total of {specific_overlap} out of {total} annotated peptides overlap \
 in the most specific OG ({specific_overlap / total * 100:.2f}%).')
 ```

 %% Output

    A total of 9402 out of 16589 annotated peptides overlap in the most specific OG (56.68%).

 %% Cell type:markdown id:69678cfd-807b-4fdd-8efa-8c66a6597c61 tags:

 ## EggNOG root OrthoGroup level

 this is the most pertinent level: for how many cases do the two annotation pipelines put a gene in the same group of orthologous genes? Extract the EggNOG OG information and compare. To facilitate the comparison we will only look at the root level, but we could extend this as much as we want (e.g. pick the most specific orthogroup for structure and ask if it is found in the sequence annotation).

 %% Cell type:markdown id:1a771c03-4489-41ae-8b21-7c8a54b50635 tags:

 Isolate the root orthogroup for each peptide. If multiple root orthogroups are present, keep all of them.

 %% Cell type:code id:905aba0d-1979-4ac1-905f-801dd60eb9d6 tags:

 ``` python
 def keep_root(row):
    """
    A function to isolate the root EggNOG orthogroup.

    Expects a comma-separated string where the root orthogroup
    contains the word 'root'.
    """
    x = np.array(row.split(','))
    keep = np.zeros(len(x), dtype=bool)
    for i, og in enumerate(x):
        keep[i] = 'root' in og
 #     print(x, keep)
    return x[keep]
 ```

 %% Cell type:code id:16cc347b-6b0b-4b2f-9935-bd726fd990d1 tags:

 ``` python
 orthogroups_prot['eggNOG_OGs_struct'] = orthogroups_prot['eggNOG_OGs_struct'].apply(keep_root)
 orthogroups_prot['eggNOG_OGs_seq'] = orthogroups_prot['eggNOG_OGs_seq'].apply(keep_root)
 ```

 %% Cell type:markdown id:dca27bca-0128-40a9-a50c-f6149851dc2e tags:

 Compare the root orthogroup lists of each gene between tables. Hitting at least one is considered a success.

 %% Cell type:code id:2a548a58-9b35-4eee-b9de-d3e5a396d84e tags:

 ``` python
 def is_contained_in(row, x='eggNOG_OGs_struct', y='eggNOG_OGs_seq'):
    overlap = np.intersect1d(row[x], row[y])
    return len(overlap) > 0
 ```

 %% Cell type:code id:2db97083-d626-48d2-9cf6-c958c2cbcb33 tags:

 ``` python
 orthogroups_prot['root_OG_agree'] = orthogroups_prot.apply(is_contained_in, axis=1)
 ```

 %% Cell type:code id:008582f8-603c-4c37-b614-b9064da558fb tags:

 ``` python
 og_overlap = np.sum(orthogroups_prot['root_OG_agree'])
 total = len(orthogroups_prot)
 print(f'A total of {og_overlap} out of {total} annotated peptides have \
 the same structure and sequence root ortholog ({og_overlap / total * 100:.2f}%).')
 ```

 %% Output

    A total of 15024 out of 16589 annotated peptides have the same structure and sequence root ortholog (90.57%).

 %% Cell type:markdown id:376c4f8a-cfb3-4808-8ae6-39dab184c5ba tags:

 ## PFAM domain overlap

 Sometimes EggNOG classifies very similar proteins in different orthogroups. A way to rescue some proteins from this is to compare the distribution of PFAM domains.

 Extract the peptides with disagreeing OGs:

 %% Cell type:code id:b5205f80-f2a9-4c63-8bae-34cbfb805bc7 tags:

 ``` python
 unannotated_proteins = orthogroups_prot['protein_id'][~orthogroups_prot['root_OG_agree']]
 seq_unannanotated = sequence_annotation.set_index('protein_id').loc[unannotated_proteins]
 struct_unannanotated = structural_annotation.set_index('protein_id').loc[unannotated_proteins]
 ```

 %% Cell type:markdown id:9dd9817f-5297-41dc-b2ca-c8632e7d268f tags:

 Merge the unannotated peptides into one table and again convert the comma-separated PFAM domains into lists of domains:

 %% Cell type:code id:778f6962-8c4f-49c2-9a0a-c9e92934ffd4 tags:

 ``` python
 keep = ['Preferred_name', 'Description', 'PFAMs']
 unannotated = seq_unannanotated[keep].join(struct_unannanotated[keep], lsuffix='_seq', rsuffix='_struct')
 unannotated = unannotated.replace('-', np.NaN)

 unannotated['PFAMs_struct'] = unannotated['PFAMs_struct'].str.split(',')
 unannotated['PFAMs_seq'] = unannotated['PFAMs_seq'].str.split(',')
 ```

 %% Cell type:code id:38f6a92d-37c1-49e5-95d0-3aea536f400a tags:

 ``` python
 def term_overlap(row, x='PFAMs_struct', y='PFAMs_seq'):
    """
    Count how many strings overlap between two list-shaped DataFrame
    columns. If one of the columns does not contain a list the overlap
    is considered to be zero.
    """
    if type(row[x]) is not list or type(row[y]) is not list:
        return 0
    overlap = np.intersect1d(row[x], row[y])
    return len(overlap)
 ```

 %% Cell type:markdown id:5402bc2c-7819-4097-85b6-0dc9db0e2024 tags:

 Find overlap and convert it to percentage of sequence domains present in the structure annotation:

 %% Cell type:code id:097e4fa3-a732-4af9-bbe7-0cb7e1126a12 tags:

 ``` python
 length = unannotated['PFAMs_seq'].apply(lambda x: 1 if type(x) is not list else len(x))
 unannotated['%overlap'] = unannotated.apply(lambda row: term_overlap(row), axis=1) / length
 ```

 %% Cell type:code id:aafea02a-90b7-4eca-9429-094c9c100d6f tags:

 ``` python
 fig, ax = plt.subplots()
 ax.hist(unannotated['%overlap'], bins=50);
 ```

 %% Output



 %% Cell type:markdown id:cf411228-dee4-46d7-8f71-2825e3e61061 tags:

 A nicely bimodal distribution. We will cut off at 50%, since this will catch proteins with two domains where one overlaps.

 %% Cell type:code id:45e95b35-4949-4637-a7ef-651e94a613dd tags:

 ``` python
 pfam_overlap = np.sum(unannotated['%overlap'] >= 0.5)
 total = len(unannotated)
 print(f'A total of {pfam_overlap} out of {total} peptides with EggNOG orthogroup disagreement share \
 at least 50% of their PFAM domains ({pfam_overlap / total * 100:.2f}%).')
 ```

 %% Output

    A total of 845 out of 1565 peptides with EggNOG orthogroup disagreement share at least 50% of their PFAM domains (53.99%).

 %% Cell type:markdown id:2c7c6180-df0b-4e4f-b693-399c24ecc29e tags:

 Let's make a table to hold that information:

 Since we're doing this we might as well check how many get the exact same preferred name, as another level of validation. This is going to be very inexact, since many proteins have different names, but hopefully since CoFFE also went through EggNOG this works well enough:

 %% Cell type:code id:65c6bc29-0b60-4f6a-ae82-618ac4dcd681 tags:

 ``` python
 names = sequence_annotation[['protein_id', 'Preferred_name']].merge(structural_annotation[['protein_id', 'Preferred_name']],
                                                                    on='protein_id', suffixes=['_seq', '_struct'])
 ```

 %% Cell type:code id:78f3646e-6131-4104-abbf-f6bd1b90cc4c tags:

 ``` python
 named_same = names['Preferred_name_seq'].str.lower() == names['Preferred_name_struct'].str.lower()
 named = orthogroups_prot['protein_id'][named_same]
 ```

 %% Cell type:code id:7aa99784-f021-4d11-a4b9-55f14f0cdf5a tags:

 ``` python
 def split_by_membership(query, target):
    query_in_target = np.intersect1d(query, target)
    query_not_target = np.setdiff1d(query, target)
    return query_in_target, query_not_target
 ```

 %% Cell type:code id:ea46bb8a-5abe-4e60-97c1-e756ee2e9288 tags:

 ``` python
 keep = orthogroups_prot['most_specific_OG']
 specific_og = orthogroups_prot[keep]['protein_id'].values
 same_name_spec, diff_name_spec = split_by_membership(specific_og, named)

 keep = orthogroups_prot['root_OG_agree'] & ~orthogroups_prot['most_specific_OG']
 root_og = orthogroups_prot[keep]['protein_id'].values
 same_name_root, diff_name_root = split_by_membership(root_og, named)

 keep = unannotated['%overlap'] >= 0.5
 by_pfam = unannotated[keep].index.values
 same_name_pfam, diff_name_pfam = split_by_membership(by_pfam, named)
 ```

 %% Cell type:code id:b4101553-1c97-483d-ba12-fbdca61001d6 tags:

 ``` python
 len(same_name_spec), len(same_name_root), len(same_name_pfam)
 ```

 %% Output

    (7059, 2958, 290)

 %% Cell type:code id:09cff5b0-2bb5-43f1-8f9a-bcc45f114c4d tags:

 ``` python
 (len(same_name_spec) + len(same_name_root) + len(same_name_pfam)) / np.sum(named_same)
 ```

 %% Output

    0.9775227617602428

 %% Cell type:markdown id:cb6793bb-93f9-4eaa-a3d7-ce0b2fb3d866 tags:

 wow

 %% Cell type:code id:5ed5a103-075f-4b2e-a8ab-58eea34a0303 tags:

 ``` python
 annotated = np.concatenate((same_name_spec, diff_name_spec,
                            same_name_root, diff_name_root,
                            same_name_pfam, diff_name_pfam))
 not_annotated = np.setdiff1d(orthogroups_prot['protein_id'], annotated)
 prt_id = np.concatenate((annotated, not_annotated))
 source = ['Specific OG\nsame name'] * len(same_name_spec) + \
        ['Specific OG'] * len(diff_name_spec) + \
        ['Root OG\nsame name'] * len(same_name_root) + \
        ['Root OG'] * len(diff_name_root) + \
        ['50% PFAM\nsame name'] * len(same_name_pfam) + \
        ['50% PFAM'] * len(diff_name_pfam) + \
        ['no agreement'] * len(not_annotated)

 annotation_status = pd.DataFrame({'protein_id': prt_id, 'annotation status': source})
 ```

 %% Cell type:code id:9e5f4090-f247-4ae2-bc2c-f940cd08e7bc tags:

 ``` python
 annotation_status.to_csv('../data/annotation_status.tsv', sep='\t', index=False)
 ```

 %% Cell type:code id:de4fc20f-1168-4216-9b5d-4d2082c00c5d tags:

 ``` python
 total_overlap = len(annotated)
 total = len(orthogroups_prot)
 print(f'A total of {total_overlap} out of {total} annotated peptides agree \
 in annotation between sequence and structure ({total_overlap / total * 100:.2f}%).')
 ```

 %% Output

    A total of 15870 out of 16589 annotated peptides agree in annotation between sequence and structure (95.67%).