{
 "metadata": {
  "name": "",
  "signature": "sha256:b92512874c91aad166c4ddaa7e5ab025f3b507daa94800a828d27019a8d7c6e8"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import HTSeq\n",
      "import itertools\n",
      "import numpy as np\n",
      "from numpy import random\n",
      "import matplotlib  \n",
      "import os\n",
      "import pandas as pd\n",
      "from pandas import Series, DataFrame\n",
      "\n",
      "import re\n",
      "\n",
      "\n",
      "data_path = \"/g/scb/patil/klaus/AlignedDataElli/\"\n",
      "gtf=\"/g/huber/users/klaus/Data/Genomes/Mus_musculus/GRCm38_ENSEMBL78/Mus_musculus.GRCm38.78.gtf\"\n",
      "\n",
      "halfwinwidth = 3000\n",
      "fragmentsize = 150"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%load_ext rpy2.ipython\n",
      "import rpy2\n",
      "import pandas.rpy.common as com"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "The rpy2.ipython extension is already loaded. To reload it, use:\n",
        "  %reload_ext rpy2.ipython\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "We follow the instructions here:\n",
      "\n",
      "http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html#streaming-through-all-reads\n",
      "\n",
      "closely and use HTSeq to genereate coverage vectors for TSS\n",
      "\n",
      "We first load the bam files and the gtf file\n",
      "\n",
      "> itertools.islice(alignment_file, 5,6).next()\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "bams = os.listdir(data_path)\n",
      "#print re.search('s.bam$',bams[2])\n",
      "#print \n",
      "\n",
      "\n",
      "# filter to select only the SORTED bams\n",
      "bams = filter(lambda x:  re.search('s.bam$',x), bams)\n",
      "bams = [data_path + x for x in bams] \n",
      "print bams"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h_s.bam', '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_wt_s.bam', '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_4h_s.bam']\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "os.listdir(data_path)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 47,
       "text": [
        "['p63ChIP_4h.bam',\n",
        " 'p63ChIP_1h_s.bam',\n",
        " 'p63ChIP_wt_s.bam',\n",
        " 'p63ChIP_wt.sam',\n",
        " 'p63ChIP_wt_s.bam.bai',\n",
        " 'p63ChIP_1h.bam',\n",
        " 'p63ChIP_4h_s.bam',\n",
        " 'p63ChIP_1h_s.bam.bai',\n",
        " 'p63ChIP_1h.sam',\n",
        " 'p63ChIP_wt.bam',\n",
        " 'p63ChIP_4h.sam',\n",
        " 'p63ChIP_4h_s.bam.bai']"
       ]
      }
     ],
     "prompt_number": 47
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "We now can read in the bam files and the gtf file"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "gtffile = HTSeq.GFF_Reader(gtf)\n",
      "bamsHT = [HTSeq.BAM_Reader(x) for x in bams]\n",
      "bamsHT "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "[<HTSeq.BAM_Reader at 0x7fca6e183c50>,\n",
        " <HTSeq.BAM_Reader at 0x7fca6cc20c50>,\n",
        " <HTSeq.BAM_Reader at 0x7fca6cc20c90>]"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# load ENSEMBL IDs and get their genomic coordinates\n",
      "\n",
      "#os.listdir(\".\")\n",
      "genes = pd.read_csv(\"ElliGenes.csv\")\n",
      "del genes[\"Unnamed: 0\"]\n",
      "genes = genes.rename(columns={'x':'geneID'})\n",
      "genes.ix[1:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>geneID</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>1 </th>\n",
        "      <td> ENSMUSG00000023067</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2 </th>\n",
        "      <td> ENSMUSG00000024521</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3 </th>\n",
        "      <td> ENSMUSG00000044551</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4 </th>\n",
        "      <td> ENSMUSG00000046070</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5 </th>\n",
        "      <td> ENSMUSG00000034457</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6 </th>\n",
        "      <td> ENSMUSG00000027356</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7 </th>\n",
        "      <td> ENSMUSG00000020184</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8 </th>\n",
        "      <td> ENSMUSG00000048458</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9 </th>\n",
        "      <td> ENSMUSG00000020326</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
        "      <td> ENSMUSG00000024965</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 46,
       "text": [
        "                geneID\n",
        "1   ENSMUSG00000023067\n",
        "2   ENSMUSG00000024521\n",
        "3   ENSMUSG00000044551\n",
        "4   ENSMUSG00000046070\n",
        "5   ENSMUSG00000034457\n",
        "6   ENSMUSG00000027356\n",
        "7   ENSMUSG00000020184\n",
        "8   ENSMUSG00000048458\n",
        "9   ENSMUSG00000020326\n",
        "10  ENSMUSG00000024965"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "?genes # genes is a DataFrame now\n",
      "\n",
      "genes[\"geneID\"].at[5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "'ENSMUSG00000034457'"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# create gene set to retain unique IDs\n",
      "geneSet = set(genes[\"geneID\"])\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# test membership\n",
      "assert 'ENSMUSG0000034457' in geneSet"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "AssertionError",
       "evalue": "",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
        "\u001b[1;32m<ipython-input-49-17f4e481f9f6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# test membership\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[1;34m'ENSMUSG0000034457'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgeneSet\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[1;31mAssertionError\u001b[0m: "
       ]
      }
     ],
     "prompt_number": 49
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "windows = Series(index = geneSet, dtype=\"object\")\n",
      "poss = Series(index = geneSet, dtype=\"object\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get TSS positions for candidate genes and create windows\n",
      "tsspos = HTSeq.GenomicArrayOfSets( \"auto\", stranded=False )\n",
      "for feature in gtffile:\n",
      "   if feature.type == \"exon\" and feature.attr[\"exon_number\"] == \"1\" and feature.attr[\"gene_id\"] in geneSet:\n",
      "      # print feature.attr[\"gene_id\"]\n",
      "      p = feature.iv.start_d_as_pos\n",
      "      window = HTSeq.GenomicInterval( p.chrom, max(p.pos - halfwinwidth,0), p.pos + halfwinwidth, \".\" )\n",
      "      tsspos[ window ] += p\n",
      "      # save TSS window in Series\n",
      "      windows[feature.attr[\"gene_id\"]] = window\n",
      "      # save TSS position in Series\n",
      "      poss[feature.attr[\"gene_id\"]] = p"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# join windwos and poss in a data frame\n",
      "#anno = DataFrame(dict(windows=windows, positions=poss), columns=[\"windows\",\"positions\"])\n",
      "anno = DataFrame(dict(windows=windows, positions=poss))\n",
      "\n",
      "print anno.columns.values\n",
      "print anno\n",
      "#print anno"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['positions' 'windows']\n",
        "                         positions                     windows\n",
        "ENSMUSG00000074063   8:119437188/+   8:[119434188,119440188)/.\n",
        "ENSMUSG00000003873    7:45466897/-     7:[45463897,45469897)/.\n",
        "ENSMUSG00000072845    5:86468989/-     5:[86465989,86471989)/.\n",
        "ENSMUSG00000070583   4:147868978/+   4:[147865978,147871978)/.\n",
        "ENSMUSG00000021668   13:96519159/-    13:[96516159,96522159)/.\n",
        "ENSMUSG00000021185  12:100885879/+  12:[100882879,100888879)/.\n",
        "ENSMUSG00000029055   4:155019427/-   4:[155016427,155022427)/.\n",
        "ENSMUSG00000045730   18:62179958/-    18:[62176958,62182958)/.\n",
        "ENSMUSG00000019577     6:5496274/-       6:[5493274,5499274)/.\n",
        "ENSMUSG00000027663    3:32366013/-     3:[32363013,32369013)/.\n",
        "ENSMUSG00000031298   X:160427291/+   X:[160424291,160430291)/.\n",
        "ENSMUSG00000025507   7:141443993/-   7:[141440993,141446993)/.\n",
        "ENSMUSG00000051910   7:115846104/-   7:[115843104,115849104)/.\n",
        "ENSMUSG00000070644   1:133367286/+   1:[133364286,133370286)/.\n",
        "ENSMUSG00000036960   3:145099103/-   3:[145096103,145102103)/.\n",
        "ENSMUSG00000015647   2:180178402/-   2:[180175402,180181402)/.\n",
        "ENSMUSG00000032135    9:44136860/+     9:[44133860,44139860)/.\n",
        "ENSMUSG00000022074   14:69767471/+    14:[69764471,69770471)/.\n",
        "ENSMUSG00000021798   14:34588680/-    14:[34585680,34591680)/.\n",
        "ENSMUSG00000020423   1:134079119/-   1:[134076119,134082119)/.\n",
        "ENSMUSG00000020184  10:117710713/-  10:[117707713,117713713)/.\n",
        "ENSMUSG00000037316    8:25785208/-     8:[25782208,25788208)/.\n",
        "ENSMUSG00000047854   16:62814675/+    16:[62811675,62817675)/.\n",
        "ENSMUSG00000032092    9:45042424/+     9:[45039424,45045424)/.\n",
        "ENSMUSG00000074968   2:110761563/-   2:[110758563,110764563)/.\n",
        "ENSMUSG00000007379   3:103149163/+   3:[103146163,103152163)/.\n",
        "ENSMUSG00000024965    19:7019468/-      19:[7016468,7022468)/.\n",
        "ENSMUSG00000023067   17:29098206/+    17:[29095206,29101206)/.\n",
        "ENSMUSG00000056076   5:140419304/+   5:[140416304,140422304)/.\n",
        "ENSMUSG00000037725    8:22185818/-     8:[22182818,22188818)/.\n",
        "...                            ...                         ...\n",
        "ENSMUSG00000017756   13:73763696/+    13:[73760696,73766696)/.\n",
        "ENSMUSG00000030641    7:92874212/-     7:[92871212,92877212)/.\n",
        "ENSMUSG00000062939    1:52105448/+     1:[52102448,52108448)/.\n",
        "ENSMUSG00000037509    1:34801740/+     1:[34798740,34804740)/.\n",
        "ENSMUSG00000037759   14:44988194/+    14:[44985194,44991194)/.\n",
        "ENSMUSG00000052087   13:55379168/+    13:[55376168,55382168)/.\n",
        "ENSMUSG00000023341   16:97558411/+    16:[97555411,97561411)/.\n",
        "ENSMUSG00000042106   9:107985631/-   9:[107982631,107988631)/.\n",
        "ENSMUSG00000044816    1:65123113/-     1:[65120113,65126113)/.\n",
        "ENSMUSG00000070645   1:133351908/+   1:[133348908,133354908)/.\n",
        "ENSMUSG00000027356   2:132941962/-   2:[132938962,132944962)/.\n",
        "ENSMUSG00000019558    X:73678892/+     X:[73675892,73681892)/.\n",
        "ENSMUSG00000026104    1:52152252/+     1:[52149252,52155252)/.\n",
        "ENSMUSG00000028992   4:149485162/-   4:[149482162,149488162)/.\n",
        "ENSMUSG00000032358    9:76545803/-     9:[76542803,76548803)/.\n",
        "ENSMUSG00000020326   11:40753937/-    11:[40750937,40756937)/.\n",
        "ENSMUSG00000028211    4:11164458/+     4:[11161458,11167458)/.\n",
        "ENSMUSG00000017386   11:78165515/-    11:[78162515,78168515)/.\n",
        "ENSMUSG00000084128   8:106136684/-   8:[106133684,106139684)/.\n",
        "ENSMUSG00000021277  12:111248482/+  12:[111245482,111251482)/.\n",
        "ENSMUSG00000068744   3:108386064/+   3:[108383064,108389064)/.\n",
        "ENSMUSG00000030107   6:121245905/+   6:[121242905,121248905)/.\n",
        "ENSMUSG00000040918   1:164262352/+   1:[164259352,164265352)/.\n",
        "ENSMUSG00000048458   3:105705457/+   3:[105702457,105708457)/.\n",
        "ENSMUSG00000022372   15:66803748/-    15:[66800748,66806748)/.\n",
        "ENSMUSG00000036596    5:35525570/-     5:[35522570,35528570)/.\n",
        "ENSMUSG00000086321    4:83390667/-     4:[83387667,83393667)/.\n",
        "ENSMUSG00000042179   19:58728886/+    19:[58725886,58731886)/.\n",
        "ENSMUSG00000027208   2:126034972/+   2:[126031972,126037972)/.\n",
        "ENSMUSG00000044813    4:45532469/-     4:[45529469,45535469)/.\n",
        "\n",
        "[148 rows x 2 columns]\n"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dates = pd.date_range('1/1/2000', periods=8)\n",
      "\n",
      "df = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])\n",
      "\n",
      "df[\"A\"]\n",
      "df.columns.values"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 264,
       "text": [
        "array(['A', 'B', 'C', 'D'], dtype=object)"
       ]
      }
     ],
     "prompt_number": 264
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# remove NAs by usage of dropna\n",
      "test = windows.copy()\n",
      "test.dropna()\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "ENSMUSG00000074063     8:[119434188,119440188)/.\n",
        "ENSMUSG00000003873       7:[45463897,45469897)/.\n",
        "ENSMUSG00000072845       5:[86465989,86471989)/.\n",
        "ENSMUSG00000070583     4:[147865978,147871978)/.\n",
        "ENSMUSG00000021668      13:[96516159,96522159)/.\n",
        "ENSMUSG00000021185    12:[100882879,100888879)/.\n",
        "ENSMUSG00000029055     4:[155016427,155022427)/.\n",
        "ENSMUSG00000045730      18:[62176958,62182958)/.\n",
        "ENSMUSG00000019577         6:[5493274,5499274)/.\n",
        "ENSMUSG00000027663       3:[32363013,32369013)/.\n",
        "ENSMUSG00000031298     X:[160424291,160430291)/.\n",
        "ENSMUSG00000025507     7:[141440993,141446993)/.\n",
        "ENSMUSG00000051910     7:[115843104,115849104)/.\n",
        "ENSMUSG00000070644     1:[133364286,133370286)/.\n",
        "ENSMUSG00000036960     3:[145096103,145102103)/.\n",
        "...\n",
        "ENSMUSG00000020326      11:[40750937,40756937)/.\n",
        "ENSMUSG00000028211       4:[11161458,11167458)/.\n",
        "ENSMUSG00000017386      11:[78162515,78168515)/.\n",
        "ENSMUSG00000084128     8:[106133684,106139684)/.\n",
        "ENSMUSG00000021277    12:[111245482,111251482)/.\n",
        "ENSMUSG00000068744     3:[108383064,108389064)/.\n",
        "ENSMUSG00000030107     6:[121242905,121248905)/.\n",
        "ENSMUSG00000040918     1:[164259352,164265352)/.\n",
        "ENSMUSG00000048458     3:[105702457,105708457)/.\n",
        "ENSMUSG00000022372      15:[66800748,66806748)/.\n",
        "ENSMUSG00000036596       5:[35522570,35528570)/.\n",
        "ENSMUSG00000086321       4:[83387667,83393667)/.\n",
        "ENSMUSG00000042179      19:[58725886,58731886)/.\n",
        "ENSMUSG00000027208     2:[126031972,126037972)/.\n",
        "ENSMUSG00000044813       4:[45529469,45535469)/.\n",
        "Length: 147, dtype: object"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get the windows only\n",
      "anno = anno.dropna()\n",
      "test = anno.columns.values\n",
      "print test\n",
      "anno.loc[:,\"windows\"] "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['positions' 'windows']\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "ENSMUSG00000074063     8:[119434188,119440188)/.\n",
        "ENSMUSG00000003873       7:[45463897,45469897)/.\n",
        "ENSMUSG00000072845       5:[86465989,86471989)/.\n",
        "ENSMUSG00000070583     4:[147865978,147871978)/.\n",
        "ENSMUSG00000021668      13:[96516159,96522159)/.\n",
        "ENSMUSG00000021185    12:[100882879,100888879)/.\n",
        "ENSMUSG00000029055     4:[155016427,155022427)/.\n",
        "ENSMUSG00000045730      18:[62176958,62182958)/.\n",
        "ENSMUSG00000019577         6:[5493274,5499274)/.\n",
        "ENSMUSG00000027663       3:[32363013,32369013)/.\n",
        "ENSMUSG00000031298     X:[160424291,160430291)/.\n",
        "ENSMUSG00000025507     7:[141440993,141446993)/.\n",
        "ENSMUSG00000051910     7:[115843104,115849104)/.\n",
        "ENSMUSG00000070644     1:[133364286,133370286)/.\n",
        "ENSMUSG00000036960     3:[145096103,145102103)/.\n",
        "...\n",
        "ENSMUSG00000020326      11:[40750937,40756937)/.\n",
        "ENSMUSG00000028211       4:[11161458,11167458)/.\n",
        "ENSMUSG00000017386      11:[78162515,78168515)/.\n",
        "ENSMUSG00000084128     8:[106133684,106139684)/.\n",
        "ENSMUSG00000021277    12:[111245482,111251482)/.\n",
        "ENSMUSG00000068744     3:[108383064,108389064)/.\n",
        "ENSMUSG00000030107     6:[121242905,121248905)/.\n",
        "ENSMUSG00000040918     1:[164259352,164265352)/.\n",
        "ENSMUSG00000048458     3:[105702457,105708457)/.\n",
        "ENSMUSG00000022372      15:[66800748,66806748)/.\n",
        "ENSMUSG00000036596       5:[35522570,35528570)/.\n",
        "ENSMUSG00000086321       4:[83387667,83393667)/.\n",
        "ENSMUSG00000042179      19:[58725886,58731886)/.\n",
        "ENSMUSG00000027208     2:[126031972,126037972)/.\n",
        "ENSMUSG00000044813       4:[45529469,45535469)/.\n",
        "Name: windows, Length: 147, dtype: object"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get test reads from unsorted BAM file\n",
      "atest= list(itertools.islice(HTSeq.BAM_Reader('/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h.bam'),1,1e4))\n",
      "\n",
      "for r in atest:\n",
      "    #print r\n",
      "    if r.iv is not None: \n",
      "        r.iv.length = fragmentsize\n",
      "\n",
      "atest[1:5]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "[<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:1592:1019#0/1' aligned to 13:[107227837,107227987)/+>,\n",
        " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:2803:1020#0/1' aligned to 19:[31615405,31615555)/->,\n",
        " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:3715:1021#0/1' aligned to 5:[124572706,124572856)/->,\n",
        " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:2263:1020#0/1' aligned to 15:[80638244,80638394)/+>]"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# test directionality of the positions, downstream and upstream are flipped for - strand FEATURES!\n",
      "start = 31615300\n",
      "end = start + 6000\n",
      "print \"window start end\", start, end\n",
      "print \"read start end\", atest[2].iv.start, atest[2].iv.end\n",
      "print \"position in profile for Window\", end-atest[2].iv.end,  end-atest[2].iv.start\n",
      "print \"position in profile for Window  method2\", end-atest[2].iv.start_d,  end-atest[2].iv.end_d"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "window start end 31615300 31621300\n",
        "read start end 31615405 31615555\n",
        "position in profile for Window 5745 5895\n",
        "position in profile for Window  method2 5746 5896\n"
       ]
      }
     ],
     "prompt_number": 184
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# test iterating through the windows\n",
      "for g, ann in  anno.iterrows():\n",
      "    #print g, ann['windows']\n",
      "    for r in atest:\n",
      "        if r.iv is not None:\n",
      "            if r.iv.overlaps(ann[\"windows\"]):  \n",
      "                print r ,\"is in\", ann[\"windows\"]\n",
      "                if ann[\"positions\"].strand == \"+\":\n",
      "                    startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n",
      "                    endInWindow   = min(r.iv.end   - ann[\"positions\"].pos + halfwinwidth,5999)\n",
      "                else:\n",
      "                    startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n",
      "                    endInWindow   = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n",
      "                print \"start in Window\", startInWindow, \"end in Window\", endInWindow\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:12120:1540#0/1' aligned to 15:[100720083,100720233)/+> is in 15:[100717476,100723476)/.\n",
        "start in Window 3243 end in Window 3393\n",
        "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:1991:1333#0/1' aligned to X:[97376439,97376589)/->"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " is in X:[97374171,97380171)/.\n",
        "start in Window 3582 end in Window 3732\n",
        "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:11153:1052#0/1' aligned to 13:[73762336,73762486)/+>"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " is in 13:[73760696,73766696)/.\n",
        "start in Window 1640 end in Window 1790\n",
        "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:13112:1524#0/1' aligned to 2:[126033348,126033498)/+>"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " is in 2:[126031972,126037972)/.\n",
        "start in Window 1376 end in Window 1526\n"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get a coverage vector for all windows using all aligned reads\n",
      "#\n",
      "coverages4h =  DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n",
      "coverages1h =  DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n",
      "coveragesWT =  DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# test adding numbers\n",
      "coveragesWT.ix[\"ENSMUSG00000020326\",] += 1\n",
      "coveragesWT.ix[ [\"ENSMUSG00000020326\",\"ENSMUSG00000026989\"],]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>0</th>\n",
        "      <th>1</th>\n",
        "      <th>2</th>\n",
        "      <th>3</th>\n",
        "      <th>4</th>\n",
        "      <th>5</th>\n",
        "      <th>6</th>\n",
        "      <th>7</th>\n",
        "      <th>8</th>\n",
        "      <th>9</th>\n",
        "      <th>...</th>\n",
        "      <th>5990</th>\n",
        "      <th>5991</th>\n",
        "      <th>5992</th>\n",
        "      <th>5993</th>\n",
        "      <th>5994</th>\n",
        "      <th>5995</th>\n",
        "      <th>5996</th>\n",
        "      <th>5997</th>\n",
        "      <th>5998</th>\n",
        "      <th>5999</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>ENSMUSG00000020326</th>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td>...</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>ENSMUSG00000026989</th>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>2 rows \u00d7 6000 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 132,
       "text": [
        "                    0     1     2     3     4     5     6     7     8     \\\n",
        "ENSMUSG00000020326     4     4     4     4     4     4     4     4     4   \n",
        "ENSMUSG00000026989     0     0     0     0     0     0     0     0     0   \n",
        "\n",
        "                    9     ...   5990  5991  5992  5993  5994  5995  5996  \\\n",
        "ENSMUSG00000020326     4  ...      4     4     4     4     4     4     4   \n",
        "ENSMUSG00000026989     0  ...      0     0     0     0     0     0     0   \n",
        "\n",
        "                    5997  5998  5999  \n",
        "ENSMUSG00000020326     4     4     4  \n",
        "ENSMUSG00000026989     0     0     0  \n",
        "\n",
        "[2 rows x 6000 columns]"
       ]
      }
     ],
     "prompt_number": 132
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "bams"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 24,
       "text": [
        "['/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h_s.bam',\n",
        " '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_wt_s.bam',\n",
        " '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_4h_s.bam']"
       ]
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get coverage for 1hr\n",
      "for g, ann in  anno.iterrows():\n",
      "    for r in bamsHT[0][ann[\"windows\"]]:\n",
      "        if r.iv is not None and r.aligned and r.flag <> 256:\n",
      "            r.iv.length = fragmentsize # extend to fragment size\n",
      "            if ann[\"positions\"].strand == \"+\":\n",
      "                startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n",
      "                endInWindow   = min(r.iv.end   - ann[\"positions\"].pos + halfwinwidth,5999)\n",
      "            else:\n",
      "                startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n",
      "                endInWindow   = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n",
      "            coverages1h.ix[g,startInWindow : endInWindow ] += 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get coverage for WT\n",
      "for g, ann in  anno.iterrows():\n",
      "    for r in bamsHT[1][ann[\"windows\"]]:\n",
      "        if r.iv is not None and r.aligned and r.flag <> 256:\n",
      "            r.iv.length = fragmentsize # extend to fragment size\n",
      "            if ann[\"positions\"].strand == \"+\":\n",
      "                startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n",
      "                endInWindow   = min(r.iv.end   - ann[\"positions\"].pos + halfwinwidth,5999)\n",
      "            else:\n",
      "                startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n",
      "                endInWindow   = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n",
      "            coveragesWT.ix[g,startInWindow : endInWindow ] += 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# get coverage for 4hr\n",
      "for g, ann in  anno.iterrows():\n",
      "    for r in bamsHT[2][ann[\"windows\"]]:\n",
      "        if r.iv is not None and r.aligned and r.flag <> 256:\n",
      "            r.iv.length = fragmentsize # extend to fragment size\n",
      "            if ann[\"positions\"].strand == \"+\":\n",
      "                startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n",
      "                endInWindow   = min(r.iv.end   - ann[\"positions\"].pos + halfwinwidth,5999)\n",
      "            else:\n",
      "                startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n",
      "                endInWindow   = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n",
      "            coverages4h.ix[g,startInWindow : endInWindow ] += 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# check mean max coverage\n",
      "coverages4h.apply(max, axis = 0).mean()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 33,
       "text": [
        "17.569333333333333"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# turn coverage frames into R data.frames\n",
      "covWT = com.convert_to_r_dataframe(coveragesWT)\n",
      "covWT"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 32,
       "text": [
        "<DataFrame - Python:0x7fca67e10710 / R:0x8e00380>\n",
        "[Float..., Float..., Float..., ..., Float..., Float..., Float...]\n",
        "  X0: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca679585f0 / R:0x50cf1e0>\n",
        "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n",
        "  X1: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca68d9ecf8 / R:0x5ea7a50>\n",
        "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n",
        "  X2: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca68d9efc8 / R:0x503a750>\n",
        "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n",
        "  ...\n",
        "  X0: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca6bd024d0 / R:0x87bf810>\n",
        "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]\n",
        "  X1: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca6bd02248 / R:0x87013f0>\n",
        "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]\n",
        "  X2: <class 'rpy2.robjects.vectors.FloatVector'>\n",
        "  <FloatVector - Python:0x7fca6bd02998 / R:0x8b7bf70>\n",
        "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]"
       ]
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# write coverages to csv files\n",
      "coveragesWT.to_csv(\"covWT.csv\")\n",
      "coverages1h.to_csv(\"cov1h.csv\")\n",
      "coverages4h.to_csv(\"cov4h.csv\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# check whether this has worked\n",
      "os.listdir(\".\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 31,
       "text": [
        "['ChipSeqElli_cache',\n",
        " 'ChipSeqElli.Rmd',\n",
        " 'ChipSeqElli.R',\n",
        " 'ChIPSeqTrimming.sh',\n",
        " 'ChipSeqElli.html',\n",
        " 'createBT2Index.sh',\n",
        " 'ChIPSeqTrimming.sh~',\n",
        " 'cluster-calls-Elli-ChIP-Seq.sh',\n",
        " 'cluster-calls-Elli-ChIP-Seq.sh~',\n",
        " 'bowtie2A.sh',\n",
        " 'ChIPQC-vignette.pdf',\n",
        " 'ChIPQC.pdf',\n",
        " 'ChIPQCSampleReport.pdf',\n",
        " 'sam2bam.sh',\n",
        " 'scriptsAleks',\n",
        " 'CoverageChIPSeq.py',\n",
        " 'ChiPSeqElli.ipynb',\n",
        " 'ElliGenes.csv',\n",
        " '.ipynb_checkpoints',\n",
        " 'bamSort.sh',\n",
        " 'bamSortIndex.sh',\n",
        " 'covWT.csv',\n",
        " 'cov1h.csv',\n",
        " 'cov4h.csv',\n",
        " 'ChipSeqElli_files',\n",
        " 'backGElli.csv',\n",
        " 'ChiPSeqElliControlSet.ipynb',\n",
        " 'covBackWT.csv',\n",
        " 'covBack1h.csv',\n",
        " 'covBack4h.csv',\n",
        " 'ElliGenes.txt',\n",
        " 'backgroundGenesTSS.pdf']"
       ]
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%R\n",
      "rnorm(10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "text": [
        " [1]  0.4145233  1.3614705  0.3975463  0.2241936 -1.4913814 -0.2301498\n",
        " [7]  0.1769197  0.4039700 -0.5776179 -0.5291591\n"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%R\n",
      "\n",
      "library(plyr)\n",
      "library(dplyr)\n",
      "library(magrittr)\n",
      "library(ggplot2)\n",
      "library(tidyr)\n",
      "\n",
      "covWT <- read.csv(\"covWT.csv\")\n",
      "covWT$time <- rep(\"WT\", dim(covWT)[1] )\n",
      "cov1h <- read.csv(\"cov1h.csv\")\n",
      "cov1h$time <- rep(\"1h\", dim(cov1h)[1] )\n",
      "cov4h <- read.csv(\"cov4h.csv\")\n",
      "cov4h$time <- rep(\"4h\", dim(cov4h)[1] )\n",
      "covs <- rbind.fill(covWT, cov1h, cov4h)\n",
      "names(covs) <- c(\"geneID\", setdiff(seq(-3000,3000),0), \"time\")\n",
      "print(sample_n(covs, 10)[, sample(5900,10)])\n",
      "\n",
      "dataGG <- covs %>%\n",
      "    gather(key = \"posRelToTSS\", value = \"coverage\", -geneID, -time)\n",
      "    \n",
      "dataGG$posRelToTSS  %<>% as.integer()\n",
      "\n",
      "covs_collapsed <- dataGG %>%\n",
      "        group_by(time, posRelToTSS) %>%\n",
      "                  summarize(coverageC = mean(coverage))\n",
      "print(covs_collapsed)\n",
      "\n",
      "#qplot(posRelToTSS, coverageC,  color = time, \n",
      "     # data=covs_collapsed, geom=\"smooth\") "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "display_data",
       "text": [
        "    2744 -2958 1004 1422 281 784 -1505 1017 -2615 -2580\n",
        "3      3     0    0    1   2   1     3    0     3     2\n",
        "58     0     3    0    3   1   3     1    4     0     2\n",
        "352    4     0    5    7   5   1     1    3     0     0\n",
        "374    8     5    0    0   1   0     2    0     1     1\n",
        "300    2     0    4    0   7   1     0    4     0     0\n",
        "57     2     0    4    3   2   0     1    1     0     0\n",
        "138    4     0    0    7   2   0     3    1     5    10\n",
        "343    0     0    9    4   1   0     0   10     2     0\n",
        "70     0     1    0    2   2   3     0    0     0     0\n",
        "363    2     0    5    1   1   0     1    5     0     0\n",
        "  coverageC\n",
        "1  1.713629\n"
       ]
      }
     ],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "covs"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "NameError",
       "evalue": "name 'covs' is not defined",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
        "\u001b[1;32m<ipython-input-39-669ca51bd5a9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcovs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[1;31mNameError\u001b[0m: name 'covs' is not defined"
       ]
      }
     ],
     "prompt_number": 39
    }
   ],
   "metadata": {}
  }
 ]
}