{ "metadata": { "name": "", "signature": "sha256:b92512874c91aad166c4ddaa7e5ab025f3b507daa94800a828d27019a8d7c6e8" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import HTSeq\n", "import itertools\n", "import numpy as np\n", "from numpy import random\n", "import matplotlib \n", "import os\n", "import pandas as pd\n", "from pandas import Series, DataFrame\n", "\n", "import re\n", "\n", "\n", "data_path = \"/g/scb/patil/klaus/AlignedDataElli/\"\n", "gtf=\"/g/huber/users/klaus/Data/Genomes/Mus_musculus/GRCm38_ENSEMBL78/Mus_musculus.GRCm38.78.gtf\"\n", "\n", "halfwinwidth = 3000\n", "fragmentsize = 150" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "%load_ext rpy2.ipython\n", "import rpy2\n", "import pandas.rpy.common as com" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "The rpy2.ipython extension is already loaded. To reload it, use:\n", " %reload_ext rpy2.ipython\n" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "We follow the instructions here:\n", "\n", "http://www-huber.embl.de/users/anders/HTSeq/doc/tss.html#streaming-through-all-reads\n", "\n", "closely and use HTSeq to genereate coverage vectors for TSS\n", "\n", "We first load the bam files and the gtf file\n", "\n", "> itertools.islice(alignment_file, 5,6).next()\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "bams = os.listdir(data_path)\n", "#print re.search('s.bam$',bams[2])\n", "#print \n", "\n", "\n", "# filter to select only the SORTED bams\n", "bams = filter(lambda x: re.search('s.bam$',x), bams)\n", "bams = [data_path + x for x in bams] \n", "print bams" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h_s.bam', '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_wt_s.bam', '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_4h_s.bam']\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "os.listdir(data_path)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ "['p63ChIP_4h.bam',\n", " 'p63ChIP_1h_s.bam',\n", " 'p63ChIP_wt_s.bam',\n", " 'p63ChIP_wt.sam',\n", " 'p63ChIP_wt_s.bam.bai',\n", " 'p63ChIP_1h.bam',\n", " 'p63ChIP_4h_s.bam',\n", " 'p63ChIP_1h_s.bam.bai',\n", " 'p63ChIP_1h.sam',\n", " 'p63ChIP_wt.bam',\n", " 'p63ChIP_4h.sam',\n", " 'p63ChIP_4h_s.bam.bai']" ] } ], "prompt_number": 47 }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now can read in the bam files and the gtf file" ] }, { "cell_type": "code", "collapsed": false, "input": [ "gtffile = HTSeq.GFF_Reader(gtf)\n", "bamsHT = [HTSeq.BAM_Reader(x) for x in bams]\n", "bamsHT " ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ "[<HTSeq.BAM_Reader at 0x7fca6e183c50>,\n", " <HTSeq.BAM_Reader at 0x7fca6cc20c50>,\n", " <HTSeq.BAM_Reader at 0x7fca6cc20c90>]" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# load ENSEMBL IDs and get their genomic coordinates\n", "\n", "#os.listdir(\".\")\n", "genes = pd.read_csv(\"ElliGenes.csv\")\n", "del genes[\"Unnamed: 0\"]\n", "genes = genes.rename(columns={'x':'geneID'})\n", "genes.ix[1:10]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>geneID</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1 </th>\n", " <td> ENSMUSG00000023067</td>\n", " </tr>\n", " <tr>\n", " <th>2 </th>\n", " <td> ENSMUSG00000024521</td>\n", " </tr>\n", " <tr>\n", " <th>3 </th>\n", " <td> ENSMUSG00000044551</td>\n", " </tr>\n", " <tr>\n", " <th>4 </th>\n", " <td> ENSMUSG00000046070</td>\n", " </tr>\n", " <tr>\n", " <th>5 </th>\n", " <td> ENSMUSG00000034457</td>\n", " </tr>\n", " <tr>\n", " <th>6 </th>\n", " <td> ENSMUSG00000027356</td>\n", " </tr>\n", " <tr>\n", " <th>7 </th>\n", " <td> ENSMUSG00000020184</td>\n", " </tr>\n", " <tr>\n", " <th>8 </th>\n", " <td> ENSMUSG00000048458</td>\n", " </tr>\n", " <tr>\n", " <th>9 </th>\n", " <td> ENSMUSG00000020326</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td> ENSMUSG00000024965</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ " geneID\n", "1 ENSMUSG00000023067\n", "2 ENSMUSG00000024521\n", "3 ENSMUSG00000044551\n", "4 ENSMUSG00000046070\n", "5 ENSMUSG00000034457\n", "6 ENSMUSG00000027356\n", "7 ENSMUSG00000020184\n", "8 ENSMUSG00000048458\n", "9 ENSMUSG00000020326\n", "10 ENSMUSG00000024965" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "?genes # genes is a DataFrame now\n", "\n", "genes[\"geneID\"].at[5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "'ENSMUSG00000034457'" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# create gene set to retain unique IDs\n", "geneSet = set(genes[\"geneID\"])\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# test membership\n", "assert 'ENSMUSG0000034457' in geneSet" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "AssertionError", "evalue": "", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m<ipython-input-49-17f4e481f9f6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# test membership\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[1;34m'ENSMUSG0000034457'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mgeneSet\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAssertionError\u001b[0m: " ] } ], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "windows = Series(index = geneSet, dtype=\"object\")\n", "poss = Series(index = geneSet, dtype=\"object\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# get TSS positions for candidate genes and create windows\n", "tsspos = HTSeq.GenomicArrayOfSets( \"auto\", stranded=False )\n", "for feature in gtffile:\n", " if feature.type == \"exon\" and feature.attr[\"exon_number\"] == \"1\" and feature.attr[\"gene_id\"] in geneSet:\n", " # print feature.attr[\"gene_id\"]\n", " p = feature.iv.start_d_as_pos\n", " window = HTSeq.GenomicInterval( p.chrom, max(p.pos - halfwinwidth,0), p.pos + halfwinwidth, \".\" )\n", " tsspos[ window ] += p\n", " # save TSS window in Series\n", " windows[feature.attr[\"gene_id\"]] = window\n", " # save TSS position in Series\n", " poss[feature.attr[\"gene_id\"]] = p" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "# join windwos and poss in a data frame\n", "#anno = DataFrame(dict(windows=windows, positions=poss), columns=[\"windows\",\"positions\"])\n", "anno = DataFrame(dict(windows=windows, positions=poss))\n", "\n", "print anno.columns.values\n", "print anno\n", "#print anno" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['positions' 'windows']\n", " positions windows\n", "ENSMUSG00000074063 8:119437188/+ 8:[119434188,119440188)/.\n", "ENSMUSG00000003873 7:45466897/- 7:[45463897,45469897)/.\n", "ENSMUSG00000072845 5:86468989/- 5:[86465989,86471989)/.\n", "ENSMUSG00000070583 4:147868978/+ 4:[147865978,147871978)/.\n", "ENSMUSG00000021668 13:96519159/- 13:[96516159,96522159)/.\n", "ENSMUSG00000021185 12:100885879/+ 12:[100882879,100888879)/.\n", "ENSMUSG00000029055 4:155019427/- 4:[155016427,155022427)/.\n", "ENSMUSG00000045730 18:62179958/- 18:[62176958,62182958)/.\n", "ENSMUSG00000019577 6:5496274/- 6:[5493274,5499274)/.\n", "ENSMUSG00000027663 3:32366013/- 3:[32363013,32369013)/.\n", "ENSMUSG00000031298 X:160427291/+ X:[160424291,160430291)/.\n", "ENSMUSG00000025507 7:141443993/- 7:[141440993,141446993)/.\n", "ENSMUSG00000051910 7:115846104/- 7:[115843104,115849104)/.\n", "ENSMUSG00000070644 1:133367286/+ 1:[133364286,133370286)/.\n", "ENSMUSG00000036960 3:145099103/- 3:[145096103,145102103)/.\n", "ENSMUSG00000015647 2:180178402/- 2:[180175402,180181402)/.\n", "ENSMUSG00000032135 9:44136860/+ 9:[44133860,44139860)/.\n", "ENSMUSG00000022074 14:69767471/+ 14:[69764471,69770471)/.\n", "ENSMUSG00000021798 14:34588680/- 14:[34585680,34591680)/.\n", "ENSMUSG00000020423 1:134079119/- 1:[134076119,134082119)/.\n", "ENSMUSG00000020184 10:117710713/- 10:[117707713,117713713)/.\n", "ENSMUSG00000037316 8:25785208/- 8:[25782208,25788208)/.\n", "ENSMUSG00000047854 16:62814675/+ 16:[62811675,62817675)/.\n", "ENSMUSG00000032092 9:45042424/+ 9:[45039424,45045424)/.\n", "ENSMUSG00000074968 2:110761563/- 2:[110758563,110764563)/.\n", "ENSMUSG00000007379 3:103149163/+ 3:[103146163,103152163)/.\n", "ENSMUSG00000024965 19:7019468/- 19:[7016468,7022468)/.\n", "ENSMUSG00000023067 17:29098206/+ 17:[29095206,29101206)/.\n", "ENSMUSG00000056076 5:140419304/+ 5:[140416304,140422304)/.\n", "ENSMUSG00000037725 8:22185818/- 8:[22182818,22188818)/.\n", "... ... ...\n", "ENSMUSG00000017756 13:73763696/+ 13:[73760696,73766696)/.\n", "ENSMUSG00000030641 7:92874212/- 7:[92871212,92877212)/.\n", "ENSMUSG00000062939 1:52105448/+ 1:[52102448,52108448)/.\n", "ENSMUSG00000037509 1:34801740/+ 1:[34798740,34804740)/.\n", "ENSMUSG00000037759 14:44988194/+ 14:[44985194,44991194)/.\n", "ENSMUSG00000052087 13:55379168/+ 13:[55376168,55382168)/.\n", "ENSMUSG00000023341 16:97558411/+ 16:[97555411,97561411)/.\n", "ENSMUSG00000042106 9:107985631/- 9:[107982631,107988631)/.\n", "ENSMUSG00000044816 1:65123113/- 1:[65120113,65126113)/.\n", "ENSMUSG00000070645 1:133351908/+ 1:[133348908,133354908)/.\n", "ENSMUSG00000027356 2:132941962/- 2:[132938962,132944962)/.\n", "ENSMUSG00000019558 X:73678892/+ X:[73675892,73681892)/.\n", "ENSMUSG00000026104 1:52152252/+ 1:[52149252,52155252)/.\n", "ENSMUSG00000028992 4:149485162/- 4:[149482162,149488162)/.\n", "ENSMUSG00000032358 9:76545803/- 9:[76542803,76548803)/.\n", "ENSMUSG00000020326 11:40753937/- 11:[40750937,40756937)/.\n", "ENSMUSG00000028211 4:11164458/+ 4:[11161458,11167458)/.\n", "ENSMUSG00000017386 11:78165515/- 11:[78162515,78168515)/.\n", "ENSMUSG00000084128 8:106136684/- 8:[106133684,106139684)/.\n", "ENSMUSG00000021277 12:111248482/+ 12:[111245482,111251482)/.\n", "ENSMUSG00000068744 3:108386064/+ 3:[108383064,108389064)/.\n", "ENSMUSG00000030107 6:121245905/+ 6:[121242905,121248905)/.\n", "ENSMUSG00000040918 1:164262352/+ 1:[164259352,164265352)/.\n", "ENSMUSG00000048458 3:105705457/+ 3:[105702457,105708457)/.\n", "ENSMUSG00000022372 15:66803748/- 15:[66800748,66806748)/.\n", "ENSMUSG00000036596 5:35525570/- 5:[35522570,35528570)/.\n", "ENSMUSG00000086321 4:83390667/- 4:[83387667,83393667)/.\n", "ENSMUSG00000042179 19:58728886/+ 19:[58725886,58731886)/.\n", "ENSMUSG00000027208 2:126034972/+ 2:[126031972,126037972)/.\n", "ENSMUSG00000044813 4:45532469/- 4:[45529469,45535469)/.\n", "\n", "[148 rows x 2 columns]\n" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "dates = pd.date_range('1/1/2000', periods=8)\n", "\n", "df = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])\n", "\n", "df[\"A\"]\n", "df.columns.values" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 264, "text": [ "array(['A', 'B', 'C', 'D'], dtype=object)" ] } ], "prompt_number": 264 }, { "cell_type": "code", "collapsed": false, "input": [ "# remove NAs by usage of dropna\n", "test = windows.copy()\n", "test.dropna()\n" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "ENSMUSG00000074063 8:[119434188,119440188)/.\n", "ENSMUSG00000003873 7:[45463897,45469897)/.\n", "ENSMUSG00000072845 5:[86465989,86471989)/.\n", "ENSMUSG00000070583 4:[147865978,147871978)/.\n", "ENSMUSG00000021668 13:[96516159,96522159)/.\n", "ENSMUSG00000021185 12:[100882879,100888879)/.\n", "ENSMUSG00000029055 4:[155016427,155022427)/.\n", "ENSMUSG00000045730 18:[62176958,62182958)/.\n", "ENSMUSG00000019577 6:[5493274,5499274)/.\n", "ENSMUSG00000027663 3:[32363013,32369013)/.\n", "ENSMUSG00000031298 X:[160424291,160430291)/.\n", "ENSMUSG00000025507 7:[141440993,141446993)/.\n", "ENSMUSG00000051910 7:[115843104,115849104)/.\n", "ENSMUSG00000070644 1:[133364286,133370286)/.\n", "ENSMUSG00000036960 3:[145096103,145102103)/.\n", "...\n", "ENSMUSG00000020326 11:[40750937,40756937)/.\n", "ENSMUSG00000028211 4:[11161458,11167458)/.\n", "ENSMUSG00000017386 11:[78162515,78168515)/.\n", "ENSMUSG00000084128 8:[106133684,106139684)/.\n", "ENSMUSG00000021277 12:[111245482,111251482)/.\n", "ENSMUSG00000068744 3:[108383064,108389064)/.\n", "ENSMUSG00000030107 6:[121242905,121248905)/.\n", "ENSMUSG00000040918 1:[164259352,164265352)/.\n", "ENSMUSG00000048458 3:[105702457,105708457)/.\n", "ENSMUSG00000022372 15:[66800748,66806748)/.\n", "ENSMUSG00000036596 5:[35522570,35528570)/.\n", "ENSMUSG00000086321 4:[83387667,83393667)/.\n", "ENSMUSG00000042179 19:[58725886,58731886)/.\n", "ENSMUSG00000027208 2:[126031972,126037972)/.\n", "ENSMUSG00000044813 4:[45529469,45535469)/.\n", "Length: 147, dtype: object" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "# get the windows only\n", "anno = anno.dropna()\n", "test = anno.columns.values\n", "print test\n", "anno.loc[:,\"windows\"] " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['positions' 'windows']\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ "ENSMUSG00000074063 8:[119434188,119440188)/.\n", "ENSMUSG00000003873 7:[45463897,45469897)/.\n", "ENSMUSG00000072845 5:[86465989,86471989)/.\n", "ENSMUSG00000070583 4:[147865978,147871978)/.\n", "ENSMUSG00000021668 13:[96516159,96522159)/.\n", "ENSMUSG00000021185 12:[100882879,100888879)/.\n", "ENSMUSG00000029055 4:[155016427,155022427)/.\n", "ENSMUSG00000045730 18:[62176958,62182958)/.\n", "ENSMUSG00000019577 6:[5493274,5499274)/.\n", "ENSMUSG00000027663 3:[32363013,32369013)/.\n", "ENSMUSG00000031298 X:[160424291,160430291)/.\n", "ENSMUSG00000025507 7:[141440993,141446993)/.\n", "ENSMUSG00000051910 7:[115843104,115849104)/.\n", "ENSMUSG00000070644 1:[133364286,133370286)/.\n", "ENSMUSG00000036960 3:[145096103,145102103)/.\n", "...\n", "ENSMUSG00000020326 11:[40750937,40756937)/.\n", "ENSMUSG00000028211 4:[11161458,11167458)/.\n", "ENSMUSG00000017386 11:[78162515,78168515)/.\n", "ENSMUSG00000084128 8:[106133684,106139684)/.\n", "ENSMUSG00000021277 12:[111245482,111251482)/.\n", "ENSMUSG00000068744 3:[108383064,108389064)/.\n", "ENSMUSG00000030107 6:[121242905,121248905)/.\n", "ENSMUSG00000040918 1:[164259352,164265352)/.\n", "ENSMUSG00000048458 3:[105702457,105708457)/.\n", "ENSMUSG00000022372 15:[66800748,66806748)/.\n", "ENSMUSG00000036596 5:[35522570,35528570)/.\n", "ENSMUSG00000086321 4:[83387667,83393667)/.\n", "ENSMUSG00000042179 19:[58725886,58731886)/.\n", "ENSMUSG00000027208 2:[126031972,126037972)/.\n", "ENSMUSG00000044813 4:[45529469,45535469)/.\n", "Name: windows, Length: 147, dtype: object" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "# get test reads from unsorted BAM file\n", "atest= list(itertools.islice(HTSeq.BAM_Reader('/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h.bam'),1,1e4))\n", "\n", "for r in atest:\n", " #print r\n", " if r.iv is not None: \n", " r.iv.length = fragmentsize\n", "\n", "atest[1:5]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ "[<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:1592:1019#0/1' aligned to 13:[107227837,107227987)/+>,\n", " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:2803:1020#0/1' aligned to 19:[31615405,31615555)/->,\n", " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:3715:1021#0/1' aligned to 5:[124572706,124572856)/->,\n", " <SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:2263:1020#0/1' aligned to 15:[80638244,80638394)/+>]" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "# test directionality of the positions, downstream and upstream are flipped for - strand FEATURES!\n", "start = 31615300\n", "end = start + 6000\n", "print \"window start end\", start, end\n", "print \"read start end\", atest[2].iv.start, atest[2].iv.end\n", "print \"position in profile for Window\", end-atest[2].iv.end, end-atest[2].iv.start\n", "print \"position in profile for Window method2\", end-atest[2].iv.start_d, end-atest[2].iv.end_d" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "window start end 31615300 31621300\n", "read start end 31615405 31615555\n", "position in profile for Window 5745 5895\n", "position in profile for Window method2 5746 5896\n" ] } ], "prompt_number": 184 }, { "cell_type": "code", "collapsed": false, "input": [ "# test iterating through the windows\n", "for g, ann in anno.iterrows():\n", " #print g, ann['windows']\n", " for r in atest:\n", " if r.iv is not None:\n", " if r.iv.overlaps(ann[\"windows\"]): \n", " print r ,\"is in\", ann[\"windows\"]\n", " if ann[\"positions\"].strand == \"+\":\n", " startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n", " endInWindow = min(r.iv.end - ann[\"positions\"].pos + halfwinwidth,5999)\n", " else:\n", " startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n", " endInWindow = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n", " print \"start in Window\", startInWindow, \"end in Window\", endInWindow\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:12120:1540#0/1' aligned to 15:[100720083,100720233)/+> is in 15:[100717476,100723476)/.\n", "start in Window 3243 end in Window 3393\n", "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:1991:1333#0/1' aligned to X:[97376439,97376589)/->" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " is in X:[97374171,97380171)/.\n", "start in Window 3582 end in Window 3732\n", "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:11153:1052#0/1' aligned to 13:[73762336,73762486)/+>" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " is in 13:[73760696,73766696)/.\n", "start in Window 1640 end in Window 1790\n", "<SAM_Alignment object: Read 'SOLEXAWS1_0001:5:1:13112:1524#0/1' aligned to 2:[126033348,126033498)/+>" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " is in 2:[126031972,126037972)/.\n", "start in Window 1376 end in Window 1526\n" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "# get a coverage vector for all windows using all aligned reads\n", "#\n", "coverages4h = DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n", "coverages1h = DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n", "coveragesWT = DataFrame(data=np.zeros(shape = (len(genes), 2*halfwinwidth)), index=genes[\"geneID\"].values)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "# test adding numbers\n", "coveragesWT.ix[\"ENSMUSG00000020326\",] += 1\n", "coveragesWT.ix[ [\"ENSMUSG00000020326\",\"ENSMUSG00000026989\"],]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " <th>5</th>\n", " <th>6</th>\n", " <th>7</th>\n", " <th>8</th>\n", " <th>9</th>\n", " <th>...</th>\n", " <th>5990</th>\n", " <th>5991</th>\n", " <th>5992</th>\n", " <th>5993</th>\n", " <th>5994</th>\n", " <th>5995</th>\n", " <th>5996</th>\n", " <th>5997</th>\n", " <th>5998</th>\n", " <th>5999</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>ENSMUSG00000020326</th>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td>...</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " <td> 4</td>\n", " </tr>\n", " <tr>\n", " <th>ENSMUSG00000026989</th>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td>...</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " <td> 0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2 rows \u00d7 6000 columns</p>\n", "</div>" ], "metadata": {}, "output_type": "pyout", "prompt_number": 132, "text": [ " 0 1 2 3 4 5 6 7 8 \\\n", "ENSMUSG00000020326 4 4 4 4 4 4 4 4 4 \n", "ENSMUSG00000026989 0 0 0 0 0 0 0 0 0 \n", "\n", " 9 ... 5990 5991 5992 5993 5994 5995 5996 \\\n", "ENSMUSG00000020326 4 ... 4 4 4 4 4 4 4 \n", "ENSMUSG00000026989 0 ... 0 0 0 0 0 0 0 \n", "\n", " 5997 5998 5999 \n", "ENSMUSG00000020326 4 4 4 \n", "ENSMUSG00000026989 0 0 0 \n", "\n", "[2 rows x 6000 columns]" ] } ], "prompt_number": 132 }, { "cell_type": "code", "collapsed": false, "input": [ "bams" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "['/g/scb/patil/klaus/AlignedDataElli/p63ChIP_1h_s.bam',\n", " '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_wt_s.bam',\n", " '/g/scb/patil/klaus/AlignedDataElli/p63ChIP_4h_s.bam']" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "# get coverage for 1hr\n", "for g, ann in anno.iterrows():\n", " for r in bamsHT[0][ann[\"windows\"]]:\n", " if r.iv is not None and r.aligned and r.flag <> 256:\n", " r.iv.length = fragmentsize # extend to fragment size\n", " if ann[\"positions\"].strand == \"+\":\n", " startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n", " endInWindow = min(r.iv.end - ann[\"positions\"].pos + halfwinwidth,5999)\n", " else:\n", " startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n", " endInWindow = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n", " coverages1h.ix[g,startInWindow : endInWindow ] += 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "# get coverage for WT\n", "for g, ann in anno.iterrows():\n", " for r in bamsHT[1][ann[\"windows\"]]:\n", " if r.iv is not None and r.aligned and r.flag <> 256:\n", " r.iv.length = fragmentsize # extend to fragment size\n", " if ann[\"positions\"].strand == \"+\":\n", " startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n", " endInWindow = min(r.iv.end - ann[\"positions\"].pos + halfwinwidth,5999)\n", " else:\n", " startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n", " endInWindow = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n", " coveragesWT.ix[g,startInWindow : endInWindow ] += 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "# get coverage for 4hr\n", "for g, ann in anno.iterrows():\n", " for r in bamsHT[2][ann[\"windows\"]]:\n", " if r.iv is not None and r.aligned and r.flag <> 256:\n", " r.iv.length = fragmentsize # extend to fragment size\n", " if ann[\"positions\"].strand == \"+\":\n", " startInWindow = max(r.iv.start - ann[\"positions\"].pos + halfwinwidth,0)\n", " endInWindow = min(r.iv.end - ann[\"positions\"].pos + halfwinwidth,5999)\n", " else:\n", " startInWindow = max(ann[\"positions\"].pos + halfwinwidth - r.iv.end,0)\n", " endInWindow = min(ann[\"positions\"].pos + halfwinwidth - r.iv.start,5999)\n", " coverages4h.ix[g,startInWindow : endInWindow ] += 1" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "# check mean max coverage\n", "coverages4h.apply(max, axis = 0).mean()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ "17.569333333333333" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "# turn coverage frames into R data.frames\n", "covWT = com.convert_to_r_dataframe(coveragesWT)\n", "covWT" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 32, "text": [ "<DataFrame - Python:0x7fca67e10710 / R:0x8e00380>\n", "[Float..., Float..., Float..., ..., Float..., Float..., Float...]\n", " X0: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca679585f0 / R:0x50cf1e0>\n", "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n", " X1: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca68d9ecf8 / R:0x5ea7a50>\n", "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n", " X2: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca68d9efc8 / R:0x503a750>\n", "[0.000000, 1.000000, 0.000000, ..., 0.000000, 0.000000, 0.000000]\n", " ...\n", " X0: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca6bd024d0 / R:0x87bf810>\n", "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]\n", " X1: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca6bd02248 / R:0x87013f0>\n", "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]\n", " X2: <class 'rpy2.robjects.vectors.FloatVector'>\n", " <FloatVector - Python:0x7fca6bd02998 / R:0x8b7bf70>\n", "[1.000000, 0.000000, 0.000000, ..., 0.000000, 0.000000, 8.000000]" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "# write coverages to csv files\n", "coveragesWT.to_csv(\"covWT.csv\")\n", "coverages1h.to_csv(\"cov1h.csv\")\n", "coverages4h.to_csv(\"cov4h.csv\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "# check whether this has worked\n", "os.listdir(\".\")" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": [ "['ChipSeqElli_cache',\n", " 'ChipSeqElli.Rmd',\n", " 'ChipSeqElli.R',\n", " 'ChIPSeqTrimming.sh',\n", " 'ChipSeqElli.html',\n", " 'createBT2Index.sh',\n", " 'ChIPSeqTrimming.sh~',\n", " 'cluster-calls-Elli-ChIP-Seq.sh',\n", " 'cluster-calls-Elli-ChIP-Seq.sh~',\n", " 'bowtie2A.sh',\n", " 'ChIPQC-vignette.pdf',\n", " 'ChIPQC.pdf',\n", " 'ChIPQCSampleReport.pdf',\n", " 'sam2bam.sh',\n", " 'scriptsAleks',\n", " 'CoverageChIPSeq.py',\n", " 'ChiPSeqElli.ipynb',\n", " 'ElliGenes.csv',\n", " '.ipynb_checkpoints',\n", " 'bamSort.sh',\n", " 'bamSortIndex.sh',\n", " 'covWT.csv',\n", " 'cov1h.csv',\n", " 'cov4h.csv',\n", " 'ChipSeqElli_files',\n", " 'backGElli.csv',\n", " 'ChiPSeqElliControlSet.ipynb',\n", " 'covBackWT.csv',\n", " 'covBack1h.csv',\n", " 'covBack4h.csv',\n", " 'ElliGenes.txt',\n", " 'backgroundGenesTSS.pdf']" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "%%R\n", "rnorm(10)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "display_data", "text": [ " [1] 0.4145233 1.3614705 0.3975463 0.2241936 -1.4913814 -0.2301498\n", " [7] 0.1769197 0.4039700 -0.5776179 -0.5291591\n" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "%%R\n", "\n", "library(plyr)\n", "library(dplyr)\n", "library(magrittr)\n", "library(ggplot2)\n", "library(tidyr)\n", "\n", "covWT <- read.csv(\"covWT.csv\")\n", "covWT$time <- rep(\"WT\", dim(covWT)[1] )\n", "cov1h <- read.csv(\"cov1h.csv\")\n", "cov1h$time <- rep(\"1h\", dim(cov1h)[1] )\n", "cov4h <- read.csv(\"cov4h.csv\")\n", "cov4h$time <- rep(\"4h\", dim(cov4h)[1] )\n", "covs <- rbind.fill(covWT, cov1h, cov4h)\n", "names(covs) <- c(\"geneID\", setdiff(seq(-3000,3000),0), \"time\")\n", "print(sample_n(covs, 10)[, sample(5900,10)])\n", "\n", "dataGG <- covs %>%\n", " gather(key = \"posRelToTSS\", value = \"coverage\", -geneID, -time)\n", " \n", "dataGG$posRelToTSS %<>% as.integer()\n", "\n", "covs_collapsed <- dataGG %>%\n", " group_by(time, posRelToTSS) %>%\n", " summarize(coverageC = mean(coverage))\n", "print(covs_collapsed)\n", "\n", "#qplot(posRelToTSS, coverageC, color = time, \n", " # data=covs_collapsed, geom=\"smooth\") " ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "display_data", "text": [ " 2744 -2958 1004 1422 281 784 -1505 1017 -2615 -2580\n", "3 3 0 0 1 2 1 3 0 3 2\n", "58 0 3 0 3 1 3 1 4 0 2\n", "352 4 0 5 7 5 1 1 3 0 0\n", "374 8 5 0 0 1 0 2 0 1 1\n", "300 2 0 4 0 7 1 0 4 0 0\n", "57 2 0 4 3 2 0 1 1 0 0\n", "138 4 0 0 7 2 0 3 1 5 10\n", "343 0 0 9 4 1 0 0 10 2 0\n", "70 0 1 0 2 2 3 0 0 0 0\n", "363 2 0 5 1 1 0 1 5 0 0\n", " coverageC\n", "1 1.713629\n" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "covs" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'covs' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m<ipython-input-39-669ca51bd5a9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcovs\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mNameError\u001b[0m: name 'covs' is not defined" ] } ], "prompt_number": 39 } ], "metadata": {} } ] }