Commit affc3a7f authored by VladaMilch's avatar VladaMilch
Browse files

first commit

parents
Package: pdProbeRemap
Type: Package
Title: What the package does (short line)
Version: 0.0.0
Date: 2016-03-23
Author: Vladislava Milchevskaya
Maintainer: Vladislava Milchevskaya <milchv@gmail.com>
Description: Given original Affymetrix probe grouping and a used-provided
reference genome annotation, the package builds new probe groupings in order to
meet the original requirements (as unique mappings of probes and existence of
enough probes at each probe set). The package provides a chipname.db annotation
package, compatible with oligo for further analysis.
License: Artistic-2.0
Collate:
'00_data.R'
'01_FROM_schema.R'
'02_FROM_pdInfoBuilder_utils.R'
'03_FROM_pdInfoBuilder_main_parcing_functions.R'
'04_FROM_pdInfoBuilder_V2NgsExpression.R'
'05_FROM_pdBuilder_V2ExonTranscription.R'
'05_modified_pdInfoBuilder_parsing_functions.R'
'0_DataClasses.R'
'10_0_makeOriginalPackageObject.R'
'10_1_annotation2fasta.R'
'10_2_star_command.R'
'1_SAM_read.R'
'2_processAlignments.R'
'3_annotate.T.alignment.R'
'4_filter.T.alignment.R'
'5_annotate.G.alignment.R'
'6_1_cigar2length.R'
'6_2_cigar2_right.R'
'6_3_cigar2_left.R'
'6_4_annotate.non25M.genomic.transcript_wise.R'
'6_getExonExonBorder.R'
'7_alignments2probesetsRaw.R'
'8_1_addControlProbes.R'
'8_cleanProbeSets.R'
'90_make_objects_for_reannotation.R'
'91_new.R'
'92_alignments2parsedData.R'
'93_make_package_from_parsedData.R'
'94_makeNewAnnotationPackage.R'
'utils.R'
Imports:
reshape2,
Rsamtools,
stringr,
seqinr,
dplyr,
BiocGenerics,
IRanges,
GenomicRanges,
oligo
Depends:
pdInfoBuilder,
Biobase,
Suggests:
oligo,
knitr,
rmarkdown,
refGenome,
methods,
testthat
VignetteBuilder: knitr
biocViews: Annotation, Infrastructure
RoxygenNote: 6.0.1
# Generated by roxygen2: do not edit by hand
export(annotation2fasta)
export(cigar2length)
export(makeNewAnnotationPackage)
export(makeOriginalPackageObject)
export(processAlignments)
export(read.sam)
export(star.command.make)
exportClasses(Alignments)
import(BiocGenerics)
import(GenomicRanges)
import(IRanges)
importFrom(Rsamtools,asSam)
importFrom(dplyr,arrange)
importFrom(oligo,cleanPlatformName)
importFrom(reshape2,melt)
importFrom(seqinr,write.fasta)
importFrom(stringr,str_extract_all)
#' Example of an object of class \code{Alignments}
#'
#' Example of an object of class \code{Alignments},
#' with slots \code{samGenome} and \code{samTranscriptome}
#' subsetted for the sake of space
#'
#' @format \code{Alignments} class example object
#' @return object of class \code{Alignments}
"Alignments_class_example"
#' Example of an object used for annotation package building
#'
#' Example of an object used for annotation package building,
#' a lisf with slot
#' \code{object}, that contains information about tha package
#' \code{parsedDataOLD}, that containns annotation, and
#' \code{AnnotationType}, a character string that specifies
#' the formant of the main original annotation file
#'
#' @format inner annotation object
#' @return seed with example metadata information about the package
"seed_example"
#' Example of an annotation \code{data.frame}
#'
#'Example of a reference genome annotation \code{data.frame}
#'
#' @format Annotation
#' @return example Annotation \code{data.frame}
"Annotation_example"
\ No newline at end of file
SENSE <- 0
ANTISENSE <- 1
ALLELE_A <- 0
ALLELE_B <- 1
## setPageSizeSql <- ('
## pragma page_size = 8192;
## ')
## dbsnp_rs_id could be integer if we strip the leading 'rs' Also, chrom could
## be integer and we could have a separate mapping tabel to the cromosome label
## (so that names are more meaningful).
# NB -- VC added cytoband
createSnpFeatureSetSql <- ('
create table featureSet (
fsetid integer primary key,
man_fsetid text,
dbsnp_rs_id text,
chrom text,
physical_pos integer,
strand integer,
cytoband text,
allele_a text,
allele_b text,
gene_assoc text,
dbsnp integer,
cnv text)
')
createSnp6FeatureSetSql <- ('
create table featureSet (
fsetid integer primary key,
man_fsetid text,
affy_snp_id integer,
dbsnp_rs_id text,
chrom text,
physical_pos integer,
strand integer,
cytoband text,
allele_a text,
allele_b text,
gene_assoc text,
dbsnp integer,
cnv text)
')
createSnpFeatureSql <- ('
create table %s (
fid integer primary key,
strand integer,
allele integer,
fsetid integer not null references "featureSet" ("fsetid"),
pos integer,
x integer,
y integer)
')
createSnpPm_MmSql <- ('
create table %s (
pm_fid integer primary key references "pmfeature" ("fid"),
mm_fid integer references "mmfeature" ("fid"))
')
createSnpSequenceSql <- ('
create table sequence (
fid integer primary key,
offset integer,
tstrand text,
tallele text,
seq text)
')
createCnvFeatureSetSql <- ('
create table featureSetCNV (
fsetid integer primary key,
man_fsetid text,
chrom text,
chrom_start integer,
chrom_stop integer,
strand integer,
cytoband text,
gene_assoc text,
xpar integer,
cnv text)
')
createCnv6FeatureSetSql <- ('
create table featureSetCNV (
fsetid integer primary key,
man_fsetid text,
chrom text,
chrom_start integer,
chrom_stop integer,
strand integer,
cytoband text,
gene_assoc text,
xpar integer,
cnv text)
')
createCnvFeatureSql <- ('
create table %s (
fid integer primary key,
strand integer,
fsetid integer not null references "featureSetCNV" ("fsetid"),
x integer,
y integer)
')
createCnvSequenceSql <- ('
create table sequenceCNV (
fid integer primary key,
offset integer,
tstrand text,
seq text)
')
createSnpFragmentLengthSql <- ('
create table fragmentLength (
fsetid integer not null references "featureSet" ("fsetid"),
enzyme text,
length integer,
start integer,
stop integer)
')
createCnvFragmentLengthSql <- ('
create table fragmentLengthCNV (
fsetid integer not null references "featureSetCNV" ("fsetid"),
enzyme text,
length integer,
start integer,
stop integer)
')
## Utility functions for producing PDInfo packages
ST <- system.time
setPageSizeSql <- ('
pragma page_size = 32768;
')
setSyncOff <- ('
pragma synchronous = OFF;
')
increaseDbPerformance <- function(conn){
dbGetQuery(conn, setPageSizeSql)
dbGetQuery(conn, setSyncOff)
}
printTime <- function(msg, t) {
m <- paste(msg, "took %.2f sec\n")
message(sprintf(m, t))
}
validInput <- function(object, required=c(), optional=c()) {
msg <- NULL
ok <- sapply(required,
function(slt) file.exists(slot(object, slt)))
if (!all(ok))
msg <- paste("missing required file(s):",
paste(sapply(names(ok[!ok]), function(slt) slt), "='",
sapply(names(ok[!ok]),
function(slt) slot(object, slt)),
"'",
collapse=", ",
sep=""))
ok <- sapply(optional,
function(slt) ifelse(is.na(slot(object, slt)),
TRUE,
file.exists(slot(object, slt))))
if(!all(ok))
msg <- paste(msg, "\n", "missing optional file(s):",
paste(sapply(names(ok[!ok]), function(slt) slt), "='",
sapply(names(ok[!ok]),
function(slt) slot(object, slt)),
"'",
collapse=", ", sep=""))
if (is.null(msg)) TRUE else msg
} # end validInput
setupPackage <- function(object, pkgName, destDir, dbFileName, unlink, quiet){
geometry <- getGeometry(object)
oligoc_dbi_version =installed.packages()['oligoClasses', 'Version']
syms <- list(
## present in DESCRIPTION
PKGNAME =pkgName,
MANUF =object@manufacturer,
CHIPNAME =object@chipName,
PKGVERSION =object@version,
AUTHOR =object@author,
AUTHOREMAIL =object@email,
MANUFURL =object@url,
LIC =object@license,
ORGANISM =object@organism,
GENOMEBUILD =object@genomebuild,
SPECIES =object@species,
BIOCVIEWS =object@biocViews,
OLIGOCVERSION=oligoc_dbi_version,
## present in namespace
PDINFONAME =pkgName,
## present in all.R
## not sure where these are used
GEOMETRY =paste(geometry$nrows, geometry$ncols, sep=";"),
## remove in future
DBFILE =dbFileName,
PDINFOCLASS = sub("PkgSeed", "", class(object)))
templateDir <- system.file("pd.PKG.template", package="pdInfoBuilder")
createPackage(pkgname=pkgName, destinationDir=destDir,
originDir=templateDir, symbolValues=syms,
unlink = unlink, quiet=quiet)
file.rename(file.path(destDir, pkgName, "man/template.Rd"),
file.path(destDir, pkgName,
"man", paste(pkgName, "Rd", sep=".")))
}
connectDb <- function(dbfile) {
db <- dbConnect(SQLite(), dbname=dbfile, cache_size=6400, synchronous=0)
sql <- ('
pragma page_size = 8192;
')
dbGetQuery(db, sql)
db
}
closeDb <- function(db){
dbDisconnect(db)
}
###################
## create chrom dictionary
## the function takes a vector with all chromosome
## values and returns a data frame with an integer id
## for each value.
## The integer ID is used in the pmfeature-like tables
##################
createChrDict <- function(x){
possible <- as.character(na.omit(unique(x)))
possible <- gsub("chr([1-9])$", "chr0\\1", possible)
possible <- gsub("chr([1-9]_)", "chr0\\1", possible)
dataSplit <- strsplit(possible, "_")
len <- sapply(dataSplit, length)
idx <- which(len == 1)
basic <- unlist(dataSplit[idx])
if (length(idx) < length(len)){
suffixes <- sapply(dataSplit[-idx], function(x) paste(x[-1], collapse="_"))
suffixes <- sort(unique(suffixes))
out <- list()
out[[1]] <- sort(basic)
for (i in 1:length(suffixes))
out[[i+1]] <- paste(basic, suffixes[i], sep="_")
out <- unlist(out)
}else{
out <- sort(basic)
}
out <- gsub("chr0([1-9])", "chr\\1", out)
data.frame(chrom=as.integer(1:length(out)),
chrom_id=out,
stringsAsFactors=FALSE)
}
### helpers
### table creation
dbCreateTableInfo <- function(db, verbose=FALSE) {
tables <- dbListTables(db)
counts <- integer(length(tables))
sql <- "select count(*) from %s"
for (i in seq(along=counts)) {
if (verbose) message("Counting rows in ", tables[i])
counts[i] <- dbGetQuery(db, sprintf(sql, tables[i]))[[1]][1]
}
df <- data.frame(tbl=tables, row_count=counts,
stringsAsFactors=FALSE)
dbWriteTable(db, "table_info", df, row.names=FALSE)
}
dbCreateTable <- function(conn, tablename, col2type, col2key){
col2type[names(col2key)] <- paste(col2type[names(col2key)], col2key,
sep=" ")
sql <- paste(names(col2type), col2type, sep=" ")
sql <- paste(sql, collapse=", ")
sql <- paste("CREATE TABLE ", tablename, " (", sql, ")", sep="")
dbGetQuery(conn, sql)
}
dbInsertDataFrame <- function(conn, tablename, data, col2type, verbose=FALSE){
cols <- names(col2type)
if (!identical(sort(names(data)), sort(cols)))
stop("cols in data frame 'data' don't match cols in table \"",
tablename, "\"")
values_template <- paste(paste(":", cols, sep=""), collapse=", ")
sql_template <- paste("INSERT INTO ", tablename,
" VALUES (", values_template, ")", sep="")
if (verbose)
simpleMessage("Inserting ", nrow(data), " rows into table ",
tablename, "... ")
dbBegin(conn)
on.exit(dbCommit(conn))
dbGetPreparedQuery(conn, sql_template, bind.data=data)
if (verbose) msgOK()
}
dbCreateIndex <- function(conn, idxname, tblname, fieldname, unique=TRUE,
verbose=TRUE){
if (verbose) simpleMessage("Creating index ", idxname, " on ",
tblname, "... ")
sql <- paste("CREATE", ifelse(unique, "UNIQUE", ""),
"INDEX", idxname, "ON", tblname,
paste("(", fieldname, ")", sep=""))
dbGetQuery(conn, sql)
if (verbose) msgOK()
NULL
}
dbCreateIndicesBg <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_bgfsetid", "bgfeature", "fsetid", FALSE,
verbose=verbose)
dbCreateIndex(conn, "idx_bgfid", "bgfeature", "fid", TRUE,
verbose=verbose)
}
dbCreateIndicesBgTiling <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_bgfid", "bgfeature", "fid", TRUE,
verbose=verbose)
}
dbCreateIndicesPm <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_pmfsetid", "pmfeature", "fsetid", FALSE,
verbose=verbose)
dbCreateIndex(conn, "idx_pmfid", "pmfeature", "fid", TRUE, verbose=verbose)
}
dbCreateIndicesMm <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_mmfid", "mmfeature",
"fid", FALSE, verbose=verbose)
dbCreateIndex(conn, "idx_mmpmfid", "mmfeature",
"fidpm", FALSE, verbose=verbose)
}
dbCreateIndicesSnpPm <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_pmfsetid", "pmfeature", "fsetid",
FALSE, verbose=verbose)
dbCreateIndex(conn, "idx_pmfid", "pmfeature", "fid",
TRUE, verbose=verbose)
}
dbCreateIndicesCnvPm <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_pmfsetidcnv", "pmfeatureCNV", "fsetid",
FALSE, verbose=verbose)
dbCreateIndex(conn, "idx_pmfidcnv", "pmfeatureCNV", "fid",
TRUE, verbose=verbose)
}
dbCreateIndicesPmTiling <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_pmfid", "pmfeature", "fid",
TRUE, verbose=verbose)
}
dbCreateIndicesFs <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_fsfsetid", "featureSet", "fsetid",
TRUE, verbose=verbose)
}
dbCreateIndicesFsCnv <- function(conn, verbose=TRUE){
dbCreateIndex(conn, "idx_fscnvfsetid", "featureSetCNV", "fsetid",
TRUE, verbose=verbose)
}
## Messages
msgParsingFile <- function(fname)
simpleMessage("Parsing file: ", basename(fname), "... ")
msgOK <- function() message("OK\n")
msgBar <- function(){
n <- options()[["width"]]
message(paste(c(rep("=", n), "\n"), collapse=""))
}
##
createSeqMat <- function(db) {
allseq <- dbGetQuery(db, "select seq from sequence")[[1]]
m <- .Call("PIB_25mers_to_mat", allseq, PACKAGE="pdInfoBuilder")
remove(allseq)
allfid <- dbGetQuery(db, "select fid from sequence")[[1]]
rownames(m) <- allfid
remove(allfid)
m
}
createSeqMat.cnv <- function(db) {
allseq <- dbGetQuery(db, "select seq from sequenceCNV")[[1]]
m <- .Call("PIB_25mers_to_mat", allseq, PACKAGE="pdInfoBuilder")
remove(allseq)
allfid <- dbGetQuery(db, "select fid from sequence")[[1]]
rownames(m) <- allfid
remove(allfid)
m
}
seqToMat <- function(seq) {
.Call("PIB_25mers_to_mat", seq, PACKAGE="pdInfoBuilder")
}
annot2fdata <- function(csv){
annot <- utils::read.csv(csv, header=TRUE, comment.char="#",
na.strings="---", stringsAsFactors=FALSE)
nms <- tolower(names(annot))
nms <- gsub("_", "", nms)
nms <- gsub("\\.", "", nms)
names(annot) <- nms
stopifnot('probesetid' %in% nms)
rownames(annot) <- annot[['probesetid']]
annot[['probeset_id']] <- NULL
new("AnnotatedDataFrame", data=annot)
}
## insert in fragmentLengthTable
insertInFragmentLengthTable <- function(db, tblTarget, fragCol,
manfsetid){
tblref <- ifelse(length(grep("CNV", tblTarget)) == 0,
'featureSet', 'featureSetCNV')
ref <- dbGetQuery(db, paste('SELECT man_fsetid, fsetid FROM', tblref))
idx <- manfsetid %in% ref[['man_fsetid']]
manfsetid <- manfsetid[idx]
fragCol <- fragCol[idx]
idx <- match(manfsetid, ref[['man_fsetid']])
idx <- ref[idx, 'fsetid']
rm(ref)
flTable <- getFragmentLengthTable(fragCol, idx)
rm(idx)
dbInsertDataFrame(db, tablename=tblTarget, data=flTable,
col2type=c(fsetid='INTEGER', enzyme='TEXT',
length='INTEGER', start='INTEGER',
stop='INTEGER'),
verbose=TRUE)
}
This diff is collapsed.
### original code from pdInfoBUilder package ###
checkFields <- function(needed, found){
if (!all(needed %in% found))
stop("Looking for fields '",
paste(needed, sep="", collapse="', '"), "',
but found '",
paste(found, sep="", collapse="', '"), "'.")
TRUE
}
#######################################################################
## SECTION A - Db Schema
#######################################################################
affyHTExpressionFeatureSetSchema <- list(col2type=c(
fsetid="INTEGER",
strand="INTEGER",
man_fsetid="INTEGER"),
col2key=c(
fsetid="PRIMARY KEY"
))
affyHTExpressionPmFeatureSchema <- list(col2type=c(
fid="INTEGER",
fsetid="INTEGER",
x="INTEGER",
y="INTEGER"
),
col2key=c(
fid="PRIMARY KEY"
))
affyHTExpressionMmFeatureSchema <- list(col2type=c(
fid="INTEGER",
fsetid="INTEGER",
x="INTEGER",