BiocIntro 0.1.0
Extensible statistical programming language
x <- rnorm(50)
y <- x + rnorm(50)
df <- data.frame(Indep = x, Dep = y)
fit <- lm(Dep ~ Indep, df)
summary(fit)
##
## Call:
## lm(formula = Dep ~ Indep, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.87002 -0.50512 -0.03011 0.55959 1.36743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1898 0.1088 1.745 0.0874 .
## Indep 1.1713 0.1199 9.771 5.38e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7692 on 48 degrees of freedom
## Multiple R-squared: 0.6654, Adjusted R-squared: 0.6585
## F-statistic: 95.46 on 1 and 48 DF, p-value: 5.385e-13
Vectors
numeric()
, character()
, integer()
, logical()
, list()
, …NA
, factor()
Objects
data.frame
, lm
, matrix
, …Function, generic, method
rnorm()
, lm()
; summary()
generic, summary.lm()
method.Programming constructs
apply()
(array), lapply()
vector or list –> list, sapply()
; if () {} else {}
, for () {}
/ repeat {}
function() {}
library(ggplot2)
ggplot(df, aes(x = Dep, y = Indep)) + geom_point() + geom_smooth(method="lm")
?"summary"
, ?"summary.lm"
Statistical analysis and comprehension of high-throughput genomic data
suppressPackageStartupMessages({
library(Biostrings)
})
dna <- DNAStringSet( c("AAACTG", "CCCAACCA") )
dna
## A DNAStringSet instance of length 2
## width seq
## [1] 6 AAACTG
## [2] 8 CCCAACCA
reverseComplement(dna)
## A DNAStringSet instance of length 2
## width seq
## [1] 6 CAGTTT
## [2] 8 TGGTTGGG
DNAStringSet
pays off in other packages.Help!
class(dna)
methods(class = "DNAStringSet")
?"DNAStringSet"
, ?"reverseComplement,DNAStringSet-method"
(tab completion!)browseVignettes(package="Biostrings")
suppressPackageStartupMessages({
library(GenomicRanges)
})
gr <- GRanges(
c("chr1:10-19", "chr1:15-24", "chr1:30-39")
)
gr
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [10, 19] *
## [2] chr1 [15, 24] *
## [3] chr1 [30, 39] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Operations
Accessors: seqnames()
, start()
, end()
, width()
, strand()
width(gr)
## [1] 10 10 10
Intra-range: shift()
, narrow()
, resize()
, flank()
, restrict()
, trim()
… See ?"intra-range-methods"
.
shift(gr, 1)
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [11, 20] *
## [2] chr1 [16, 25] *
## [3] chr1 [31, 40] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
shift(gr, c(1, 2, 3))
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [11, 20] *
## [2] chr1 [17, 26] *
## [3] chr1 [33, 42] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Inter-range: range()
, gaps()
, reduce()
, disjoin()
, coverage()
, … See ?"inter-range-methods"
.
gaps(gr)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [ 1, 9] *
## [2] chr1 [25, 29] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
reduce(gr)
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [10, 24] *
## [2] chr1 [30, 39] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
disjoin(gr)
## GRanges object with 4 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [10, 14] *
## [2] chr1 [15, 19] *
## [3] chr1 [20, 24] *
## [4] chr1 [30, 39] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
as(coverage(gr), "GRanges")
## GRanges object with 6 ranges and 1 metadata column:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 [ 1, 9] * | 0
## [2] chr1 [10, 14] * | 1
## [3] chr1 [15, 19] * | 2
## [4] chr1 [20, 24] * | 1
## [5] chr1 [25, 29] * | 0
## [6] chr1 [30, 39] * | 1
## -------
## seqinfo: 1 sequence from an unspecified genome
Between-ranges: findOverlaps()
/ countOverlaps()
; set operations (e.g., union()
, punion()
); subsetByOverlaps()
; …
snps <- GRanges("chr1", IRanges(c(7, 12, 17, 22), width = 1))
snps
## GRanges object with 4 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [ 7, 7] *
## [2] chr1 [12, 12] *
## [3] chr1 [17, 17] *
## [4] chr1 [22, 22] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
countOverlaps(gr, snps)
## [1] 2 2 0
subsetByOverlaps(snps, gr)
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr1 [12, 12] *
## [2] chr1 [17, 17] *
## [3] chr1 [22, 22] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Data associated with ranges
mcols()
or $
gr$p.value <- runif(3)
gr
## GRanges object with 3 ranges and 1 metadata column:
## seqnames ranges strand | p.value
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 [10, 19] * | 0.337758973706514
## [2] chr1 [15, 24] * | 0.0958204425405711
## [3] chr1 [30, 39] * | 0.827171147800982
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
GRangesList
List of GRanges
, e.g., exons
gene <- c("A", "A", "B")
grl <- splitAsList(gr, gene)
grl
## GRangesList object of length 2:
## $A
## GRanges object with 2 ranges and 1 metadata column:
## seqnames ranges strand | p.value
## <Rle> <IRanges> <Rle> | <numeric>
## [1] chr1 [10, 19] * | 0.337758973706514
## [2] chr1 [15, 24] * | 0.0958204425405711
##
## $B
## GRanges object with 1 range and 1 metadata column:
## seqnames ranges strand | p.value
## [1] chr1 [30, 39] * | 0.827171147800982
##
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
A common paradigm – unlist()
, transform, relist()
gr1 <- unlist(grl, use.names=FALSE)
gr1$neg.log10.pvalue <- -log10(gr1$p.value)
relist(gr1, grl)
## GRangesList object of length 2:
## $A
## GRanges object with 2 ranges and 2 metadata columns:
## seqnames ranges strand | p.value neg.log10.pvalue
## <Rle> <IRanges> <Rle> | <numeric> <numeric>
## [1] chr1 [10, 19] * | 0.337758973706514 0.471393103654713
## [2] chr1 [15, 24] * | 0.0958204425405711 1.01854182771202
##
## $B
## GRanges object with 1 range and 2 metadata columns:
## seqnames ranges strand | p.value neg.log10.pvalue
## [1] chr1 [30, 39] * | 0.827171147800982 0.0824046224239173
##
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Other *List
, e.g., NumericList()
seqinfo()
GRanges
or GRangesList
, analogous to factor levels.assay()
data (e.g., RNAseq expression counts) with information about rows (rowData()
, e.g., genes) and columns (colData()
, e.g., samples).Example: ‘airway’ RNAseq data
suppressPackageStartupMessages({
library(airway)
})
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
Information about samples, and coordinated manipulation
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
airway[, airway$dex == "trt"]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
Information about regions of interest (genes, including exon coordinates)
rowRanges(airway)
## GRangesList object of length 64102:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X [99883667, 99884983] - | 667145 ENSE00001459322
## [2] X [99885756, 99885863] - | 667146 ENSE00000868868
## [3] X [99887482, 99887565] - | 667147 ENSE00000401072
## [4] X [99887538, 99887565] - | 667148 ENSE00001849132
## [5] X [99888402, 99888536] - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X [99890555, 99890743] - | 667156 ENSE00003512331
## [14] X [99891188, 99891686] - | 667158 ENSE00001886883
## [15] X [99891605, 99891803] - | 667159 ENSE00001855382
## [16] X [99891790, 99892101] - | 667160 ENSE00001863395
## [17] X [99894942, 99894988] - | 667161 ENSE00001828996
##
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
rowData(airway)$p.value <- runif(nrow(airway))
Access to assay information (matrix of counts of RNAseq reads in each region of interest and sample).
libSize <- colSums(assay(airway))
libSize
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 20637971 18809481 25348649 15163415 24448408 30818215 19126151
## SRR1039521
## 21164133
airway$libSize <- libSize
table(rowSums(assay(airway)) != 0)
##
## FALSE TRUE
## 30633 33469
BED, GFF, GTF, … import
import()
Gene sets
File management
Parallel evaluation
Annotation resources
org.*
(e.g., org.Hs.eg.db) packages: 6-month snapshots for symbol mapping
suppressPackageStartupMessages({
library(org.Hs.eg.db)
})
rowData(airway)$Symbol <- mapIds(
org.Hs.eg.db, rownames(airway), "SYMBOL", "ENSEMBL"
)
## 'select()' returned 1:many mapping between keys and columns
TxDb.*
(e.g., TxDb.Hsapiens.UCSC.hg38.knownGene) packages: gene models; GenomicFeatures: makeTxDbFrom...
BSgenome.*
(e.g., BSgenome.Hsapiens.UCSC.hg38suppressPackageStartupMessages({
library(AnnotationHub)
library(ExperimentHub)
})
query(AnnotationHub(), c("Homo sapiens", "gtf", "release-90"))
## AnnotationHub with 1 record
## # snapshotDate(): 2017-10-27
## # names(): AH57150
## # $dataprovider: Ensembl
## # $species: Homo sapiens
## # $rdataclass: GRanges
## # $rdatadateadded: 2017-08-28
## # $title: Homo_sapiens.GRCh38.90.gtf
## # $description: Gene Annotation for Homo sapiens
## # $taxonomyid: 9606
## # $genome: GRCh38
## # $sourcetype: GTF
## # $sourceurl: ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_...
## # $sourcesize: 41837030
## # $tags: c("GTF", "ensembl", "Gene", "Transcript", "Annotation")
## # retrieve record with 'object[["AH57150"]]'
query(AnnotationHub(), "EnsDb")
## AnnotationHub with 2 records
## # snapshotDate(): 2017-10-27
## # $dataprovider: Ensembl
## # $species: Homo Sapiens
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH53715"]]'
##
## title
## AH53715 | Ensembl 88 EnsDb for Homo Sapiens
## AH57757 | Ensembl 90 EnsDb for Homo Sapiens
query(ExperimentHub(), "curatedMetagenomic")
## ExperimentHub with 1 record
## # snapshotDate(): 2017-10-30
## # names(): EH549
## # package(): curatedMetagenomicData
## # $dataprovider: INRA, Institut National de la Recherche Agronomique, US1...
## # $species: Homo Sapiens
## # $rdataclass: ExpressionSet
## # $rdatadateadded: 2017-06-09
## # $title: 20170526.LeChatelierE_2013.pathcoverage.stool
## # $description: Pathcoverage data from the LeChatelierE_2013 dataset spec...
## # $taxonomyid: 9606
## # $genome: NA
## # $sourcetype: FASTQ
## # $sourceurl: NA
## # $sourcesize: NA
## # $tags: c("Homo_sapiens_Data", "ReproducibleResearch",
## # "MicrobiomeData")
## # retrieve record with 'object[["EH549"]]'
NB: requires R-devel / Bioc-devel
if (!"TENxBrainData" %in% rownames(installed.packages()))
biocLite("LTLA/TENxBrainData") # install from github
library(TENxBrainData)
se <- TENxBrainData()
se
## class: SingleCellExperiment
## dim: 27998 1306127
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames(1306127): AAACCTGAGATAGGAG-1 AAACCTGAGCGGCTTC-1 ...
## TTTGTCAGTTAAAGTG-133 TTTGTCATCTGAAAGA-133
## colData names(4): Barcode Sequence Library Mouse
## reducedDimNames(0):
## spikeNames(0):
libSize <- colSums(assay(se)[, 1:1000])
range(libSize)
## [1] 1453 34233