1 R

Extensible statistical programming language

1.1 R Langauge

x <- rnorm(50)
y <- x + rnorm(50)
df <- data.frame(Indep = x, Dep = y)
fit <- lm(Dep ~ Indep, df)
summary(fit)
## 
## Call:
## lm(formula = Dep ~ Indep, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.87002 -0.50512 -0.03011  0.55959  1.36743 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.1898     0.1088   1.745   0.0874 .  
## Indep         1.1713     0.1199   9.771 5.38e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7692 on 48 degrees of freedom
## Multiple R-squared:  0.6654, Adjusted R-squared:  0.6585 
## F-statistic: 95.46 on 1 and 48 DF,  p-value: 5.385e-13

Vectors

  • numeric(), character(), integer(), logical(), list(), …
  • Statistical concepts: NA, factor()

Objects

  • Class: data.frame, lm, matrix, …

Function, generic, method

  • rnorm(), lm(); summary() generic, summary.lm() method.

Programming constructs

  • apply() (array), lapply() vector or list –> list, sapply(); if () {} else {}, for () {} / repeat {}
  • function() {}
  • Garbage collection

1.2 Packages

library(ggplot2)
ggplot(df, aes(x = Dep, y = Indep)) + geom_point() + geom_smooth(method="lm")

  • Base, recommended, contributed
  • CRAN
  • Domain expertise + author ‘opinion’ about statistics, programming, …

1.3 Help!

2 Bioconductor

Statistical analysis and comprehension of high-throughput genomic data

2.1 A First Workflow

suppressPackageStartupMessages({
    library(Biostrings)
})
dna <- DNAStringSet( c("AAACTG", "CCCAACCA") )
dna
##   A DNAStringSet instance of length 2
##     width seq
## [1]     6 AAACTG
## [2]     8 CCCAACCA
reverseComplement(dna)
##   A DNAStringSet instance of length 2
##     width seq
## [1]     6 CAGTTT
## [2]     8 TGGTTGGG
  • Biological context
  • ‘S4’ classes and methods
  • Inter-operable packages – learning to work with Biostrings and DNAStringSet pays off in other packages.

Help!

2.2 GenomicRanges

suppressPackageStartupMessages({
    library(GenomicRanges)
})
gr <- GRanges(
    c("chr1:10-19", "chr1:15-24", "chr1:30-39")
)
gr
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1  [10, 19]      *
##   [2]     chr1  [15, 24]      *
##   [3]     chr1  [30, 39]      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
  • Closed-interval (start and end coordinates include in range, like Ensembl; UCSC uses 1/2-open)
  • 1-based (like Ensembl; UCSC uses 0-based)

Operations

  • Accessors: seqnames(), start(), end(), width(), strand()

    width(gr)
    ## [1] 10 10 10
  • Intra-range: shift(), narrow(), resize(), flank(), restrict(), trim()… See ?"intra-range-methods".

    shift(gr, 1)
    ## GRanges object with 3 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [11, 20]      *
    ##   [2]     chr1  [16, 25]      *
    ##   [3]     chr1  [31, 40]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    shift(gr, c(1, 2, 3))
    ## GRanges object with 3 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [11, 20]      *
    ##   [2]     chr1  [17, 26]      *
    ##   [3]     chr1  [33, 42]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
  • Inter-range: range(), gaps(), reduce(), disjoin(), coverage(), … See ?"inter-range-methods".

    gaps(gr)
    ## GRanges object with 2 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [ 1,  9]      *
    ##   [2]     chr1  [25, 29]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    reduce(gr)
    ## GRanges object with 2 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [10, 24]      *
    ##   [2]     chr1  [30, 39]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    disjoin(gr)
    ## GRanges object with 4 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [10, 14]      *
    ##   [2]     chr1  [15, 19]      *
    ##   [3]     chr1  [20, 24]      *
    ##   [4]     chr1  [30, 39]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    as(coverage(gr), "GRanges")
    ## GRanges object with 6 ranges and 1 metadata column:
    ##       seqnames    ranges strand |     score
    ##          <Rle> <IRanges>  <Rle> | <integer>
    ##   [1]     chr1  [ 1,  9]      * |         0
    ##   [2]     chr1  [10, 14]      * |         1
    ##   [3]     chr1  [15, 19]      * |         2
    ##   [4]     chr1  [20, 24]      * |         1
    ##   [5]     chr1  [25, 29]      * |         0
    ##   [6]     chr1  [30, 39]      * |         1
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome
  • Between-ranges: findOverlaps() / countOverlaps(); set operations (e.g., union(), punion()); subsetByOverlaps(); …

    snps <- GRanges("chr1", IRanges(c(7, 12, 17, 22), width = 1))
    snps
    ## GRanges object with 4 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [ 7,  7]      *
    ##   [2]     chr1  [12, 12]      *
    ##   [3]     chr1  [17, 17]      *
    ##   [4]     chr1  [22, 22]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    countOverlaps(gr, snps)
    ## [1] 2 2 0
    subsetByOverlaps(snps, gr)
    ## GRanges object with 3 ranges and 0 metadata columns:
    ##       seqnames    ranges strand
    ##          <Rle> <IRanges>  <Rle>
    ##   [1]     chr1  [12, 12]      *
    ##   [2]     chr1  [17, 17]      *
    ##   [3]     chr1  [22, 22]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Data associated with ranges

  • mcols() or $

    gr$p.value <- runif(3)
    gr
    ## GRanges object with 3 ranges and 1 metadata column:
    ##       seqnames    ranges strand |            p.value
    ##          <Rle> <IRanges>  <Rle> |          <numeric>
    ##   [1]     chr1  [10, 19]      * |  0.337758973706514
    ##   [2]     chr1  [15, 24]      * | 0.0958204425405711
    ##   [3]     chr1  [30, 39]      * |  0.827171147800982
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

GRangesList

  • List of GRanges, e.g., exons

    gene <- c("A", "A", "B")
    grl <- splitAsList(gr, gene)
    grl
    ## GRangesList object of length 2:
    ## $A 
    ## GRanges object with 2 ranges and 1 metadata column:
    ##       seqnames    ranges strand |            p.value
    ##          <Rle> <IRanges>  <Rle> |          <numeric>
    ##   [1]     chr1  [10, 19]      * |  0.337758973706514
    ##   [2]     chr1  [15, 24]      * | 0.0958204425405711
    ## 
    ## $B 
    ## GRanges object with 1 range and 1 metadata column:
    ##       seqnames   ranges strand |           p.value
    ##   [1]     chr1 [30, 39]      * | 0.827171147800982
    ## 
    ## -------
    ## seqinfo: 1 sequence from an unspecified genome; no seqlengths
  • A common paradigm – unlist(), transform, relist()

    gr1 <- unlist(grl, use.names=FALSE)
    gr1$neg.log10.pvalue <- -log10(gr1$p.value)
    relist(gr1, grl)
    ## GRangesList object of length 2:
    ## $A 
    ## GRanges object with 2 ranges and 2 metadata columns:
    ##       seqnames    ranges strand |            p.value  neg.log10.pvalue
    ##          <Rle> <IRanges>  <Rle> |          <numeric>         <numeric>
    ##   [1]     chr1  [10, 19]      * |  0.337758973706514 0.471393103654713
    ##   [2]     chr1  [15, 24]      * | 0.0958204425405711  1.01854182771202
    ## 
    ## $B 
    ## GRanges object with 1 range and 2 metadata columns:
    ##       seqnames   ranges strand |           p.value   neg.log10.pvalue
    ##   [1]     chr1 [30, 39]      * | 0.827171147800982 0.0824046224239173
    ## 
    ## -------
    ## seqinfo: 1 sequence from an unspecified genome; no seqlengths
  • Other *List, e.g., NumericList()

seqinfo()

  • Sequences associated with GRanges or GRangesList, analogous to factor levels.

2.3 SummarizedExperiment

  • Coordinate assay() data (e.g., RNAseq expression counts) with information about rows (rowData(), e.g., genes) and columns (colData(), e.g., samples).

Example: ‘airway’ RNAseq data

suppressPackageStartupMessages({
   library(airway)
})
data(airway)
airway
## class: RangedSummarizedExperiment 
## dim: 64102 8 
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
  • Information about samples, and coordinated manipulation

    colData(airway)
    ## DataFrame with 8 rows and 9 columns
    ##            SampleName     cell      dex    albut        Run avgLength
    ##              <factor> <factor> <factor> <factor>   <factor> <integer>
    ## SRR1039508 GSM1275862   N61311    untrt    untrt SRR1039508       126
    ## SRR1039509 GSM1275863   N61311      trt    untrt SRR1039509       126
    ## SRR1039512 GSM1275866  N052611    untrt    untrt SRR1039512       126
    ## SRR1039513 GSM1275867  N052611      trt    untrt SRR1039513        87
    ## SRR1039516 GSM1275870  N080611    untrt    untrt SRR1039516       120
    ## SRR1039517 GSM1275871  N080611      trt    untrt SRR1039517       126
    ## SRR1039520 GSM1275874  N061011    untrt    untrt SRR1039520       101
    ## SRR1039521 GSM1275875  N061011      trt    untrt SRR1039521        98
    ##            Experiment    Sample    BioSample
    ##              <factor>  <factor>     <factor>
    ## SRR1039508  SRX384345 SRS508568 SAMN02422669
    ## SRR1039509  SRX384346 SRS508567 SAMN02422675
    ## SRR1039512  SRX384349 SRS508571 SAMN02422678
    ## SRR1039513  SRX384350 SRS508572 SAMN02422670
    ## SRR1039516  SRX384353 SRS508575 SAMN02422682
    ## SRR1039517  SRX384354 SRS508576 SAMN02422673
    ## SRR1039520  SRX384357 SRS508579 SAMN02422683
    ## SRR1039521  SRX384358 SRS508580 SAMN02422677
    airway[, airway$dex == "trt"]
    ## class: RangedSummarizedExperiment 
    ## dim: 64102 4 
    ## metadata(1): ''
    ## assays(1): counts
    ## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
    ## rowData names(0):
    ## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
    ## colData names(9): SampleName cell ... Sample BioSample
  • Information about regions of interest (genes, including exon coordinates)

    rowRanges(airway)
    ## GRangesList object of length 64102:
    ## $ENSG00000000003 
    ## GRanges object with 17 ranges and 2 metadata columns:
    ##        seqnames               ranges strand |   exon_id       exon_name
    ##           <Rle>            <IRanges>  <Rle> | <integer>     <character>
    ##    [1]        X [99883667, 99884983]      - |    667145 ENSE00001459322
    ##    [2]        X [99885756, 99885863]      - |    667146 ENSE00000868868
    ##    [3]        X [99887482, 99887565]      - |    667147 ENSE00000401072
    ##    [4]        X [99887538, 99887565]      - |    667148 ENSE00001849132
    ##    [5]        X [99888402, 99888536]      - |    667149 ENSE00003554016
    ##    ...      ...                  ...    ... .       ...             ...
    ##   [13]        X [99890555, 99890743]      - |    667156 ENSE00003512331
    ##   [14]        X [99891188, 99891686]      - |    667158 ENSE00001886883
    ##   [15]        X [99891605, 99891803]      - |    667159 ENSE00001855382
    ##   [16]        X [99891790, 99892101]      - |    667160 ENSE00001863395
    ##   [17]        X [99894942, 99894988]      - |    667161 ENSE00001828996
    ## 
    ## ...
    ## <64101 more elements>
    ## -------
    ## seqinfo: 722 sequences (1 circular) from an unspecified genome
    rowData(airway)$p.value <- runif(nrow(airway))
  • Access to assay information (matrix of counts of RNAseq reads in each region of interest and sample).

    libSize <- colSums(assay(airway))
    libSize
    ## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
    ##   20637971   18809481   25348649   15163415   24448408   30818215   19126151 
    ## SRR1039521 
    ##   21164133
    airway$libSize <- libSize
    table(rowSums(assay(airway)) != 0)
    ## 
    ## FALSE  TRUE 
    ## 30633 33469

2.4 Additional ‘Core’ Infrastructure

BED, GFF, GTF, … import

Gene sets

File management

  • BiocFileCache – e.g., retrieve remote files to disk, subsequent references read from disk.
  • GenomicFiles – iteration and other operations on large files or collections of files (e.g., by-chromosome VCF files).

Parallel evaluation

Annotation resources

  • biomaRt, KEGGREST, …:
  • org.* (e.g., org.Hs.eg.db) packages: 6-month snapshots for symbol mapping

    suppressPackageStartupMessages({
        library(org.Hs.eg.db)
    })
    rowData(airway)$Symbol <- mapIds(
        org.Hs.eg.db, rownames(airway), "SYMBOL", "ENSEMBL"
    )
    ## 'select()' returned 1:many mapping between keys and columns
  • TxDb.* (e.g., TxDb.Hsapiens.UCSC.hg38.knownGene) packages: gene models; GenomicFeatures: makeTxDbFrom...
  • BSgenome.* (e.g., BSgenome.Hsapiens.UCSC.hg38
  • AnnotationHub, ExperimentHub

    • Ready access to lightly or heavily curated resources
    suppressPackageStartupMessages({
        library(AnnotationHub)
        library(ExperimentHub)
    })
    query(AnnotationHub(), c("Homo sapiens", "gtf", "release-90"))
    ## AnnotationHub with 1 record
    ## # snapshotDate(): 2017-10-27 
    ## # names(): AH57150
    ## # $dataprovider: Ensembl
    ## # $species: Homo sapiens
    ## # $rdataclass: GRanges
    ## # $rdatadateadded: 2017-08-28
    ## # $title: Homo_sapiens.GRCh38.90.gtf
    ## # $description: Gene Annotation for Homo sapiens
    ## # $taxonomyid: 9606
    ## # $genome: GRCh38
    ## # $sourcetype: GTF
    ## # $sourceurl: ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_...
    ## # $sourcesize: 41837030
    ## # $tags: c("GTF", "ensembl", "Gene", "Transcript", "Annotation") 
    ## # retrieve record with 'object[["AH57150"]]'
    query(AnnotationHub(), "EnsDb")
    ## AnnotationHub with 2 records
    ## # snapshotDate(): 2017-10-27 
    ## # $dataprovider: Ensembl
    ## # $species: Homo Sapiens
    ## # $rdataclass: EnsDb
    ## # additional mcols(): taxonomyid, genome, description,
    ## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
    ## #   tags, rdatapath, sourceurl, sourcetype 
    ## # retrieve records with, e.g., 'object[["AH53715"]]' 
    ## 
    ##             title                            
    ##   AH53715 | Ensembl 88 EnsDb for Homo Sapiens
    ##   AH57757 | Ensembl 90 EnsDb for Homo Sapiens
    query(ExperimentHub(), "curatedMetagenomic")
    ## ExperimentHub with 1 record
    ## # snapshotDate(): 2017-10-30 
    ## # names(): EH549
    ## # package(): curatedMetagenomicData
    ## # $dataprovider: INRA, Institut National de la Recherche Agronomique, US1...
    ## # $species: Homo Sapiens
    ## # $rdataclass: ExpressionSet
    ## # $rdatadateadded: 2017-06-09
    ## # $title: 20170526.LeChatelierE_2013.pathcoverage.stool
    ## # $description: Pathcoverage data from the LeChatelierE_2013 dataset spec...
    ## # $taxonomyid: 9606
    ## # $genome: NA
    ## # $sourcetype: FASTQ
    ## # $sourceurl: NA
    ## # $sourcesize: NA
    ## # $tags: c("Homo_sapiens_Data", "ReproducibleResearch",
    ## #   "MicrobiomeData") 
    ## # retrieve record with 'object[["EH549"]]'

2.5 Emerging Area: Very Large Data

NB: requires R-devel / Bioc-devel

  • Library accessing 10x genomics ‘million neuron’ data set
if (!"TENxBrainData" %in% rownames(installed.packages()))
    biocLite("LTLA/TENxBrainData")    # install from github
library(TENxBrainData)
  • Retrieve data from ExperimentHub
  • Represent ‘big’ data as DelayedArray (HDF5Array)
  • Incorporate into SingleCellExperiment
se <- TENxBrainData()
se
## class: SingleCellExperiment 
## dim: 27998 1306127 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames(1306127): AAACCTGAGATAGGAG-1 AAACCTGAGCGGCTTC-1 ...
##   TTTGTCAGTTAAAGTG-133 TTTGTCATCTGAAAGA-133
## colData names(4): Barcode Sequence Library Mouse
## reducedDimNames(0):
## spikeNames(0):
  • Familiar operation on very large data, e.g., subset & calculate
libSize <- colSums(assay(se)[, 1:1000])
range(libSize)
## [1]  1453 34233