The purpose of this quick start is to introduce the four newly implemented functions, toRanges
, annoGO
, annotatePeakInBatch
, and addGeneIDs
in the new version of the ChIPpeakAnno. With those wrapper functions, the annotation of ChIP-Seq peaks becomes streamlined into four major steps:
1 Read peak data with toGRanges
2 Generate annotation data with toGRanges
3 Annotate peaks with annotatePeakInBatch
4 Add additional informations with addGeneIDs
Most of time user can use the default settings of the arguments of those functions. This makes the annotation pipeline straightforward and easy to use.
GRanges
with toGRanges
## First, load the ChIPpeakAnno package
library(ChIPpeakAnno)
path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno")
peaks <- toGRanges(path, format="broadPeak")
peaks[1:2]
## GRanges object with 2 ranges and 4 metadata columns:
## seqnames ranges strand | score signalValue
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## peak12338 chr2 [175473, 176697] * | 206 668.42
## peak12339 chr2 [246412, 246950] * | 31 100.23
## pValue qValue
## <numeric> <numeric>
## peak12338 -1 -1
## peak12339 -1 -1
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
toGRanges
library(EnsDb.Hsapiens.v75)
annoData <- toGRanges(EnsDb.Hsapiens.v75)
annoData[1:2]
## GRanges object with 2 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000210049 chrM [577, 647] + | MT-TF
## ENSG00000211459 chrM [648, 1601] + | MT-RNR1
## -------
## seqinfo: 273 sequences from GRCh37 genome
annotatePeakInBatch
## keep the seqnames in the same style
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
## do annotation by nearest TSS
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData)
anno[1:2]
## GRanges object with 2 ranges and 13 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 [175473, 176697] * | 206
## peak12339.ENSG00000143727 chr2 [246412, 246950] * | 31
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## feature_strand insideFeature distancetoFeature
## <character> <factor> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(anno$insideFeature))
## Step 4: Add additional annotation with addGeneIDs
library(org.Hs.eg.db)
anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
head(anno)
## GRanges object with 6 ranges and 14 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 [175473, 176697] * | 206
## peak12339.ENSG00000143727 chr2 [246412, 246950] * | 31
## peak12340.ENSG00000143727 chr2 [249352, 250233] * | 195
## peak12341.ENSG00000143727 chr2 [259896, 261404] * | 510
## peak12342.ENSG00000143727 chr2 [261931, 263148] * | 48
## peak12343.ENSG00000236856 chr2 [378232, 378871] * | 132
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## peak12340.ENSG00000143727 630.65 -1 -1 peak12340
## peak12341.ENSG00000143727 1649.19 -1 -1 peak12341
## peak12342.ENSG00000143727 155.56 -1 -1 peak12342
## peak12343.ENSG00000236856 426.52 -1 -1 peak12343
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## peak12340.ENSG00000143727 ENSG00000143727 264140 278283
## peak12341.ENSG00000143727 ENSG00000143727 264140 278283
## peak12342.ENSG00000143727 ENSG00000143727 264140 278283
## peak12343.ENSG00000236856 ENSG00000236856 388412 416885
## feature_strand insideFeature distancetoFeature
## <character> <factor> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## peak12340.ENSG00000143727 + upstream -14788
## peak12341.ENSG00000143727 + upstream -4244
## peak12342.ENSG00000143727 + upstream -2209
## peak12343.ENSG00000236856 + upstream -10180
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## peak12340.ENSG00000143727 13907 NearestLocation
## peak12341.ENSG00000143727 2736 NearestLocation
## peak12342.ENSG00000143727 992 NearestLocation
## peak12343.ENSG00000236856 9541 NearestLocation
## symbol
## <character>
## peak12338.ENSG00000227061 <NA>
## peak12339.ENSG00000143727 ACP1
## peak12340.ENSG00000143727 ACP1
## peak12341.ENSG00000143727 ACP1
## peak12342.ENSG00000143727 ACP1
## peak12343.ENSG00000236856 <NA>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
This section demonstrates how to annotate the same peak data as in quick start 1 using a new annotation based on TxDb with toGRanges
.
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData[1:2]
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 chr19 [58858172, 58874214] -
## 10 chr8 [18248755, 18258723] +
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
The same annotatePeakInBatch
function is used to annotate the peaks using annotation data just created. This time we want the peaks within 2kb upstream and up to 300bp downstream of TSS within the gene body.
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData,
output="overlapping",
FeatureLocForDistance="TSS",
bindingRegion=c(-2000, 300))
anno$symbol <- xget(anno$feature, org.Hs.egSYMBOL)
head(anno)
## GRanges object with 6 ranges and 12 metadata columns:
## seqnames ranges strand | score signalValue
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## peak12342 chr2 [ 261931, 263148] * | 48 155.56
## peak12345 chr2 [ 677052, 677862] * | 103 334.74
## peak12348 chr2 [3380709, 3382315] * | 110 357.22
## peak12348 chr2 [3380709, 3382315] * | 110 357.22
## peak12349 chr2 [3383131, 3384541] * | 199 645.56
## peak12349 chr2 [3383131, 3384541] * | 199 645.56
## pValue qValue feature peak feature.ranges
## <numeric> <numeric> <character> <character> <IRanges>
## peak12342 -1 -1 52 peak12342 [ 264869, 278282]
## peak12345 -1 -1 129787 peak12345 [ 667973, 677439]
## peak12348 -1 -1 7260 peak12348 [3192741, 3381653]
## peak12348 -1 -1 51112 peak12348 [3383446, 3488857]
## peak12349 -1 -1 7260 peak12349 [3192741, 3381653]
## peak12349 -1 -1 51112 peak12349 [3383446, 3488857]
## feature.strand distance insideFeature distanceToStart
## <Rle> <integer> <factor> <numeric>
## peak12342 + 1720 upstream 1721
## peak12345 - 0 overlapStart 387
## peak12348 - 0 overlapStart 662
## peak12348 + 1130 upstream 1131
## peak12349 - 1477 upstream 1478
## peak12349 + 0 overlapStart 315
## symbol
## <character>
## peak12342 ACP1
## peak12345 TMEM18
## peak12348 TSSC1
## peak12348 TRAPPC12
## peak12349 TSSC1
## peak12349 TRAPPC12
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
This section demonstrates the flexibility of the annotaition functions in the ChIPpeakAnno. Instead of building a new annotation data, the argument bindingTypes and bindingRegion in annoPeak
function can be set to find the peaks within 5000 bp upstream and downstream of the TSS, which could be the user defined promoter region.
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData,
output="nearestBiDirectionalPromoters",
bindingRegion=c(-5000, 500))
anno$symbol <- xget(anno$feature, org.Hs.egSYMBOL)
anno[anno$peak=="peak12725"]
## GRanges object with 2 ranges and 12 metadata columns:
## seqnames ranges strand | score signalValue
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## peak12725 chr2 [28112981, 28113476] * | 34 110.72
## peak12725 chr2 [28112981, 28113476] * | 34 110.72
## pValue qValue feature peak
## <numeric> <numeric> <character> <character>
## peak12725 -1 -1 9577 peak12725
## peak12725 -1 -1 64080 peak12725
## feature.ranges feature.strand distance insideFeature
## <IRanges> <Rle> <integer> <factor>
## peak12725 [28113482, 28561767] + 5 upstream
## peak12725 [28004266, 28113223] - 0 overlapStart
## distanceToStart symbol
## <numeric> <character>
## peak12725 6 BRE
## peak12725 242 RBKS
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
The annotated peaks can be visualized with R/Bioconductor package trackViewer developed by our group. Following is the sample code to plot the tracks.
if(require(trackViewer, quietly=TRUE)){
gr <- peak <- peaks["peak12725"]
start(gr) <- start(gr) - 5000
end(gr) <- end(gr) + 5000
if(.Platform$OS.type != "windows"){
peak12725 <- importScore(file=system.file("extdata", "Tead4.bigWig",
package="ChIPpeakAnno"),
ranges=peak, format = "BigWig")
}else{## rtracklayer can not import bigWig files on Windows
load(file.path(dirname(path), "cvglist.rds"))
peak12725 <- Views(cvglists[["Tead4"]][[as.character(seqnames(peak))]],
start(peak),
end(peak))
peak12725 <- viewApply(peak12725, as.numeric)
tmp <- rep(peak, width(peak))
width(tmp) <- 1
tmp <- shift(tmp, shift=0:(width(peak)-1))
mcols(tmp) <- peak12725
colnames(mcols(tmp)) <- "score"
peak12725 <- new("track", dat=tmp,
name="peak12725",
type="data",
format="BED")
}
trs <- geneModelFromTxdb(TxDb.Hsapiens.UCSC.hg19.knownGene,
org.Hs.eg.db, gr)
names(trs) <- paste(sapply(trs, function(.ele) .ele@name), names(trs), sep=":")
optSty <- optimizeStyle(trackList(peak12725, trs, heightDist = c(.3, .7)),
theme="bw")
viewTracks(optSty$tracks, gr=gr, viewerStyle=optSty$style)
}