引入四个新实现的函数toRanges,annoGO,annotatePeakInBatch和addGeneID。ChIP-Seq peak的注释变成了四个主要步骤:
library(ChIPpeakAnno)
library(EnsDb.Hsapiens.v75)
library(org.Hs.eg.db)
path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno")
peaks <- toGRanges(path, format="broadPeak")
peaks[1:2]
## GRanges object with 2 ranges and 4 metadata columns:
## seqnames ranges strand | score signalValue
## <Rle> <IRanges> <Rle> | <integer> <numeric>
## peak12338 chr2 [175473, 176697] * | 206 668.42
## peak12339 chr2 [246412, 246950] * | 31 100.23
## pValue qValue
## <numeric> <numeric>
## peak12338 -1 -1
## peak12339 -1 -1
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
annoData <- toGRanges(EnsDb.Hsapiens.v75)
annoData[1:2]
## GRanges object with 2 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000223972 chr1 [11869, 14412] + | DDX11L1
## ENSG00000227232 chr1 [14363, 29806] - | WASH7P
## -------
## seqinfo: 273 sequences from GRCh37 genome
## keep the seqnames in the same style
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
## do annotation by nearest TSS
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData)
anno[1:2]
## GRanges object with 2 ranges and 13 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 [175473, 176697] * | 206
## peak12339.ENSG00000143727 chr2 [246412, 246950] * | 31
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## feature_strand insideFeature distancetoFeature
## <character> <factor> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(anno$insideFeature))
anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
head(anno)
## GRanges object with 6 ranges and 14 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 [175473, 176697] * | 206
## peak12339.ENSG00000143727 chr2 [246412, 246950] * | 31
## peak12340.ENSG00000143727 chr2 [249352, 250233] * | 195
## peak12341.ENSG00000143727 chr2 [259896, 261404] * | 510
## peak12342.ENSG00000143727 chr2 [261931, 263148] * | 48
## peak12343.ENSG00000236856 chr2 [378232, 378871] * | 132
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## peak12340.ENSG00000143727 630.65 -1 -1 peak12340
## peak12341.ENSG00000143727 1649.19 -1 -1 peak12341
## peak12342.ENSG00000143727 155.56 -1 -1 peak12342
## peak12343.ENSG00000236856 426.52 -1 -1 peak12343
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## peak12340.ENSG00000143727 ENSG00000143727 264140 278283
## peak12341.ENSG00000143727 ENSG00000143727 264140 278283
## peak12342.ENSG00000143727 ENSG00000143727 264140 278283
## peak12343.ENSG00000236856 ENSG00000236856 388412 416885
## feature_strand insideFeature distancetoFeature
## <character> <factor> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## peak12340.ENSG00000143727 + upstream -14788
## peak12341.ENSG00000143727 + upstream -4244
## peak12342.ENSG00000143727 + upstream -2209
## peak12343.ENSG00000236856 + upstream -10180
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## peak12340.ENSG00000143727 13907 NearestLocation
## peak12341.ENSG00000143727 2736 NearestLocation
## peak12342.ENSG00000143727 992 NearestLocation
## peak12343.ENSG00000236856 9541 NearestLocation
## symbol
## <character>
## peak12338.ENSG00000227061 <NA>
## peak12339.ENSG00000143727 ACP1
## peak12340.ENSG00000143727 ACP1
## peak12341.ENSG00000143727 ACP1
## peak12342.ENSG00000143727 ACP1
## peak12343.ENSG00000236856 <NA>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths