Four steps for peak annotation for quick start

引入四个新实现的函数toRanges,annoGO,annotatePeakInBatch和addGeneID。ChIP-Seq peak的注释变成了四个主要步骤:

annotate peak data with EnsDb

Step 1: 用toGRanges将 peak data 转换为 GRanges

library(ChIPpeakAnno)
library(EnsDb.Hsapiens.v75)
library(org.Hs.eg.db)
path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno")
peaks <- toGRanges(path, format="broadPeak")
peaks[1:2]
## GRanges object with 2 ranges and 4 metadata columns:
##             seqnames           ranges strand |     score signalValue
##                <Rle>        <IRanges>  <Rle> | <integer>   <numeric>
##   peak12338     chr2 [175473, 176697]      * |       206      668.42
##   peak12339     chr2 [246412, 246950]      * |        31      100.23
##                pValue    qValue
##             <numeric> <numeric>
##   peak12338        -1        -1
##   peak12339        -1        -1
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Step 2: 用toGRanges准备注释信息

annoData <- toGRanges(EnsDb.Hsapiens.v75)
annoData[1:2]
## GRanges object with 2 ranges and 1 metadata column:
##                   seqnames         ranges strand |   gene_name
##                      <Rle>      <IRanges>  <Rle> | <character>
##   ENSG00000223972     chr1 [11869, 14412]      + |     DDX11L1
##   ENSG00000227232     chr1 [14363, 29806]      - |      WASH7P
##   -------
##   seqinfo: 273 sequences from GRCh37 genome

Step 3: 用 annotatePeakInBatch注释peaks

## keep the seqnames in the same style
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
## do annotation by nearest TSS
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData)
anno[1:2]
## GRanges object with 2 ranges and 13 metadata columns:
##                             seqnames           ranges strand |     score
##                                <Rle>        <IRanges>  <Rle> | <integer>
##   peak12338.ENSG00000227061     chr2 [175473, 176697]      * |       206
##   peak12339.ENSG00000143727     chr2 [246412, 246950]      * |        31
##                             signalValue    pValue    qValue        peak
##                               <numeric> <numeric> <numeric> <character>
##   peak12338.ENSG00000227061      668.42        -1        -1   peak12338
##   peak12339.ENSG00000143727      100.23        -1        -1   peak12339
##                                     feature start_position end_position
##                                 <character>      <integer>    <integer>
##   peak12338.ENSG00000227061 ENSG00000227061         197569       202605
##   peak12339.ENSG00000143727 ENSG00000143727         264140       278283
##                             feature_strand insideFeature distancetoFeature
##                                <character>      <factor>         <numeric>
##   peak12338.ENSG00000227061              +      upstream            -22096
##   peak12339.ENSG00000143727              +      upstream            -17728
##                             shortestDistance fromOverlappingOrNearest
##                                    <integer>              <character>
##   peak12338.ENSG00000227061            20872          NearestLocation
##   peak12339.ENSG00000143727            17190          NearestLocation
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(anno$insideFeature))

Step 4: 用 addGeneIDs添加附加信息

anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db", 
                   feature_id_type="ensembl_gene_id",
                   IDs2Add=c("symbol"))
head(anno)
## GRanges object with 6 ranges and 14 metadata columns:
##                             seqnames           ranges strand |     score
##                                <Rle>        <IRanges>  <Rle> | <integer>
##   peak12338.ENSG00000227061     chr2 [175473, 176697]      * |       206
##   peak12339.ENSG00000143727     chr2 [246412, 246950]      * |        31
##   peak12340.ENSG00000143727     chr2 [249352, 250233]      * |       195
##   peak12341.ENSG00000143727     chr2 [259896, 261404]      * |       510
##   peak12342.ENSG00000143727     chr2 [261931, 263148]      * |        48
##   peak12343.ENSG00000236856     chr2 [378232, 378871]      * |       132
##                             signalValue    pValue    qValue        peak
##                               <numeric> <numeric> <numeric> <character>
##   peak12338.ENSG00000227061      668.42        -1        -1   peak12338
##   peak12339.ENSG00000143727      100.23        -1        -1   peak12339
##   peak12340.ENSG00000143727      630.65        -1        -1   peak12340
##   peak12341.ENSG00000143727     1649.19        -1        -1   peak12341
##   peak12342.ENSG00000143727      155.56        -1        -1   peak12342
##   peak12343.ENSG00000236856      426.52        -1        -1   peak12343
##                                     feature start_position end_position
##                                 <character>      <integer>    <integer>
##   peak12338.ENSG00000227061 ENSG00000227061         197569       202605
##   peak12339.ENSG00000143727 ENSG00000143727         264140       278283
##   peak12340.ENSG00000143727 ENSG00000143727         264140       278283
##   peak12341.ENSG00000143727 ENSG00000143727         264140       278283
##   peak12342.ENSG00000143727 ENSG00000143727         264140       278283
##   peak12343.ENSG00000236856 ENSG00000236856         388412       416885
##                             feature_strand insideFeature distancetoFeature
##                                <character>      <factor>         <numeric>
##   peak12338.ENSG00000227061              +      upstream            -22096
##   peak12339.ENSG00000143727              +      upstream            -17728
##   peak12340.ENSG00000143727              +      upstream            -14788
##   peak12341.ENSG00000143727              +      upstream             -4244
##   peak12342.ENSG00000143727              +      upstream             -2209
##   peak12343.ENSG00000236856              +      upstream            -10180
##                             shortestDistance fromOverlappingOrNearest
##                                    <integer>              <character>
##   peak12338.ENSG00000227061            20872          NearestLocation
##   peak12339.ENSG00000143727            17190          NearestLocation
##   peak12340.ENSG00000143727            13907          NearestLocation
##   peak12341.ENSG00000143727             2736          NearestLocation
##   peak12342.ENSG00000143727              992          NearestLocation
##   peak12343.ENSG00000236856             9541          NearestLocation
##                                  symbol
##                             <character>
##   peak12338.ENSG00000227061        <NA>
##   peak12339.ENSG00000143727        ACP1
##   peak12340.ENSG00000143727        ACP1
##   peak12341.ENSG00000143727        ACP1
##   peak12342.ENSG00000143727        ACP1
##   peak12343.ENSG00000236856        <NA>
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths