生信菜鸟团 » bioconductor

用BioNet这个bioconductor包来找 maximal-scoring subgraph

ulwvfje — Fri, 25 Nov 2016 14:54:20 +0000

## 此包是为了解决一个难题： maximal-scoring subgraph (MSS) problem ，在一个巨大的复杂网络里面找到significantly differentially expressed subnetworks，就是说，得到了几百个差异基因，去PPI数据库做网络图的时候，发现还是巨大无比，所以需要用这个包来精简我们的网络图。

heuristically的中文意思：启发性地

## 而这个R包可以整合多种数据结果来给一个网络打分，

包的主页是：https://www.bioconductor.org/packages/release/bioc/html/BioNet.html

paper：BioNet: an R-Package for the Functional Analysis of ... - Bioinformatics

它整合了PPI网络分析和寻找功能模块的需求。

脚本：https://www.bioconductor.org/packages/release/bioc/vignettes/BioNet/inst/doc/Tutorial.R

教程：https://www.bioconductor.org/packages/release/bioc/vignettes/BioNet/inst/doc/Tutorial.pdf

重点就是根据一个"igraph" or "graphNEL"对象和打分来找最大的MSS

subnet <- subNetwork(dataLym$label, interactome)

module <- runFastHeinz(subnet, scores)

plotModule(module, scores=scores, diff.expr=logFC) #这个就是精简后的我们的网络图。

其实另外一个函数也有类似的功能，dNetFind https://rdrr.io/cran/dnet/man/dNetFind.html

## 里面用到的网络，都是基于igraph的包： A graph object, either in graphNEL or igraph format.

## 首先加载一系列的包和内置数据

library(BioNet)

library(DLBCL)

data(dataLym)

data(interactome)

## dataLym 里面是3个样本,t,s,o 分别对应着的每个基因的p值

## interactome是一个内置的PPI网络对象，可以根据指定的基因list来提取里面的信息

pvals <- cbind(t=dataLym$t.pval, s=dataLym$s.pval)

rownames(pvals) <- dataLym$label

pval <- aggrPvals(pvals, order=2, plot=FALSE)

## 提取t,s样本的p值，然后用aggrPvals整合成一个p值

subnet <- subNetwork(dataLym$label, interactome)

subnet <- rmSelfLoops(subnet)

subnet

## 根据指定的dataLym$label基因信息来提取网络，但是这个基因信息有点奇怪,比如TP53(7157) ，看起来是symbol跟entrez ID的合体。

## 函数rmSelfLoops是标配，只要是网络，都需要处理一下，去除自循环信息

## 因为指定的dataLym$label基因是有限的，一般不会太多，提取的网络一般也就上千个nodes，万把个edges的

fb <- fitBumModel(pval, plot=FALSE)

## 对我们整合好的基因对应的P值进行Beta-Uniform-Mixture (BUM) model模型处理。

scores <- scoreNodes(subnet, fb, fdr=0.001)

module <- runFastHeinz(subnet, scores)

## Here we use a fast heuristic approach to calculate an approximation to the optimal scoring subnetwork.

logFC <- dataLym$diff

names(logFC) <- dataLym$label

plotModule(module, scores=scores, diff.expr=logFC)

## diff.expr是用来给nodes调色的

## scores是用来给nodes赋予性状的

## 这个函数本身是基于graphNEL or igraph format的定制版，其实可以直接用igraph包来绘图。

## 也可以把这个network导出成Cytoscape format，这样可以用cytoscape来绘图

## 一般来说，红色是上调基因，绿色是下调基因，圆形是得分为正，菱形是得分为负

## 下面是一个实际的例子，如何使用BioNet包来做网络分析

library(BioNet)

library(DLBCL)

data(exprLym)

data(interactome)

exprLym ## 内置对象，所以它的gene的laber是符合interactome的要求的

interactome

network <- subNetwork(featureNames(exprLym), interactome)

network

network <- largestComp(network)

## The function extracts the largest component of a network

network

library(genefilter)

library(impute)

expressions <- impute.knn(exprs(exprLym))$data

## exprs得到的不再是纯粹的表达矩阵，需要用来 impute missing expression data

## 这里选择genefilter包的rowttests函数来做差异分析

t.test <- rowttests(expressions, fac=exprLym$Subgroup)

t.test[1:10, ]

data(dataLym)

ttest.pval <- t.test[, "p.value"]

surv.pval <- dataLym$s.pval

names(surv.pval) <- dataLym$label

pvals <- cbind(ttest.pval, surv.pval)

pval <- aggrPvals(pvals, order=2, plot=FALSE)

fb <- fitBumModel(pval, plot=FALSE)

## 用图来展示这个fitBumModel函数到底做了什么

dev.new(width=13, height=7)

par(mfrow=c(1,2))

hist(fb)

plot(fb)

dev.off()

## 下面这个图可以看到 Beta-Uniform-Mixture (BUM) 模型的两个参数是如何体现的

plotLLSurface(pval, fb)

scores <- scoreNodes(network=network, fb=fb, fdr=0.001)

## 根据p值来对每个edge打分

network <- rmSelfLoops(network)

## 下面是把网络数据写到txt文档，就可以导入到cytoscape啦！

writeHeinzEdges(network=network, file="lymphoma_edges_001", use.score=FALSE)

writeHeinzNodes(network=network, file="lymphoma_nodes_001", node.scores = scores)

datadir <- file.path(path.package("BioNet"), "extdata")

dir(datadir)

## 本次算法变了：the heinz algorithm is used to calculate the maximum-scoring subnetwork

## 下面的文件需要借助heinz.py脚本生成，这里实例用的是包自带的数据

## 脚本代码是：heinz.py -e lymphoma_edges_001.txt -n lymphoma_nodes_001.txt -N True -E False

module <- readHeinzGraph(node.file=file.path(datadir, "lymphoma_nodes_001.txt.0.hnz"), network=network)

diff <- t.test[, "dm"]

names(diff) <- rownames(t.test)

plotModule(module, diff.expr=diff, scores=scores)

sum(scores[nodes(module)])

sum(scores[nodes(module)]>0)

sum(scores[nodes(module)]<0)

###################################################

### code chunk number 27: Tutorial.Rnw:375-380

###################################################

library(BioNet)

library(DLBCL)

library(ALL)

data(ALL)

data(interactome)

## 这个ALL是另外一个包的数据，基因ID现在还没有，是探针ID，需要转换成BioNet识别的！

mapped.eset <- mapByVar(ALL, network=interactome, attr="geneID")

mapped.eset[1:5,1:5]

length(intersect(rownames(mapped.eset), nodes(interactome)))

network <- subNetwork(rownames(mapped.eset), interactome)

network

network <- largestComp(network)

network <- rmSelfLoops(network)

network

## 这里用limma来做差异分析

library(limma)

design <- model.matrix(~ -1+ factor(c(substr(unlist(ALL$BT), 0, 1))))

colnames(design)<- c("B", "T")

contrast.matrix <- makeContrasts(B-T, levels=design)

contrast.matrix

fit <- lmFit(mapped.eset, design)

fit2 <- contrasts.fit(fit, contrast.matrix)

fit2 <- eBayes(fit2)

pval <- fit2$p.value[,1]

fb <- fitBumModel(pval, plot=FALSE)

dev.new(width=13, height=7)

par(mfrow=c(1,2))

hist(fb)

plot(fb)

scores <- scoreNodes(network=network, fb=fb, fdr=1e-14)

## 还是把网络数据写到本地，供cytoscape导入

writeHeinzEdges(network=network, file="ALL_edges_001", use.score=FALSE)

writeHeinzNodes(network=network, file="ALL_nodes_001", node.scores = scores)

## 还是使用 heinz algorithm is used to calculate the maximum-scoring subnetwork

## A new implementation Heinz v2.0 is also available at https://software.cwi.nl/software/heinz ,

datadir <- file.path(path.package("BioNet"), "extdata")

module <- readHeinzGraph(node.file=file.path(datadir, "ALL_nodes_001.txt.0.hnz"), network=network)

nodeDataDefaults(module, attr="diff") <- ""

nodeData(module, n=nodes(module), attr="diff") <- fit2$coefficients[nodes(module),1]

nodeDataDefaults(module, attr="score") <- ""

nodeData(module, n=nodes(module), attr="score") <- scores[nodes(module)]

nodeData(module)[1]

## 保存为XGMML file，供cytoscape使用

saveNetwork(module, file="ALL_module", type="XGMML")

## 一般来说，红色是上调基因，绿色是下调基因，圆形是得分为正，菱形是得分为负

用R的bioconductor里面的stringDB包来做PPI分析

ulwvfje — Wed, 23 Nov 2016 11:37:37 +0000

PPI本质上是根据一系列感兴趣的蛋白质或者基因（可以是几百个甚至上千个）来去PPI数据库里面找到跟这系列蛋白质或者基因的相互作用关系！

本次的主角是stringDB，顾名思义用得是大名鼎鼎的string数据库，

paper见：https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4383874/

主页见：http://string-db.org/cgi/input.pl

本来还以为需要自己上传自己的基因给这个数据库去做分析，没想到他们也开发了R包，主页见： http://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html 而我比较喜欢用编程来解决问题，所以就学了一下这个包，非常好用！

它只需要一个3列的data.frame，分别是logFC,p.value,gene ID,就是标准的差异分析的结果。

然后用string_db$map函数给它加上一列是 string 数据库的蛋白ID，然后用string_db$add_diff_exp_color函数给它加上一列是color。

用string_db$plot_network函数画网络图，只需要 string 数据库的蛋白ID，如果需要给蛋白标记不同的颜色，需要用string_db$post_payload来把color对应到每个蛋白，然后再画网络图。

也可以直接用get_interactions函数得到所有的PPI数据，然后写入到本地，再导入到cytoscape进行画图

还以几个小功能，对我可能没什么用，但是比较适合初学者，仅仅根据string 数据库的蛋白ID就可以做GO/KEGG的enrichment分析啦，还可以查找两个蛋白的interaction呀，还有两个蛋白直接相互作用的paper呀，还有找某个蛋白在其它物种的同源蛋白呀！

软件运行中需要下载以下文件，悲催的是每次都在下载，很坑呀！因为它默认把这些文件存储在电脑的临时文件夹里面！

所有的网络图本质上是基于iGraph的深度定制，包括后面的cluster方法，还有可能要结合cytoscape的MCODE插件来找hub基因

基本上只需要把下面的代码运行一遍，就明白了：http://www.bioconductor.org/packages/release/bioc/vignettes/STRINGdb/inst/doc/STRINGdb.R

library(STRINGdb)

## 整个包不是用roxygen2来写帮助文档的，而且自己把所有函数放在了string_db对象里面，用$符合来调用各个函数，也可以查看函数的帮助文档！

## 首先选定物种及数据库的版本！

string_db <- STRINGdb$new( version="10", species=9606,

score_threshold=0, input_directory="" )

###################################################

### code chunk number 3: help

###################################################

STRINGdb$methods() # To list all the methods available.

STRINGdb$help("get_graph") # To visualize their documentation.

## 列出该包所包含的所有函数，并且可以具体查看某个函数的帮助文档。

###################################################

### code chunk number 4: load_data

###################################################

data(diff_exp_example1)

head(diff_exp_example1)

##一个测试数据，三列，如下：

# pvalue logFC gene

# 0.0001018 3.333461 VSTM2L

# 0.0001392 3.822383 TBC1D2

# 通常就是差异分析的结果

###################################################

### code chunk number 5: map

###################################################

example1_mapped <- string_db$map( diff_exp_example1, "gene", removeUnmappedRows = TRUE )

## 因为我们的差异分析是以基因来标识的，需要map到string数据库的蛋白ID

STRINGdb$help("map")

# 查看帮助文档，明白map函数如何使用，以及该函数返回的是什么！

# 本质上就是根据输入的data.frame的gene列来查找string的蛋白ID，返回的data.frame多了一列！

###################################################

### code chunk number 6: STRINGdb.Rnw:118-121

###################################################

options(SweaveHooks=list(fig=function()

par(mar=c(2.1, 0.1, 4.1, 2.1))))

#par(mar=c(1.1, 0.1, 4.1, 2.1))))

## 设置画图的属性，没什么好讲的

###################################################

### code chunk number 7: get_hits

###################################################

hits <- example1_mapped$STRING_id[1:200]

# 这里简单的挑选了前面的200个蛋白来进行下一步的分析！

## 请记住，这个例子是在随机挑选，事实上我们应该挑选自定义的差异基因

###################################################

### code chunk number 8: plot_network

###################################################

string_db$plot_network( hits )

## 只有有蛋白ID就可以进行画网络图，ID越多，耗时越长！

## 函数会根据输入的ID列表在string数据库里面找到所有的PPI数据，然后画网络图

## STRINGdb$help("plot_network")

###################################################

### code chunk number 9: add_diff_exp_color

###################################################

# filter by p-value and add a color column

# (i.e. green down-regulated gened and red for up-regulated genes)

example1_mapped_pval05 <- string_db$add_diff_exp_color( subset(example1_mapped, pvalue<0.05),

logFcColStr="logFC" )

## 上面简单的网络图一般不满足需求，比如我们需要定位基因的上下调关系，还有联系的紧密与否，可以用红绿色的深浅来刻画。

## 用add_diff_exp_color函数得到的对象还是data.frame，但是增加了一列是color

STRINGdb$help("add_diff_exp_color")

###################################################

### code chunk number 10: post_payload

###################################################

# post payload information to the STRING server

payload_id <- string_db$post_payload( example1_mapped_pval05$STRING_id,

colors=example1_mapped_pval05$color )

## 前面add_diff_exp_color函数为我们的data.frame增加了一列是color，还需要用post_payload函数来把string的蛋白ID跟color对应成功，返回一个payload_id对象给画图函数。

STRINGdb$help("post_payload")

###################################################

### code chunk number 11: plot_halo_network

###################################################

# display a STRING network png with the "halo"

string_db$plot_network( hits, payload_id=payload_id )

## 同样是画网络图，但是增加了一个color的属性。

## 可以看出来，基因太多了，画的图其实很拥挤

###################################################

### code chunk number 13: plot_ppi_enrichment

###################################################

# plot the enrichment for the best 1000 genes

string_db$plot_ppi_enrichment( example1_mapped$STRING_id[1:1000], quiet=TRUE )

STRINGdb$help("plot_ppi_enrichment")

## 这个代码我没有看懂在干吗

###################################################

### code chunk number 14: enrichment

###################################################

enrichmentGO <- string_db$get_enrichment( hits, category = "Process", methodMT = "fdr", iea = TRUE )

enrichmentKEGG <- string_db$get_enrichment( hits, category = "KEGG", methodMT = "fdr", iea = TRUE )

head(enrichmentGO, n=7)

head(enrichmentKEGG, n=7)

### 直接根据 string 数据库的蛋白ID来做富集分析，此函数会自动下载一些数据。默认是以人类的蛋白库作为背景，但是大部分情况下是需要改变的，否则P值就算的不准确啦

#################################################

# code chunk number 15: background (eval = FALSE)

#################################################

# 这里修改背景值，人类本来有两万多个基因，这里变成只有2000个了

backgroundV <- example1_mapped$STRING_id[1:2000] # as an example, we use the first 2000 genes

string_db$set_background(backgroundV)

## string_db 是一个全局变量，之前是直接选择人类的V10.0版本，现在被修改了，只是做一个测试，一定要记得改回去！！！

###################################################

### code chunk number 16: new_background_inst (eval = FALSE)

###################################################

string_db <- STRINGdb$new( score_threshold=0, backgroundV = backgroundV )

###################################################

### code chunk number 17: enrichmentHeatmap (eval = FALSE)

###################################################

eh <- string_db$enrichment_heatmap( list( hits[1:100], hits[101:200]),

list("list1","list2"), title="My Lists" )

## 我们还是把 string_db 修改回来吧！

string_db <- STRINGdb$new( version="10", species=9606,

score_threshold=0, input_directory="" )

###################################################

### code chunk number 18: clustering1

###################################################

# get clusters

clustersList <- string_db$get_clusters(example1_mapped$STRING_id[1:600])

###################################################

### code chunk number 19: STRINGdb.Rnw:254-256

###################################################

options(SweaveHooks=list(fig=function()

par(mar=c(2.1, 0.1, 4.1, 2.1))))

###################################################

### code chunk number 20: clustering2

###################################################

# plot first 4 clusters

par(mfrow=c(2,2))

for(i in seq(1:4)){

string_db$plot_network(clustersList[[i]])

}

## 把4个cluster画在同一个画布上面！

###################################################

### code chunk number 21: proteins

###################################################

string_proteins <- string_db$get_proteins()

## 下面是一下其它小工具，比如找两个蛋白的interaction呀，还有两个蛋白直接相互作用的paper呀，还有找某个蛋白在其它物种的同源蛋白呀！

###################################################

### code chunk number 22: atmtp

###################################################

tp53 = string_db$mp( "tp53" )

atm = string_db$mp( "atm" )

###################################################

### code chunk number 23: neighbors (eval = FALSE)

###################################################

## string_db$get_neighbors( c(tp53, atm) )

###################################################

### code chunk number 24: interactions

###################################################

string_db$get_interactions( c(tp53, atm) )

###################################################

### code chunk number 25: pubmedInteractions (eval = FALSE)

###################################################

## string_db$get_pubmed_interaction( tp53, atm )

###################################################

### code chunk number 26: homologs (eval = FALSE)

###################################################

## # get the reciprocal best hits of the following protein in all the STRING species

## string_db$get_homologs_besthits(tp53, symbets = TRUE)

###################################################

### code chunk number 27: homologs2 (eval = FALSE)

###################################################

## # get the homologs of the following two proteins in the mouse (i.e. species_id=10090)

## string_db$get_homologs(c(tp53, atm), target_species_id=10090, bitscore_threshold=60 )

###################################################

### code chunk number 28: benchmark1

###################################################

data(interactions_example)

interactions_benchmark = string_db$benchmark_ppi(interactions_example, pathwayType = "KEGG",

max_homology_bitscore = 60, precision_window = 400, exclude_pathways = "blacklist")

###################################################

### code chunk number 29: STRINGdb.Rnw:391-393

###################################################

options(SweaveHooks=list(fig=function()

par(mar=c(4.1, 4.1, 4.1, 2.1))))

###################################################

### code chunk number 30: benchmark2

###################################################

plot(interactions_benchmark$precision, ylim=c(0,1), type="l", xlim=c(0,700),

xlab="interactions", ylab="precision")

###################################################

### code chunk number 31: benchmark3

###################################################

interactions_pathway_view = string_db$benchmark_ppi_pathway_view(interactions_benchmark, precision_threshold=0.2, pathwayType = "KEGG")

head(interactions_pathway_view)

R一大利器之对象的操作函数查询

ulwvfje — Sat, 15 Oct 2016 13:44:09 +0000

对于生物出身的部分生物信息学工程师来说，很多计算机概念让人很头疼，尤其是计算机语言里面的高级对象。我以前学编程的时候，给我一个变量，一个数据，一个hash，我就心满意足了，可以解决大部分我数据处理问题，可事情远比想象之中复杂。因为很多高手喜欢用封装，代码复用，喜欢用高级对象。在R的bioconductor里面尤其是如此，经常会遇到各种包装好的S3，S4对象，看过说明书，倒是知道一些对象里面有什么，可以去如何处理那些对象，提取我们想要的信息，比如我就写过一系列的帖子：

Bioconductor系列之GenomicAlignments

Bioconductor系列之GenomicFeatures

R的bioconductor包TxDb.Hsapiens.UCSC.hg19.knownGene详解

R的bioconductor包里面的txdb对象及GRange对象详解

那个时候傻傻的去搜集总结每个对象的操作函数，辛苦死了，一直想有没有地方可以查询这些对象，到底应该用什么函数呢？人怎么能记住一堆函数呢《比如seqnames(),strand(),cigar(),qwidth(),start(),end(),width(),njunc() 这些函数对这个GAlignments对象进行处理》

今天我又遇到了一个LumiBatch对象，也是很复杂，我明明知道里面有基因和探针，但就是拿它没办法：

Summary of data information:
Illumina Inc. BeadStudio version 1.4.0.1
Normalization = none
Array Content = 11188230_100CP_MAGE-ML.XML
Error Model = none
DateTime = 2/3/2005 3:21 PM
Local Settings = en-US

Major Operation History:
submitted finished command lumiVersion
1 2007-04-22 00:08:36 2007-04-22 00:10:36 lumiR("../data/Barnes_gene_profile.txt") 1.1.6
2 2007-04-22 00:10:36 2007-04-22 00:10:38 lumiQ(x.lumi = x.lumi) 1.1.6
3 2007-04-22 00:13:06 2007-04-22 00:13:10 addNuId2lumi(x.lumi = x.lumi, lib = "lumiHumanV1") 1.1.6
4 2007-04-22 00:59:20 2007-04-22 00:59:36 Subsetting 8000 features and 4 samples. 1.1.6

Object Information:
LumiBatch (storageMode: lockedEnvironment)
assayData: 8000 features, 4 samples
element names: beadNum, detection, exprs, se.exprs
protocolData: none
phenoData
sampleNames: A01 A02 B01 B02
varLabels: sampleID label
varMetadata: labelDescription
featureData
featureNames: oZsQEQXp9ccVIlwoQo 9qedFRd_5Cul.ueZeQ ... 33KnLHy.RFaieogAF4 (8000 total)
fvarLabels: TargetID
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation: lumiHumanAll.db
Control Data: Available
QC information: Please run summary(x, 'QC') for details!

看起来极度的复杂，教程里面有提到一些函数可以操作这个对象，用来画图，提取数据，但是不能满足我的需要。搜索了好久，终于找到了解决方法：

https://www.rdocumentation.org/packages/Biobase/versions/2.26.0/topics/AnnotatedDataFrame?

https://www.rdocumentation.org/packages/Biobase/versions/2.26.0/topics/ExpressionSet?

https://www.rdocumentation.org/packages/Biobase/versions/2.26.0/topics/eSet?

https://www.rdocumentation.org/packages/lumi/versions/2.24.0/topics/LumiBatch-class

https://www.rdocumentation.org/packages/GenomicFeatures/versions/1.24.4/topics/TxDb-class

这些函数是有规律的，而且这个网站也提供了查询接口，很容易就可以了解每个对象是如何设置的，有哪些属性，定义好了哪些函数可以去操作它。

我需要自己组合 pData(featureData(x.lumi)) 才能从 x.lumi这个对象里面提取到我想要的 ProbeID TargetID

> head(pData(featureData(x.lumi))）
ProbeID TargetID
6450255 6450255 7A5
2570615 2570615 A1BG
6370619 6370619 A1BG
2600039 2600039 A1CF
2650615 2650615 A1CF
5340672 5340672 A1CF

以前就是把说明书给翻烂也找不到！

而且，你只需要class一下你的对象，就知道它的具体名字，然后用method就可以看到它所有可供操作的函数！

> class(x.lumi)

[1] "LumiBatch"

attr(,"package")

[1] "lumi"

> methods(class='LumiBatch')

[1] $ $<- [ [[ [[<- abstract annotation annotation<-

[9] as.matrix asBigMatrix assayData assayData<- beadNum beadNum<- boxplot classVersion

[17] classVersion<- coerce combine controlData controlData<- density description description<-

[25] detection detection<- dim dimnames dimnames<- dims esApply experimentData

[33] experimentData<- exprs exprs<- fData fData<- featureData featureData<- featureNames

[41] featureNames<- fvarLabels fvarLabels<- fvarMetadata fvarMetadata<- getHistory hist initialize

[49] isCurrent isVersioned makeDataPackage MAplot notes notes<- pairs pData

[57] pData<- phenoData phenoData<- plot preproc preproc<- protocolData protocolData<-

[65] pubMedIds pubMedIds<- rowMedians rowQ sampleNames sampleNames<- se.exprs se.exprs<-

[73] show storageMode storageMode<- summary updateObject updateObjectTo varLabels varLabels<-

[81] varMetadata varMetadata<- write.exprs

see '?methods' for accessing help and source code

用lumi包来处理illumina的bead系列表达芯片

ulwvfje — Sat, 15 Oct 2016 12:01:03 +0000

表达芯片大家最熟悉的当然是affymetrix系列芯片啦，而且分析套路很简单，直接用R的affy包，就可以把cel文件经过RMA或者MAS5方法得到表达矩阵。illumina出厂的芯片略微有点不一样，它的原始数据有3个层级，一般拿到的是Processed data (示例), 当仍然需要一系列的统计学方法才能提取到表达矩阵。我比较喜欢用bioconductor，所以下面讲一讲如何用lumi包来处理这个芯片数据！

这个lumi包的使用代码和说明书都有，按部就班的学一遍就好了。

http://www.bioconductor.org/packages/release/bioc/vignettes/lumi/inst/doc/lumi.R

http://www.bioconductor.org/packages/release/bioc/vignettes/lumi/inst/doc/lumi.pdf

如果仅仅是分析数据，那么并不难，但是每个分析步骤后面都隐含着一系列的统计学方法，想彻底搞清楚他它们，就很难了。

data(example.lumi)

lumi.N.Q <- lumiExpresso(example.lumi)

dataMatrix <- exprs(lumi.N.Q)

重点就是得到表达矩阵，它封装好了一个函数，lumiExpresso可以直接处理LumiBatch对象，这个函数结合了,N,T,B,Q(normalization,transformation,backgroud correction,qulity control)四个步骤，其中Q这个步骤又包括8种统计学图片。在该包的文章有详细说明：http://bioinformatics.oxfordjournals.org/content/24/13/1547.full

而 LumiBatch 对象是通过 lumiR.batch 读取的芯片文件被Illumina Bead Studio toolkit 处理的结果，也就是通常我们从公司或者GEO下载的数据( level 3 的 process data)，如下所示：

这个包用的测试文件Barnes_gene_profile.txt可以在http://www.chibi.ubc.ca/wp-content/uploads/2013/02/ 下载。

如果是在GEO下载公共数据，每个study都会给芯片描述文件，基本没有用，只需要下载non-normalized.txt.gz类似的文件就好了

GPL10558_HumanHT-12_V4_0_R1_15002873_B.txt.gz 13.1 Mb

GPL10558_HumanHT-12_V4_0_R2_15002873_B.txt.gz 13.1 Mb

比如我下载了：ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE30nnn/GSE30669/suppl/GSE30669_HEK_Sample_Probe_Profile.txt.gz 这个文件，就可以直接用lumi包的lumiR.batch 函数读取文件成为LumiBatch对象，然后被lumiExpresso函数直接处理，然后被exprs函数提取表达矩阵。

rm(list=ls())

library(lumi)

# setwd('G:/array/illumina-beadseed-v4/lumi_example')

# fileName <- 'Barnes_gene_profile.txt' # Not Run

## 首先是从illumina的芯片结果文件，自己用R的lumi包来获取表达矩阵。

setwd('G:/array/illumina-beadseed-v4/GSE30669')

fileName <- 'GSE30669_HEK_Sample_Probe_Profile.txt' # Not Run

x.lumi <- lumiR.batch(fileName) ##, sampleInfoFile='sampleInfo.txt')

pData(phenoData(x.lumi))

## Do all the default preprocessing in one step

lumi.N.Q <- lumiExpresso(x.lumi)

### retrieve normalized data

dataMatrix <- exprs(lumi.N.Q)

## 下面是从GEO里面下载表达矩阵

rm(list=ls())

library(GEOquery)

library(limma)

GSE30669 <- getGEO('GSE30669', destdir=".",getGPL = F)

exprSet=exprs(GSE30669[[1]])

GSE30669[[1]]

pdata=pData(GSE30669[[1]])

exprSet=exprs(GSE30669[[1]])

很明显可以看到前面得到的dataMatrix 和后面得到的 exprSet 都是我们想要的表达矩阵

## 因为你有时候获取别人处理好的表达矩阵，不符合你的normalization要求。

这个芯片一般是处理12个样本，从GEO里面很容易看到样品是如何分组的。

lumi这个包甚至还提供了一个函数produceGEOSubmissionFile来直接把我们的芯片数据转换成NCBI的GEO要求的格式

最后，官网链接很重要：https://support.illumina.com/array/array_kits/humanht-12_v4_expression_beadchip_kit/downloads.html

用SomaticSignatures包来解析maf突变数据获得mutation signature

ulwvfje — Fri, 06 May 2016 12:26:19 +0000

mutation signature这个概念提出来还不久，我看了看文献，最早见于2013年的一篇nature文章，主要是用来描述癌症患者的somatic mutation情况的。

首先要自己分析癌症样本数据，拿到somatic mutation，TCGA计划发展到现在已经有非常多的somatic mutation结果啦，大家可以自行选择感兴趣的癌症数据拿来研究，解析一下mutation signature 。

我这里给大家推荐一个工具，是R语言的Bioconductor系列包中的一个，SomaticSignatures

其实它的说明书写的非常详细了已经，如果你理解了mutation signature的概念，很容易用那个包，其实你自己写一个脚本也是非常任意的，就是根据mutation的位置在基因组中找到它的前后一个碱基，然后组成三碱基突变模式，最后统计一下那96种突变模式的分布状况！

我这里简单讲一讲这个包如何用吧！

首先下载并加载几个必须的包：

library(SomaticSignatures) ## 程序

library(SomaticCancerAlterations) ## 自带测试数据

library(BSgenome.Hsapiens.1000genomes.hs37d5) ## 我们的参考基因组

library(VariantAnnotation)

## 这个对象很重要： GRanges class of the GenomicRanges package

##其中SomaticCancerAlterations这个包提供了测试数据，来自于8个不同癌症的外显子测序的项目。

sca_metadata = scaMetadata()

###可以查看关于这8个项目的介绍，每个项目都测了好几百个样本。但是我们只关心突变数据，而且只关心somatic的突变数据。

sca_data = unlist(scaLoadDatasets())

然后根据突变数据做好一个GRanges对象，这个可以看我以前的博客

sca_data$study = factor(gsub("(.*)_(.*)", "\\1", toupper(names(sca_data))))

sca_data = unname(subset(sca_data, Variant_Type %in% "SNP"))

sca_data = keepSeqlevels(sca_data, hsAutosomes())

## 这个对象就是我们软件的输入数据

sca_vr = VRanges(

seqnames = seqnames(sca_data),

ranges = ranges(sca_data),

ref = sca_data$Reference_Allele,

alt = sca_data$Tumor_Seq_Allele2,

sampleNames = sca_data$Patient_ID,

seqinfo = seqinfo(sca_data),

study = sca_data$study

)

## 这里还可以直接用readVcf或者readMutect 来读取本地somatic mutation文件

## 提取突变数据，并且构造成一个Range对象。

sca_vr

###可以简单看看每个study都有多少somatic mutation

sort(table(sca_vr$study), decreasing = TRUE)

LUAD SKCM HNSC LUSC KIRC GBM THCA OV

208724 200589 67125 61485 24158 19938 6716 5872

##用mutationContext函数来根据Range对象和下载好的参考基因组文件来获取突变的上下文信息。

sca_motifs = mutationContext(sca_vr, BSgenome.Hsapiens.1000genomes.hs37d5)

head(sca_motifs)

##可以看到Range对象，增加了两列：alteration context

## 接下来根据做好的上下文突变数据矩阵来构建 the matrix MM of the form {motifs × studies}

sca_mm = motifMatrix(sca_motifs, group = "study", normalize = TRUE)

## 根据96种突变的频率，而不是次数来构造矩阵

head(round(sca_mm, 4))

## 然后直接画出每个study的Mutation spectrum 图

plotMutationSpectrum(sca_motifs, "study")

## 还要把spectrum分解成signature！！

## 这个包提供了两种方法，分别是NMF和PCA

n_sigs = 5

sigs_nmf = identifySignatures(sca_mm, n_sigs, nmfDecomposition)

sigs_pca = identifySignatures(sca_mm, n_sigs, pcaDecomposition)

##还提供了很多函数来探索：signatures, samples, observed and fitted.

需要我们掌握的是assessNumberSignatures，用来探索我们到底应该把ｓｐｅｃｔｒｕｍ分成多少个ｓｉｇｎａｔｕｒｅ

n_sigs = 2:8

gof_nmf = assessNumberSignatures(sca_mm, n_sigs, nReplicates = 5)

gof_pca = assessNumberSignatures(sca_mm, n_sigs, pcaDecomposition)

plotNumberSignatures(gof_nmf)　## 可视化展现

## 接下来可视化展现具体每个cancer type里面的各个个体在各个signature的占比

library(ggplot2)

plotSignatureMap(sigs_nmf) + ggtitle("Somatic Signatures: NMF - Heatmap")

plotSignatures(sigs_nmf) + ggtitle("Somatic Signatures: NMF - Barchart")

plotObservedSpectrum(sigs_nmf)

plotFittedSpectrum(sigs_nmf)

plotSampleMap(sigs_nmf)

plotSamples(sigs_nmf)

同理，PCA的结果也可以同样的可视化展现：

plotSignatureMap(sigs_pca) + ggtitle("Somatic Signatures: PCA - Heatmap")

plotSignatures(sigs_pca) + ggtitle("Somatic Signatures: PCA - Barchart")

plotFittedSpectrum(sigs_pca)

plotObservedSpectrum(sigs_pca)

值得一提的是，所有的plot系列函数，都是基于ggplot的，所以可以继续深度定制化绘图细节。

p = plotSamples(sigs_nmf)

## (re)move the legend

p = p + theme(legend.position = "none")

## (re)label the axis

p = p + xlab("Studies")

## add a title

p = p + ggtitle("Somatic Signatures in TGCA WES Data")

## change the color scale

p = p + scale_fill_brewer(palette = "Blues")

## decrease the size of x-axis labels

p = p + theme(axis.text.x = element_text(size = 9))

###当然，对上下文突变数据矩阵也可以进行聚类分析

clu_motif = clusterSpectrum(sca_mm, "motif")

library(ggdendro)

p = ggdendrogram(clu_motif, rotate = TRUE)

## 最后，由于我们综合了8个不同的study，所以必然会有批次影响，如果可以，也需要去除。

用samr包对芯片数据做差异分析

ulwvfje — Thu, 05 May 2016 11:43:04 +0000

本来搞差异分析的工具和包就一大堆了，而且limma那个包已经非常完善了，我是不准备再讲这个的，正好有个同学问了一下这个包，我就随手测试了一下，顺便看看它跟limma有什么差异没有！手痒了就记录了测试流程！

学习一个包其实非常简单，就是找到包的官网看看说明书即可！说明书链接

samr这个包更简单，就一个函数SAM,但是根据分析数据的不同被包装成了两个函数，分别是处理高通量测序数据的SAMseq和处理芯片数据的samr,本次我只讲解芯片数据的处理，然后跟limma这个包做一个简单比较~

所以，我们只需要制作好数据，然后学会用samr这个函数即可！

我们还是利用CLL这个包的测试数据来讲解这个包的用法，首先也是制作表达矩阵和分组信息。

suppressPackageStartupMessages(library(CLL))
data(sCLLex)
exprSet=exprs(sCLLex)   ##sCLLex是依赖于CLL这个package的一个对象
samples=sampleNames(sCLLex)
pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
group_list

##  [1] "progres." "stable"   "progres." "progres." "progres." "progres."
##  [7] "stable"   "stable"   "progres." "stable"   "progres." "stable"  
## [13] "progres." "stable"   "stable"   "progres." "progres." "progres."
## [19] "progres." "progres." "progres." "stable"

as.numeric(as.factor(group_list))

##  [1] 1 2 1 1 1 1 2 2 1 2 1 2 1 2 2 1 1 1 1 1 1 2

这个表达矩阵exprSet和分组信息group_list就可以直接用来做差异分析啦~！它的分组信息要求比较读取，需要1,1,1,2,2,2这样的向量，所以我用了as.numeric(as.factor(group_list))，具体见下面的代码！

suppressPackageStartupMessages(library(samr))
data=list(x=exprSet,y=as.numeric(as.factor(group_list)), 
          geneid=as.character(1:nrow(exprSet)),
          genenames=rownames(exprSet), 
          logged2=TRUE
)
samr.obj<-samr(data, resp.type="Two class unpaired", nperms=100)

这样其实已经OK啦，重点是如何调整这个函数的参数，以及如何理解这个函数返回的结果(samr.obj这个对象非常重要，关乎你能否真正用好samr)~

我这里的genenames其实是探针名，如果真正要做分析，可以修改，而且我的nperms次数为100，也可以修改，一般是1000.

除了直接应用它找差异基因外，它还有几个单独的函数

首先是对表达矩阵进行normalization

x.norm <- samr.norm.data(data$x)
par(mfrow=c(1,2))
boxplot(exprSet, col = rainbow(exprSet),main="before normalization",las=2)
boxplot(x.norm,  col = rainbow(exprSet),main="after normalization",las=2)

看图好像没什么区别

另外几个函数，我就不一一介绍了，大家可以自行探索。

* samr.plot(samr.obj, del, min.foldchange=0)

* samr.plot(samr.obj, del=.3)

* samr.assess.samplesize.obj<- samr.assess.samplesize(samr.obj, data, log2(1.5))

* samr.assess.samplesize.plot(samr.assess.samplesize.obj)

我们重点看看这个samr得到的差异与limma的差异区别在哪里

## 首先提取samr做差异分析检验的p值
pv=samr.pvalues.from.perms(samr.obj$tt, samr.obj$ttstar)
## 然后提取limma包做差异分析检验的p值
library(limma) 
design=model.matrix(~factor(sCLLex$Disease))
fit=lmFit(sCLLex,design)
fit=eBayes(fit)
options(digits = 4)
DEG_limma=topTable(fit,coef=2,adjust='BH',n=Inf) 
pv_limma=DEG_limma$P.Value
names(pv_limma)=rownames(DEG_limma)
head(pv[sort(names(pv))])

##  100_g_at   1000_at   1001_at 1002_f_at 1003_s_at   1004_at 
##    0.2531    0.4144    0.5671    0.5686    0.4687    0.6340

head(pv_limma[sort(names(pv_limma))])

##  100_g_at   1000_at   1001_at 1002_f_at 1003_s_at   1004_at 
##    0.2497    0.4312    0.5349    0.5498    0.4361    0.6473

cor(pv[sort(names(pv))],pv_limma[sort(names(pv_limma))])

## [1] 0.9976

从数据上来看，没什么本质区别,而且相关系数高达0.9978.

所以结论是，没必要搞那么多的包，用limma就好了，甚至直接用t检验也是OK的

还有plot和summary也是可以直接作用于samr的结果samr.obj对象的

用oligo包来读取affymetix的基因表达芯片数据-CEL格式数据

ulwvfje — Sat, 23 Apr 2016 14:58:31 +0000

前面讲到affy处理的芯片平台是有限的，一般是hgu 95系列和133系列，[HuGene-1_1-st] Affymetrix Human Gene 1.1 ST Array这个平台虽然也是affymetrix公司的，但是affy包就无法处理了，这时候就需要oligo包了！

oligo包是R语言的bioconductor系列包的一个，就一个功能，读取affymetix的基因表达芯片数据-CEL格式数据，处理成表达矩阵！！！

同理，我们也是要下载原始数据：一个例子：GSE48452

下载之后，解压到指定目录，就可以直接用oligo包啦！

geneCELs=list.celfiles('/path/GSE48452/cel_files/',listGzipped=T,full.name=T)

#用全路径，一般cel文件也是压缩包形式，没必要解压

affyGeneFS <- read.celfiles(geneCELs) ##读取ｃｅｌ文件

geneCore <- rma(affyGeneFS, target = "core")　 ##这一步是normalization，会比较耗时

genePS <- rma(affyGeneFS, target = "probeset")

#两种normlization的方法，##一般我们会选择transcript相关的

## 这个芯片平台还需要自己把探针ID赋值给表达矩阵

featureData(genePS) <- getNetAffx(genePS, "probeset")

featureData(geneCore) <- getNetAffx(geneCore, "transcript")

## 探针ID还需要注释到基因ID，这里就不讲了！

处理之后得到的表达矩阵应该是与GEO官网的一致，大家可以自己对照检查一下：

ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE48nnn/GSE48452/matrix/GSE48452_series_matrix.txt.gz

用affy包读取affymetix的基因表达芯片数据-CEL格式数据

ulwvfje — Sat, 23 Apr 2016 14:50:46 +0000

Affymetrix的探针（proble）一般是长为25碱基的寡聚核苷酸；探针总是以perfect match 和mismatch成对出现，其信号值称为PM和MM，成对的perfect match 和mismatch有一个共同的affyID。
CEL文件：信号值和定位信息。
CDF文件：探针对在芯片上的定位信息

affy包是R语言的bioconductor系列包的一个，就一个功能，读取affymetix的基因表达芯片数据-CEL格式数据，处理成表达矩阵！！！

一般我们都是去GEO数据库里面知道找到CEL文件的下载地址~~~比如GSE1438，测了10 young (19-25 years old) and 12 older (70-80 years old) male的样品，然后找差异基因，从GEO数据库我们找到cel文件下载地址是：

ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE1nnn/GSE1428/suppl/GSE1428_RAW.tar

我们是为了讲解affy才下载原始数据的，其实GEO也提供处理好的表达矩阵供下载

下载后压缩到指定目录即可

下载到本地之后就可以用代码读取它了！

library(affy)
dir_cels='D:\\test_analysis\\TNBC\\cel_files'
affy_data = ReadAffy(celfile.path=dir_cels)
eset.mas5 = mas5(affy_data)

读取的过程还是蛮耗时间的，也可以选择rma函数而不是mas5函数对表达数据进行normalization

读取之后的表达矩阵如图所示：

理论上，处理得到的数据应该与直接在GEO官网下载的表达量是一样的，下载链接都是有规律的！

ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE1nnn/GSE1428/matrix/GSE1428_series_matrix.txt.gz

当然这个affy包支持的芯片平台是有限的！

一般是hgu 95系列和133系列~~

其实严格来说，这个芯片得到的表达矩阵，是需要过滤的。

比如像下面的代码：

setwd('../')
library(affy)
dir_cels='GSE34824_RAW'
data <- ReadAffy(celfile.path=dir_cels)
eset <- rma(data)
calls <- mas5calls(data) # get PMA calls
calls <- exprs(calls)
absent <- rowSums(calls == 'A') # how may samples are each gene 'absent' in all samples
absent <- which (absent == ncol(calls)) # which genes are 'absent' in all samples
rmaFiltered <- eset[-absent,] # filters out the genes 'absent' in all samples

54675 features 经过过滤后，剩下 42482 features

R包精讲第四篇：4种R包安装方式

ulwvfje — Tue, 12 Apr 2016 15:45:07 +0000

请先看：R包精讲第一篇：如何查看你已经安装了和可以安装哪些R包？

第一种方式，当然是R自带的函数直接安装包了，这个是最简单的，而且不需要考虑各种包之间的依赖关系。

对普通的R包，直接install.packages()即可，一般下载不了都是包的名字打错了，或者是R的版本不够，如果下载了安装不了，一般是依赖包没弄好，或者你的电脑缺少一些库文件，如果实在是找不到或者下载慢，一般就用repos=来切换一些镜像。

> install.packages("ape")  ##直接输入包名字即可
Installing package into ‘C:/Users/jmzeng/Documents/R/win-library/3.1’
(as ‘lib’ is unspecified)  ##一般不指定lib，除非你明确知道你的lib是在哪里
trying URL 'http://mirror.bjtu.edu.cn/cran/bin/windows/contrib/3.1/ape_3.4.zip'
Content type 'application/zip' length 1418322 bytes (1.4 Mb)
opened URL   ## 根据你选择的镜像，程序会自动拼接好下载链接url
downloaded 1.4 Mb

package ‘ape’ successfully unpacked and MD5 sums checked  ##表明你已经安装好包啦

The downloaded binary packages are in  ##程序自动下载的原始文件一般放在临时目录，会自动删除
	C:\Users\jmzeng\AppData\Local\Temp\Rtmpy0OivY\downloaded_packages

对于bioconductor的包，我们一般是

source("http://bioconductor.org/biocLite.R") ##安装BiocInstaller

#options(BioC_mirror=”http://mirrors.ustc.edu.cn/bioc/“) 如果需要切换镜像
biocLite("ggbio")

或者直接BiocInstaller::biocLite('ggbio') ## 前提是你已经安装好了BiocInstaller

某些时候你还需要卸载remove.packages("BiocInstaller") 然后安装新的

第二种方式，是直接找到包的下载地址，需要进入包的主页

packageurl <- "http://cran.r-project.org/src/contrib/Archive/ggplot2/ggplot2_0.9.1.tar.gz"
packageurl <- "http://cran.r-project.org/src/contrib/Archive/gridExtra/gridExtra_0.9.1.tar.gz"
install.packages(packageurl, repos=NULL, type="source")
#packageurl <- "http://www.bioconductor.org/packages/2.11/bioc/src/contrib/ggbio_1.6.6.tar.gz"
#packageurl <- "http://cran.r-project.org/src/contrib/Archive/ggplot2/ggplot2_1.0.1.tar.gz"
install.packages(packageurl, repos=NULL, type="source")

这样安装的就不需要选择镜像了，也跨越了安装器的版本！

第三种是，先把包下载到本地，然后安装：

download.file("http://bioconductor.org/packages/release/bioc/src/contrib/BiocInstaller_1.20.1.tar.gz","BiocInstaller_1.20.1.tar.gz")
##也可以选择用浏览器下载这个包
install.packages("BiocInstaller_1.20.1.tar.gz", repos = NULL)
## 如果你用的RStudio这样的IDE，那么直接用鼠标就可以操作了
或者用choose.files()来手动交互的选择你把下载的源码BiocInstaller_1.20.1.tar.gz放到了哪里。

这种形式大部分安装都无法成功，因为R包之间的依赖性很强！

第四种是：命令行版本安装

如果是linux版本，命令行从网上自动下载包如下：
sudo su - -c \
"R -e \"install.packages('shiny', repos='https://cran.rstudio.com/')\""
如果是linux，命令行安装本地包，在shell的终端
sudo R CMD INSTALL package.tar.gz
window或者mac平台一般不推荐命令行格式，可视化那么舒心，何必自讨苦吃

R包精讲第三篇：如何切换镜像？

ulwvfje — Tue, 12 Apr 2016 13:11:53 +0000

这个技巧很重要，一般来说，R语言自带的install.packages函数来安装一个包时，都是用的默认的镜像！

如果你是用的Rstudio这个IDE，你的默认镜像就是： https://cran.rstudio.com/

如果你直接用的R语言，那么就是："http://cran.us.r-project.org" 但是一般你安装的时候会提醒你选择。

而我们一般需要更改成自己最方便的

install.packages(pkgs, lib, repos = getOption("repos"),
contriburl = contrib.url(repos, type),
method, available = NULL, destdir = NULL,
dependencies = NA, type = getOption("pkgType"),
configure.args = getOption("configure.args"),
configure.vars = getOption("configure.vars"),
clean = FALSE, Ncpus = getOption("Ncpus", 1L),
verbose = getOption("verbose"),
libs_only = FALSE, INSTALL_opts, quiet = FALSE,
keep_outputs = FALSE, ...)

如果是在国内， install.packages("ABC",repos="http://mirror.bjtu.edu.cn/ "),换成北大的镜像，飞一般的感觉！

如果想永久设置，就用options修改即可。

如果你是Rstudio的IDE，那么直接进入全局设置，一劳永逸的选择好镜像！

你可以check一下每个镜像的包是不是一致的：

dim(available.packages(contriburl = "http://cran.rstudio.com/bin/windows/contrib/3.2/"))

更改镜像主页及包的版本即可查看所有镜像各提供哪些包！

当然，我们的bioconductor其实也是有镜像的，只是大部分人都不知道，也不会去用而已！

source("http://bioconductor.org/biocLite.R")

options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")

biocLite("RGalaxy")##这样就用中科大的镜像来下载包啦

## bioconductor还有很多其它镜像：https://www.bioconductor.org/about/mirrors/

##https://stat.ethz.ch/R-manual/R-devel/library/utils/html/chooseBioCmirror.html