# 线性混合效应模型

### 下载数据

step1.从GEO下载表达数据和表型信息

``````rm(list = ls())
### 下载GEO数据
library(GEOquery)
gset <- getGEO('GSE90627', destdir="./raw-data/",
AnnotGPL = F, ## 注释文件
getGPL = F)
gset
#注释文件
gpl <- getGEO("GPL17077",destdir="./raw-data/")
ids <- Table(gpl)
### 取表达矩阵
a=gset[[1]]
expr=exprs(a)
dim(expr)
expr[1:4,1:4]
#把探针号换成gene symbol
rownames(expr) <- ids[match(rownames(expr),ids\$ID),2]
### 表型信息
pd=pData(a)
pd <- pd[,c(1,2,37:40)]
colnames(pd)[3:6] <- c("age","gender","patient","tissue")
# 构建出patient和distance变量
library(stringr)
pd\$distance <- str_split(pd\$title,"_",simplify = T)[,2]
pd\$distance[97:128] <- rep("0cm",32)
pd\$patient <- str_split(pd\$title,"_",simplify = T)[,3]
pd\$patient[97:128] <- str_split(pd\$title[97:128],"_",simplify = T)[,2]
str(pd) #查看数据结构
save(expr,pd,file = "./Rdata/GSE90627_eSet.Rdata")
``````

step2.文章的Supplementary Figure 1记录了每个患者的末端样本距离，复制到excel中，保存为sTable1.csv

``````rm(list=ls())
#读入取样末端距离数据
#将距离改为数字
pd\$distance2 <- substr(pd\$distance,1,1)
pd\$distance2[1:32] <- NP\$NP.cm. #前32个恰好是patient1-32的ProximalEnd样本
pd\$distance2 <- as.numeric(pd\$distance2)
pd\$patient <- as.factor(pd\$patient)
``````

### 学习线性混合效应模型

• 固定效应：被研究的变量，本例中是“距离”
• 随机效应：希望被控制的影响因素，本例中是“患者”

(1 | g) ：斜率固定，截距变化

x + (x | g) ：斜率和截距都变化

``````#以CXCL1为例
df <- data.frame(CXCL1=expr["CXCL1",],pd[,c(5,7,8)])
library(lme4)
lmer.fit <- lmer(CXCL1~distance2+(1+distance2|patient),data = df)
summary(lmer.fit)
``````

``````# 画图查看随机效应
library(sjPlot)
plot_model(lmer.fit, type = "re", show.values = TRUE)
#查看每个患者分别计算的截距和斜率
coefficients(lmer.fit)
``````

``````library(ggeffects)
pred.mm <- ggpredict(lmer.fit, terms = c("distance2"))

ggplot(pred.mm) +
geom_point(data = df,aes(x = distance2, y = CXCL1, colour = distance),position = "jitter") +
geom_line(aes(x = x, y = predicted)) + # slope
geom_ribbon(aes(x = x, ymin = predicted - std.error, ymax = predicted + std.error),
fill = "lightgrey", alpha = 0.5) + # error band
theme_minimal()
``````

``````#固定效应是否显著
fit.full <- lmer(CXCL1~distance2+(1+distance2|patient),data = df,REML = F)
fit.null <- lmer(CXCL1~(1+distance2|patient),data = df,REML = F)
anova(fit.full,fit.null,test="LRT") #LRT代表极大似然检验
``````

### 计算所有基因与距离的相关性

``````## 批量计算
coeff <- list()
pval <- list()
for(i in 1:nrow(expr)){
#i=1
lmer.fit <- lmer(expr[i,]~pd\$distance2+(pd\$distance2|pd\$patient))
coeff[[i]] <- summary(lmer.fit)\$coefficients[2,1]

fit.full <- lmer(expr[i,]~pd\$distance2+(pd\$distance2|pd\$patient),REML = F)
fit.null <- lmer(expr[i,]~(pd\$distance2|pd\$patient),REML = F)

pval[[i]] <- anova(fit.full,fit.null,test="LRT")\$`Pr(>Chisq)`[2]
}

lmer.res <- data.frame(gene=rownames(expr),
coeff=unlist(coeff),
pval=unlist(pval),
stringsAsFactors = F)
#去掉重复的基因名
lmer.res <- lmer.res[order(lmer.res\$gene,lmer.res\$pval),]
lmer.res <- lmer.res[!duplicated(lmer.res\$gene),]
#校正FDR
save(lmer.res,file = "./Rdata/lmer_result.Rdata")
``````

### 与文章结果比较

``````load("./Rdata/lmer_result.Rdata")
# 按文章的FDR cutoff过滤结果
diff <- lmer.res[lmer.res\$FDR < 0.0001,]
dim(diff) #611个基因
table(diff\$coeff>0) #上调204个，下调407个

# 读入原文结果（从文章的补充数据下载）
up_df <- read_excel("./raw-data/oncotarget-08-61107-s002.xls", skip = 1)
down_df <- read_excel("./raw-data/oncotarget-08-61107-s003.xls", skip = 1)
df_paper <- rbind(up_df,down_df)

length(intersect(diff\$gene,df_paper\$GeneSymbol)) #交集580个

#韦恩图看交集
require("VennDiagram")
VENN.LIST=list(paper=df_paper\$GeneSymbol,my_res=diff\$gene)
venn.plot <- venn.diagram(VENN.LIST , NULL,
fill=c("#F4E99A", "#95D6F4"),
alpha=c(0.7,0.7), cex = 1.5,
cat.fontface=4)
grid.draw(venn.plot)
``````