新的任務,找到這 276 genes encompassed nine major DDR pathways:
base excision repair (BER),
nucleotide excision repair (NER),
mismatch repair (MMR),
the Fanconi anemia (FA) pathway,
homology-dependent recombination (HR),
non-homologous DNA end joining (NHEJ),
direct damage reversal repair (DR),
translesion DNA synthesis (TLS),
nucleotide pool maintenance (NP)
然后搞清楚每個基因出現在多少個通路,跟上次的任務比較像,來自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf
文章在線網址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/
276個基因在補充材料表格1,截圖如下:
根據Jimmy老師提示KEGGREST包含KEGG通路里的所有基因,那么反過來理論上也是可行的。所以查找KEGGREST包的說明,了解這個包的使用。
rm (list=ls())
Sys.setenv(LANGUAGE = "en") #顯示英文報錯信息
options(stringsAsFactors = FALSE) #禁止chr轉成factor
{ #install.packages
options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
}
# 載入數據,補充材料表1
library("xlsx")
res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
這里可以查看差異基因在哪些KEGG通路
library(KEGGREST)
listDatabases()
res1 <- paste("hsa:",res[-1, 1]) #根據包說明書對變量重命名
res2 <- gsub(" ", "", res1) #正則表達式去掉空格
res3 <- keggLink("pathway", res2)#查找通路
res3
開始進行分析
DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
# 批量提取幾個pathway的基因
DDRgene <- lapply(DDR.list, function(i){
#i="hsa03410"
d1 <- KEGGREST::keggGet(i)[[1]]$GENE
gene <- sapply(seq(2, length(d1), 2), function(x){
d2 <- unlist(strsplit(d1[x], ";"))[1]
})
})
#有三個通路在KEGG數據庫中找不到,在其他數據庫
#Reactome/wikipathways/SMPDB/Reactome/BioCarta Pathway/Pathway Commons找到的,但是與原文有些不一致
DDR.list2 <- as.list(read.csv("NP.csv"))
DDR.list2
# $Gene.Name
# [1] "NUDT1" "NUDT15" "NUDT18" "RRM1" "RRM2"
DDR.list3 <- as.list(read.csv("TLS.csv"))
DDR.list3
#$Gene.Name
[1] "REV3L" "SPRTN" "DDX11" "TACC3" "NUP160" "SPDL1" "PARP3"
[8] "TPR" "POLQ" "EIF2AK2" "NDC1" "ZW10" "AURKA" "TTC28"
[15] "POLE2" "NIN" "CEP128" "PABPN1" "PABPC1L" "RAE1" "CDC25B"
[22] "FAM83D" "POLI" "STAG2" "EMD" "BEX4" "PARP4" "RPA3"
[29] "POLE4" "MAD2L2" "ADPRHL2" "CDC20" "NEK2" "CENPF" "RPA2"
[36] "POLK" "DLGAP5" "PARP2" "RPA1" "VPS4A" "PCNA" "REV1"
[43] "FLNB" "CKAP2" "NUPL2" "ODF2" "CDK5RAP2" "NUMA1" "MAPKBP1"
[50] "TUBGCP4" "RMDN3" "PPM1B" "ACTR2" "KIF11" "KIF20B" "PARP9"
[57] "EIF4A3" "PARP1" "POLE3" "TUT1" "DSN1" "LATS2" "UBC"
[64] "TIPARP" "RCHY1" "TOPBP1" "POC1A" "MAD2L1" "CEP44" "PAPD4"
[71] "VCP" "UBB" "NUDCD2" "POLH" "USP32" "WDR73" "POLE"
[78] "CETN1" "UBE2N" "CALML3" "CALML5" "PARP10" "PARPBP" "BRCC3"
[85] "PARG"
DDR.list4<- as.list (read.csv("DR.csv"))
DDR.list4
# $Gene.Name
# [1] "ALKBH5" "ASCC2" "ASCC3" "ASCC1" "FTO" "ALKBH3" "MGMT" "ALKBH2"
merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
#names(merged.list) <- c(1:9)
intersect(res[-1, 2], DDRgene[[1]])
overlap <- lapply(1:9, function(x){
res <- intersect(res[-1, 2], merged.list[[x]])
})
Reduce(intersect,overlap) # 也沒有overlap
library(ggplot2)
library(tidyverse)
count <- unlist(overlap) %>% table()
count <- as.data.frame(count) %>% base::subset(Freq > 1)
names(count)[1] <- "gene"
ggplot(count, aes(gene, Freq,colour=Freq, size =Freq))+
geom_point(stat="identity")+coord_flip()
做個韋恩圖
library(UpSetR)
listinput <- list(dfgene = res[-1, 2],
BER = merged.list[[1]],
NER = merged.list[[2]],
MMR = merged.list[[3]],
HR = merged.list[[4]],
NHEJ = merged.list[[5]],
FA = merged.list[[6]],
NP = merged.list[[7]],
TLS = merged.list[[8]],
DR = merged.list[[9]])
pdf(file='upset.pdf',height = 8,width = 8)
p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
dev.off()
單個基因/單個通路
keggLink("pathway", "hsa:7157" )#單個基因,比如TP53
png <- keggGet("path:hsa01522", "image")# 看下通路
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
做個富集分析
library(clusterProfiler)
kegg.result <- enrichKEGG(gene=res[-1,1],
organism="hsa",
pvalueCutoff=0.05,
pAdjustMethod="BH",
qvalueCutoff=0.1,
keyType = "kegg")
barplot(kegg.result)
dotplot(kegg.result)
write.csv(kegg.result,"kegg.csv")
注意:這個任務中最關鍵的是如何確定九條信號通路中有哪些基因,但是各個數據庫并不一致,這是造成與作者原圖有出路的主要原因,作者在補充材料表格3中有276個基因的分類,可以參考。
特別感謝內蒙古生信菜鳥團尋找DDR通路中關鍵基因
,部分代碼參考他寫的帖子!
參考文獻
Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
KEGG學習筆記
簡介Bioconductor中的幾個注釋信息數據庫
生信技能樹公益視頻合輯:學習順序是linux,r,軟件安裝,geo,小技巧,ngs組學!
B站鏈接:https://m.bilibili.com/space/338686099
YouTube鏈接:https://m.youtube.com/channel/UC67sImqK7V8tSWHMG8azIVA/playlists
生信工程師入門最佳指南:https://mp.weixin.qq.com/s/vaX4ttaLIa19MefD86WfUA
學徒培養:https://mp.weixin.qq.com/s/3jw3_PgZXYd7FomxEMxFmw
生信技能樹 - 簡書 http://www.lxweimin.com/u/d645f768d2d5
https://www.wikipathways.org/index.php/Pathway:WP1928
https://www.cnblogs.com/muchen/p/5412278.html