getgeo geo表达矩阵
矩阵 表达 Geo
2023-09-14 09:09:48 时间
glp平台基因和探针转化
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/")
library(GEOquery)
f='GSE163224_eSet.Rdata'
if(!file.exists(f)){
gset <- getGEO('GSE163224', destdir=".",
AnnotGPL = F, ## 注释文件
getGPL = F) ## 平台文件
save(gset,file=f) ## 保存到本地
}
getwd()
load('GSE163224_eSet.Rdata') ## 载入数据
class(gset) #查看数据类型
length(gset) #
class(gset[[1]])
gset
exprs(gset[[1]])
gset[[1]]
a=exprs(gset[[1]])
head(a)
a['220570_at',]
a[rownames(a)=='220570_at',]
library(dplyr)
myexpression=as.data.frame(a)
head(myexpression)
myexpression$probe_id=rownames(myexpression)#将行名转化为列名方便合并
head(myexpression)
#方法1##下载p值为1的geo2r 获取探针和基因的对应关系 对此数据不适用
if(1==1){
input=read.table("G:\\silicosis\\geo\\GSE103548_rna-seq_llc_mle-12\\GSE163224_a549_contact_esophil\\GSE163224.top.table.tsv",
header = T,
sep = "\t",
comment.char = "!")
head(input)
input$probe_id=input$ID
head(input)
mydata=merge(myexpression,input,by='probe_id')
head(mydata)
colnames(a)
boxplot(mydata[,2:42])
getwd()
save(mydata,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
load("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
}
#方法二###下载基因芯片平台的探针和基因的对应关系 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL24324
#http://zouyawen.top/2020/10/21/GEO/#more
getwd()
#gpGPL24324<-getGEO('GPL24324')#获取探针信息进行ID转换
library(readr)
gene_info <- read_delim("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/GPL24324-69524.txt",
"\t", escape_double = FALSE, comment = "#",
trim_ws = TRUE)
head(gene_info)
gene_info$probe_id=gene_info$ID
gene_info=gene_info %>% dplyr::select(ID, Gene_Symbol = 'SPOT_ID...10')
head(gene_info)
#gene_info=as.data.frame(gene_info)
head(gene_info,4)
myexpression[1:3,1:3]
head(myexpression)
myexpression['TC0100006437.hg.1',]
expr_id_symbol <- merge(myexpression,gene_info,by='probe_id')#合并表达矩阵和基因信息表
#表明有合并后的列表中有NA值,应该是gene_info里的
head(expr_id_symbol)
expr_id_symbol[10000:100003,]
head(expr_id_symbol$Gene_Symbol)
table(is.na(expr_id_symbol))
pos <- which(is.na(expr_id_symbol$Gene_Symbol))#找出NA值的行号
#expr_id_symbol <- expr_id_symbol[-pos,]#去除NA值所在的行,保留剩下的行
table(is.na(expr_id_symbol))#表明所有NA值都已经去除了
#test
if(1==1){
mystring1="NM_001142401 // Ref(fad)Seq // Homo (cd64) asf(fda)daf af (aavd) Afadf"
strsplit(mystring1,split = "(",fixed = T) ##设置fixed = T,正常拆分,参数作用为精确匹配,屏蔽正则表达式
str_detect(mystring1, regex(".\\(.", dotall = TRUE))
str_split(mystring1,pattern = "\\(.+\\)")
str_split(mystring1,pattern = "\\(*\\)")
str_extract(mystring1,pattern ="\\(.+\\)" )
str_extract(mystring1,pattern ="\\(.{2,15}\\)" )
#https://blog.csdn.net/violin256/article/details/115189643
str_extract(mystring1,pattern = "[\\(|\\(].*[\\)|\\)]$")
str_extract(mystring1,pattern = "[\\(].*[\\)]")
str_extract(mystring1,pattern = "[\\(^].*[\\)$]")
str_extract(mystring1,pattern = "^[^}]*\}")
str_extract(mystring1,pattern = "^[^\\)]*")
str_extract(mystring1,pattern = "^[^\\(]*")
str_extract(mystring1,pattern = "[^\\(]*")
#String skh ="(?<=\\《)[^\\》]+";//用于匹配《》里面的文本
#https://blog.csdn.net/WuLex/article/details/82116701
str_extract(mystring1,pattern = "[\\(]*\\)")
str_extract(mystring1,pattern ="(?<=\\()[^\\)]+") #匹配第一个括号内的内容
library(stringr)
strsplit(mystring1,split = "(*)")
str_split(mystring1,pattern = "(*)",simplify = T)
rs <- ("This&is&First&R&String&Example")
strsplit(rs, split = "&")
library(stringi)
stri_split_regex("Lorem ipsum dolor sit amet",
"\\p{Z}+") # see also stri_split_charclass
stri_split_regex(mystring1,
pattern = "?+") # see also stri_split_charclass
}
#一个探针对应多个基因的情况下取第一个个基因 匹配第一个括号里的内容
#https://blog.csdn.net/WuLex/article/details/82116701
symbol<-lapply(expr_id_symbol$Gene_Symbol,
FUN = function(x){
str_extract(x,pattern ="(?<=\\()[^\\)]+")
})
head(symbol)
symbol[10000:100003]
expr_id_symbol$Gene_Symbol2 <- as.character(symbol)#得到的唯一的symbol替换原来的Gene_Symbol列
head(expr_id_symbol)
colnames(expr_id_symbol)
ncol(expr_id_symbol)
#多个探针对应一个基因的情况下,取平均值
library(dplyr)
data4 <- expr_id_symbol[,2:27] %>% #数据从第二列开始,第一列是ID,最后一列是Gene_Symbol
arrange(Gene_Symbol2) %>% #按Gene_Symbol排序
group_by(Gene_Symbol2) %>% #Gene_Symbol分组
summarise_all(mean)
#将Gene_Symbol列变为行名
head(data4)
data4 <- column_to_rownames(data4,var = 'Gene_Symbol')
nrow(myexpression)
nrow(expr_id_symbol)
nrow(data4)
getwd()
save(data4,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/data4.rds")
https://blog.csdn.net/WuLex/article/details/82116701