zl程序教程

您现在的位置是:首页 >  其它

当前栏目

getgeo geo表达矩阵

矩阵 表达 Geo
2023-09-14 09:09:48 时间

glp平台基因和探针转化


getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/")

library(GEOquery)

f='GSE163224_eSet.Rdata'
if(!file.exists(f)){
  gset <- getGEO('GSE163224', destdir=".",
                 AnnotGPL = F,     ## 注释文件
                 getGPL = F)       ## 平台文件
  save(gset,file=f)   ## 保存到本地
}

getwd()
load('GSE163224_eSet.Rdata')  ## 载入数据
class(gset)  #查看数据类型
length(gset)  #
class(gset[[1]])
gset
exprs(gset[[1]])
gset[[1]]


a=exprs(gset[[1]])
head(a)
a['220570_at',]
a[rownames(a)=='220570_at',]
library(dplyr)
myexpression=as.data.frame(a)
head(myexpression)
myexpression$probe_id=rownames(myexpression)#将行名转化为列名方便合并
head(myexpression)


#方法1##下载p值为1的geo2r  获取探针和基因的对应关系  对此数据不适用
if(1==1){
  input=read.table("G:\\silicosis\\geo\\GSE103548_rna-seq_llc_mle-12\\GSE163224_a549_contact_esophil\\GSE163224.top.table.tsv",
                   header = T,
                   sep = "\t",
                   comment.char = "!")
  head(input)
  input$probe_id=input$ID
  head(input)
  
  mydata=merge(myexpression,input,by='probe_id')
  head(mydata)
  
  colnames(a)
  boxplot(mydata[,2:42])
  getwd()
  
  save(mydata,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
  load("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
  
}


#方法二###下载基因芯片平台的探针和基因的对应关系 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL24324
#http://zouyawen.top/2020/10/21/GEO/#more
getwd()
#gpGPL24324<-getGEO('GPL24324')#获取探针信息进行ID转换
library(readr)
gene_info <- read_delim("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/GPL24324-69524.txt", 
                        "\t", escape_double = FALSE, comment = "#", 
                        trim_ws = TRUE) 
head(gene_info)
gene_info$probe_id=gene_info$ID
gene_info=gene_info %>%    dplyr::select(ID, Gene_Symbol = 'SPOT_ID...10')
head(gene_info)
#gene_info=as.data.frame(gene_info)
head(gene_info,4)


myexpression[1:3,1:3]
head(myexpression)
myexpression['TC0100006437.hg.1',]



expr_id_symbol <- merge(myexpression,gene_info,by='probe_id')#合并表达矩阵和基因信息表
#表明有合并后的列表中有NA值,应该是gene_info里的
head(expr_id_symbol)
expr_id_symbol[10000:100003,]
head(expr_id_symbol$Gene_Symbol)
table(is.na(expr_id_symbol))
pos <- which(is.na(expr_id_symbol$Gene_Symbol))#找出NA值的行号
#expr_id_symbol <- expr_id_symbol[-pos,]#去除NA值所在的行,保留剩下的行
table(is.na(expr_id_symbol))#表明所有NA值都已经去除了

#test
if(1==1){
  mystring1="NM_001142401 // Ref(fad)Seq // Homo (cd64) asf(fda)daf  af (aavd) Afadf"
  
  strsplit(mystring1,split = "(",fixed = T) ##设置fixed = T,正常拆分,参数作用为精确匹配,屏蔽正则表达式
  str_detect(mystring1, regex(".\\(.", dotall = TRUE))
  str_split(mystring1,pattern = "\\(.+\\)")
  str_split(mystring1,pattern = "\\(*\\)")
  
  str_extract(mystring1,pattern ="\\(.+\\)" )
  str_extract(mystring1,pattern ="\\(.{2,15}\\)" )
  
  #https://blog.csdn.net/violin256/article/details/115189643
  str_extract(mystring1,pattern = "[\\(|\\(].*[\\)|\\)]$")
  str_extract(mystring1,pattern = "[\\(].*[\\)]")
  str_extract(mystring1,pattern = "[\\(^].*[\\)$]")
  str_extract(mystring1,pattern = "^[^}]*\}")
  str_extract(mystring1,pattern = "^[^\\)]*")
  str_extract(mystring1,pattern = "^[^\\(]*")
  str_extract(mystring1,pattern = "[^\\(]*")
  
  #String skh ="(?<=\\《)[^\\》]+";//用于匹配《》里面的文本 
  #https://blog.csdn.net/WuLex/article/details/82116701
  str_extract(mystring1,pattern = "[\\(]*\\)")
  str_extract(mystring1,pattern ="(?<=\\()[^\\)]+") #匹配第一个括号内的内容
  
  library(stringr)
  strsplit(mystring1,split = "(*)")
  str_split(mystring1,pattern = "(*)",simplify = T)
  
  
  rs <- ("This&is&First&R&String&Example")
  strsplit(rs, split = "&")
  
  
  library(stringi)
  stri_split_regex("Lorem ipsum dolor sit amet",
                   "\\p{Z}+") # see also stri_split_charclass
  
  stri_split_regex(mystring1,
                   pattern = "?+") # see also stri_split_charclass
  
}
#一个探针对应多个基因的情况下取第一个个基因 匹配第一个括号里的内容
#https://blog.csdn.net/WuLex/article/details/82116701
symbol<-lapply(expr_id_symbol$Gene_Symbol,
               FUN = function(x){
                 str_extract(x,pattern ="(?<=\\()[^\\)]+")
                                             })
head(symbol)
symbol[10000:100003]
expr_id_symbol$Gene_Symbol2 <- as.character(symbol)#得到的唯一的symbol替换原来的Gene_Symbol列

head(expr_id_symbol)
colnames(expr_id_symbol)
ncol(expr_id_symbol)

#多个探针对应一个基因的情况下,取平均值
library(dplyr)
data4 <- expr_id_symbol[,2:27] %>% #数据从第二列开始,第一列是ID,最后一列是Gene_Symbol
  arrange(Gene_Symbol2) %>% #按Gene_Symbol排序
  group_by(Gene_Symbol2) %>% #Gene_Symbol分组
  summarise_all(mean)
#将Gene_Symbol列变为行名
head(data4)
data4 <- column_to_rownames(data4,var = 'Gene_Symbol')


nrow(myexpression)
nrow(expr_id_symbol)
nrow(data4)

getwd()
save(data4,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE163224_a549_contact_esophil/data4.rds")


在这里插入图片描述

https://blog.csdn.net/WuLex/article/details/82116701