zl程序教程

您现在的位置是:首页 >  其它

当前栏目

自建函数取某个字符串固定位置的元素

函数 字符串 元素 位置 某个 固定
2023-09-14 09:09:49 时间

library(dplyr)



getwd()
dir.create("G:/r/duqiang_IPF/filtered_genes_from_different_dataset")
setwd("G:/r/duqiang_IPF/filtered_genes_from_different_dataset")

IPF_UIP_controlsgse32537=read.table(file ="G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF/geo2r/GSE32537.top.table ncbi id.tsv"
                                    ,header = T,sep = "\t",
                                    fill = TRUE
                                    ,row.names = NULL)
head(IPF_UIP_controlsgse32537) 
str(IPF_UIP_controlsgse32537)

IPF_UIP_controlsgse32537_filtered=IPF_UIP_controlsgse32537 %>%
  transform(adj.P.Val=as.numeric(adj.P.Val)) %>%
  transform(logFC=as.numeric(logFC)) %>%
                         filter(adj.P.Val<0.05 &
                                  abs(logFC)>1)

head(IPF_UIP_controlsgse32537_filtered)

getwd()
#去掉

# 判断是否存在双斜杠 CBSL///CBS 如果存在则分开并取出元素
library(stringr)
str_split("CBSL///CBS",pattern = "///")
str_split("CBSLCBS",pattern = "///")
str_split("",pattern = "///")

wheter_double<-function(x){
  genes=str_split(x,pattern = "///")[[1]]
  return(genes)
}

IPF_UIP_controlsgse32537_filtered$Gene.symbol



genes=wheter_double(IPF_UIP_controlsgse32537_filtered$Gene.symbol)
genes

genes=sapply(IPF_UIP_controlsgse32537_filtered$Gene.symbol, wheter_double)
head(genes)
names(genes)=NULL
unlist(genes)
genes=data.frame(genes=unlist(genes))

openxlsx::write.xlsx(IPF_UIP_controlsgse32537_filtered,
                     file ="G:/r/duqiang_IPF/filtered_genes_from_different_dataset/IPF_UIP_controlsgse32537_filtered_both_abs(logFC).xlsx" )

openxlsx::write.xlsx(genes,
                     file ="G:/r/duqiang_IPF/filtered_genes_from_different_dataset/IPF_UIP_controlsgse32537_filtered_genesboth_abs(logFC).xlsx" )



#提取每行的行名第一个| 竖线之前的字符串
res$ensembl <- sapply( strsplit( rownames(res),split="\\+" ), "[", 1 ) 


#自建函数,只要输入一个长的类似于"ENST00000641515.1|OR4F5-202|OR4F5|2618|protein_coding|"的字符串,
#就可以返回第一个 | 之前的名字
return_desired_position_value<-function(x,myposition){
  strsplit(x,split = "|",fixed = T)[[1]][myposition]
}
return_desired_position_value("ENST00000641515.1|OR4F5-202|OR4F5|2618|protein_coding|",4)



dir.create("G:/r/duqiang_IPF/filtered_genes_from_different_dataset/surval_analysis_for_intersecte_genes")
setwd("G:/r/duqiang_IPF/filtered_genes_from_different_dataset/surval_analysis_for_intersecte_genes")
#yll duqiang 制作geo三个独立数据集IPF基因集合 用于分析某个基因是否与生存期相关THBS2
load("G:/r/duqiang_IPF/surval_analysis_3_independent_dataset_IPF/combined_data_for_surval.RDdata")

getwd()

1#输入想要查询的基因名称或者向量
gene_interested="MMP7"  #输入想要查询的基因名称或者向量
library(stringr)
gene_interested=readClipboard() %>% str_split(pattern = ",",gene_interested)[[1]]
gene_interested= str_split(pattern = " ",gene_interested)[[1]]

2#首先查看基因是否存在数据集中,如果不存在则去掉该基因
table( gene_interested %in% rownames(expr.17077clean) & 
         gene_interested %in% rownames(expr.freibrug.IPF))

gene_interested=gene_interested[which(gene_interested %in% rownames(expr.17077clean) ==TRUE)]

2.2#去掉空字符串
table(nchar(gene_interested)>0)
gene_interested=gene_interested[nchar(gene_interested)>0]



if(1==1){
  3#制作phedata数据用于存活分析
  if(1==1){#制作phedata数据用于存活分析
    for (eachgene in gene_interested) {
      phe.freigbrug[paste0(eachgene)]=ifelse(expr.freibrug.IPF[eachgene,]>median(expr.freibrug.IPF[eachgene,]),
                                             "High",'Low')
    }
    head(phe.freigbrug)
    
    
    for (eachgene in gene_interested) {
      phe.senia[paste0(eachgene)]=ifelse(expr.siena.IPF[eachgene,]>median(expr.siena.IPF[eachgene,]),
                                         "High",'Low')
    }
    head(phe.senia)
    
    for (eachgene in gene_interested) {
      phe.17077[paste0(eachgene)]=ifelse(expr.17077clean[eachgene,]>median(expr.17077clean[eachgene,]),
                                         "High",'Low')
    }
    head(phe.17077)
    
    
  }
  
  
  4###开始合并三个数据集的phe数据
  
  phe_final_3=rbind(phe.freigbrug,phe.senia,phe.17077)
  dim(phe_final_3) #[1] 176  5
  head(phe_final_3)
  
  library(dplyr)
  phe_final_3=phe_final_3 %>% transform(time=as.numeric(time))%>% transform(event=as.numeric(event))
  getwd()
  
  5.#批量基因差异分析
  library(survival)
  library(survminer)
  getwd()
  for (eachgene in names(gene_interested_for_surval_analysis)) {
    p=ggsurvplot(survfit(Surv(time, event)~phe_final_3[,eachgene], 
                         data=phe_final_3), conf.int=F, pval=TRUE)
    
    pdf(paste0(eachgene, "_surval_analysis_for_intersected_genes.pdf"),width = 5, height = 5)
    print(p, newpage = FALSE)
    dev.off()
    
  }
  
}

load("G:/r/duqiang_IPF/surval_analysis_3_independent_dataset_IPF_pval_filtered/surval_genes_filtered_log_rank_p.RData")

getwd()
## 批量生存分析 使用  logrank test 方法
phe=phe_final_3
head(phe)[1:3,1:19]
mySurv=with(phe,Surv(time, event))
log_rank_p=list()

for (eachgene in colnames(phe)[5:ncol(phe)] ) {
  
  data.survdiff=survdiff(mySurv~phe[,eachgene],data=phe)
  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
  log_rank_p[[eachgene]]=p.val
}
head(log_rank_p)
log_rank_p=unlist(log_rank_p)
log_rank_p=sort(log_rank_p)
head(log_rank_p)
log_rank_p[1000]
table(log_rank_p<0.005)
boxplot(log_rank_p) 
phe$H6PD=ifelse(exprSet['H6PD',]>median(exprSet['H6PD',]),'high','low')
table(phe$H6PD)
ggsurvplot(survfit(Surv(time, event)~MRVI1, data=phe), conf.int=F, pval=TRUE)


log_rank_p
log_rank_p["HNF4A"]
log_rank_p["HNF4G"]
gene_interested=readClipboard()
log_rank_p[gene_interested]

myinterested_gene_pval=sort(log_rank_p[gene_interested])

gene_interested_for_surval_analysis=myinterested_gene_pval[myinterested_gene_pval<0.05]

library(survminer)
library(survival)
ggsurvplot(survfit(Surv(time, event)~CXCL14, data=phe), conf.int=F, pval=TRUE)
ggsurvplot(survfit(Surv(time, event)~TMEM45A, data=phe), conf.int=F, pval=TRUE)
ggsurvplot(survfit(Surv(time, event)~SCP2, data=phe), conf.int=F, pval=TRUE)
ggsurvplot(survfit(Surv(time, event)~ACOX2, data=phe), conf.int=F, pval=TRUE)
ggsurvplot(survfit(Surv(time, event)~CPT1A, data=phe), conf.int=F, pval=TRUE)


names(gene_interested_for_surval_analysis)
data.frame(names(gene_interested_for_surval_analysis),
           row.names = NULL)

getwd()
mydat=data.frame(names(gene_interested_for_surval_analysis),
                 row.names = gene_interested_for_surval_analysis)
openxlsx::write.xlsx(mydat,overwrite = T,
                     file ="G:/r/duqiang_IPF/filtered_genes_from_different_dataset/surval_analysis_for_intersecte_genes/gene_interested_for_surval_analysis.xlsx" )