zl程序教程

您现在的位置是:首页 >  其他

当前栏目

R语言小专题

2023-03-07 09:14:46 时间

一、字符串专题(stringr包)

1)str_length() 统计字符串长度

str_length("hello world")
10 #这个字符串的长度,包括空格和一些符号在内

2)str_split() 拆封字符串

str_split("hello world"," ")
[[1]]
[1] "hello" "world"

⚠️注意:str_spilt的第二个参数,写你想分割的符号,上面代码“hello world”的分割是空格,因此输入“ ”,同样也可以是其他符号。

> y = c("jimmy 150","nicker 140","tony 152") 
> str_split(y," ") #按照空格进行拆分,会变成list格式
[[1]]
[1] "jimmy" "150"  

[[2]]
[1] "nicker" "140"  

[[3]]
[1] "tony" "152"


> str_split(y,",")  #按照逗号进行拆分
[[1]]
[1] "jimmy 150"

[[2]]
[1] "nicker 140"

[[3]]
[1] "tony 152"

> z<-str_split(y," ",simplify = T) #加上simplity参数后,输出的结果变成矩阵
> z     
      [,1]     [,2] 
[1,] "jimmy"  "150"
[2,] "nicker" "140"
[3,] "tony"   "152"
> class(z)
[1] "matrix" "array" 

3)str_sub() 按位置取字符串

x <- "The birch canoe slid on the smooth planks."
str_sub(x,5,9) #取x字符串第五到第九位
[1] "birch"

4)str_detect() 查找字节

x2 = str_split(x," ")[[1]];x2
[1] "The"     "birch"   "canoe"   "slid"  
[5] "on"      "the"     "smooth"  "planks."

str_detect(x2,"h")
[1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE
[8] FALSE

根据搜索的内容会返回true or false的值

5)str_replace() / str_replace_all() 字符替换

 x2
[1] "The"     "birch"   "canoe"   "slid"   
[5] "on"      "the"     "smooth"  "planks."
 str_replace(x2,"o","A") #同一个字符串只替换了一次
 [1] "The"     "birch"   "canAe"   "slid"  
 [5] "An"      "the"     "smAoth"  "planks."
 
 str_replace_all(x2,"o","A") #全部替换
 [1] "The"     "birch"   "canAe"   "slid"   
 [5] "An"      "the"     "smAAth"  "planks."

6) str_remove() / str_remove_all () 字符删除

> x
[1] "The birch canoe slid on the smooth planks."
> str_remove(x,"o") #只删一个字符
[1] "The birch cane slid on the smooth planks."

> str_remove_all(x,"o")
[1] "The birch cane slid n the smth planks."

二、数据框专题

 #数据处理
 test <- iris[c(1:2,51:52,101:102),]
 rownames(test) =NULL # 去掉行名,NULL是“什么都没有”
 test

1)arrange() 排序

library(dplyr)
arrange(test, Sepal.Length) #从小到大
arrange(test, desc(Sepal.Length)) #从大到小

2)distinct()去重复

distinct(test,Species,.keep_all = T)  #把Species列的重复去掉

3)数据框新增一列

mutate(test, new = Sepal.Length * Sepal.Width)
test$new<-test$Sepal.Length*test$Sepal.Width

‼️‼️管道符的妙用-----%>%

x = iris %>%   filter(Sepal.Width>3) %>%   select(Sepal.Length,Sepal.Width)%>%  arrange(Sepal.Length)

x定义为iris数据集——筛选出数据集中Sepal.Width大于3的值——将这些值的Sepal.Length和Sepal.Width列输出——从小到大排序

三、条件语句和循环语句专题

1)if 条件语句

基本格式:

if (i>1) {print("+")
else
{print( "-")}

2)‼️重点函数:ifelse()

ifelse(x,yes,no)

x:逻辑值或逻辑向量

yes:逻辑值为true时的返回值

no:逻辑值是false时的返回值

 x = rnorm(3)
 x
 [1]  0.9616716 -0.1292150  1.7251983
 ifelse(x>0,"+","-")
 [1] "+" "-" "+"

⚠️ifelse()和str_detect()函数连用的超牛用途

samples = c("tumor1","tumor2","tumor3","normal1","normal2","normal3")
 k1 = str_detect(samples,"tumor");k
 1[1]  TRUE  TRUE  TRUE FALSE FALSE FALSE
 ifelse(k1,"tumor","normal")
 [1] "tumor"  "tumor"  "tumor"  "normal" "normal"
 [6] "normal"

(以后可以方便对数据进行分组)

3)多个条件

i=0
if(i>0){print("+")}
else if (i==o) {print("0")
else if (i<0) {print("-")}
[1] "0"

#当然也可以使用ifelse函数
ifelse(i>0,'+',ifelse(i<0,'-','0'))
[1] "0"

4)for循环

#元素循环
x <- c(5,6,0,3)
s=0
for (i in x){  s=s+i  print(c(i,s))}
[1]  5 5
[1]  6 11
[1]  0 11
[1]  3 14

#下标循环
x <- c(5,6,0,3)
s = 0
for (i in 1:length(x)){  s=s+x[[i]]  print(c(x[[i]],s))}
[1]  5 5
[1]  6 11
[1]  0 11
[1]  3 14

#储存结果
s = 0 (上面s已经变成14,重新设置一下)
result = list()
for(i in 1:length(x)){  s=s+x[[i]]  result[[i]] = c(x[[i]],s)}
result
[[1]]
[1] 5 5

[[2]]
[1]  6 11

[[3]]
[1]  0 11

[[4]]
[1]  3 14

四、隐式循环

1)apply(x,margin,function)

其中x代表数据框或矩阵,margin=1代表行;margin=2代表列;function表示对行或列采取的函数

test<- iris[1:6,1:4]
apply(test, 2, mean) #计算出每列的平均值
Sepal.Length  Sepal.Width Petal.Length    Petal.Width
 4.9500000    3.3833333    1.4500000     0.2333333 

2)对列表中的元素进行操作的 lapply()

test <- list(x = 36:33,y = 32:35,z = 30:27);test
$x
[1] 36 35 34 33
$y
[1] 32 33 34 35
$z
[1] 30 29 28 27

lapply(test,mean)
$x
[1] 34.5
$y
[1] 33.5
$z
[1] 28.5

3)简化的隐式循环

(由于lapply输出的格式也是列表不便于观看,因此可以使用sapply函数)

sapply(test,mean)  #输出形式是矩阵
 x    y    z
  34.5 33.5 28.5 

五、数据框的链接 (dplyr包)

数据设置

 test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'), blood_type = c("A","B","O","AB"))
 test1   
    name         blood_type
 1  jimmy            A
 2  nicker           B
 3  Damon            O
 4 Sophie            AB
 
 test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),group = c("group1","group1","group2","group2"), vision = c(4.2,4.3,4.9,4.5))
 test2   
  name       group vision
  1  Damon  group1    4.2
  2  jimmy  group1    4.3
  3 nicker  group2    4.9
  4  tony   group2    4.5
  > library(dplyr)

1)inner_join() 根据共同的列名取交集再合并

inner_join(test1,test2,by="name")    
name          blood_type  group   vision
1  jimmy          A       group1    4.3
2 nicker          B       group2    4.9
3  Damon          O       group1    4.2

2)left_join() 左连接(保留左边数据所有,相同的会被合并,空的数据为NA)

left_join(test1,test2,by="name")    
name          blood_type  group   vision
1  jimmy          A       group1    4.3
2 nicker          B       group2    4.9
3  Damon          O       group1    4.2
4 Sophie         AB        <NA>     NA

3)right_join( )右连接 (保留左边数据所有,相同的会被合并,空的数据为NA)

right_join(test1,test2,by="name")   
 name         blood_type  group    vision
 1  jimmy          A      group1    4.3
 2 nicker          B      group2    4.9
 3  Damon          O      group1    4.2
 4   tony         <NA>    group2    4.5

4)full_join() 全连接(所有数据按命令列连接)

full_join(test1,test2,by="name")  
  name          blood_type  group    vision
  1  jimmy          A       group1    4.3
  2 nicker          B       group2    4.9
  3  Damon          O       group1    4.2
  4 Sophie         AB        <NA>     NA
  5   tony         <NA>     group2    4.5

5)semi_join ()半连接 (前一个数据中选出共同列名的值)

semi_join(test1,test2,by="name")    
name。       blood_type
1  jimmy          A
2 nicker          B
3  Damon          O

6)anti_join( ) 反连接 (输出前一个数据中除外共同列名的数据)

anti_join(test1,test2,by="name")   
     name     blood_type
 1  Sophie       AB