dplyr is a grammar of data manipulation, providing a consistent set of verbs that help you solve the most common data manipulation challenges:
mutate() adds new variables that are functions of existing variablesselect() picks variables based on their names.filter() picks cases based on their values.summarise() reduces multiple values down to a single summary.arrange() changes the ordering of the rows.必须而使用tidy data,即整洁数据,dplyr中可以使用tibble()创建数据框
#管道函数
%>% #快捷键 ctrl+shift+m
# 创建一个tibble数据框
df <- tibble(x = 1:3, y = 3:1)
#普通数据框和tibble数据框进行互转
tib <- as_tibble(mtcars)
df <- as.data.frame(tib)
Orange %>% summarise(a1 = sum(age),a2=mean(age),a3=var(age),a4=sd(age))
Orange %>% count(age)
# 计算新列
mtcars %>% transmute(gpm = 1 / mpg)
## 在原数据框中加入新列,并进行一系列操作
# 构建数据
df <- tibble(x1 = runif(6), x2 = runif(6), x3 = runif(6), x4 = runif(6), x5 = runif(6))
# 选择特定列并进行后续操作
df %>% select(x1,x3) %>% mutate(sum=x1+x3, mean=(x1*x3)/2)
df %>% select(x1,x3) %>% mutate(x1=x1+x3, x3=(x1*x3)/2) # 原本的x1和x3被新的替代
df %>% select(x1,x3) %>% mutate(x1=x1+x3, x3=NULL) # 删除x3列
# 一些细节
df %>% select(x1,x3) %>% mutate(sum=x1+x3, mean=(x1*x3)/2, .keep="all", .before=1) # 使用.keep参数控制显示的列数,.before或者.after参数控制新增列顺序,可以是数字,也可用列名
#配合group_by()和rowwise()函数
#见下面
# 构建数据
df <- tibble(x1 = c("a","a","b","b","c","c"), x2 = runif(6), x3 = runif(6), x4 = runif(6), x5 = runif(6))
# group_by()函数实现分组统计
df %>% group_by(x1) %>% summarise(m=mean(x2),s =sum(x3))
df %>% group_by(x1) %>% summarise(m=mean(c(x2,x3,x5))) # 通过向量选择多列
df %>% group_by(x1) %>% summarise(m=mean(x2),s =sum(c_across(x2:x5))) # 通过c_across快速选择多列
# rowwise()函数将数据框每一行进行后续操作,也可用mutate()实现类似操作
df%>% rowwise() %>% summarise(m=mean(x2),s =sum(x3))
df%>% rowwise() %>% summarise(m=mean(c(x2,x3,x5)))
df%>% rowwise() %>% summarise(m=mean(x2),s =sum(c_across(x2:x5)))
# 用mutate()实现上述操作
df %>% mutate(m = x2, s = x3)
df %>% mutate(m = (x2 + x3 + x5) / 3)
df %>% mutate(m = x2, s = x2 + x3 + x4 + x5)
主要函数filter(),其中要用到逻辑与关系运算,或者See ?base::Logic and ?Comparison for help