full_name <- c("id", "name", "parent_id", "initial", "initials",
"pinyin", "extra", "suffix", "code", "area_code", "order")
url_csv <- "https://raw.githubusercontent.com/eduosi/district/master/district-full.csv"
district <- readr::read_delim(
url(url_csv),
col_types = cols(.default = "c"),
col_names = F, delim = "\t") %>%
as_tibble() %>%
rename_all(., ~all_of(full_name))
knitr::kable(head(district), caption = "地区区号信息")1 R函数避坑
1.1 读取文件
1.1.1 如何避免readr对列格式自作主张
默认情况下,readr包读取文件函数会自动猜测数据列的格式,体现在col_types里。
如下所示,我们使用参数col_types = cols(.default = "c"),就可以强制读取所有列为character格式。
1.1.2 如何读取sav数据格式
- 读取spss文件。为了避免出现编码错误(中文等多字节问题),建议使用
foreign::read.spss()函数,而不宜使用haven::read_sav()函数
# 此时会有报错
file_tar <- here("data/case-3.6-computer-sales.sav")
tryCatch(df_computer <- haven::read_sav(file_tar,encoding = "UTF-8", .name_repair = "unique"),
warning = function(c) "warning")# 此时,正常读取和显示
file_tar <- here("data/case-3.6-computer-sales.sav")
df_computer<- foreign::read.spss(file_tar, to.data.frame=TRUE)re-encoding from CP936
knitr::kable(head(df_computer), align = "c" )| 月份 | 北京 | 长春 | 南京 | 郑州 | 武汉 | 广州 | 成都 | 昆明 | 兰州 | 西安 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 49 | 70 | 76 | 57 | 77 | 72 | 79 | 65 | 51 | 67 |
| 2 | 41 | 68 | 71 | 57 | 75 | 80 | 83 | 65 | 41 | 67 |
| 3 | 47 | 50 | 77 | 68 | 81 | 80 | 81 | 58 | 49 | 74 |
| 4 | 50 | 39 | 72 | 67 | 75 | 84 | 79 | 61 | 46 | 70 |
| 5 | 55 | 56 | 68 | 63 | 71 | 83 | 75 | 58 | 41 | 58 |
| 6 | 57 | 54 | 73 | 57 | 74 | 87 | 82 | 72 | 43 | 42 |
1.1.3 如何正确读取json
需要用到的R包:
library("jsonlite")
library("tidyjson")参看资料:
关键操作步骤在于:
使用
map_if()函数转换为list多次使用
unnest()函数。当然,考虑到列变量名的处理问题,可以使用names_repair = tidyr_legacy进行设定。
#download.file("http://arxiv.org/pdf/1403.2805.pdf", "data-raw/pdf/1403.2805.pdf", mode = "wb")
# Table of contents
file_tar <- here("data-raw/pdf/1403.2805.pdf")
toc <-pdftools::pdf_toc(file_tar)
# Show as JSON
text_json<- jsonlite::toJSON(toc, auto_unbox = TRUE, pretty = TRUE)
# json style
y_list <- jsonlite::fromJSON(text_json)
# pure list
y_list_new <- map_if(y_list, is.data.frame, list)
# flatten tibble
y_df_new <- as_tibble(y_list_new) %>%
unnest(title,names_repair = tidyr_legacy) %>%
unnest(children, names_repair = tidyr_legacy) %>%
unnest(children, names_repair = tidyr_legacy) %>%
mutate(children = map(children, as_tibble))%>%
unnest(children, names_repair = tidyr_legacy,
keep_empty = T) %>%
select(tidyselect::matches("title\\d{1}", perl =T))
DT::datatable(
y_df_new,
options = list(
dom = "tip",
pageLength = 10)
)1.2 操作日期对象
1.2.1 数值转日期
经验法则:日期格式转换时,要注意不同环境下的默认初始日期
origin =。Stata软件默认初始日期为origin = 1960-01-01;而R环境默认初始日期为origin = 1970-01-01。
首先我们看如下一段常见的日期对象代码操作:
start_d <- lubridate::as_date("1989-01-01")
end_d <- as_date("2006-12-31")
tot_d <- lubridate::as.duration(interval(start_d,end_d)) %/%
as.duration(days(1)) +1
tot_y <- as.duration(interval(start_d,end_d)) %/%
as.duration(years(1)) +1上述结果得到:
- 观测站点运作至少持续
tot_y=18年,且中间不能终端运行超过2个月。- 总共观测天数长
tot_d=6574,从start_d=1989-01-01开始到end_d=2006-12-31结束。
下面再来看,数值型数据的日期格式转换:
# random select 10 start dates
start_dates <- c(12274, 12614, 12717,
12804, 13061, 13344,
13712, 14471, 14910,
15678)
## default origin specification
### origin ="1970-01-01"
start_dates1 <- lubridate::as_date(
start_dates,
origin =lubridate::origin) #
## custom origin specification
start_dates2 <- lubridate::as_date(
start_dates,
origin ="1960-01-01")我们将看到不同的结果:
- 默认情形下(R环境)
start_dates1的日期(第一个)为2003-08-10- 指定情形下(STATA环境)
start_dates2的日期(第一个)为1993-08-09
进一步地:
# option 1
lubridate::as_date(0,origin = lubridate::origin) # "1970-01-01"[1] "1970-01-01"
# option 2
lubridate::as_date(0,origin = "1960-01-01")[1] "1960-01-01"
同时,我们还要注意到如下的差异性(并非四舍五入进位!而是相当于ceiling()形式的进位):
# note the difference
lubridate::as_date(10593+4*365,origin ="1960-01-01")[1] "1992-12-31"
lubridate::as_date(10593+4*365+0.6,origin ="1960-01-01")[1] "1992-12-31"
lubridate::as_date(10593+4*365+0.4,origin ="1960-01-01")[1] "1992-12-31"
最后,R读取stata格式的数据文件,如果.dta文件包含了日期
file_url <- "D:/ProgramData/AER20090377_FinalData.dta"
df_raw <- foreign::read.dta(file_url, convert.dates = T)此时,函数foreign::read.dta会按照stata的默认起始日期origin = "1960-01-01"进行日期格式转换,而不会按照R环境下默认起始日期origin = "1970-01-01"。
参考资料:
1.2.2 日期转星期几和周次
经验法则:国别文化差异在此体现。
lubridate::wday()的默认起始日为周日(week_start = 7),对于东方文化周一才是第一天,因此要记得设定(week_start = 1)。
# some sunday
one_sunday <- as.Date("2022-06-12")
# weekday of today
# week_start: 1 = Monday, while default 7=Sunday
## show weekday label
lubridate::wday(one_sunday, label = T)[1] 周日
Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## show weekday number with `week_start = 1`
lubridate::wday(one_sunday,week_start = 1)[1] 7
## show weekday number with `week_start = 7`
lubridate::wday(one_sunday,week_start = 7)[1] 1
这种区别也同样体现在日期(date)转周次(week)的计算上。
check_dates <- as.Date(
c("2022-04-10", "2022-04-11",
"2022-05-21", "2022-05-22",
"2022-06-12",
"2022-06-19"
)
)
tbl_date <- tibble(dates = check_dates ) %>%
mutate(weekday_lable = lubridate::wday(dates, label = T),
weekday_us = lubridate::wday(dates),
weekday_iso = lubridate::wday(dates, week_start =1),
) %>%
mutate(weeks_us = week(dates),
weeks_iso = isoweek(dates))1.2.3 日期转字符
time_initor <- lubridate::as_datetime("2022-12-16 13:30:00",tz = "Asia/Shanghai")
duration <- 30 # minutes
time_start <- strftime(time_initor,format="%H:%M")
time_end <- strftime(time_initor +60*duration,format="%H:%M")
cat(paste0("initiate time (CST): ",time_initor,
"\n duration: ", duration, " minutes.",
"\n start hm: ",time_start,
"\n end hm: ",time_end))initiate time (CST): 2022-12-16 13:30:00
duration: 30 minutes.
start hm: 13:30
end hm: 14:00
1.3 数据框操作
1.3.1 setNames动态赋予名称
(demo_vct <- c(1, 2))[1] 1 2
(demo_vct <- setNames(demo_vct, paste0("V", 1:2)))V1 V2
1 2
(demo_list <- list("a", "b", "c"))[[1]]
[1] "a"
[[2]]
[1] "b"
[[3]]
[1] "c"
(demo_list <- setNames(demo_list, paste0("X", 1:3)))$X1
[1] "a"
$X2
[1] "b"
$X3
[1] "c"
1.3.2 rename_with()批量重命名列变量
分析场景:对于多列数据框dataframe(见下图@ref(fig:before-rename-iris)),我们希望进行批量化列重名,实现指定的重命名模式要求(见图@ref(fig:after-rename-iris))。
方法要点:需要使用函数参数为rename_with(.,.fn = ~paste0(., to_app), .cols = all_of(cols))。
tbl_demo <- iris %>%
select(Petal.Length:Species) %>%
head(10)
DT::datatable(tbl_demo)cols <- c("Petal.Length", "Petal.Width")
to_app <- "_xxx"
tbl_demo %>%
rename_with(
., .fn = ~paste0(., to_app),
.cols = all_of(cols)
) %>%
DT::datatable() 参考资源:
- 队长问答:How to rename selected columns using dplyr with new column names as strings(参看链接)
1.3.3 unnest()/nest()操作下前缀或后缀列变量名操作
分析场景:对于list类型的复合dataframe(见下图@ref(fig:ls-dt)),希望把它unnest,同时还保持母list命名与子dataframe的命名关系,期望的最终命名效果见图@ref(fig:ls-dt-unnest)。
方法要点:需要使用函数参数为unnest(., names_sep = "_")。
library(dplyr, warn.conflicts = FALSE)
msd_c <- function(x) c(mn = mean(x), sd = sd(x))
msd_df <- function(x) bind_rows(c(mn = mean(x), sd = sd(x)))
tbl_demo <- iris %>%
select(Petal.Length:Species) %>%
group_by(Species) %>%
tidyr::nest() %>%
mutate(
Petal.Length = purrr::map(data, ~ msd_df(.$Petal.Length)),
Petal.Width = purrr::map(data, ~ msd_df(.$Petal.Width))
) %>%
select(-data)
DT::datatable(tbl_demo)tbl_unnest <- tbl_demo %>%
tidyr::unnest(c(Petal.Length, Petal.Width), names_sep = "_")
DT::datatable(tbl_unnest) %>%
formatRound(., columns = 2:5, digits = 4)参考资源:
- 队长问答:tidyr unnest, prefix column names with nested name during unnesting(参看链接)
1.3.4 pivot_longer()操作中直接使用前缀或后缀列变量名
分析场景:对于有规律列命名(前缀名或后缀名)的数据库dataframe(见前节的图@ref(fig:ls-dt-unnest)),我们希望把它pivot_longer,并利用这些前缀名或后缀名关系,期望的效果见图@ref(fig:dt-pivot-name-sep)。
方法要点:需要使用函数参数为pivot_longer(., names_to = c(".value", "stat"), names_sep = "_")。
tbl_pivot <- tbl_unnest %>%
pivot_longer(
cols = starts_with('Petal'),
names_to = c(".value", "stat"),
names_sep = "_")
tbl_pivot %>%
DT::datatable() %>%
formatRound(columns = 3:4, digits = 4)参考资源:
- 队长问答:How to pivot_longer a set of multiple columns? and How to go back from that long format to original wide?(参看链接)
1.3.5 行操作rowwise()组合c_across()
text_cols <- c("cols_tar")
df %>%
rowwise() %>% # 按行操作
mutate(sum_yes = sum(c_across(all_of(text_cols)) == "yes")) %>%
ungroup() # rowwise 本质上类似于groupby()参看:
1.3.6 rleid()长程群组序号标定
目标:对于演示数据集的中识别好的目标群组(“B”、“D”)进行长程顺次分组排序。
library(data.table)
Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
DT <- data.table(
grp=rep(c("A", "B", "C", "D", "A","F", "B", "E"), c(2,2,3,2,2,5, 4,5)),
value=1:25,
tar = rep(c(FALSE, TRUE, FALSE, TRUE, FALSE,TRUE, FALSE, FALSE), c(2,2,3,2,2,5,4,5)))| grp | value | tar |
|---|---|---|
| A | 1 | FALSE |
| A | 2 | FALSE |
| B | 3 | TRUE |
| B | 4 | TRUE |
| C | 5 | FALSE |
| C | 6 | FALSE |
| C | 7 | FALSE |
| D | 8 | TRUE |
| D | 9 | TRUE |
| A | 10 | FALSE |
| A | 11 | FALSE |
| F | 12 | TRUE |
| F | 13 | TRUE |
| F | 14 | TRUE |
| F | 15 | TRUE |
| F | 16 | TRUE |
| B | 17 | FALSE |
| B | 18 | FALSE |
| B | 19 | FALSE |
| B | 20 | FALSE |
| E | 21 | FALSE |
| E | 22 | FALSE |
| E | 23 | FALSE |
| E | 24 | FALSE |
| E | 25 | FALSE |
根据目标群组(“B”、“D”)tar变量,我们可以得到dplyr::cur_group_id()下的普通群组标号grp_id(并不是我们这里想要的),还可以得到data.table::rleid(tar)下的长程群组标号grp_lreid(我们这里期望的)。
dt_group <- DT %>%
group_by(tar) %>%
mutate(grp_id = dplyr::cur_group_id()) %>%
ungroup() %>%
mutate(grp_rleid = data.table::rleid(tar)) | grp | value | tar | grp_id | grp_rleid |
|---|---|---|---|---|
| A | 1 | FALSE | 1 | 1 |
| A | 2 | FALSE | 1 | 1 |
| B | 3 | TRUE | 2 | 2 |
| B | 4 | TRUE | 2 | 2 |
| C | 5 | FALSE | 1 | 3 |
| C | 6 | FALSE | 1 | 3 |
| C | 7 | FALSE | 1 | 3 |
| D | 8 | TRUE | 2 | 4 |
| D | 9 | TRUE | 2 | 4 |
| A | 10 | FALSE | 1 | 5 |
| A | 11 | FALSE | 1 | 5 |
| F | 12 | TRUE | 2 | 6 |
| F | 13 | TRUE | 2 | 6 |
| F | 14 | TRUE | 2 | 6 |
| F | 15 | TRUE | 2 | 6 |
| F | 16 | TRUE | 2 | 6 |
| B | 17 | FALSE | 1 | 7 |
| B | 18 | FALSE | 1 | 7 |
| B | 19 | FALSE | 1 | 7 |
| B | 20 | FALSE | 1 | 7 |
| E | 21 | FALSE | 1 | 7 |
| E | 22 | FALSE | 1 | 7 |
| E | 23 | FALSE | 1 | 7 |
| E | 24 | FALSE | 1 | 7 |
| E | 25 | FALSE | 1 | 7 |
进一步地,我们可以通过一组计算公式,获得仅针对目标群组(“B”、“D”)的群组标号(其中非目标群组全部标定为基准组1)。其中公式代码factor((FALSE^!tar)*grp_rleid)将获得目标群组下的标号,非目标群组全部标定为0(见变量grp_factor)。代码as.integer(grp_factor)将进一步转化为按顺次排列的标号(见变量grp_factor)。从而最终实现我们的代码目标。
dt_group %>%
mutate(
grp_factor = factor((FALSE^!tar)*grp_rleid),
grp_integer = as.integer(grp_factor)
) %>%
kable(align = "c")| grp | value | tar | grp_id | grp_rleid | grp_factor | grp_integer |
|---|---|---|---|---|---|---|
| A | 1 | FALSE | 1 | 1 | 0 | 1 |
| A | 2 | FALSE | 1 | 1 | 0 | 1 |
| B | 3 | TRUE | 2 | 2 | 2 | 2 |
| B | 4 | TRUE | 2 | 2 | 2 | 2 |
| C | 5 | FALSE | 1 | 3 | 0 | 1 |
| C | 6 | FALSE | 1 | 3 | 0 | 1 |
| C | 7 | FALSE | 1 | 3 | 0 | 1 |
| D | 8 | TRUE | 2 | 4 | 4 | 3 |
| D | 9 | TRUE | 2 | 4 | 4 | 3 |
| A | 10 | FALSE | 1 | 5 | 0 | 1 |
| A | 11 | FALSE | 1 | 5 | 0 | 1 |
| F | 12 | TRUE | 2 | 6 | 6 | 4 |
| F | 13 | TRUE | 2 | 6 | 6 | 4 |
| F | 14 | TRUE | 2 | 6 | 6 | 4 |
| F | 15 | TRUE | 2 | 6 | 6 | 4 |
| F | 16 | TRUE | 2 | 6 | 6 | 4 |
| B | 17 | FALSE | 1 | 7 | 0 | 1 |
| B | 18 | FALSE | 1 | 7 | 0 | 1 |
| B | 19 | FALSE | 1 | 7 | 0 | 1 |
| B | 20 | FALSE | 1 | 7 | 0 | 1 |
| E | 21 | FALSE | 1 | 7 | 0 | 1 |
| E | 22 | FALSE | 1 | 7 | 0 | 1 |
| E | 23 | FALSE | 1 | 7 | 0 | 1 |
| E | 24 | FALSE | 1 | 7 | 0 | 1 |
| E | 25 | FALSE | 1 | 7 | 0 | 1 |
其中的重点还是长程群组标号命令data.table::rleid(tar),基于它(见变量grp_lreid)我们就可以很快得到目标结果,例如可以不使用上述十分晦涩的计算公式,而是使用如下的tidyverse语法:
dt_group %>%
mutate(
grp_factor_tidy = as.factor(ifelse(tar, grp_rleid, 0)),
grp_integer_tidy = as.integer(grp_factor_tidy)
) %>%
kable(align = "c")| grp | value | tar | grp_id | grp_rleid | grp_factor_tidy | grp_integer_tidy |
|---|---|---|---|---|---|---|
| A | 1 | FALSE | 1 | 1 | 0 | 1 |
| A | 2 | FALSE | 1 | 1 | 0 | 1 |
| B | 3 | TRUE | 2 | 2 | 2 | 2 |
| B | 4 | TRUE | 2 | 2 | 2 | 2 |
| C | 5 | FALSE | 1 | 3 | 0 | 1 |
| C | 6 | FALSE | 1 | 3 | 0 | 1 |
| C | 7 | FALSE | 1 | 3 | 0 | 1 |
| D | 8 | TRUE | 2 | 4 | 4 | 3 |
| D | 9 | TRUE | 2 | 4 | 4 | 3 |
| A | 10 | FALSE | 1 | 5 | 0 | 1 |
| A | 11 | FALSE | 1 | 5 | 0 | 1 |
| F | 12 | TRUE | 2 | 6 | 6 | 4 |
| F | 13 | TRUE | 2 | 6 | 6 | 4 |
| F | 14 | TRUE | 2 | 6 | 6 | 4 |
| F | 15 | TRUE | 2 | 6 | 6 | 4 |
| F | 16 | TRUE | 2 | 6 | 6 | 4 |
| B | 17 | FALSE | 1 | 7 | 0 | 1 |
| B | 18 | FALSE | 1 | 7 | 0 | 1 |
| B | 19 | FALSE | 1 | 7 | 0 | 1 |
| B | 20 | FALSE | 1 | 7 | 0 | 1 |
| E | 21 | FALSE | 1 | 7 | 0 | 1 |
| E | 22 | FALSE | 1 | 7 | 0 | 1 |
| E | 23 | FALSE | 1 | 7 | 0 | 1 |
| E | 24 | FALSE | 1 | 7 | 0 | 1 |
| E | 25 | FALSE | 1 | 7 | 0 | 1 |
1.4 数据清洗
1.4.1 如何正确生成重复数据rep()
分析场景:生成指定的重复数据,要求指定被重复的vector("char vec"),以及指定各自重复的次数vector("times vec")。
- 踩坑函数1:直接
rep()。
brand<- c("果汁","矿泉水","绿茶","其他","碳酸饮料")
reps <- c(6L, 10L,11L, 8L,15L)
rep(brand, reps) [1] "果汁" "果汁" "果汁" "果汁" "果汁" "果汁"
[7] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水"
[13] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "绿茶" "绿茶"
[19] "绿茶" "绿茶" "绿茶" "绿茶" "绿茶" "绿茶"
[25] "绿茶" "绿茶" "绿茶" "其他" "其他" "其他"
[31] "其他" "其他" "其他" "其他" "其他" "碳酸饮料"
[37] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[43] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[49] "碳酸饮料" "碳酸饮料"
- 踩坑函数2:
mapply()+rep()。这个要复杂一点。
drink <- mapply(FUN = rep, x=brand, times = reps) %>%
unlist()
names(drink) <- NULL
drink [1] "果汁" "果汁" "果汁" "果汁" "果汁" "果汁"
[7] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水" "矿泉水"
[13] "矿泉水" "矿泉水" "矿泉水" "矿泉水" "绿茶" "绿茶"
[19] "绿茶" "绿茶" "绿茶" "绿茶" "绿茶" "绿茶"
[25] "绿茶" "绿茶" "绿茶" "其他" "其他" "其他"
[31] "其他" "其他" "其他" "其他" "其他" "碳酸饮料"
[37] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[43] "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料" "碳酸饮料"
[49] "碳酸饮料" "碳酸饮料"
1.4.2 如何探查样本数据集是否缺失
- 问题:
lm(formula = lm.mod, data = lm.dt)回归提示warning。表明数据集lm.dt中存在缺失值。
12 observation deleted due to missingness
处理:采用函数
stats::complete.cases(lm.dt)来使用无缺失的数据集。或者查看存在缺失数据的行which(!stats::complete.cases(lm.dt))。
1.4.3 如何处理的不可见unicode元素
在文本数据清洗过程中,我们经常要处理一些不可见的unicode元素,例如tab符、换行符、中文空格符等。事实上,这些看不见的unicode元素要比我们想象到的要多得多(参看)。
U+0009 CHARACTER TABULATION # Tab空格符
U+0020 SPACE
U+00A0 NO-BREAK SPACE (not matched by \s) # 中文空格
U+1680 OGHAM SPACE MARK
U+2000 EN QUAD
U+2001 EM QUAD
U+2002 EN SPACE
U+2003 EM SPACE
U+2004 THREE-PER-EM SPACE
U+2005 FOUR-PER-EM SPACE
U+2006 SIX-PER-EM SPACE
U+2007 FIGURE SPACE
U+2008 PUNCTUATION SPACE
U+2009 THIN SPACE
U+200A HAIR SPACE
U+202F NARROW NO-BREAK SPACE
U+205F MEDIUM MATHEMATICAL SPACE
U+3000 IDEOGRAPHIC SPACE
此时,我们需要识别待处理字符串中的不可见元素都是何种unicode编码,这时需要用到icove()函数进行转换。
mystr <- "这是一个\u00a0不可见的中文空格符"
cat("这是一个\u00a0不可见的中文空格符")这是一个 不可见的中文空格符
# not working
str_replace(mystr, " ","")[1] "这是一个 不可见的中文空格符"
# detect
iconv(mystr,
from = "UTF-8", to = "ASCII",
sub = "Unicode") # ! important[1] "<U+8FD9><U+662F><U+4E00><U+4E2A><U+00A0><U+4E0D><U+53EF><U+89C1><U+7684><U+4E2D><U+6587><U+7A7A><U+683C><U+7B26>"
# now it works
str_replace(mystr, "\u00a0","")[1] "这是一个不可见的中文空格符"
1.4.4 字符串拆分
使用情景:拆分一段字符串(或文本段落),但是保留分隔符。其中,分隔符具有某些信息和价值,可以供后续分析使用。
具体实现的要点:
正确使用正则表达式
regex。R环境下一般有两类正则表达式,一类为默认的扩展正则表达式;另一类为Perl形式的正则表达式(perl = TRUE)。当然,R环境下也还有一类文学化正则表达式(fixed = TRUE`)。使用基础包函数
base::strsplit()
下面为代码示例:目标是根据美元符号拆分整个文本段落,并保留分隔符(\$)。
# minimal example
txt <- "One way to interpret the CEF $m(x)=\\mathbb{E}[Y \\mid X=x]$ is in terms of how marginal changes in the regressors $X$ imply changes in the conditional expectation of the response variable $Y$."
txt[1] "One way to interpret the CEF $m(x)=\\mathbb{E}[Y \\mid X=x]$ is in terms of how marginal changes in the regressors $X$ imply changes in the conditional expectation of the response variable $Y$."
下列代码的正则表达式中,我们使用了前向搜索表达式(lookahead)。
# split paragraph
split_raw <- strsplit(
txt,
split = "(?=[\\$])",
perl = TRUE
) %>%
unlist()
split_raw [1] "One way to interpret the CEF "
[2] "$"
[3] "m(x)=\\mathbb{E}[Y \\mid X=x]"
[4] "$"
[5] " is in terms of how marginal changes in the regressors "
[6] "$"
[7] "X"
[8] "$"
[9] " imply changes in the conditional expectation of the response variable "
[10] "$"
[11] "Y"
[12] "$"
[13] "."
需要注意的是使用stringr::str_split()则达不到上述效果:
stringr::str_split(string = txt,
pattern = "(?=[\\$])")[[1]]
[1] "One way to interpret the CEF "
[2] "$m(x)=\\mathbb{E}[Y \\mid X=x]"
[3] "$ is in terms of how marginal changes in the regressors "
[4] "$X"
[5] "$ imply changes in the conditional expectation of the response variable "
[6] "$Y"
[7] "$."
相关资源:
1.4.5 奇偶数位置
使用情景:在文本处理中,数学公式往往以成对的双美元符号定义公式环境(math environment,$$math environment$$),或者以成对的单美元符号定义公式环境(math environment,$math inline$)出现。上述成对定位符号,意味着奇数为位置起始,偶数位置为结束。因此,确定奇数/偶数定位符位置是字符串处理的一项重要工作内容。
方法要点:
base::seq_len()取模运算符
%%2
我们继续使用前述的文本案例(见节@ref(#string-split):字符串拆分)。
# demo txt lines
split_raw [1] "One way to interpret the CEF "
[2] "$"
[3] "m(x)=\\mathbb{E}[Y \\mid X=x]"
[4] "$"
[5] " is in terms of how marginal changes in the regressors "
[6] "$"
[7] "X"
[8] "$"
[9] " imply changes in the conditional expectation of the response variable "
[10] "$"
[11] "Y"
[12] "$"
[13] "."
# 查找定位符的所在行位置
(lines <- grep("\\$", split_raw))[1] 2 4 6 8 10 12
# modal divide
(row_mod <- seq_len(length(lines))%%2 ) [1] 1 0 1 0 1 0
# start row (indexing odd position)
(lines_star <- lines[row_mod==1])[1] 2 6 10
# end row (indexing even position)
(lines_end <- lines[row_mod==0] ) [1] 4 8 12
当然我们也可以自己设计定制化函数:
odd <- function(x) x%%2 != 0
even <- function(x) x%%2 == 0
evenb <- function(x) !odd(x)
# odd position
odd(1:10) [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
# even position
even(1:10) [1] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
参看资料:
1.4.6 文本转数字chartr()
可以使用基本函数base::chartr()
char2num <- function(x, char="ABCDE", num="12345"){
out <- base::chartr(old=char, new = num, x) %>%
as.numeric()
out <- ifelse(is.na(out), 0, out)
return(out)
}
char2num("BADDBDC")[1] 2144243
1.5 数据变换
1.5.1 小数位取整与小数位呈现
经验法则:a.你需要在读者的“眼睛”看到的结果与数据“真实”结果之间做出权衡。取整操作
round()会简化视觉效果,但是必然会损失数据信息。b.大多数情况下,我们都应该慎用取整操作round(),而采用小数位呈现操作formatC()。
典型情景:ggplot多类别堆叠柱状图的分项数值与inline code加总数值如何保持一致?
这时,可以采用janitor::round_half_up(value,digits=0)进行真正的“四舍五入”。具体做法如下:
在ggplot制图时,柱状图分项数据进行
geom_text()下对参数label= janitor::round_half_up(value,digits=0)在文本写作中的行内代码(
inline code)对类别分项数据加总时,也同样进行“四舍五入”,例如dt %>% mutate(value =janitor::round_half_up(value,digits=0)) %>% summarize(total=sum(value))。
经过以上的操作,二者数值将会保持一致!
require(janitor)
require(kableExtra)Loading required package: kableExtra
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
df_test <- read_rds(here("data/test-round.rds"))如下是一份常用的表格输出结果(取整数)。
df_test %>%
mutate(year =as.character(year)) %>%
janitor::adorn_totals() %>%
kable(align = "r",
caption = "kable with row sum")| region_pro | year | vars | value |
|---|---|---|---|
| 非旱区 | 2019 | 教育 | 19673.74 |
| 非旱区 | 2019 | 科学技术 | 4371.65 |
| 非旱区 | 2019 | 农林水 | 11655.07 |
| Total | - | - | 35700.46 |
眼尖的话,可以看到上表中value列的加总数35700,并不等于各行数的直接加总19674+4372+11655 = 35701。(从出版社编辑的角度来看,这是一个错误!)
我们马上会想到是“真实数据”经过了“四舍五入”,才会出现上述小状况。
下面我们把“真实数据”的小数位展示出来:
df_test %>%
mutate(year =as.character(year)) %>%
janitor::adorn_totals() %>%
mutate(value = formatC(value, format='f',digits =2)) %>%
kable(align = "r",
caption = "kable with row sum by format") | region_pro | year | vars | value |
|---|---|---|---|
| 非旱区 | 2019 | 教育 | 19673.74 |
| 非旱区 | 2019 | 科学技术 | 4371.65 |
| 非旱区 | 2019 | 农林水 | 11655.07 |
| Total | - | - | 35700.46 |
显然,这样加总后的结果完全“符合常识”认知!
那么,在正式发表报告之前,我们到底该如何操作,以“正常地”呈现小数位结果呢?
直接显示两位小数。
# formatC with digits 2
formatC(df_test$value, format="f", digits=2)[1] "19673.74" "4371.65" "11655.07"
1位小数取整(四舍五入),然后显示两位小数。(请注意观察后两个数的变化差别!!)
# formatC after round function
formatC(round(df_test$value,1), format="f", digits=2)[1] "19673.70" "4371.60" "11655.10"
2位小数取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,2)), format="f", digits=2)[1] "35700.46"
1位小数取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,1)), format="f", digits=2)[1] "35700.40"
取整(四舍五入),再加总,然后显示两位小数。
# formatC function
formatC(sum(round(df_test$value,0)), format="f", digits=2)[1] "35701.00"
因此,为了对付“眼尖和刁钻”的出版社编辑或读者们,我们需要先取整,再加总。当然,这番“神操作”之后,你就必然需要面对它带了的一个“副作用”——在连续多轮次取整加总,最后与“真实”浮点数值(float numerical)的差值会越来越大。
df_test %>%
mutate(year =as.character(year)) %>%
mutate(value = round(value)) %>%
janitor::adorn_totals() %>%
#mutate(value = formatC(value, format='f',digits =2)) %>%
kable(align = "r",
caption = "kable seems much normal!")| region_pro | year | vars | value |
|---|---|---|---|
| 非旱区 | 2019 | 教育 | 19674 |
| 非旱区 | 2019 | 科学技术 | 4372 |
| 非旱区 | 2019 | 农林水 | 11655 |
| Total | - | - | 35701 |
1.5.2 正确进行滞后变量操作
经验法则:
lag()滞后变换操作时,要注意缺失值的设定,其默认为default = NA。在需要对滞后变量进行进一步计算变换时,默认缺失值NA可能会带来一些附带效应。例如4 - NA = NA。
## default origin specification
### origin ="1970-01-01"
start_dates <- lubridate::as_date(
12274:12279,
origin =lubridate::origin)
set.seed(123)
df_demo <- tibble(date = start_dates,
value = rnorm(n = length(start_dates),5,2),
cat = rep(c("A","B"),each =3))
kable(df_demo, caption = "模拟的数据")| date | value | cat |
|---|---|---|
| 2003-08-10 | 3.879049 | A |
| 2003-08-11 | 4.539645 | A |
| 2003-08-12 | 8.117417 | A |
| 2003-08-13 | 5.141017 | B |
| 2003-08-14 | 5.258576 | B |
| 2003-08-15 | 8.430130 | B |
df_lag <- df_demo %>%
arrange(cat, date) %>%
#group_by(cat) %>%
mutate(cat_lag1 = lag(cat),
cat_lag2 = lag(cat, default = "xx") ) %>%
mutate(detect1 = ifelse(cat != cat_lag1, 1, 0),
detect2 = ifelse(cat != cat_lag2, 1, 0))
df_lag %>%
kable(caption = "lag函数默认缺失值的设定", digits = 2)| date | value | cat | cat_lag1 | cat_lag2 | detect1 | detect2 |
|---|---|---|---|---|---|---|
| 2003-08-10 | 3.88 | A | xx | 1 | ||
| 2003-08-11 | 4.54 | A | A | A | 0 | 0 |
| 2003-08-12 | 8.12 | A | A | A | 0 | 0 |
| 2003-08-13 | 5.14 | B | A | A | 1 | 1 |
| 2003-08-14 | 5.26 | B | B | B | 0 | 0 |
| 2003-08-15 | 8.43 | B | B | B | 0 | 0 |
1.5.3 构造多项式列变量
R包stats::poly()可以构造出matrix格式的多项式变量(指定p阶),但是这里要基于dplyr的mutate管道流(pipe flow),因此要用到少见的!!!函数。
use
!!!to splice the columns (turning each element of a list/data.frame into it’s own argument).
data("mtcars")
df_test<- mtcars %>%
mutate(
!!!as.data.frame(
stats::poly(x = .$wt,
degree = 3,
raw = TRUE)
)
) %>%
rename_at(all_of(as.character(1:3)),
~all_of(paste0("wt_p",1:3)))Warning: Using `all_of()` outside of a selecting function was deprecated in tidyselect
1.2.0.
ℹ See details at
<https://tidyselect.r-lib.org/reference/faq-selection-context.html>
df_test %>%
select(starts_with("wt")) %>%
DT::datatable(caption = "wt的多项式构造(p=3)",
options = list(dom = "tip", pageLength = 10)) %>%
formatRound(c(1:4), digits = 2)参考资料:
- dplyr: Using poly function to generate polynomial coefficients (见队长问答)
1.6 数据计算
1.6.1 统计次数
经验法则:如果使用
dplyr::summarise()结合across()来进行多列次数的统计,则应该使用sum(!is.na()),而不应该是n()。
参考资料:
How to Use na.rm=TRUE with n() While Using Dplyr’s Group_by and Summarise_at 队长问答
Why I love dplyr’s across 博文
df_fake <- mtcars %>%
mutate(carb = ifelse(carb %in% c(3,8), NA, carb))
df_fake$gear[c(3,6,9)] <- NA
kable(df_fake, caption = "fake data with NA")| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
| Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
| Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 1 | |
| Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
| Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
| Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 1 | |
| Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
| Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
| Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 2 | |
| Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
| Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
| Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | |
| Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | |
| Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | |
| Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
| Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
| Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
| Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 | 1 | 1 | 4 | 1 |
| Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
| Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 | 1 | 1 | 4 | 1 |
| Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
| Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
| AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
| Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
| Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
| Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 | 1 | 1 | 4 | 1 |
| Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 | 0 | 1 | 5 | 2 |
| Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 | 1 | 1 | 5 | 2 |
| Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
| Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
| Maserati Bora | 15.0 | 8 | 301.0 | 335 | 3.54 | 3.570 | 14.60 | 0 | 1 | 5 | |
| Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 | 1 | 1 | 4 | 2 |
df_fake %>%
group_by(carb) %>%
dplyr::summarise(n_carb = n()) %>%
kable(caption = "summarise with `n()`")| carb | n_carb |
|---|---|
| 1 | 7 |
| 2 | 10 |
| 4 | 10 |
| 6 | 1 |
| 4 |
# Helper function
summarizer <- function(data, numeric_cols = NULL, ...) {
data %>%
group_by(...) %>%
dplyr::summarise(
across({{numeric_cols}},
list(
count = ~sum(!is.na(.)) # ! do not use `n()`),
),
.names = "{col}_{fn}"))
}
df_fake %>%
summarizer(.,
numeric_cols =all_of( c("gear", "cyl")),
carb) %>%
kable(caption = "summarise with `sum()`")| carb | gear_count | cyl_count |
|---|---|---|
| 1 | 5 | 7 |
| 2 | 9 | 10 |
| 4 | 10 | 10 |
| 6 | 1 | 1 |
| 4 | 4 |
options(knitr.kable.NA = 'NA')1.6.2 描述性统计
经验法则:在对数据的指定多个列进行描述性统计分析时,
dplyr::summarise()结合across()可以大大提升分析效率。
参考资料:
Why I love dplyr’s across 博文
第 40 章 tidyverse中的across()之美1 bookdown’s
# Helper function
summarizer <- function(data, numeric_cols = NULL, ...) {
data %>%
group_by(...) %>%
dplyr::summarise(
across({{numeric_cols}},
list(
count = ~sum(!is.na(.)), # ! do not use `n()`
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE)
),
.names = "{col}_{fn}"))
}var_quanty <- names(df_fake)
smry_eda <- df_fake %>%
summarizer(., numeric_cols = all_of(var_quanty)) %>%
pivot_longer(., contains('_'),
names_to = "variables",
values_to = "value") %>%
separate(., col =variables,
into = c("var", "measure"),
sep = "_" ) %>%
pivot_wider(., names_from = measure, values_from = value) %>%
select(var, count, mean, sd, min, max) kable(smry_eda,
caption = "描述性统计表",
align = "c")| var | count | mean | sd | min | max |
|---|---|---|---|---|---|
| mpg | 32 | 20.090625 | 6.0269481 | 10.400 | 33.900 |
| cyl | 32 | 6.187500 | 1.7859216 | 4.000 | 8.000 |
| disp | 32 | 230.721875 | 123.9386938 | 71.100 | 472.000 |
| hp | 32 | 146.687500 | 68.5628685 | 52.000 | 335.000 |
| drat | 32 | 3.596563 | 0.5346787 | 2.760 | 4.930 |
| wt | 32 | 3.217250 | 0.9784574 | 1.513 | 5.424 |
| qsec | 32 | 17.848750 | 1.7869432 | 14.500 | 22.900 |
| vs | 32 | 0.437500 | 0.5040161 | 0.000 | 1.000 |
| am | 32 | 0.406250 | 0.4989909 | 0.000 | 1.000 |
| gear | 29 | 3.689655 | 0.7608007 | 3.000 | 5.000 |
| carb | 28 | 2.607143 | 1.3968028 | 1.000 | 6.000 |
1.7 定制化函数
1.7.1 字符串作为变量名
在开发定制化函数过程中,我们可能会用到动态定义变量名,也即将特定字符串(string)定义为动态变量名(variable),然后tidyverse语境下的mutate()操作应用。
下面提供了若干动态变量名操作的方法:
df <- tibble(
province = c("陕西","山东","湖南"),
value = rnorm(3),
cat = LETTERS[1:3]
)kable(df,align = "c", caption = "演示数据")| province | value | cat |
|---|---|---|
| 陕西 | 0.4609162 | A |
| 山东 | -1.2650612 | B |
| 湖南 | -0.6868529 | C |
方法1:利用dplyr包(version >1.0,结合了glue包的参数调用),结合{{}}、:=和get()。缺点是函数会显得可读性较差!(参看队长问答)
col_tar <- "province"
col_media <- paste0(col_tar,"_raw")
n<-3
col_num <- "value"
# this will work
out1 <- df %>%
# duplicate new named columns
mutate({{col_media}} := get(col_tar)) %>%
mutate("{col_media}.{3}" := value*n) %>%
mutate("mean of {col_num}" := mean(get(col_num)))
# NOTE: this will work in another way unintended
out2 <- df %>%
# duplicate new named columns
mutate({{col_media}} := {{col_tar}})kable(out1,align = "c", caption = "可行1:{{}}结合:=以及get()")| province | value | cat | province_raw | province_raw.3 | mean of value |
|---|---|---|---|---|---|
| 陕西 | 0.4609162 | A | 陕西 | 1.382749 | -0.4969993 |
| 山东 | -1.2650612 | B | 山东 | -3.795184 | -0.4969993 |
| 湖南 | -0.6868529 | C | 湖南 | -2.060559 | -0.4969993 |
kable(out2,align = "c", caption = "不可行2:{{}}结合:=")| province | value | cat | province_raw |
|---|---|---|---|
| 陕西 | 0.4609162 | A | province |
| 山东 | -1.2650612 | B | province |
| 湖南 | -0.6868529 | C | province |
方法2:结合使用!!rlang::sym()和:=。缺点是代码比较繁琐。
# this will work
out3 <- df %>%
# duplicate new named columns
mutate(!! rlang::sym(col_media) := !! rlang::sym(col_tar))kable(out3,align = "c", caption = "可行3:结合!!rlang::sym()以及:=")| province | value | cat | province_raw |
|---|---|---|---|
| 陕西 | 0.4609162 | A | 陕西 |
| 山东 | -1.2650612 | B | 山东 |
| 湖南 | -0.6868529 | C | 湖南 |
1.7.2 计数counter
task_counter <- function() {
i <- 0
function() {
i <<- i + 1
i
}
}
n_task <- task_counter()
# firt
n_task()[1] 1
# second
n_task()[1] 2
# clean the counter
n_task <- task_counter()
# firt
n_task()[1] 1
参考资料:
Hadley “Advanced R”一书中,提到Mutable state
R-blogger 博文 A Little R Counter
1.7.3 排列组合
对于组合数的计算,要区分可重复还是不重复两类情形:
可重复组合数的计算公式为\(n^k\)
不可重复组合数的计算公式为:\(\left(\begin{array}{l}n \\ k\end{array}\right)=\frac{n !}{k !(n-k) !}\)
排列组合问题的R计算可以参看:Learning R: Permutations and Combinations with base R
# combination function
## combinations without repetitions
n1 <- length(combn(1:151,2))/2
n2 <- choose(151,2)
# permutation function
perm <- function(v) {
n <- length(v)
if (n == 1) v
else {
X <- NULL
for (i in 1:n) X <- rbind(X, cbind(v[i], perm(v[-i])))
X
}
}
# nrow(perm(1:2))1.8 R包janitor
1.8.1 统计制表 vs janitor::tabyl
tabyls: a tidy, fully-featured approach to counting things. see link
(1)优点:
方便统计频次和频率
方便添加行合计
方便调整数据格式(百分数等)
可以结合
knitr::kable()输出
(2)不足:
janitor::tabyl()仅适合处理原始数据,并且是data.frame或vector。
1.8.2 注意函数使用次序
特别需要注意系列janitor::adorn_xx()函数的使用先后顺序关系。概括起来需要小心的是:
先计算汇总,再设置格式。其核心要点在于,数据列格式的变换,会导致数据列类型(type)的改变,比如
numeric类型会变为character类型。例如,需要先adorn_total("rows")进行行汇总,再adorn_pct_formatting()把比率转换为百分数。没有被汇总的
adorn_total(),也不会被进一步调整格式adorn_pct_formatting()。这意味着手动计算的占比列(小数如0.32),不能够使用adorn_pct_formatting()调整为百分数(如32%)。添加汇总计算
adorn_total(),结果将不再保持为标准数据表(tidy data table)。因此需要额外处理不适合汇总的列或行。例如累积百分比和累积次数,就不应该有汇总(Total)值。
1.8.3 指定列进行汇总或调整格式
首先,指定列进行汇总或调整格式需要满足前述函数使用的先后次序关系。
其次,指定列进行汇总或调整格式需要完整使用
janitor::adorn_xx()函数的所有参数设置。这一点非常不人性。最后,指定列进行汇总或调整格式需要使用
dplyr包的select语法,如all_of(contains(“p”))`。
char <-c("非常不满意","不满意","一般","满意","非常满意")
reps <- c(24L, 108L,93L, 45L,30L)
sat_list <- rep(char, reps)
sat <- data.frame(groups=LETTERS[1:5],
satisfication = sat_list,
row.names=NULL) %>%
mutate(satisfication = factor(satisfication, levels = char))
cum_calc <- sat %>%
janitor::tabyl(satisfication) %>%
mutate(min_cum_n = cumsum(n),
min_cum_p = cumsum(percent), #<<
max_cum_n =rev( cumsum(rev(n))), #<<
max_cum_p =rev( cumsum(rev(percent))))
cum_format<- cum_calc %>%
mutate(min_cum_p = scales::percent(min_cum_p,accuracy = 0.1), #<<
max_cum_p = scales::percent(max_cum_p,accuracy = 0.1)) %>%
janitor::adorn_totals(.,where = "row",fill = "-",na.rm = TRUE, #<<
name = "Total", all_of(c("n","percent"))) %>%
janitor::adorn_pct_formatting(.,digits = 1, #<<
rounding ="half to even",
affix_sign = TRUE,
all_of(c("percent"))) #<<1.8.4 相关系数与回归系数
require(techme)Loading required package: techme
data("AgriFertilizer", package = "techme")
names(AgriFertilizer)[1] "province" "year" "chn_block4" "value" "units"
[6] "variables"
df <- AgriFertilizer %>%
filter(year ==2019) %>%
pivot_wider(id_cols = province, names_from = variables, values_from = value)
lm(data = df, v7_sctj_nyhf_df ~0+v7_sctj_nyhf_lf)
Call:
lm(formula = v7_sctj_nyhf_df ~ 0 + v7_sctj_nyhf_lf, data = df)
Coefficients:
v7_sctj_nyhf_lf
2.811