R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.3     √ purrr   0.3.4
## √ tibble  3.0.4     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.4.0     √ forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rmarkdown)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(emmeans)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(haven)
library(sjmisc)
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:purrr':
## 
##     is_empty
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## The following object is masked from 'package:tibble':
## 
##     add_case
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:dplyr':
## 
##     as_label
library(gtsummary)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(ggstatsplot)
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## In case you would like cite this package, cite it as:
##      Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
##      Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html
library(knitr)
cgss2017 <- read_dta("C:/Users/huawei/Desktop/R/cgss/cgss2017.dta")

descr(cgss2017$a4)
## 
## ## Basic descriptive statistics
## 
##  var    type        label     n NA.prc mean   sd   se md trimmed   range iqr
##   dd numeric 您的民族是: 12582      0 1.36 1.38 0.01  1    1.36 7 (1-8)   0
##  skew
##  4.03
descr(cgss2017$a10)
## 
## ## Basic descriptive statistics
## 
##  var    type              label     n NA.prc mean  sd   se md trimmed     range
##   dd numeric 目前的政治面貌是: 12582      0 1.47 2.9 0.03  1    1.47 98 (1-99)
##  iqr  skew
##    0 29.89
descr(cgss2017$a59g)
## 
## ## Basic descriptive statistics
## 
##  var    type                                                        label    n
##   dd numeric 在您目前的工作中,您在多大程度上能自主决定您工作的具体方式: 4792
##  NA.prc mean   sd   se md trimmed     range iqr  skew
##   61.91 2.51 6.11 0.09  2    2.51 98 (1-99)   2 15.28
descr(cgss2017$a89b)
## 
## ## Basic descriptive statistics
## 
##  var    type                    label     n NA.prc mean    sd   se md trimmed
##   dd numeric 您父亲的最高教育程度是: 12582      0 9.27 23.74 0.21  3    9.27
##      range iqr skew
##  98 (1-99)   3 3.43
table(cgss2017$a4)
## 
##     1     2     3     4     5     6     7     8 
## 11636    48    94   261    10   140     2   391
table(cgss2017$a58)
## 
##    1    2    3    4    5    6 
## 4794  616 1625 1493 3370  684
descr(cgss2017$a52)
## 
## ## Basic descriptive statistics
## 
##  var    type                              label     n NA.prc mean   sd   se md
##   dd numeric 您觉得自己说英语的能力是什么水平? 12582      0 1.55 3.43 0.03  1
##  trimmed     range iqr  skew
##     1.55 98 (1-99)   1 26.71
cgss2017[cgss2017 >= 9999997] <- NA
cgss2017[cgss2017 == 9999996] <- 999999
cgss2017[cgss2017 == 98] <- NA
cgss2017[cgss2017 == 99] <- NA
cgss2017 <- sjlabelled::drop_labels(cgss2017)
cgss2017 <- sjmisc::to_label(cgss2017)
descr(cgss2017$a8a)
## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc     mean       sd
##   dd numeric 您个人去年(2016年)全年总收入 11900   5.42 41156.86 233509.9
##       se    md  trimmed               range   iqr  skew
##  2140.58 20000 41156.86 9993600 (0-9993600) 42000 37.51
cgss2017$a8a <- as.numeric(cgss2017$a8a)
cgss2017$a8a2 <-
  car::recode(cgss2017$a8a,"0:6000='1';6200:15000='2';15300:25000='3';25200:35000='4';35280:70000='5';72000:9993600='6' ")

table(cgss2017$a8a2)
## 
##    1    2    3    4    5    6 
## 3821 1411 1483 1162 2714 1309
cgss2017$a8a2 <-
  factor(cgss2017$a8a2,
         levels = c(1, 2, 3,4,5,6),
         labels = c("低下", "低上", "中下","中上","上下","上上"))
table(cgss2017$a8a2)
## 
## 低下 低上 中下 中上 上下 上上 
## 3821 1411 1483 1162 2714 1309
cgssGPA4.0 <-cgss2017 %>% select(
    MZ = a4,
    ZZMM = a10,
    JYY = a52,
    F.edu = a89b,
    ZZD = a59g,
    income = a8a2
    )




cgssGPA4.0$MZ <- as.numeric(cgssGPA4.0$MZ)
cgssGPA4.0$ZZMM <- as.numeric(cgssGPA4.0$ZZMM)
cgssGPA4.0$JYY <- as.numeric(cgssGPA4.0$JYY)
cgssGPA4.0$F.edu <- as.numeric(cgssGPA4.0$F.edu)
cgssGPA4.0$ZZD <- as.numeric(cgssGPA4.0$ZZD)
cgssGPA4.0$income <- as.numeric(cgssGPA4.0$income)

tbl_summary(cgssGPA4.0)
Characteristic N = 12,5821
MZ
1 11,636 (92%)
2 48 (0.4%)
3 94 (0.7%)
4 261 (2.1%)
5 10 (<0.1%)
6 140 (1.1%)
7 2 (<0.1%)
8 391 (3.1%)
ZZMM
1 10,512 (84%)
2 636 (5.1%)
3 19 (0.2%)
4 1,405 (11%)
Unknown 10
JYY
1 8,887 (71%)
2 2,260 (18%)
3 1,098 (8.7%)
4 246 (2.0%)
5 76 (0.6%)
Unknown 15
F.edu 3.00 (1.00, 4.00)
Unknown 832
ZZD
1 1,246 (26%)
2 2,101 (44%)
3 1,002 (21%)
4 424 (8.9%)
Unknown 7,809
income
1 3,821 (32%)
2 1,411 (12%)
3 1,483 (12%)
4 1,162 (9.8%)
5 2,714 (23%)
6 1,309 (11%)
Unknown 682

1 n (%); Median (IQR)

tbl_summary(cgssGPA4.0,statistic = list(all_continuous()~"{mean}({sd}){median}({min},{p25},{p75},{max})"),)
Characteristic N = 12,5821
MZ
1 11,636 (92%)
2 48 (0.4%)
3 94 (0.7%)
4 261 (2.1%)
5 10 (<0.1%)
6 140 (1.1%)
7 2 (<0.1%)
8 391 (3.1%)
ZZMM
1 10,512 (84%)
2 636 (5.1%)
3 19 (0.2%)
4 1,405 (11%)
Unknown 10
JYY
1 8,887 (71%)
2 2,260 (18%)
3 1,098 (8.7%)
4 246 (2.0%)
5 76 (0.6%)
Unknown 15
F.edu 2.98(2.47)3.00(1.00,1.00,4.00,14.00)
Unknown 832
ZZD
1 1,246 (26%)
2 2,101 (44%)
3 1,002 (21%)
4 424 (8.9%)
Unknown 7,809
income
1 3,821 (32%)
2 1,411 (12%)
3 1,483 (12%)
4 1,162 (9.8%)
5 2,714 (23%)
6 1,309 (11%)
Unknown 682

1 n (%); Mean(SD)Median(Minimum,25%,75%,Maximum)

cgssGPA4.0 %>%
  select(MZ, ZZMM, JYY, F.edu, ZZD,income) %>% tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"
    ),
    digits = list(c(income) ~ 1)
  )
Characteristic N = 12,5821
MZ
1 11,636 (92%)
2 48 (0.4%)
3 94 (0.7%)
4 261 (2.1%)
5 10 (<0.1%)
6 140 (1.1%)
7 2 (<0.1%)
8 391 (3.1%)
ZZMM
1 10,512 (84%)
2 636 (5.1%)
3 19 (0.2%)
4 1,405 (11%)
Unknown 10
JYY
1 8,887 (71%)
2 2,260 (18%)
3 1,098 (8.7%)
4 246 (2.0%)
5 76 (0.6%)
Unknown 15
F.edu 2.98(2.47) 3.00 (1.00, 1.00, 4.00,14.00)
Unknown 832
ZZD
1 1,246 (26%)
2 2,101 (44%)
3 1,002 (21%)
4 424 (8.9%)
Unknown 7,809
income
1 3,821.0 (32.1%)
2 1,411.0 (11.9%)
3 1,483.0 (12.5%)
4 1,162.0 (9.8%)
5 2,714.0 (22.8%)
6 1,309.0 (11.0%)
Unknown 682

1 n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssGPA4.0 %>%
  select(MZ, ZZMM, JYY, F.edu,ZZD,income) %>% tbl_summary(
    by = income, missing = "no",
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    digits = list(c(income) ~ 1)
  ) %>%
  add_n() %>% 
  add_p() %>% 
  modify_header(label = "**Variable**") %>% 
  bold_labels()
## 682 observations missing `income` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `income` column before passing to `tbl_summary()`.
## There was an error in 'add_p()/add_difference()' for variable 'MZ', p-value omitted:
## Error in stats::fisher.test(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, : FEXACT error 5.
## The hash table key cannot be computed because the largest key
## is larger than the largest representable int.
## The algorithm cannot proceed.
## Reduce the workspace, consider using 'simulate.p.value=TRUE' or another algorithm.
## There was an error in 'add_p()/add_difference()' for variable 'ZZMM', p-value omitted:
## Error in stats::fisher.test(c(4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 1, 2, 1, : FEXACT error 501.
## The hash table key cannot be computed because the largest key
## is larger than the largest representable int.
## The algorithm cannot proceed.
## Reduce the workspace, consider using 'simulate.p.value=TRUE' or another algorithm.
Variable N 1, N = 3,821 2, N = 1,411 3, N = 1,483 4, N = 1,162 5, N = 2,714 6, N = 1,309 p-value1
MZ 11,900
1 3,432 (90%) 1,277 (91%) 1,372 (93%) 1,100 (95%) 2,575 (95%) 1,250 (95%)
2 7 (0.2%) 8 (0.6%) 4 (0.3%) 6 (0.5%) 10 (0.4%) 7 (0.5%)
3 24 (0.6%) 9 (0.6%) 18 (1.2%) 5 (0.4%) 22 (0.8%) 13 (1.0%)
4 86 (2.3%) 31 (2.2%) 39 (2.6%) 21 (1.8%) 55 (2.0%) 20 (1.5%)
5 3 (<0.1%) 1 (<0.1%) 1 (<0.1%) 1 (<0.1%) 3 (0.1%) 1 (<0.1%)
6 75 (2.0%) 20 (1.4%) 17 (1.1%) 6 (0.5%) 10 (0.4%) 4 (0.3%)
7 0 (0%) 0 (0%) 0 (0%) 0 (0%) 2 (<0.1%) 0 (0%)
8 194 (5.1%) 65 (4.6%) 32 (2.2%) 23 (2.0%) 37 (1.4%) 14 (1.1%)
ZZMM 11,894
1 3,435 (90%) 1,302 (92%) 1,311 (89%) 979 (84%) 2,055 (76%) 897 (69%)
2 250 (6.5%) 33 (2.3%) 48 (3.2%) 51 (4.4%) 98 (3.6%) 68 (5.2%)
3 0 (0%) 0 (0%) 1 (<0.1%) 1 (<0.1%) 8 (0.3%) 9 (0.7%)
4 135 (3.5%) 74 (5.3%) 121 (8.2%) 131 (11%) 552 (20%) 335 (26%)
JYY 11,886 <0.001
1 3,174 (83%) 1,237 (88%) 1,169 (79%) 835 (72%) 1,629 (60%) 418 (32%)
2 366 (9.6%) 126 (8.9%) 230 (16%) 238 (21%) 710 (26%) 452 (35%)
3 216 (5.7%) 42 (3.0%) 77 (5.2%) 70 (6.0%) 297 (11%) 311 (24%)
4 54 (1.4%) 3 (0.2%) 3 (0.2%) 15 (1.3%) 61 (2.3%) 88 (6.7%)
5 8 (0.2%) 0 (0%) 3 (0.2%) 2 (0.2%) 14 (0.5%) 38 (2.9%)
F.edu 11,148 <0.001
N 3,564 1,324 1,380 1,099 2,541 1,240
Median (IQR) 1.00 (1.00, 3.00) 1.00 (1.00, 3.00) 3.00 (1.00, 3.00) 3.00 (1.00, 4.00) 3.00 (1.00, 4.00) 4.00 (3.00, 6.00)
Range 1.00, 14.00 1.00, 14.00 1.00, 14.00 1.00, 14.00 1.00, 14.00 1.00, 14.00
ZZD 4,548 <0.001
1 94 (33%) 129 (29%) 150 (23%) 125 (21%) 357 (24%) 314 (29%)
2 102 (35%) 145 (33%) 237 (36%) 237 (40%) 710 (48%) 585 (54%)
3 56 (19%) 114 (26%) 180 (27%) 157 (27%) 306 (21%) 142 (13%)
4 37 (13%) 54 (12%) 90 (14%) 70 (12%) 109 (7.4%) 48 (4.4%)

1 Pearson's Chi-squared test; Kruskal-Wallis rank sum test

table1(~ MZ+ZZMM+JYY+F.edu+ZZD+income, data=cgssGPA4.0)
Overall
(N=12582)
MZ
Mean (SD) 1.36 (1.38)
Median [Min, Max] 1.00 [1.00, 8.00]
ZZMM
Mean (SD) 1.39 (0.955)
Median [Min, Max] 1.00 [1.00, 4.00]
Missing 10 (0.1%)
JYY
Mean (SD) 1.44 (0.782)
Median [Min, Max] 1.00 [1.00, 5.00]
Missing 15 (0.1%)
F.edu
Mean (SD) 2.98 (2.47)
Median [Min, Max] 3.00 [1.00, 14.0]
Missing 832 (6.6%)
ZZD
Mean (SD) 2.13 (0.900)
Median [Min, Max] 2.00 [1.00, 4.00]
Missing 7809 (62.1%)
income
Mean (SD) 3.12 (1.84)
Median [Min, Max] 3.00 [1.00, 6.00]
Missing 682 (5.4%)
table1(~ MZ+ZZMM+JYY+F.edu+ZZD|income, data=cgssGPA4.0)
## Warning in table1.formula(~MZ + ZZMM + JYY + F.edu + ZZD | income, data =
## cgssGPA4.0): Terms to the right of '|' in formula 'x' define table columns and
## are expected to be factors with meaningful labels.
1
(N=3821)
2
(N=1411)
3
(N=1483)
4
(N=1162)
5
(N=2714)
6
(N=1309)
Overall
(N=12582)
MZ
Mean (SD) 1.54 (1.71) 1.48 (1.62) 1.32 (1.24) 1.24 (1.12) 1.20 (0.991) 1.16 (0.877) 1.36 (1.38)
Median [Min, Max] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00] 1.00 [1.00, 8.00]
ZZMM
Mean (SD) 1.17 (0.595) 1.18 (0.681) 1.28 (0.833) 1.38 (0.957) 1.65 (1.21) 1.83 (1.30) 1.39 (0.955)
Median [Min, Max] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00] 1.00 [1.00, 4.00]
Missing 1 (0.0%) 2 (0.1%) 2 (0.1%) 0 (0%) 1 (0.0%) 0 (0%) 10 (0.1%)
JYY
Mean (SD) 1.26 (0.645) 1.16 (0.452) 1.27 (0.582) 1.37 (0.673) 1.57 (0.813) 2.14 (1.03) 1.44 (0.782)
Median [Min, Max] 1.00 [1.00, 5.00] 1.00 [1.00, 4.00] 1.00 [1.00, 5.00] 1.00 [1.00, 5.00] 1.00 [1.00, 5.00] 2.00 [1.00, 5.00] 1.00 [1.00, 5.00]
Missing 3 (0.1%) 3 (0.2%) 1 (0.1%) 2 (0.2%) 3 (0.1%) 2 (0.2%) 15 (0.1%)
F.edu
Mean (SD) 2.33 (2.12) 2.23 (1.80) 2.79 (2.30) 3.01 (2.28) 3.41 (2.48) 4.80 (3.05) 2.98 (2.47)
Median [Min, Max] 1.00 [1.00, 14.0] 1.00 [1.00, 14.0] 3.00 [1.00, 14.0] 3.00 [1.00, 14.0] 3.00 [1.00, 14.0] 4.00 [1.00, 14.0] 3.00 [1.00, 14.0]
Missing 257 (6.7%) 87 (6.2%) 103 (6.9%) 63 (5.4%) 173 (6.4%) 69 (5.3%) 832 (6.6%)
ZZD
Mean (SD) 2.12 (1.01) 2.21 (0.998) 2.32 (0.974) 2.29 (0.933) 2.11 (0.854) 1.93 (0.769) 2.13 (0.900)
Median [Min, Max] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00] 2.00 [1.00, 4.00]
Missing 3532 (92.4%) 969 (68.7%) 826 (55.7%) 573 (49.3%) 1232 (45.4%) 220 (16.8%) 7809 (62.1%)
ggstatsplot::ggbetweenstats(data=cgssGPA4.0,x=income,y=MZ,title="民族与收入")

ggstatsplot::ggpiestats(
  data = cgssGPA4.0,
  x = ZZMM,
  y = income,
  title = "政治面貌与收入", 
  legend.title = "政治面貌", 
  caption = substitute(paste(italic("Source"), ": cgss2017"))
)
## Warning in stats::chisq.test(x = x_arg, correct = FALSE): Chi-squared
## approximation may be incorrect

ggstatsplot::ggbarstats(
 data = cgssGPA4.0,
  x = JYY,
  y = income,
  title = "讲英文水平与收入",
  xlab = "收入",
  legend.title = "讲英文水平"
)

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=income)) +
  geom_bar(stat="count",position = "dodge",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=ZZD,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)
## Warning: Removed 7352 rows containing non-finite values (stat_count).

ggstatsplot::ggbarstats(
 data = cgssGPA4.0,
  x = F.edu,
  y = income,
  title = "父亲受教育程度与收入",
  xlab = "收入",
  legend.title = "父亲受教育程度"
)
## Warning in stats::chisq.test(x = x_arg, correct = FALSE): Chi-squared
## approximation may be incorrect
## Warning: Number of labels is greater than default palette color count.
##  Try using another color `palette` (and/or `package`).
## 

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=ZZMM,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)
## Warning: Removed 6 rows containing non-finite values (stat_count).

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=JYY,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)
## Warning: Removed 14 rows containing non-finite values (stat_count).