Untitled

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## √ ggplot2 3.3.3     √ purrr   0.3.4
## √ tibble  3.0.4     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.4.0     √ forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rmarkdown)
library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(emmeans)
library(effects)

## Loading required package: carData

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(haven)
library(sjmisc)

## 
## Attaching package: 'sjmisc'

## The following object is masked from 'package:purrr':
## 
##     is_empty

## The following object is masked from 'package:tidyr':
## 
##     replace_na

## The following object is masked from 'package:tibble':
## 
##     add_case

library(sjlabelled)

## 
## Attaching package: 'sjlabelled'

## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels

## The following object is masked from 'package:forcats':
## 
##     as_factor

## The following object is masked from 'package:dplyr':
## 
##     as_label

library(gtsummary)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(ggstatsplot)

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## In case you would like cite this package, cite it as:
##      Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
##      Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html

library(knitr)
cgss2017 <- read_dta("C:/Users/huawei/Desktop/R/cgss/cgss2017.dta")

descr(cgss2017$a4)

## 
## ## Basic descriptive statistics
## 
##  var    type        label     n NA.prc mean   sd   se md trimmed   range iqr
##   dd numeric 您的民族是： 12582      0 1.36 1.38 0.01  1    1.36 7 (1-8)   0
##  skew
##  4.03

descr(cgss2017$a10)

## 
## ## Basic descriptive statistics
## 
##  var    type              label     n NA.prc mean  sd   se md trimmed     range
##   dd numeric 目前的政治面貌是： 12582      0 1.47 2.9 0.03  1    1.47 98 (1-99)
##  iqr  skew
##    0 29.89

descr(cgss2017$a59g)

## 
## ## Basic descriptive statistics
## 
##  var    type                                                        label    n
##   dd numeric 在您目前的工作中，您在多大程度上能自主决定您工作的具体方式： 4792
##  NA.prc mean   sd   se md trimmed     range iqr  skew
##   61.91 2.51 6.11 0.09  2    2.51 98 (1-99)   2 15.28

descr(cgss2017$a89b)

## 
## ## Basic descriptive statistics
## 
##  var    type                    label     n NA.prc mean    sd   se md trimmed
##   dd numeric 您父亲的最高教育程度是： 12582      0 9.27 23.74 0.21  3    9.27
##      range iqr skew
##  98 (1-99)   3 3.43

table(cgss2017$a4)

## 
##     1     2     3     4     5     6     7     8 
## 11636    48    94   261    10   140     2   391

table(cgss2017$a58)

## 
##    1    2    3    4    5    6 
## 4794  616 1625 1493 3370  684

descr(cgss2017$a52)

## 
## ## Basic descriptive statistics
## 
##  var    type                              label     n NA.prc mean   sd   se md
##   dd numeric 您觉得自己说英语的能力是什么水平？ 12582      0 1.55 3.43 0.03  1
##  trimmed     range iqr  skew
##     1.55 98 (1-99)   1 26.71

cgss2017[cgss2017 >= 9999997] <- NA
cgss2017[cgss2017 == 9999996] <- 999999
cgss2017[cgss2017 == 98] <- NA
cgss2017[cgss2017 == 99] <- NA
cgss2017 <- sjlabelled::drop_labels(cgss2017)
cgss2017 <- sjmisc::to_label(cgss2017)
descr(cgss2017$a8a)

## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc     mean       sd
##   dd numeric 您个人去年(2016年)全年总收入 11900   5.42 41156.86 233509.9
##       se    md  trimmed               range   iqr  skew
##  2140.58 20000 41156.86 9993600 (0-9993600) 42000 37.51

cgss2017$a8a <- as.numeric(cgss2017$a8a)
cgss2017$a8a2 <-
  car::recode(cgss2017$a8a,"0:6000='1';6200:15000='2';15300:25000='3';25200:35000='4';35280:70000='5';72000:9993600='6' ")

table(cgss2017$a8a2)

## 
##    1    2    3    4    5    6 
## 3821 1411 1483 1162 2714 1309

cgss2017$a8a2 <-
  factor(cgss2017$a8a2,
         levels = c(1, 2, 3,4,5,6),
         labels = c("低下", "低上", "中下","中上","上下","上上"))
table(cgss2017$a8a2)

## 
## 低下 低上 中下 中上 上下 上上 
## 3821 1411 1483 1162 2714 1309

cgssGPA4.0 <-cgss2017 %>% select(
    MZ = a4,
    ZZMM = a10,
    JYY = a52,
    F.edu = a89b,
    ZZD = a59g,
    income = a8a2
    )




cgssGPA4.0$MZ <- as.numeric(cgssGPA4.0$MZ)
cgssGPA4.0$ZZMM <- as.numeric(cgssGPA4.0$ZZMM)
cgssGPA4.0$JYY <- as.numeric(cgssGPA4.0$JYY)
cgssGPA4.0$F.edu <- as.numeric(cgssGPA4.0$F.edu)
cgssGPA4.0$ZZD <- as.numeric(cgssGPA4.0$ZZD)
cgssGPA4.0$income <- as.numeric(cgssGPA4.0$income)

tbl_summary(cgssGPA4.0)

Characteristic	N = 12,582¹
MZ
1	11,636 (92%)
2	48 (0.4%)
3	94 (0.7%)
4	261 (2.1%)
5	10 (<0.1%)
6	140 (1.1%)
7	2 (<0.1%)
8	391 (3.1%)
ZZMM
1	10,512 (84%)
2	636 (5.1%)
3	19 (0.2%)
4	1,405 (11%)
Unknown	10
JYY
1	8,887 (71%)
2	2,260 (18%)
3	1,098 (8.7%)
4	246 (2.0%)
5	76 (0.6%)
Unknown	15
F.edu	3.00 (1.00, 4.00)
Unknown	832
ZZD
1	1,246 (26%)
2	2,101 (44%)
3	1,002 (21%)
4	424 (8.9%)
Unknown	7,809
income
1	3,821 (32%)
2	1,411 (12%)
3	1,483 (12%)
4	1,162 (9.8%)
5	2,714 (23%)
6	1,309 (11%)
Unknown	682
¹ n (%); Median (IQR)

tbl_summary(cgssGPA4.0,statistic = list(all_continuous()~"{mean}({sd}){median}({min},{p25},{p75},{max})"),)

Characteristic	N = 12,582¹
MZ
1	11,636 (92%)
2	48 (0.4%)
3	94 (0.7%)
4	261 (2.1%)
5	10 (<0.1%)
6	140 (1.1%)
7	2 (<0.1%)
8	391 (3.1%)
ZZMM
1	10,512 (84%)
2	636 (5.1%)
3	19 (0.2%)
4	1,405 (11%)
Unknown	10
JYY
1	8,887 (71%)
2	2,260 (18%)
3	1,098 (8.7%)
4	246 (2.0%)
5	76 (0.6%)
Unknown	15
F.edu	2.98(2.47)3.00(1.00,1.00,4.00,14.00)
Unknown	832
ZZD
1	1,246 (26%)
2	2,101 (44%)
3	1,002 (21%)
4	424 (8.9%)
Unknown	7,809
income
1	3,821 (32%)
2	1,411 (12%)
3	1,483 (12%)
4	1,162 (9.8%)
5	2,714 (23%)
6	1,309 (11%)
Unknown	682
¹ n (%); Mean(SD)Median(Minimum,25%,75%,Maximum)

cgssGPA4.0 %>%
  select(MZ, ZZMM, JYY, F.edu, ZZD,income) %>% tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"
    ),
    digits = list(c(income) ~ 1)
  )

Characteristic	N = 12,582¹
MZ
1	11,636 (92%)
2	48 (0.4%)
3	94 (0.7%)
4	261 (2.1%)
5	10 (<0.1%)
6	140 (1.1%)
7	2 (<0.1%)
8	391 (3.1%)
ZZMM
1	10,512 (84%)
2	636 (5.1%)
3	19 (0.2%)
4	1,405 (11%)
Unknown	10
JYY
1	8,887 (71%)
2	2,260 (18%)
3	1,098 (8.7%)
4	246 (2.0%)
5	76 (0.6%)
Unknown	15
F.edu	2.98(2.47) 3.00 (1.00, 1.00, 4.00,14.00)
Unknown	832
ZZD
1	1,246 (26%)
2	2,101 (44%)
3	1,002 (21%)
4	424 (8.9%)
Unknown	7,809
income
1	3,821.0 (32.1%)
2	1,411.0 (11.9%)
3	1,483.0 (12.5%)
4	1,162.0 (9.8%)
5	2,714.0 (22.8%)
6	1,309.0 (11.0%)
Unknown	682
¹ n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssGPA4.0 %>%
  select(MZ, ZZMM, JYY, F.edu,ZZD,income) %>% tbl_summary(
    by = income, missing = "no",
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    digits = list(c(income) ~ 1)
  ) %>%
  add_n() %>% 
  add_p() %>% 
  modify_header(label = "**Variable**") %>% 
  bold_labels()

## 682 observations missing `income` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `income` column before passing to `tbl_summary()`.

## There was an error in 'add_p()/add_difference()' for variable 'MZ', p-value omitted:
## Error in stats::fisher.test(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, : FEXACT error 5.
## The hash table key cannot be computed because the largest key
## is larger than the largest representable int.
## The algorithm cannot proceed.
## Reduce the workspace, consider using 'simulate.p.value=TRUE' or another algorithm.

## There was an error in 'add_p()/add_difference()' for variable 'ZZMM', p-value omitted:
## Error in stats::fisher.test(c(4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 1, 2, 1, : FEXACT error 501.
## The hash table key cannot be computed because the largest key
## is larger than the largest representable int.
## The algorithm cannot proceed.
## Reduce the workspace, consider using 'simulate.p.value=TRUE' or another algorithm.

Variable	N	1, N = 3,821	2, N = 1,411	3, N = 1,483	4, N = 1,162	5, N = 2,714	6, N = 1,309	p-value¹
MZ	11,900
1		3,432 (90%)	1,277 (91%)	1,372 (93%)	1,100 (95%)	2,575 (95%)	1,250 (95%)
2		7 (0.2%)	8 (0.6%)	4 (0.3%)	6 (0.5%)	10 (0.4%)	7 (0.5%)
3		24 (0.6%)	9 (0.6%)	18 (1.2%)	5 (0.4%)	22 (0.8%)	13 (1.0%)
4		86 (2.3%)	31 (2.2%)	39 (2.6%)	21 (1.8%)	55 (2.0%)	20 (1.5%)
5		3 (<0.1%)	1 (<0.1%)	1 (<0.1%)	1 (<0.1%)	3 (0.1%)	1 (<0.1%)
6		75 (2.0%)	20 (1.4%)	17 (1.1%)	6 (0.5%)	10 (0.4%)	4 (0.3%)
7		0 (0%)	0 (0%)	0 (0%)	0 (0%)	2 (<0.1%)	0 (0%)
8		194 (5.1%)	65 (4.6%)	32 (2.2%)	23 (2.0%)	37 (1.4%)	14 (1.1%)
ZZMM	11,894
1		3,435 (90%)	1,302 (92%)	1,311 (89%)	979 (84%)	2,055 (76%)	897 (69%)
2		250 (6.5%)	33 (2.3%)	48 (3.2%)	51 (4.4%)	98 (3.6%)	68 (5.2%)
3		0 (0%)	0 (0%)	1 (<0.1%)	1 (<0.1%)	8 (0.3%)	9 (0.7%)
4		135 (3.5%)	74 (5.3%)	121 (8.2%)	131 (11%)	552 (20%)	335 (26%)
JYY	11,886							<0.001
1		3,174 (83%)	1,237 (88%)	1,169 (79%)	835 (72%)	1,629 (60%)	418 (32%)
2		366 (9.6%)	126 (8.9%)	230 (16%)	238 (21%)	710 (26%)	452 (35%)
3		216 (5.7%)	42 (3.0%)	77 (5.2%)	70 (6.0%)	297 (11%)	311 (24%)
4		54 (1.4%)	3 (0.2%)	3 (0.2%)	15 (1.3%)	61 (2.3%)	88 (6.7%)
5		8 (0.2%)	0 (0%)	3 (0.2%)	2 (0.2%)	14 (0.5%)	38 (2.9%)
F.edu	11,148							<0.001
N		3,564	1,324	1,380	1,099	2,541	1,240
Median (IQR)		1.00 (1.00, 3.00)	1.00 (1.00, 3.00)	3.00 (1.00, 3.00)	3.00 (1.00, 4.00)	3.00 (1.00, 4.00)	4.00 (3.00, 6.00)
Range		1.00, 14.00	1.00, 14.00	1.00, 14.00	1.00, 14.00	1.00, 14.00	1.00, 14.00
ZZD	4,548							<0.001
1		94 (33%)	129 (29%)	150 (23%)	125 (21%)	357 (24%)	314 (29%)
2		102 (35%)	145 (33%)	237 (36%)	237 (40%)	710 (48%)	585 (54%)
3		56 (19%)	114 (26%)	180 (27%)	157 (27%)	306 (21%)	142 (13%)
4		37 (13%)	54 (12%)	90 (14%)	70 (12%)	109 (7.4%)	48 (4.4%)
¹ Pearson's Chi-squared test; Kruskal-Wallis rank sum test

table1(~ MZ+ZZMM+JYY+F.edu+ZZD+income, data=cgssGPA4.0)

	Overall (N=12582)
MZ
Mean (SD)	1.36 (1.38)
Median [Min, Max]	1.00 [1.00, 8.00]
ZZMM
Mean (SD)	1.39 (0.955)
Median [Min, Max]	1.00 [1.00, 4.00]
Missing	10 (0.1%)
JYY
Mean (SD)	1.44 (0.782)
Median [Min, Max]	1.00 [1.00, 5.00]
Missing	15 (0.1%)
F.edu
Mean (SD)	2.98 (2.47)
Median [Min, Max]	3.00 [1.00, 14.0]
Missing	832 (6.6%)
ZZD
Mean (SD)	2.13 (0.900)
Median [Min, Max]	2.00 [1.00, 4.00]
Missing	7809 (62.1%)
income
Mean (SD)	3.12 (1.84)
Median [Min, Max]	3.00 [1.00, 6.00]
Missing	682 (5.4%)

table1(~ MZ+ZZMM+JYY+F.edu+ZZD|income, data=cgssGPA4.0)

## Warning in table1.formula(~MZ + ZZMM + JYY + F.edu + ZZD | income, data =
## cgssGPA4.0): Terms to the right of '|' in formula 'x' define table columns and
## are expected to be factors with meaningful labels.

	1 (N=3821)	2 (N=1411)	3 (N=1483)	4 (N=1162)	5 (N=2714)	6 (N=1309)	Overall (N=12582)
MZ
Mean (SD)	1.54 (1.71)	1.48 (1.62)	1.32 (1.24)	1.24 (1.12)	1.20 (0.991)	1.16 (0.877)	1.36 (1.38)
Median [Min, Max]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]	1.00 [1.00, 8.00]
ZZMM
Mean (SD)	1.17 (0.595)	1.18 (0.681)	1.28 (0.833)	1.38 (0.957)	1.65 (1.21)	1.83 (1.30)	1.39 (0.955)
Median [Min, Max]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]	1.00 [1.00, 4.00]
Missing	1 (0.0%)	2 (0.1%)	2 (0.1%)	0 (0%)	1 (0.0%)	0 (0%)	10 (0.1%)
JYY
Mean (SD)	1.26 (0.645)	1.16 (0.452)	1.27 (0.582)	1.37 (0.673)	1.57 (0.813)	2.14 (1.03)	1.44 (0.782)
Median [Min, Max]	1.00 [1.00, 5.00]	1.00 [1.00, 4.00]	1.00 [1.00, 5.00]	1.00 [1.00, 5.00]	1.00 [1.00, 5.00]	2.00 [1.00, 5.00]	1.00 [1.00, 5.00]
Missing	3 (0.1%)	3 (0.2%)	1 (0.1%)	2 (0.2%)	3 (0.1%)	2 (0.2%)	15 (0.1%)
F.edu
Mean (SD)	2.33 (2.12)	2.23 (1.80)	2.79 (2.30)	3.01 (2.28)	3.41 (2.48)	4.80 (3.05)	2.98 (2.47)
Median [Min, Max]	1.00 [1.00, 14.0]	1.00 [1.00, 14.0]	3.00 [1.00, 14.0]	3.00 [1.00, 14.0]	3.00 [1.00, 14.0]	4.00 [1.00, 14.0]	3.00 [1.00, 14.0]
Missing	257 (6.7%)	87 (6.2%)	103 (6.9%)	63 (5.4%)	173 (6.4%)	69 (5.3%)	832 (6.6%)
ZZD
Mean (SD)	2.12 (1.01)	2.21 (0.998)	2.32 (0.974)	2.29 (0.933)	2.11 (0.854)	1.93 (0.769)	2.13 (0.900)
Median [Min, Max]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]	2.00 [1.00, 4.00]
Missing	3532 (92.4%)	969 (68.7%)	826 (55.7%)	573 (49.3%)	1232 (45.4%)	220 (16.8%)	7809 (62.1%)

ggstatsplot::ggbetweenstats(data=cgssGPA4.0,x=income,y=MZ,title="民族与收入")

ggstatsplot::ggpiestats(
  data = cgssGPA4.0,
  x = ZZMM,
  y = income,
  title = "政治面貌与收入", 
  legend.title = "政治面貌", 
  caption = substitute(paste(italic("Source"), ": cgss2017"))
)

## Warning in stats::chisq.test(x = x_arg, correct = FALSE): Chi-squared
## approximation may be incorrect

ggstatsplot::ggbarstats(
 data = cgssGPA4.0,
  x = JYY,
  y = income,
  title = "讲英文水平与收入",
  xlab = "收入",
  legend.title = "讲英文水平"
)

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=income)) +
  geom_bar(stat="count",position = "dodge",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=ZZD,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)

## Warning: Removed 7352 rows containing non-finite values (stat_count).

ggstatsplot::ggbarstats(
 data = cgssGPA4.0,
  x = F.edu,
  y = income,
  title = "父亲受教育程度与收入",
  xlab = "收入",
  legend.title = "父亲受教育程度"
)

## Warning in stats::chisq.test(x = x_arg, correct = FALSE): Chi-squared
## approximation may be incorrect

## Warning: Number of labels is greater than default palette color count.
##  Try using another color `palette` (and/or `package`).

##

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=ZZMM,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)

## Warning: Removed 6 rows containing non-finite values (stat_count).

ggplot(cgssGPA4.0[is.na(cgssGPA4.0$income) == F,], aes(x=JYY,fill=income,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~income)

## Warning: Removed 14 rows containing non-finite values (stat_count).

Untitled

R Markdown

Including Plots