#示例和作业/广州大学公共管理学院社会学系创新班课程

library(haven)
library(sjmisc)
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following objects are masked from 'package:sjmisc':
## 
##     to_character, to_factor, to_label, to_numeric
## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(tidyverse)
## -- Attaching packages ------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ---------------- tidyverse_conflicts() --
## x tibble::add_case()       masks sjmisc::add_case()
## x forcats::as_factor()     masks sjlabelled::as_factor(), haven::as_factor()
## x dplyr::as_label()        masks sjlabelled::as_label()
## x dplyr::filter()          masks stats::filter()
## x purrr::is_empty()        masks sjmisc::is_empty()
## x dplyr::lag()             masks stats::lag()
## x sjlabelled::read_sas()   masks haven::read_sas()
## x sjlabelled::read_spss()  masks haven::read_spss()
## x sjlabelled::read_stata() masks haven::read_stata()
## x tidyr::replace_na()      masks sjmisc::replace_na()
## x sjlabelled::write_sas()  masks haven::write_sas()
## x sjlabelled::zap_labels() masks haven::zap_labels()
#setwd("E:/course/cgss2017")
##模块1 数据读入和简单清理
cgss2017 <- read_dta("E:/course/cgss2017/cgss2017.dta")
#View(cgss2017)
cgss2017backup <- cgss2017 #backup
table(cgss2017$a15)
## 
##    1    2    3    4    5   98   99 
##  593 2014 3261 4409 2300    3    2
descr(cgss2017$a8a)  #注意excel 编码表  收入编码有坑
## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc   mean      sd       se
##   dd numeric 您个人去年(2016年)全年总收入 12580   0.02 580188 2265003 20194.29
##     md trimmed               range   iqr skew
##  24000  580188 9999999 (0-9999999) 46400 3.91
##处理异常值 读入数据
cgss2017[cgss2017 >= 9999997] <- NA
cgss2017[cgss2017 == 9999996] <- 999999
cgss2017[cgss2017 == 98] <- NA
cgss2017[cgss2017 == 99] <- NA
cgss2017 <- sjlabelled::drop_labels(cgss2017)
#str(cgss2017)
cgss2017 <- sjmisc::to_label(cgss2017)
descr(cgss2017$a8a)  #测试
## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc     mean       sd
##   dd numeric 您个人去年(2016年)全年总收入 11900   5.42 41156.86 233509.9
##       se    md  trimmed               range   iqr  skew
##  2140.58 20000 41156.86 9993600 (0-9993600) 42000 37.51
table(cgss2017$a15) #测试
## 
##   很不健康 比较不健康       一般   比较健康     很健康 
##        593       2014       3261       4409       2300
#cgss2017[cgss2017 == "拒绝回答"] <- NA
#cgss2017[cgss2017 == "不知道"] <- NA
#c2017[c2017=="不适用"] <- NA  #c2017[c2017=="无法回答"] <- NA  c2017[c2017=="无法选择"] <- NA c2017[c2017=="不好说"] <- NA
cgss2017 <- droplevels(cgss2017)
#copy&past 数据读入结束
#数据清理  select  rename label 编码和转化示例
library(tidyverse)
cgssx <-
  cgss2017 %>% select(
    gender = a2,
    birth = a31,
    edu = a7a,
    income = a8a,
    salary = a8b,
    gpg = a35
  )

cgssx <- rename(cgssx,  fairplay = gpg)
#cgss2017 %>% select(contains("a"))
#cgss2017 %>% select(a2,a31,a7a,a8a,a8b,a35)
cgssx$age <- 2017 - cgssx$birth
#cgssx<-filter(cgssx,cgssx$age>18)
#table(cgssx$edu)
cgssx$education <- as.numeric(cgssx$edu)
cgssx$educ <-
  car::recode(cgssx$education, "9:13='3';5:8='2';1;4='1';'14'=NA")#recode car package
table(cgssx$educ)
## 
##    1    2    3 
## 3511 2326 5196
cgssx$educ <-
  factor(cgssx$educ,
         levels = c(1, 2, 3),
         labels = c("pri", "middle", "high"))
table(cgssx$educ)#从类型到数值再到类型 教育变量为例
## 
##    pri middle   high 
##   3511   2326   5196
cgssx <- droplevels(cgssx)

###作业1  完成数据读入 选取感兴趣的变量生成子集  并且进行编码

### 模块2  描述性统计
# 2.1表格制作
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.0.3
cgssx$salary <- as.numeric(cgssx$salary)#数值变量转换
cgssx$income <- as.numeric(cgssx$income)
tbl_summary(cgssx)
Characteristic N = 12,5821
gender
5,935 (47%)
6,647 (53%)
[年]  您的出生日期是什么? 1,965 (1,953, 1,979)
edu
没有受过任何教育 1,528 (12%)
私塾、扫盲班 91 (0.7%)
小学 2,726 (22%)
初中 3,511 (28%)
职业高中 163 (1.3%)
普通高中 1,470 (12%)
中专 530 (4.2%)
技校 72 (0.6%)
大学专科(成人高等教育) 377 (3.0%)
大学专科(正规高等教育) 674 (5.4%)
大学本科(成人高等教育) 299 (2.4%)
大学本科(正规高等教育) 948 (7.5%)
研究生及以上 172 (1.4%)
其他 21 (0.2%)
income 20,000 (3,000, 45,000)
Unknown 682
salary 10,000 (0, 40,000)
Unknown 890
fairplay
完全不公平 962 (7.7%)
比较不公平 3,182 (25%)
说不上公平但也不能说不公平 2,444 (20%)
比较公平 5,490 (44%)
完全公平 444 (3.5%)
Unknown 60
[年]  您的出生日期是什么? 52 (38, 64)
education 4.0 (3.0, 6.0)
educ
pri 3,511 (32%)
middle 2,326 (21%)
high 5,196 (47%)
Unknown 1,549

1 Statistics presented: n (%); Median (IQR)

tbl_summary(cgssx,statistic = list(all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"),label = list(age = "age"),digits = list(age ~ c(2, 2), c(income) ~ 1))
Characteristic N = 12,5821
gender
5,935 (47%)
6,647 (53%)
[年]  您的出生日期是什么? 1,966(17) 1,965 (1,914, 1,953, 1,979,1,999)
edu
没有受过任何教育 1,528 (12%)
私塾、扫盲班 91 (0.7%)
小学 2,726 (22%)
初中 3,511 (28%)
职业高中 163 (1.3%)
普通高中 1,470 (12%)
中专 530 (4.2%)
技校 72 (0.6%)
大学专科(成人高等教育) 377 (3.0%)
大学专科(正规高等教育) 674 (5.4%)
大学本科(成人高等教育) 299 (2.4%)
大学本科(正规高等教育) 948 (7.5%)
研究生及以上 172 (1.4%)
其他 21 (0.2%)
income 41,156.9(233,509.9) 20,000.0 (0.0, 3,000.0, 45,000.0,9,993,600.0)
Unknown 682
salary 33,982(212,555) 10,000 (0, 0, 40,000,9,980,000)
Unknown 890
fairplay
完全不公平 962 (7.7%)
比较不公平 3,182 (25%)
说不上公平但也不能说不公平 2,444 (20%)
比较公平 5,490 (44%)
完全公平 444 (3.5%)
Unknown 60
age 51.01(16.86) 52.00 (18.00, 38.00, 64.00,103.00)
education 5.2(3.3) 4.0 (1.0, 3.0, 6.0,14.0)
educ
pri 3,511 (32%)
middle 2,326 (21%)
high 5,196 (47%)
Unknown 1,549

1 Statistics presented: n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssx %>%
  select(gender, edu, income, age, educ) %>% tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  )
Characteristic N = 12,5821
gender
5,935 (47%)
6,647 (53%)
edu
没有受过任何教育 1,528 (12%)
私塾、扫盲班 91 (0.7%)
小学 2,726 (22%)
初中 3,511 (28%)
职业高中 163 (1.3%)
普通高中 1,470 (12%)
中专 530 (4.2%)
技校 72 (0.6%)
大学专科(成人高等教育) 377 (3.0%)
大学专科(正规高等教育) 674 (5.4%)
大学本科(成人高等教育) 299 (2.4%)
大学本科(正规高等教育) 948 (7.5%)
研究生及以上 172 (1.4%)
其他 21 (0.2%)
income 41,156.9(233,509.9) 20,000.0 (0.0, 3,000.0, 45,000.0,9,993,600.0)
Unknown 682
age 51.01(16.86) 52.00 (18.00, 38.00, 64.00,103.00)
educ
pri 3,511 (32%)
middle 2,326 (21%)
high 5,196 (47%)
Unknown 1,549

1 Statistics presented: n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssx %>%
  select(gender, edu, income, age, educ) %>% tbl_summary(
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  )
Characteristic N = 12,582
gender
5,935 (47%)
6,647 (53%)
edu
没有受过任何教育 1,528 (12%)
私塾、扫盲班 91 (0.7%)
小学 2,726 (22%)
初中 3,511 (28%)
职业高中 163 (1.3%)
普通高中 1,470 (12%)
中专 530 (4.2%)
技校 72 (0.6%)
大学专科(成人高等教育) 377 (3.0%)
大学专科(正规高等教育) 674 (5.4%)
大学本科(成人高等教育) 299 (2.4%)
大学本科(正规高等教育) 948 (7.5%)
研究生及以上 172 (1.4%)
其他 21 (0.2%)
income
N 11,900.0
Median (IQR) 20,000.0 (3,000.0, 45,000.0)
Range 0.0, 9,993,600.0
Unknown 682
age
N 12,582.00
Median (IQR) 52.00 (38.00, 64.00)
Range 18.00, 103.00
educ
pri 3,511 (32%)
middle 2,326 (21%)
high 5,196 (47%)
Unknown 1,549
cgssx %>%
  select(gender, income, age, educ) %>% tbl_summary(
    by = gender, missing = "no",
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  ) %>%
  add_n() %>% # add column with total number of non-missing observations
  add_p() %>% # test for a difference between groups
  modify_header(label = "**Variable**") %>% # update the column header
  bold_labels() 
Variable N , N = 5,935 , N = 6,647 p-value1
income 11,900.0 <0.001
N 5,651.0 6,249.0
Median (IQR) 30,000.0 (7,000.0, 50,000.0) 15,000.0 (1,000.0, 36,000.0)
Range 0.0, 9,993,600.0 0.0, 9,930,000.0
age 12,582.00 0.2
N 5,935.00 6,647.00
Median (IQR) 52.00 (37.00, 64.00) 51.00 (38.00, 64.00)
Range 18.00, 103.00 18.00, 101.00
educ 11,033 0.004
pri 1,818 (33%) 1,693 (31%)
middle 1,202 (22%) 1,124 (20%)
high 2,524 (46%) 2,672 (49%)

1 Statistical tests performed: Wilcoxon rank-sum test; chi-square test of independence

#solution2
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ gender+edu+income+ age+educ, data=cgssx)
Overall
(N=12582)
gender
5935 (47.2%)
6647 (52.8%)
edu
没有受过任何教育 1528 (12.1%)
私塾、扫盲班 91 (0.7%)
小学 2726 (21.7%)
初中 3511 (27.9%)
职业高中 163 (1.3%)
普通高中 1470 (11.7%)
中专 530 (4.2%)
技校 72 (0.6%)
大学专科(成人高等教育) 377 (3.0%)
大学专科(正规高等教育) 674 (5.4%)
大学本科(成人高等教育) 299 (2.4%)
大学本科(正规高等教育) 948 (7.5%)
研究生及以上 172 (1.4%)
其他 21 (0.2%)
income
Mean (SD) 41200 (234000)
Median [Min, Max] 20000 [0, 9990000]
Missing 682 (5.4%)
[年]  您的出生日期是什么?
Mean (SD) 51.0 (16.9)
Median [Min, Max] 52.0 [18.0, 103]
educ
pri 3511 (27.9%)
middle 2326 (18.5%)
high 5196 (41.3%)
Missing 1549 (12.3%)
table1(~ edu+income+educ|gender, data=cgssx)

(N=5935)

(N=6647)
Overall
(N=12582)
edu
没有受过任何教育 383 (6.5%) 1145 (17.2%) 1528 (12.1%)
私塾、扫盲班 38 (0.6%) 53 (0.8%) 91 (0.7%)
小学 1258 (21.2%) 1468 (22.1%) 2726 (21.7%)
初中 1818 (30.6%) 1693 (25.5%) 3511 (27.9%)
职业高中 85 (1.4%) 78 (1.2%) 163 (1.3%)
普通高中 768 (12.9%) 702 (10.6%) 1470 (11.7%)
中专 264 (4.4%) 266 (4.0%) 530 (4.2%)
技校 47 (0.8%) 25 (0.4%) 72 (0.6%)
大学专科(成人高等教育) 205 (3.5%) 172 (2.6%) 377 (3.0%)
大学专科(正规高等教育) 338 (5.7%) 336 (5.1%) 674 (5.4%)
大学本科(成人高等教育) 147 (2.5%) 152 (2.3%) 299 (2.4%)
大学本科(正规高等教育) 486 (8.2%) 462 (7.0%) 948 (7.5%)
研究生及以上 90 (1.5%) 82 (1.2%) 172 (1.4%)
其他 8 (0.1%) 13 (0.2%) 21 (0.2%)
income
Mean (SD) 48200 (204000) 34800 (257000) 41200 (234000)
Median [Min, Max] 30000 [0, 9990000] 15000 [0, 9930000] 20000 [0, 9990000]
Missing 284 (4.8%) 398 (6.0%) 682 (5.4%)
educ
pri 1818 (30.6%) 1693 (25.5%) 3511 (27.9%)
middle 1202 (20.3%) 1124 (16.9%) 2326 (18.5%)
high 2524 (42.5%) 2672 (40.2%) 5196 (41.3%)
Missing 391 (6.6%) 1158 (17.4%) 1549 (12.3%)
table1(~ income|gender*educ, data=cgssx)
Overall
pri
(N=1818)
middle
(N=1202)
high
(N=2524)
pri
(N=1693)
middle
(N=1124)
high
(N=2672)
pri
(N=3511)
middle
(N=2326)
high
(N=5196)
income
Mean (SD) 32000 (43400) 55100 (299000) 62400 (232000) 24200 (113000) 43700 (311000) 48900 (339000) 28300 (84400) 49600 (305000) 55500 (292000)
Median [Min, Max] 25000 [0, 960000] 36000 [0, 9930000] 30000 [0, 9990000] 20000 [0, 4420000] 30000 [0, 9930000] 20000 [0, 9930000] 20000 [0, 4420000] 30000 [0, 9930000] 24000 [0, 9990000]
Missing 59 (3.2%) 62 (5.2%) 143 (5.7%) 90 (5.3%) 75 (6.7%) 162 (6.1%) 149 (4.2%) 137 (5.9%) 305 (5.9%)
table1(~ income|gender*educ, data=cgssx,overall=FALSE)
pri
(N=1818)
middle
(N=1202)
high
(N=2524)
pri
(N=1693)
middle
(N=1124)
high
(N=2672)
income
Mean (SD) 32000 (43400) 55100 (299000) 62400 (232000) 24200 (113000) 43700 (311000) 48900 (339000)
Median [Min, Max] 25000 [0, 960000] 36000 [0, 9930000] 30000 [0, 9990000] 20000 [0, 4420000] 30000 [0, 9930000] 20000 [0, 9930000]
Missing 59 (3.2%) 62 (5.2%) 143 (5.7%) 90 (5.3%) 75 (6.7%) 162 (6.1%)
# 作业2.1  根据自己所选问题 完成表格
# 阅读相关package文档 丰富修改自己的表格 不能和例题完全一致

#2.2 图形制作 
library(ggstatsplot)
## Registered S3 method overwritten by 'broom.mixed':
##   method      from 
##   tidy.gamlss broom
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## In case you would like cite this package, cite it as:
##      Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
##      Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html
# plot
ggstatsplot::ggbetweenstats(
  data = cgssx,
  x = gender,
  y = income,
  title = "收入与性别"
)

cgssx$income[cgssx$income>= 999999] <- 999999 #清除异常值对比
ggstatsplot::ggbetweenstats(
  data = cgssx,
  x = gender,
  y = income,
  title = "收入与性别"
)

#To study an interaction between two categorical variables
ggstatsplot::ggpiestats(
  data = cgssx,
  x = educ,
  y = gender,
  title = "教育与性别", # title for the plot
  legend.title = "性别", # title for the legend
  caption = substitute(paste(italic("Source"), ": 1974 Motor Trend US magazine"))
)
## # A tibble: 2 x 12
##   gender counts  perc N     pri   middle high  statistic   p.value parameter
##   <fct>   <int> <dbl> <chr> <chr> <chr>  <chr>     <dbl>     <dbl>     <dbl>
## 1 女       5489  49.8 (n =~ 30.8~ 20.48% 48.6~      670. 3.00e-146         2
## 2 男       5544  50.2 (n =~ 32.7~ 21.68% 45.5~      474. 1.45e-103         2
## # ... with 2 more variables: method <chr>, significance <chr>

ggstatsplot::ggbarstats(
 data = cgssx,
  x = educ,
  y = gender,
  title = "Education",
  xlab = "性别",
  legend.title = "gender"
  #ggtheme = hrbrthemes::theme_ipsum_pub(),
)
## # A tibble: 2 x 12
##   gender counts  perc N     pri   middle high  statistic   p.value parameter
##   <fct>   <int> <dbl> <chr> <chr> <chr>  <chr>     <dbl>     <dbl>     <dbl>
## 1 女       5489  49.8 (n =~ 30.8~ 20.48% 48.6~      670. 3.00e-146         2
## 2 男       5544  50.2 (n =~ 32.7~ 21.68% 45.5~      474. 1.45e-103         2
## # ... with 2 more variables: method <chr>, significance <chr>

#使用ggplot
descr(cgssx$income)
## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc     mean       sd  se    md  trimmed
##   dd numeric    dd 11900   5.42 36291.57 71779.17 658 20000 24005.94
##              range   iqr skew
##  999999 (0-999999) 42000 7.86
cgssx$income1<-log(cgssx$income+1)
descr(cgssx$income1)#连续变量示例
## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc mean   sd   se  md trimmed           range
##   dd numeric    dd 11900   5.42 8.35 3.85 0.04 9.9    8.96 13.82 (0-13.82)
##       iqr  skew
##  2.707739 -1.48
c <- ggplot(cgssx[is.na(cgssx$income) == F,], aes(income))
c + geom_dotplot(binwidth = 500)+scale_x_continuous(limits = c(0, 999999))

c + geom_histogram()+ylim(0,1500)+xlim(0, 999999)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).

cgssx$income1<-log(cgssx$income+1)#收入数值取对数
descr(cgssx$income1)
## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc mean   sd   se  md trimmed           range
##   dd numeric    dd 11900   5.42 8.35 3.85 0.04 9.9    8.96 13.82 (0-13.82)
##       iqr  skew
##  2.707739 -1.48
d <- ggplot(cgssx[is.na(cgssx$income1) == F,], aes(income1))
d + geom_dotplot()+scale_x_continuous(limits = c(0, 14))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

d + geom_histogram()+ylim(0,2000)+xlim(0, 14)+ scale_x_continuous(
  breaks = c(2, 4, 6,8,10,12,14),
  label = c("two", "four", "six","eight","ten","twelve","fourteen")
)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_bar).

#常用的barplot
ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay)) +
  geom_bar(stat="count",position = "dodge",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay,fill=educ,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~gender)

ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay,fill=educ)) +geom_bar(stat="count",position = "dodge")+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

cgssx%>%filter(!is.na(educ)&!is.na(fairplay))%>% 
  ggplot(.,aes(x=fairplay,fill=educ)) +
  geom_bar(stat="count",position = "dodge")+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))+theme(axis.text.x = element_text(size = 8, color = "blue", face = "bold", vjust = 0.5, hjust = 0.5, angle = 15))

# 作业2.2  根据自己所选问题和需要 完成图、表 
# 阅读相关package文档 丰富、修改、变化自己的图表 

#3 完成模型  https://rpubs.com/xiexiexiexiexie/685875
library(emmeans)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(ggeffects)
names(cgssx)
##  [1] "gender"    "birth"     "edu"       "income"    "salary"    "fairplay" 
##  [7] "age"       "education" "educ"      "income1"
#linear models
sat_mod1 <- lm(income ~ age+educ, # regression formula
               data = cgssx) # data 
sat_mod2<-lm(income ~ age+educ + gender, data = cgssx) 
sat_mod3<-lm(income ~ age+educ + gender+age*educ*gender, data = cgssx) 
# model marginsplot ggeffects
mydf<-ggpredict(sat_mod3, terms = c("gender", "educ"),data = cgssx)
ggplot(mydf, aes(x = x, y = predicted, colour = group)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

mydf1<-ggpredict(sat_mod3, terms = c("educ", "gender"),data = cgssx)
ggplot(mydf1, aes(x = x, y = predicted, colour = group)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

mydf2<-ggpredict(sat_mod3, terms = c("age[30,40,50,60,70]","educ"),data = cgssx)
ggplot(mydf2, aes(x = x, y = predicted, colour = group)) +
  geom_line() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

mydf3<-ggpredict(sat_mod3, terms = c("age[30,40,50,60,70]","educ","gender"),data = cgssx)
ggplot(mydf3, aes(x = x, y = predicted, colour = group)) +
  geom_line() +
  stat_smooth(method = "lm", se = FALSE) +
  facet_wrap(~facet)
## `geom_smooth()` using formula 'y ~ x'

#观察以上图形的区别
library(emmeans) #emmeans

emmip(sat_mod3, educ ~ gender, cov.reduce = range)
## NOTE: Results may be misleading due to involvement in interactions

emmip(sat_mod3, gender ~ educ, cov.reduce = range)
## NOTE: Results may be misleading due to involvement in interactions

###  作业3.1 重新code教育变量、加入更多变量,完善对收入影响的研究
##课程论文作业 结合相关理论,解释模型结果,完成课程论文1

library(modelsummary) #model results output
## Warning: package 'modelsummary' was built under R version 4.0.3
models <- list(
  sat_mod1,
  sat_mod2 ,
  sat_mod3    
)
#word格式# modelsummary(models,"table3.docx",stars = TRUE, stars_note = TRUE)
modelsummary(models,stars = TRUE, stars_note = TRUE)
Model 1 Model 2 Model 3
(Intercept) 48782.716*** 57534.715*** 50061.329***
(2584.534) (2694.067) (6414.001)
age -424.360*** -449.533*** -349.990***
(44.228) (44.044) (119.472)
educmiddle 13314.416*** 13298.609*** 4264.826
(2038.347) (2027.067) (9034.030)
educhigh 19950.661*** 20474.229*** 66465.521***
(1662.987) (1654.489) (7869.612)
gender女 -15674.531*** -43825.011***
(1446.045) (9176.788)
age × educmiddle 203.730
(171.683)
age × educhigh -814.730***
(147.024)
age × gender女 666.757***
(173.996)
educmiddle × gender女 19187.717
(13209.903)
educhigh × gender女 -9599.947
(11141.823)
age × educmiddle × gender女 -408.235
(253.448)
age × educhigh × gender女 -19.055
(212.359)
Num.Obs. 10442 10442 10442
R2 0.023 0.034 0.047
R2 Adj. 0.023 0.034 0.046
AIC 263826.1 263711.2 263584.5
BIC 263862.3 263754.7 263678.8
Log.Lik. -131908.040 -131849.592 -131779.251
F 83.048 92.355 46.877
* p < 0.1, ** p < 0.05, *** p < 0.01
#glm probit models
m1 <- glm(fairplay ~ age+educ + gender+income1,
                data = cgssx, 
                family = binomial(link = "probit"))
m2 <- glm(fairplay ~ age+educ + gender+income1+income1*educ ,
                data = cgssx, 
                family = binomial(link = "probit"))
models1 <- list(
  m1,
  m2   
)
modelsummary(models1,stars = TRUE, stars_note = TRUE)
Model 1 Model 2
(Intercept) 1.087*** 1.067***
(0.077) (0.095)
age 0.003*** 0.003***
(0.001) (0.001)
educmiddle 0.030 0.070
(0.050) (0.123)
educhigh 0.127*** 0.150
(0.041) (0.098)
gender女 0.028 0.029
(0.037) (0.037)
income1 0.012*** 0.015*
(0.005) (0.008)
educmiddle × income1 -0.005
(0.013)
educhigh × income1 -0.003
(0.011)
Num.Obs. 10402 10402
AIC 5662.8 5666.6
BIC 5706.3 5724.6
Log.Lik. -2825.386 -2825.316
* p < 0.1, ** p < 0.05, *** p < 0.01
modelsummary(models1,stars = TRUE, stars_note = TRUE,exponentiate = TRUE)#change to OR
Model 1 Model 2
(Intercept) 2.964*** 2.906***
(0.077) (0.095)
age 1.003*** 1.003***
(0.001) (0.001)
educmiddle 1.030 1.073
(0.050) (0.123)
educhigh 1.136*** 1.162
(0.041) (0.098)
gender女 1.029 1.029
(0.037) (0.037)
income1 1.013*** 1.015*
(0.005) (0.008)
educmiddle × income1 0.995
(0.013)
educhigh × income1 0.997
(0.011)
Num.Obs. 10402 10402
AIC 5662.8 5666.6
BIC 5706.3 5724.6
Log.Lik. -2825.386 -2825.316
* p < 0.1, ** p < 0.05, *** p < 0.01
library(ordinal)#Cumulative Link Models
## 
## Attaching package: 'ordinal'
## The following object is masked from 'package:dplyr':
## 
##     slice
m3 <- clm(fairplay ~ age+educ + gender+income1,
          data = cgssx,na.action=na.omit)
m4 <- clm(fairplay ~ age+educ + gender+income1+income1*educ ,
          data = cgssx,na.action=na.omit)
modelsc <- list( m3,m4)
modelsummary(modelsc,stars = TRUE, coef_omit = "公平",stars_note = TRUE)
Model 1 Model 2
age 0.012*** 0.012***
(0.001) (0.001)
educmiddle -0.035 -0.102
(0.050) (0.132)
educhigh 0.182*** 0.055
(0.042) (0.105)
gender女 -0.091** -0.093**
(0.037) (0.037)
income1 0.006 -0.003
(0.005) (0.009)
educmiddle × income1 0.008
(0.014)
educhigh × income1 0.015
(0.011)
Num.Obs. 10402 10402
AIC 27636.4 27638.6
BIC 27701.7 27718.4
Log.Lik. -13809.2101519416 -13808.3156106251
edf 9 11
* p < 0.1, ** p < 0.05, *** p < 0.01
###对系数进行调整示例
cm <- c('gender女' = '性别女/男',
        'educhigh' = '高等教育/义务教育以下',
        'income1' = '收入'
        )
modelsummary(modelsc,coef_map = cm, stars = TRUE,stars_note = TRUE)
Model 1 Model 2
性别女/男 -0.091** -0.093**
(0.037) (0.037)
高等教育/义务教育以下 0.182*** 0.055
(0.042) (0.105)
收入 0.006 -0.003
(0.005) (0.009)
Num.Obs. 10402 10402
AIC 27636.4 27638.6
BIC 27701.7 27718.4
Log.Lik. -13809.2101519416 -13808.3156106251
edf 9 11
* p < 0.1, ** p < 0.05, *** p < 0.01
# 回归模型表格输出变化参见 modelsummary文档
library(effects)
plot(predictorEffects(m2), lines=list(multiline=TRUE))

#e2.lm1 <- predictorEffect("educ", m4)
#plot(e2.lm1)
ee<-emmeans(m2, "educ")
## NOTE: Results may be misleading due to involvement in interactions
plot(ee,xlab="公平感预测值",ylab="教育",intervals=TRUE)

plot(ee,xlab="公平感预测值",ylab="教育",intervals=FALSE)

emmip(m2,educ~income1, cov.reduce = range)

### 以上为模型可视化  
### 模型系数可视化
modelplot(sat_mod1,coef_omit = 'Interc')#use modelsummary

modelplot(models,coef_omit = 'Interc') +
  labs(x = 'Coefficients',
       y = '变量',
       title = 'Linear regression models of 收入',
       caption = "数据来源CGSS2017") 

###  作业3.2 选择自己感兴趣的问题 ,完成广义模型数据分析
# 结合相关理论,解释模型结果,完成课程论文2
# 阅读相关R包文档,变化、美化、丰富结果

##选作作业 完成对孝敬观念问题的因子分析,对因子进行解释
##选择因子得分为因变量,选取变量完成孝敬观念分析论文
library("dplyr")
d19<-dplyr::select(cgss2017,d191,d192,d193,d194,d195,d196,d197)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:modelsummary':
## 
##     SD
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
d19$d191<-as.numeric(d19$d191)
d19$d192<-as.numeric(d19$d192)
d19$d193<-as.numeric(d19$d193)
d19$d196<-as.numeric(d19$d196)
d19$d194<-as.numeric(d19$d194)
d19$d195<-as.numeric(d19$d194)
d19$d197<-as.numeric(d19$d197)
lowerCor(d19)
##      d191 d192 d193 d194 d195 d196 d197
## d191 1.00                              
## d192 0.51 1.00                         
## d193 0.26 0.29 1.00                    
## d194 0.30 0.40 0.03 1.00               
## d195 0.30 0.40 0.03 1.00 1.00          
## d196 0.24 0.20 0.30 0.06 0.06 1.00     
## d197 0.29 0.38 0.04 0.63 0.63 0.13 1.00
cor.plot(d19,numbers=TRUE,main="7 variables from Test Score")

test<-d19
uls <- fa(test,2,rotate="varimax")
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
print(uls,sort=TRUE)
## Factor Analysis using method =  minres
## Call: fa(r = test, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      item   MR1  MR2   h2    u2 com
## d195    5  0.99 0.08 0.99 0.011 1.0
## d194    4  0.99 0.08 0.99 0.011 1.0
## d197    7  0.62 0.21 0.43 0.571 1.2
## d192    2  0.37 0.60 0.50 0.495 1.7
## d191    1  0.27 0.60 0.43 0.567 1.4
## d193    3 -0.03 0.51 0.26 0.740 1.0
## d196    6  0.03 0.42 0.18 0.823 1.0
## 
##                        MR1  MR2
## SS loadings           2.56 1.22
## Proportion Var        0.37 0.17
## Cumulative Var        0.37 0.54
## Proportion Explained  0.68 0.32
## Cumulative Proportion 0.68 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  21  and the objective function was  23.62 with Chi Square of  297047.1
## The degrees of freedom for the model are 8  and the objective function was  17.59 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  4109 with the empirical chi square  168.12  with prob <  3.2e-32 
## The total number of observations was  12582  with Likelihood Chi Square =  221165.6  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  -0.955
## RMSEA index =  1.482  and the 90 % confidence intervals are  1.477 1.488
## BIC =  221090.1
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.99 0.81
## Multiple R square of scores with factors          0.99 0.65
## Minimum correlation of possible factor scores     0.97 0.30
fa.diagram(uls,digits = 2,main="Test Score Factors")

fs <- factor.scores(test,uls)
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
fs <- fs$scores                   #get the columns of factor scores for each case
cgssff <- cbind(cgss2017,fs)              #append factor scores to dataset  
##因子得分以加入数据cgssff (顺从因子与关心因子)
#作业发送至 xysoc@gzhu.edu.cn