homework.R

#示例和作业/广州大学公共管理学院社会学系创新班课程

library(haven)
library(sjmisc)
library(sjlabelled)

## 
## Attaching package: 'sjlabelled'

## The following objects are masked from 'package:sjmisc':
## 
##     to_character, to_factor, to_label, to_numeric

## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels

library(tidyverse)

## -- Attaching packages ------------- tidyverse 1.3.0 --

## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0

## -- Conflicts ---------------- tidyverse_conflicts() --
## x tibble::add_case()       masks sjmisc::add_case()
## x forcats::as_factor()     masks sjlabelled::as_factor(), haven::as_factor()
## x dplyr::as_label()        masks sjlabelled::as_label()
## x dplyr::filter()          masks stats::filter()
## x purrr::is_empty()        masks sjmisc::is_empty()
## x dplyr::lag()             masks stats::lag()
## x sjlabelled::read_sas()   masks haven::read_sas()
## x sjlabelled::read_spss()  masks haven::read_spss()
## x sjlabelled::read_stata() masks haven::read_stata()
## x tidyr::replace_na()      masks sjmisc::replace_na()
## x sjlabelled::write_sas()  masks haven::write_sas()
## x sjlabelled::zap_labels() masks haven::zap_labels()

#setwd("E:/course/cgss2017")
##模块1 数据读入和简单清理
cgss2017 <- read_dta("E:/course/cgss2017/cgss2017.dta")
#View(cgss2017)
cgss2017backup <- cgss2017 #backup
table(cgss2017$a15)

## 
##    1    2    3    4    5   98   99 
##  593 2014 3261 4409 2300    3    2

descr(cgss2017$a8a)  #注意excel 编码表  收入编码有坑

## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc   mean      sd       se
##   dd numeric 您个人去年(2016年)全年总收入 12580   0.02 580188 2265003 20194.29
##     md trimmed               range   iqr skew
##  24000  580188 9999999 (0-9999999) 46400 3.91

##处理异常值 读入数据
cgss2017[cgss2017 >= 9999997] <- NA
cgss2017[cgss2017 == 9999996] <- 999999
cgss2017[cgss2017 == 98] <- NA
cgss2017[cgss2017 == 99] <- NA
cgss2017 <- sjlabelled::drop_labels(cgss2017)
#str(cgss2017)
cgss2017 <- sjmisc::to_label(cgss2017)
descr(cgss2017$a8a)  #测试

## 
## ## Basic descriptive statistics
## 
##  var    type                        label     n NA.prc     mean       sd
##   dd numeric 您个人去年(2016年)全年总收入 11900   5.42 41156.86 233509.9
##       se    md  trimmed               range   iqr  skew
##  2140.58 20000 41156.86 9993600 (0-9993600) 42000 37.51

table(cgss2017$a15) #测试

## 
##   很不健康 比较不健康       一般   比较健康     很健康 
##        593       2014       3261       4409       2300

#cgss2017[cgss2017 == "拒绝回答"] <- NA
#cgss2017[cgss2017 == "不知道"] <- NA
#c2017[c2017=="不适用"] <- NA  #c2017[c2017=="无法回答"] <- NA  c2017[c2017=="无法选择"] <- NA c2017[c2017=="不好说"] <- NA
cgss2017 <- droplevels(cgss2017)
#copy&past 数据读入结束
#数据清理  select  rename label 编码和转化示例
library(tidyverse)
cgssx <-
  cgss2017 %>% select(
    gender = a2,
    birth = a31,
    edu = a7a,
    income = a8a,
    salary = a8b,
    gpg = a35
  )

cgssx <- rename(cgssx,  fairplay = gpg)
#cgss2017 %>% select(contains("a"))
#cgss2017 %>% select(a2,a31,a7a,a8a,a8b,a35)
cgssx$age <- 2017 - cgssx$birth
#cgssx<-filter(cgssx,cgssx$age>18)
#table(cgssx$edu)
cgssx$education <- as.numeric(cgssx$edu)
cgssx$educ <-
  car::recode(cgssx$education, "9:13='3';5:8='2';1;4='1';'14'=NA")#recode car package
table(cgssx$educ)

## 
##    1    2    3 
## 3511 2326 5196

cgssx$educ <-
  factor(cgssx$educ,
         levels = c(1, 2, 3),
         labels = c("pri", "middle", "high"))
table(cgssx$educ)#从类型到数值再到类型 教育变量为例

## 
##    pri middle   high 
##   3511   2326   5196

cgssx <- droplevels(cgssx)

###作业1  完成数据读入 选取感兴趣的变量生成子集  并且进行编码

### 模块2  描述性统计
# 2.1表格制作
library(gtsummary)

## Warning: package 'gtsummary' was built under R version 4.0.3

cgssx$salary <- as.numeric(cgssx$salary)#数值变量转换
cgssx$income <- as.numeric(cgssx$income)
tbl_summary(cgssx)

Characteristic	N = 12,582¹
gender
男	5,935 (47%)
女	6,647 (53%)
[年] 您的出生日期是什么？	1,965 (1,953, 1,979)
edu
没有受过任何教育	1,528 (12%)
私塾、扫盲班	91 (0.7%)
小学	2,726 (22%)
初中	3,511 (28%)
职业高中	163 (1.3%)
普通高中	1,470 (12%)
中专	530 (4.2%)
技校	72 (0.6%)
大学专科（成人高等教育）	377 (3.0%)
大学专科（正规高等教育）	674 (5.4%)
大学本科（成人高等教育）	299 (2.4%)
大学本科（正规高等教育）	948 (7.5%)
研究生及以上	172 (1.4%)
其他	21 (0.2%)
income	20,000 (3,000, 45,000)
Unknown	682
salary	10,000 (0, 40,000)
Unknown	890
fairplay
完全不公平	962 (7.7%)
比较不公平	3,182 (25%)
说不上公平但也不能说不公平	2,444 (20%)
比较公平	5,490 (44%)
完全公平	444 (3.5%)
Unknown	60
[年] 您的出生日期是什么？	52 (38, 64)
education	4.0 (3.0, 6.0)
educ
pri	3,511 (32%)
middle	2,326 (21%)
high	5,196 (47%)
Unknown	1,549
¹ Statistics presented: n (%); Median (IQR)

tbl_summary(cgssx,statistic = list(all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"),label = list(age = "age"),digits = list(age ~ c(2, 2), c(income) ~ 1))

Characteristic	N = 12,582¹
gender
男	5,935 (47%)
女	6,647 (53%)
[年] 您的出生日期是什么？	1,966(17) 1,965 (1,914, 1,953, 1,979,1,999)
edu
没有受过任何教育	1,528 (12%)
私塾、扫盲班	91 (0.7%)
小学	2,726 (22%)
初中	3,511 (28%)
职业高中	163 (1.3%)
普通高中	1,470 (12%)
中专	530 (4.2%)
技校	72 (0.6%)
大学专科（成人高等教育）	377 (3.0%)
大学专科（正规高等教育）	674 (5.4%)
大学本科（成人高等教育）	299 (2.4%)
大学本科（正规高等教育）	948 (7.5%)
研究生及以上	172 (1.4%)
其他	21 (0.2%)
income	41,156.9(233,509.9) 20,000.0 (0.0, 3,000.0, 45,000.0,9,993,600.0)
Unknown	682
salary	33,982(212,555) 10,000 (0, 0, 40,000,9,980,000)
Unknown	890
fairplay
完全不公平	962 (7.7%)
比较不公平	3,182 (25%)
说不上公平但也不能说不公平	2,444 (20%)
比较公平	5,490 (44%)
完全公平	444 (3.5%)
Unknown	60
age	51.01(16.86) 52.00 (18.00, 38.00, 64.00,103.00)
education	5.2(3.3) 4.0 (1.0, 3.0, 6.0,14.0)
educ
pri	3,511 (32%)
middle	2,326 (21%)
high	5,196 (47%)
Unknown	1,549
¹ Statistics presented: n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssx %>%
  select(gender, edu, income, age, educ) %>% tbl_summary(
    statistic = list(
      all_continuous() ~ "{mean}({sd}) {median} ({min}, {p25}, {p75},{max})"
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  )

Characteristic	N = 12,582¹
gender
男	5,935 (47%)
女	6,647 (53%)
edu
没有受过任何教育	1,528 (12%)
私塾、扫盲班	91 (0.7%)
小学	2,726 (22%)
初中	3,511 (28%)
职业高中	163 (1.3%)
普通高中	1,470 (12%)
中专	530 (4.2%)
技校	72 (0.6%)
大学专科（成人高等教育）	377 (3.0%)
大学专科（正规高等教育）	674 (5.4%)
大学本科（成人高等教育）	299 (2.4%)
大学本科（正规高等教育）	948 (7.5%)
研究生及以上	172 (1.4%)
其他	21 (0.2%)
income	41,156.9(233,509.9) 20,000.0 (0.0, 3,000.0, 45,000.0,9,993,600.0)
Unknown	682
age	51.01(16.86) 52.00 (18.00, 38.00, 64.00,103.00)
educ
pri	3,511 (32%)
middle	2,326 (21%)
high	5,196 (47%)
Unknown	1,549
¹ Statistics presented: n (%); Mean(SD) Median (Minimum, IQR,Maximum)

cgssx %>%
  select(gender, edu, income, age, educ) %>% tbl_summary(
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  )

Characteristic	N = 12,582
gender
男	5,935 (47%)
女	6,647 (53%)
edu
没有受过任何教育	1,528 (12%)
私塾、扫盲班	91 (0.7%)
小学	2,726 (22%)
初中	3,511 (28%)
职业高中	163 (1.3%)
普通高中	1,470 (12%)
中专	530 (4.2%)
技校	72 (0.6%)
大学专科（成人高等教育）	377 (3.0%)
大学专科（正规高等教育）	674 (5.4%)
大学本科（成人高等教育）	299 (2.4%)
大学本科（正规高等教育）	948 (7.5%)
研究生及以上	172 (1.4%)
其他	21 (0.2%)
income
N	11,900.0
Median (IQR)	20,000.0 (3,000.0, 45,000.0)
Range	0.0, 9,993,600.0
Unknown	682
age
N	12,582.00
Median (IQR)	52.00 (38.00, 64.00)
Range	18.00, 103.00
educ
pri	3,511 (32%)
middle	2,326 (21%)
high	5,196 (47%)
Unknown	1,549

cgssx %>%
  select(gender, income, age, educ) %>% tbl_summary(
    by = gender, missing = "no",
    type = all_continuous() ~ "continuous2",
    statistic = list(
      all_continuous() ~ c("{N_nonmiss}",
                           "{median} ({p25}, {p75})", 
                           "{min}, {max}")
    ),
    label = list(age = "age"),
    digits = list(age ~ c(2, 2), c(income) ~ 1)
  ) %>%
  add_n() %>% # add column with total number of non-missing observations
  add_p() %>% # test for a difference between groups
  modify_header(label = "**Variable**") %>% # update the column header
  bold_labels()

Variable	N	男, N = 5,935	女, N = 6,647	p-value¹
income	11,900.0			<0.001
N		5,651.0	6,249.0
Median (IQR)		30,000.0 (7,000.0, 50,000.0)	15,000.0 (1,000.0, 36,000.0)
Range		0.0, 9,993,600.0	0.0, 9,930,000.0
age	12,582.00			0.2
N		5,935.00	6,647.00
Median (IQR)		52.00 (37.00, 64.00)	51.00 (38.00, 64.00)
Range		18.00, 103.00	18.00, 101.00
educ	11,033			0.004
pri		1,818 (33%)	1,693 (31%)
middle		1,202 (22%)	1,124 (20%)
high		2,524 (46%)	2,672 (49%)
¹ Statistical tests performed: Wilcoxon rank-sum test; chi-square test of independence

#solution2
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~ gender+edu+income+ age+educ, data=cgssx)

	Overall (N=12582)
gender
男	5935 (47.2%)
女	6647 (52.8%)
edu
没有受过任何教育	1528 (12.1%)
私塾、扫盲班	91 (0.7%)
小学	2726 (21.7%)
初中	3511 (27.9%)
职业高中	163 (1.3%)
普通高中	1470 (11.7%)
中专	530 (4.2%)
技校	72 (0.6%)
大学专科（成人高等教育）	377 (3.0%)
大学专科（正规高等教育）	674 (5.4%)
大学本科（成人高等教育）	299 (2.4%)
大学本科（正规高等教育）	948 (7.5%)
研究生及以上	172 (1.4%)
其他	21 (0.2%)
income
Mean (SD)	41200 (234000)
Median [Min, Max]	20000 [0, 9990000]
Missing	682 (5.4%)
[年] 您的出生日期是什么？
Mean (SD)	51.0 (16.9)
Median [Min, Max]	52.0 [18.0, 103]
educ
pri	3511 (27.9%)
middle	2326 (18.5%)
high	5196 (41.3%)
Missing	1549 (12.3%)

table1(~ edu+income+educ|gender, data=cgssx)

	男 (N=5935)	女 (N=6647)	Overall (N=12582)
edu
没有受过任何教育	383 (6.5%)	1145 (17.2%)	1528 (12.1%)
私塾、扫盲班	38 (0.6%)	53 (0.8%)	91 (0.7%)
小学	1258 (21.2%)	1468 (22.1%)	2726 (21.7%)
初中	1818 (30.6%)	1693 (25.5%)	3511 (27.9%)
职业高中	85 (1.4%)	78 (1.2%)	163 (1.3%)
普通高中	768 (12.9%)	702 (10.6%)	1470 (11.7%)
中专	264 (4.4%)	266 (4.0%)	530 (4.2%)
技校	47 (0.8%)	25 (0.4%)	72 (0.6%)
大学专科（成人高等教育）	205 (3.5%)	172 (2.6%)	377 (3.0%)
大学专科（正规高等教育）	338 (5.7%)	336 (5.1%)	674 (5.4%)
大学本科（成人高等教育）	147 (2.5%)	152 (2.3%)	299 (2.4%)
大学本科（正规高等教育）	486 (8.2%)	462 (7.0%)	948 (7.5%)
研究生及以上	90 (1.5%)	82 (1.2%)	172 (1.4%)
其他	8 (0.1%)	13 (0.2%)	21 (0.2%)
income
Mean (SD)	48200 (204000)	34800 (257000)	41200 (234000)
Median [Min, Max]	30000 [0, 9990000]	15000 [0, 9930000]	20000 [0, 9990000]
Missing	284 (4.8%)	398 (6.0%)	682 (5.4%)
educ
pri	1818 (30.6%)	1693 (25.5%)	3511 (27.9%)
middle	1202 (20.3%)	1124 (16.9%)	2326 (18.5%)
high	2524 (42.5%)	2672 (40.2%)	5196 (41.3%)
Missing	391 (6.6%)	1158 (17.4%)	1549 (12.3%)

table1(~ income|gender*educ, data=cgssx)

	男			女			Overall
	pri (N=1818)	middle (N=1202)	high (N=2524)	pri (N=1693)	middle (N=1124)	high (N=2672)	pri (N=3511)	middle (N=2326)	high (N=5196)
income
Mean (SD)	32000 (43400)	55100 (299000)	62400 (232000)	24200 (113000)	43700 (311000)	48900 (339000)	28300 (84400)	49600 (305000)	55500 (292000)
Median [Min, Max]	25000 [0, 960000]	36000 [0, 9930000]	30000 [0, 9990000]	20000 [0, 4420000]	30000 [0, 9930000]	20000 [0, 9930000]	20000 [0, 4420000]	30000 [0, 9930000]	24000 [0, 9990000]
Missing	59 (3.2%)	62 (5.2%)	143 (5.7%)	90 (5.3%)	75 (6.7%)	162 (6.1%)	149 (4.2%)	137 (5.9%)	305 (5.9%)

table1(~ income|gender*educ, data=cgssx,overall=FALSE)

	男			女
	pri (N=1818)	middle (N=1202)	high (N=2524)	pri (N=1693)	middle (N=1124)	high (N=2672)
income
Mean (SD)	32000 (43400)	55100 (299000)	62400 (232000)	24200 (113000)	43700 (311000)	48900 (339000)
Median [Min, Max]	25000 [0, 960000]	36000 [0, 9930000]	30000 [0, 9990000]	20000 [0, 4420000]	30000 [0, 9930000]	20000 [0, 9930000]
Missing	59 (3.2%)	62 (5.2%)	143 (5.7%)	90 (5.3%)	75 (6.7%)	162 (6.1%)

# 作业2.1  根据自己所选问题 完成表格
# 阅读相关package文档 丰富修改自己的表格 不能和例题完全一致

#2.2 图形制作 
library(ggstatsplot)

## Registered S3 method overwritten by 'broom.mixed':
##   method      from 
##   tidy.gamlss broom

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## In case you would like cite this package, cite it as:
##      Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
##      Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html

# plot
ggstatsplot::ggbetweenstats(
  data = cgssx,
  x = gender,
  y = income,
  title = "收入与性别"
)

cgssx$income[cgssx$income>= 999999] <- 999999 #清除异常值对比
ggstatsplot::ggbetweenstats(
  data = cgssx,
  x = gender,
  y = income,
  title = "收入与性别"
)

#To study an interaction between two categorical variables
ggstatsplot::ggpiestats(
  data = cgssx,
  x = educ,
  y = gender,
  title = "教育与性别", # title for the plot
  legend.title = "性别", # title for the legend
  caption = substitute(paste(italic("Source"), ": 1974 Motor Trend US magazine"))
)

## # A tibble: 2 x 12
##   gender counts  perc N     pri   middle high  statistic   p.value parameter
##   <fct>   <int> <dbl> <chr> <chr> <chr>  <chr>     <dbl>     <dbl>     <dbl>
## 1 女       5489  49.8 (n =~ 30.8~ 20.48% 48.6~      670. 3.00e-146         2
## 2 男       5544  50.2 (n =~ 32.7~ 21.68% 45.5~      474. 1.45e-103         2
## # ... with 2 more variables: method <chr>, significance <chr>

ggstatsplot::ggbarstats(
 data = cgssx,
  x = educ,
  y = gender,
  title = "Education",
  xlab = "性别",
  legend.title = "gender"
  #ggtheme = hrbrthemes::theme_ipsum_pub(),
)

## # A tibble: 2 x 12
##   gender counts  perc N     pri   middle high  statistic   p.value parameter
##   <fct>   <int> <dbl> <chr> <chr> <chr>  <chr>     <dbl>     <dbl>     <dbl>
## 1 女       5489  49.8 (n =~ 30.8~ 20.48% 48.6~      670. 3.00e-146         2
## 2 男       5544  50.2 (n =~ 32.7~ 21.68% 45.5~      474. 1.45e-103         2
## # ... with 2 more variables: method <chr>, significance <chr>

#使用ggplot
descr(cgssx$income)

## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc     mean       sd  se    md  trimmed
##   dd numeric    dd 11900   5.42 36291.57 71779.17 658 20000 24005.94
##              range   iqr skew
##  999999 (0-999999) 42000 7.86

cgssx$income1<-log(cgssx$income+1)
descr(cgssx$income1)#连续变量示例

## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc mean   sd   se  md trimmed           range
##   dd numeric    dd 11900   5.42 8.35 3.85 0.04 9.9    8.96 13.82 (0-13.82)
##       iqr  skew
##  2.707739 -1.48

c <- ggplot(cgssx[is.na(cgssx$income) == F,], aes(income))
c + geom_dotplot(binwidth = 500)+scale_x_continuous(limits = c(0, 999999))

c + geom_histogram()+ylim(0,1500)+xlim(0, 999999)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3 rows containing missing values (geom_bar).

cgssx$income1<-log(cgssx$income+1)#收入数值取对数
descr(cgssx$income1)

## 
## ## Basic descriptive statistics
## 
##  var    type label     n NA.prc mean   sd   se  md trimmed           range
##   dd numeric    dd 11900   5.42 8.35 3.85 0.04 9.9    8.96 13.82 (0-13.82)
##       iqr  skew
##  2.707739 -1.48

d <- ggplot(cgssx[is.na(cgssx$income1) == F,], aes(income1))
d + geom_dotplot()+scale_x_continuous(limits = c(0, 14))

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

d + geom_histogram()+ylim(0,2000)+xlim(0, 14)+ scale_x_continuous(
  breaks = c(2, 4, 6,8,10,12,14),
  label = c("two", "four", "six","eight","ten","twelve","fourteen")
)

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing missing values (geom_bar).

#常用的barplot
ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay)) +
  geom_bar(stat="count",position = "dodge",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay,fill=educ,na.rm = TRUE)) +
  geom_bar(stat="count",na.rm = TRUE)+geom_text(stat='count',aes(label=..count..),position=position_stack(vjust=0.5))+
 facet_wrap(~gender)

ggplot(cgssx[is.na(cgssx$fairplay) == F,], aes(x=fairplay,fill=educ)) +geom_bar(stat="count",position = "dodge")+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))

cgssx%>%filter(!is.na(educ)&!is.na(fairplay))%>% 
  ggplot(.,aes(x=fairplay,fill=educ)) +
  geom_bar(stat="count",position = "dodge")+geom_text(stat='count',aes(label=..count..),vjust=-1,position=position_dodge(width=0.9))+theme(axis.text.x = element_text(size = 8, color = "blue", face = "bold", vjust = 0.5, hjust = 0.5, angle = 15))

# 作业2.2  根据自己所选问题和需要 完成图、表 
# 阅读相关package文档 丰富、修改、变化自己的图表 

#3 完成模型  https://rpubs.com/xiexiexiexiexie/685875
library(emmeans)
library(effects)

## Loading required package: carData

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(ggeffects)
names(cgssx)

##  [1] "gender"    "birth"     "edu"       "income"    "salary"    "fairplay" 
##  [7] "age"       "education" "educ"      "income1"

#linear models
sat_mod1 <- lm(income ~ age+educ, # regression formula
               data = cgssx) # data 
sat_mod2<-lm(income ~ age+educ + gender, data = cgssx) 
sat_mod3<-lm(income ~ age+educ + gender+age*educ*gender, data = cgssx) 
# model marginsplot ggeffects
mydf<-ggpredict(sat_mod3, terms = c("gender", "educ"),data = cgssx)
ggplot(mydf, aes(x = x, y = predicted, colour = group)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

mydf1<-ggpredict(sat_mod3, terms = c("educ", "gender"),data = cgssx)
ggplot(mydf1, aes(x = x, y = predicted, colour = group)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

mydf2<-ggpredict(sat_mod3, terms = c("age[30,40,50,60,70]","educ"),data = cgssx)
ggplot(mydf2, aes(x = x, y = predicted, colour = group)) +
  geom_line() +
  stat_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

mydf3<-ggpredict(sat_mod3, terms = c("age[30,40,50,60,70]","educ","gender"),data = cgssx)
ggplot(mydf3, aes(x = x, y = predicted, colour = group)) +
  geom_line() +
  stat_smooth(method = "lm", se = FALSE) +
  facet_wrap(~facet)

## `geom_smooth()` using formula 'y ~ x'

#观察以上图形的区别
library(emmeans) #emmeans

emmip(sat_mod3, educ ~ gender, cov.reduce = range)

## NOTE: Results may be misleading due to involvement in interactions

emmip(sat_mod3, gender ~ educ, cov.reduce = range)

## NOTE: Results may be misleading due to involvement in interactions

###  作业3.1 重新code教育变量、加入更多变量，完善对收入影响的研究
##课程论文作业 结合相关理论，解释模型结果，完成课程论文1

library(modelsummary) #model results output

## Warning: package 'modelsummary' was built under R version 4.0.3

models <- list(
  sat_mod1,
  sat_mod2 ,
  sat_mod3    
)
#word格式# modelsummary(models,"table3.docx",stars = TRUE, stars_note = TRUE)
modelsummary(models,stars = TRUE, stars_note = TRUE)

	Model 1	Model 2	Model 3
(Intercept)	48782.716***	57534.715***	50061.329***
	(2584.534)	(2694.067)	(6414.001)
age	-424.360***	-449.533***	-349.990***
	(44.228)	(44.044)	(119.472)
educmiddle	13314.416***	13298.609***	4264.826
	(2038.347)	(2027.067)	(9034.030)
educhigh	19950.661***	20474.229***	66465.521***
	(1662.987)	(1654.489)	(7869.612)
gender女		-15674.531***	-43825.011***
		(1446.045)	(9176.788)
age × educmiddle			203.730
			(171.683)
age × educhigh			-814.730***
			(147.024)
age × gender女			666.757***
			(173.996)
educmiddle × gender女			19187.717
			(13209.903)
educhigh × gender女			-9599.947
			(11141.823)
age × educmiddle × gender女			-408.235
			(253.448)
age × educhigh × gender女			-19.055
			(212.359)
Num.Obs.	10442	10442	10442
R2	0.023	0.034	0.047
R2 Adj.	0.023	0.034	0.046
AIC	263826.1	263711.2	263584.5
BIC	263862.3	263754.7	263678.8
Log.Lik.	-131908.040	-131849.592	-131779.251
F	83.048	92.355	46.877
* p < 0.1, p < 0.05, * p < 0.01

#glm probit models
m1 <- glm(fairplay ~ age+educ + gender+income1,
                data = cgssx, 
                family = binomial(link = "probit"))
m2 <- glm(fairplay ~ age+educ + gender+income1+income1*educ ,
                data = cgssx, 
                family = binomial(link = "probit"))
models1 <- list(
  m1,
  m2   
)
modelsummary(models1,stars = TRUE, stars_note = TRUE)

	Model 1	Model 2
(Intercept)	1.087***	1.067***
	(0.077)	(0.095)
age	0.003***	0.003***
	(0.001)	(0.001)
educmiddle	0.030	0.070
	(0.050)	(0.123)
educhigh	0.127***	0.150
	(0.041)	(0.098)
gender女	0.028	0.029
	(0.037)	(0.037)
income1	0.012***	0.015*
	(0.005)	(0.008)
educmiddle × income1		-0.005
		(0.013)
educhigh × income1		-0.003
		(0.011)
Num.Obs.	10402	10402
AIC	5662.8	5666.6
BIC	5706.3	5724.6
Log.Lik.	-2825.386	-2825.316
* p < 0.1, p < 0.05, * p < 0.01

modelsummary(models1,stars = TRUE, stars_note = TRUE,exponentiate = TRUE)#change to OR

	Model 1	Model 2
(Intercept)	2.964***	2.906***
	(0.077)	(0.095)
age	1.003***	1.003***
	(0.001)	(0.001)
educmiddle	1.030	1.073
	(0.050)	(0.123)
educhigh	1.136***	1.162
	(0.041)	(0.098)
gender女	1.029	1.029
	(0.037)	(0.037)
income1	1.013***	1.015*
	(0.005)	(0.008)
educmiddle × income1		0.995
		(0.013)
educhigh × income1		0.997
		(0.011)
Num.Obs.	10402	10402
AIC	5662.8	5666.6
BIC	5706.3	5724.6
Log.Lik.	-2825.386	-2825.316
* p < 0.1, p < 0.05, * p < 0.01

library(ordinal)#Cumulative Link Models

## 
## Attaching package: 'ordinal'

## The following object is masked from 'package:dplyr':
## 
##     slice

m3 <- clm(fairplay ~ age+educ + gender+income1,
          data = cgssx,na.action=na.omit)
m4 <- clm(fairplay ~ age+educ + gender+income1+income1*educ ,
          data = cgssx,na.action=na.omit)
modelsc <- list( m3,m4)
modelsummary(modelsc,stars = TRUE, coef_omit = "公平",stars_note = TRUE)

	Model 1	Model 2
age	0.012***	0.012***
	(0.001)	(0.001)
educmiddle	-0.035	-0.102
	(0.050)	(0.132)
educhigh	0.182***	0.055
	(0.042)	(0.105)
gender女	-0.091**	-0.093**
	(0.037)	(0.037)
income1	0.006	-0.003
	(0.005)	(0.009)
educmiddle × income1		0.008
		(0.014)
educhigh × income1		0.015
		(0.011)
Num.Obs.	10402	10402
AIC	27636.4	27638.6
BIC	27701.7	27718.4
Log.Lik.	-13809.2101519416	-13808.3156106251
edf	9	11
* p < 0.1, p < 0.05, * p < 0.01

###对系数进行调整示例
cm <- c('gender女' = '性别女/男',
        'educhigh' = '高等教育/义务教育以下',
        'income1' = '收入'
        )
modelsummary(modelsc,coef_map = cm, stars = TRUE,stars_note = TRUE)

	Model 1	Model 2
性别女/男	-0.091**	-0.093**
	(0.037)	(0.037)
高等教育/义务教育以下	0.182***	0.055
	(0.042)	(0.105)
收入	0.006	-0.003
	(0.005)	(0.009)
Num.Obs.	10402	10402
AIC	27636.4	27638.6
BIC	27701.7	27718.4
Log.Lik.	-13809.2101519416	-13808.3156106251
edf	9	11
* p < 0.1, p < 0.05, * p < 0.01

# 回归模型表格输出变化参见 modelsummary文档
library(effects)
plot(predictorEffects(m2), lines=list(multiline=TRUE))

#e2.lm1 <- predictorEffect("educ", m4)
#plot(e2.lm1)
ee<-emmeans(m2, "educ")

## NOTE: Results may be misleading due to involvement in interactions

plot(ee,xlab="公平感预测值",ylab="教育",intervals=TRUE)

plot(ee,xlab="公平感预测值",ylab="教育",intervals=FALSE)

emmip(m2,educ~income1, cov.reduce = range)

### 以上为模型可视化  
### 模型系数可视化
modelplot(sat_mod1,coef_omit = 'Interc')#use modelsummary

modelplot(models,coef_omit = 'Interc') +
  labs(x = 'Coefficients',
       y = '变量',
       title = 'Linear regression models of 收入',
       caption = "数据来源CGSS2017")

###  作业3.2 选择自己感兴趣的问题 ，完成广义模型数据分析
# 结合相关理论，解释模型结果，完成课程论文2
# 阅读相关R包文档，变化、美化、丰富结果

##选作作业 完成对孝敬观念问题的因子分析，对因子进行解释
##选择因子得分为因变量，选取变量完成孝敬观念分析论文
library("dplyr")
d19<-dplyr::select(cgss2017,d191,d192,d193,d194,d195,d196,d197)
library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:modelsummary':
## 
##     SD

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

d19$d191<-as.numeric(d19$d191)
d19$d192<-as.numeric(d19$d192)
d19$d193<-as.numeric(d19$d193)
d19$d196<-as.numeric(d19$d196)
d19$d194<-as.numeric(d19$d194)
d19$d195<-as.numeric(d19$d194)
d19$d197<-as.numeric(d19$d197)
lowerCor(d19)

##      d191 d192 d193 d194 d195 d196 d197
## d191 1.00                              
## d192 0.51 1.00                         
## d193 0.26 0.29 1.00                    
## d194 0.30 0.40 0.03 1.00               
## d195 0.30 0.40 0.03 1.00 1.00          
## d196 0.24 0.20 0.30 0.06 0.06 1.00     
## d197 0.29 0.38 0.04 0.63 0.63 0.13 1.00

cor.plot(d19,numbers=TRUE,main="7 variables from Test Score")

test<-d19
uls <- fa(test,2,rotate="varimax")

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done

## In smc, smcs < 0 were set to .0

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done

## In smc, smcs < 0 were set to .0

## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done

## In smc, smcs < 0 were set to .0

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## In factor.scores, the correlation matrix is singular, an approximation is used

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

print(uls,sort=TRUE)

## Factor Analysis using method =  minres
## Call: fa(r = test, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      item   MR1  MR2   h2    u2 com
## d195    5  0.99 0.08 0.99 0.011 1.0
## d194    4  0.99 0.08 0.99 0.011 1.0
## d197    7  0.62 0.21 0.43 0.571 1.2
## d192    2  0.37 0.60 0.50 0.495 1.7
## d191    1  0.27 0.60 0.43 0.567 1.4
## d193    3 -0.03 0.51 0.26 0.740 1.0
## d196    6  0.03 0.42 0.18 0.823 1.0
## 
##                        MR1  MR2
## SS loadings           2.56 1.22
## Proportion Var        0.37 0.17
## Cumulative Var        0.37 0.54
## Proportion Explained  0.68 0.32
## Cumulative Proportion 0.68 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  21  and the objective function was  23.62 with Chi Square of  297047.1
## The degrees of freedom for the model are 8  and the objective function was  17.59 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  4109 with the empirical chi square  168.12  with prob <  3.2e-32 
## The total number of observations was  12582  with Likelihood Chi Square =  221165.6  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  -0.955
## RMSEA index =  1.482  and the 90 % confidence intervals are  1.477 1.488
## BIC =  221090.1
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.99 0.81
## Multiple R square of scores with factors          0.99 0.65
## Minimum correlation of possible factor scores     0.97 0.30

fa.diagram(uls,digits = 2,main="Test Score Factors")

fs <- factor.scores(test,uls)

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

fs <- fs$scores                   #get the columns of factor scores for each case
cgssff <- cbind(cgss2017,fs)              #append factor scores to dataset  
##因子得分以加入数据cgssff （顺从因子与关心因子）
#作业发送至 xysoc@gzhu.edu.cn

homework.R

Administrator

2020-11-29