library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(gtsummary)
library(readstata13)
CGSS <- read.dta13("D:/cgss2021/cgss2021/CGSS2021.dta")
## Warning in read.dta13("D:/cgss2021/cgss2021/CGSS2021.dta"): 
##    Duplicated factor levels for variables
## 
##    XA1, XA6, XB1, XB6, XC1, XC6, isco08_a57e,
##    isco08_a59d, isco08_a60d, isco08_sp,
##    isco08_f, isco08_m
## 
##    Unique labels for these variables have been generated.
## Warning in read.dta13("D:/cgss2021/cgss2021/CGSS2021.dta"): 
##    Factor codes of type double or float detected in variables
## 
##    type, A8a, A8b, L2, L4b, L5, L8a, L8b, L9,
##    A60m, A62, A75a, A75b
## 
##    No labels have been assigned.
##    Set option 'nonint.factors = TRUE' to assign labels anyway.
## Warning in read.dta13("D:/cgss2021/cgss2021/CGSS2021.dta"): 
##    Missing factor labels for variables
## 
##    P18
## 
##    No labels have been assigned.
##    Set option 'generate.factors=TRUE' to generate labels.
CGSS$A13[CGSS$A13>=800] <- NA
CGSS$A14[CGSS$A14>=800] <- NA

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14)
) +
  geom_point()
## Warning: Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14,color = A2)
) +
  geom_point()
## Warning: Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14, color = A2)
) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14)
) +
  geom_point(mapping = aes(color = A2)) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14)
) +
  geom_point(mapping = aes(color = A2, shape = A2)) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14)
) +
  geom_point(aes(color = A2, shape = A2)) +
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo CGSS",
    x = "Flipper length (mm)", y = "Body mass (g)",
    color = "A2", shape = "A2"
  ) +
  scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Removed 179 rows containing missing values (`geom_point()`).

ggplot(
  data = CGSS,
  mapping = aes(x = A13, y = A14)
) +
  geom_point(aes(color = A2, shape = A2)) +
  geom_smooth(method = "lm") +
  labs(
    title = "身高与体重",
    subtitle = " CGSS2021",
    x = "身高", y = "体重",
    color = "A2", shape = "A2"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Removed 179 rows containing missing values (`geom_point()`).

cgss2021 <- CGSS%>% select(
  stype = A27d,
  gender = A2,
  birth = A3_1,
  height = A13,
  weight = A14,
  edu = A7a,
  poli = A10,
  income = A8a,
  famincome = A62,
  health = A15,
  happy = A36
)

ggplot(
  data = cgss2021,
  mapping = aes(x =height, y = weight)
) +
  geom_point(aes(color = gender, shape = gender)) +
  geom_smooth(method = "lm") +
  labs(
    title = "身高与体重",
    subtitle = " CGSS2021",
    x = "身高", y = "体重"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 179 rows containing non-finite values (`stat_smooth()`).
## Removed 179 rows containing missing values (`geom_point()`).

cgss2021 <- cgss2021 %>% mutate(age = 2021-birth)
#install.packages("gtsummary")

tbl_summary(cgss2021)
Characteristic N = 8,1481
stype
    农村 5,931 (73%)
    乡镇 533 (6.5%)
    县城 535 (6.6%)
    城郊 76 (0.9%)
    城市市区 1,019 (13%)
    境外 2 (<0.1%)
    其他(请注明) 9 (0.1%)
    不知道 41 (0.5%)
    拒绝回答 2 (<0.1%)
gender
    男 3,679 (45%)
    女 4,469 (55%)
birth 1,968 (1,955, 1,984)
height 163 (158, 170)
    Unknown 144
weight 120 (108, 140)
    Unknown 66
edu
    没有受过任何教育 858 (11%)
    私塾、扫盲班 47 (0.6%)
    小学 1,751 (22%)
    初中 2,311 (28%)
    职业高中 127 (1.6%)
    普通高中 960 (12%)
    中专 374 (4.6%)
    技校 28 (0.3%)
    大学专科(成人高等教育) 261 (3.2%)
    大学专科(正规高等教育) 361 (4.4%)
    大学本科(成人高等教育) 194 (2.4%)
    大学本科(正规高等教育) 738 (9.1%)
    研究生及以上 117 (1.4%)
    其他(请注明) 0 (0%)
    Unknown 21
poli
    群众 6,557 (80%)
    共青团员 606 (7.4%)
    民主党派 6 (<0.1%)
    共产党员 966 (12%)
    不知道 12 (0.1%)
    拒绝回答 1 (<0.1%)
income 30,000 (3,000, 70,000)
famincome 80,000 (30,000, 300,000)
health
    很不健康 429 (5.3%)
    比较不健康 1,069 (13%)
    一般 2,288 (28%)
    比较健康 2,864 (35%)
    很健康 1,492 (18%)
    不知道 6 (<0.1%)
    拒绝回答 0 (0%)
happy
    非常不幸福 75 (1.4%)
    比较不幸福 243 (4.5%)
    说不上幸福不幸福 698 (13%)
    比较幸福 3,113 (57%)
    非常幸福 1,319 (24%)
    不知道 7 (0.1%)
    拒绝回答 3 (<0.1%)
    Unknown 2,690
age 53 (37, 66)
1 n (%); Median (IQR)
trial2 <- cgss2021 %>% select(gender, age, happy)
trial2 %>% tbl_summary()
Characteristic N = 8,1481
gender
    男 3,679 (45%)
    女 4,469 (55%)
age 53 (37, 66)
happy
    非常不幸福 75 (1.4%)
    比较不幸福 243 (4.5%)
    说不上幸福不幸福 698 (13%)
    比较幸福 3,113 (57%)
    非常幸福 1,319 (24%)
    不知道 7 (0.1%)
    拒绝回答 3 (<0.1%)
    Unknown 2,690
1 n (%); Median (IQR)
trial2 %>%
  tbl_summary(by = gender) 
Characteristic , N = 3,6791 , N = 4,4691
age 54 (38, 67) 52 (37, 65)
happy
    非常不幸福 29 (1.2%) 46 (1.5%)
    比较不幸福 103 (4.1%) 140 (4.7%)
    说不上幸福不幸福 349 (14%) 349 (12%)
    比较幸福 1,369 (55%) 1,744 (59%)
    非常幸福 636 (26%) 683 (23%)
    不知道 2 (<0.1%) 5 (0.2%)
    拒绝回答 0 (0%) 3 (0.1%)
    Unknown 1,191 1,499
1 Median (IQR); n (%)