file.exists("/Users/giangnguyen/Desktop/GiangNT/2025/Course_R/Datasets/CHNS data full.csv")
## [1] TRUE
df = read.csv("/Users/giangnguyen/Desktop/GiangNT/2025/Course_R/Datasets/CHNS data full.csv")
str(df)
## 'data.frame': 9317 obs. of 29 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ whours : num 35 48 40 48 32 40 40 40 40 40 ...
## $ wgroup : int 1 3 1 3 2 1 1 1 1 1 ...
## $ dead : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fu.time : int 4 4 4 4 4 4 4 4 4 4 ...
## $ gender : int 2 1 2 2 1 1 2 1 2 1 ...
## $ age : int 52 36 31 51 58 42 43 53 53 52 ...
## $ edu : int 3 3 3 2 2 3 3 3 3 3 ...
## $ marital : int 3 2 2 2 2 2 2 2 2 2 ...
## $ residence: int 1 1 1 1 1 1 1 1 1 1 ...
## $ income : num 116000 25200 27000 27600 34800 77000 63000 92000 70000 48000 ...
## $ occu : int 1 1 1 2 2 1 1 1 1 1 ...
## $ smoking : int 0 1 0 0 0 0 0 0 0 1 ...
## $ drinking : int 0 0 1 0 0 1 0 1 1 1 ...
## $ height : num 168 173 167 164 175 179 164 166 158 172 ...
## $ weight : num 83.5 85 50 80 65 75 55 68 57 70 ...
## $ bmi : num 29.6 28.4 17.9 29.7 21.2 ...
## $ sys1 : int 120 120 110 120 120 110 120 120 110 120 ...
## $ sys2 : int 126 120 108 110 120 112 124 116 110 120 ...
## $ sys3 : int 120 120 110 120 120 110 120 120 110 120 ...
## $ dias1 : int 80 90 70 80 80 72 70 80 76 80 ...
## $ dias2 : int 82 80 70 82 82 76 70 78 72 80 ...
## $ dias3 : int 76 80 70 80 80 70 68 78 74 80 ...
## $ tsf1 : int 28 25 18 27 23 24 20 20 23 25 ...
## $ tsf2 : int 27 44 17 26 22 23 21 19 24 25 ...
## $ tsf3 : int 28 25 18 27 22 24 20 20 23 25 ...
## $ uac : num 36 35 25 32 35 28 23 43 32 32 ...
## $ hc : num 111 102 96 104 102 96 92 100 93 98 ...
## $ wc : num 103 95 72 97 90 90 72 90 77 90 ...
dim(df)
## [1] 9317 29
head(df)
## id whours wgroup dead fu.time gender age edu marital residence income occu
## 1 1 35 1 0 4 2 52 3 3 1 116000 1
## 2 2 48 3 0 4 1 36 3 2 1 25200 1
## 3 3 40 1 0 4 2 31 3 2 1 27000 1
## 4 4 48 3 0 4 2 51 2 2 1 27600 2
## 5 5 32 2 0 4 1 58 2 2 1 34800 2
## 6 6 40 1 0 4 1 42 3 2 1 77000 1
## smoking drinking height weight bmi sys1 sys2 sys3 dias1 dias2 dias3 tsf1
## 1 0 0 168 83.5 29.58 120 126 120 80 82 76 28
## 2 1 0 173 85.0 28.40 120 120 120 90 80 80 25
## 3 0 1 167 50.0 17.93 110 108 110 70 70 70 18
## 4 0 0 164 80.0 29.74 120 110 120 80 82 80 27
## 5 0 0 175 65.0 21.22 120 120 120 80 82 80 23
## 6 0 1 179 75.0 23.41 110 112 110 72 76 70 24
## tsf2 tsf3 uac hc wc
## 1 27 28 36 111 103
## 2 44 25 35 102 95
## 3 17 18 25 96 72
## 4 26 27 32 104 97
## 5 22 22 35 102 90
## 6 23 24 28 96 90
tail(df)
## id whours wgroup dead fu.time gender age edu marital residence income
## 9312 9312 20 2 0 4 1 62 1 2 2 14568.9
## 9313 9313 20 2 0 4 2 58 1 2 2 23986.1
## 9314 9314 35 1 0 4 1 43 1 2 2 14995.0
## 9315 9315 35 1 0 4 1 48 2 2 2 15460.8
## 9316 9316 35 1 0 4 2 48 1 2 2 16444.2
## 9317 9317 42 3 0 4 1 52 1 2 2 14170.0
## occu smoking drinking height weight bmi sys1 sys2 sys3 dias1 dias2 dias3
## 9312 2 1 0 161.6 62.8 24.05 130 124 124 90 84 86
## 9313 2 0 0 149.6 62.2 27.79 130 128 128 90 92 92
## 9314 2 1 1 155.9 47.4 19.50 138 136 136 90 88 90
## 9315 2 1 0 155.6 59.8 24.70 150 148 150 96 94 94
## 9316 2 0 0 150.4 54.2 23.96 108 108 106 64 66 62
## 9317 2 1 0 166.4 56.1 20.26 124 126 126 88 90 88
## tsf1 tsf2 tsf3 uac hc wc
## 9312 16 12 10 27.0 94.0 82.3
## 9313 19 14 16 32.0 93.0 81.0
## 9314 16 14 11 26.0 84.2 70.5
## 9315 11 10 14 29.0 90.5 75.2
## 9316 16 14 12 28.2 86.8 76.9
## 9317 10 9 10 25.0 88.0 72.5
summary(df)
## id whours wgroup dead
## Min. : 1 Min. : 20.00 Min. :1.000 Min. :0.0000
## 1st Qu.:2330 1st Qu.: 40.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :4659 Median : 48.00 Median :3.000 Median :0.0000
## Mean :4659 Mean : 47.26 Mean :2.483 Mean :0.0381
## 3rd Qu.:6988 3rd Qu.: 56.00 3rd Qu.:4.000 3rd Qu.:0.0000
## Max. :9317 Max. :168.00 Max. :4.000 Max. :1.0000
##
## fu.time gender age edu
## Min. : 0.00 Min. :1.00 Min. : 19.00 Min. :1.000
## 1st Qu.: 4.00 1st Qu.:1.00 1st Qu.: 42.00 1st Qu.:1.000
## Median :11.00 Median :1.00 Median : 49.00 Median :2.000
## Mean :11.58 Mean :1.46 Mean : 51.21 Mean :1.871
## 3rd Qu.:18.00 3rd Qu.:2.00 3rd Qu.: 60.00 3rd Qu.:2.000
## Max. :26.00 Max. :2.00 Max. :106.00 Max. :3.000
## NA's :2
## marital residence income occu
## Min. :1.000 Min. :1.000 Min. : 0 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 1525 1st Qu.:1.000
## Median :2.000 Median :2.000 Median : 5773 Median :2.000
## Mean :1.882 Mean :1.546 Mean : 15829 Mean :1.673
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.: 19600 3rd Qu.:2.000
## Max. :3.000 Max. :2.000 Max. :2400000 Max. :2.000
## NA's :224
## smoking drinking height weight
## Min. :0.0000 Min. :0.0000 Min. : 63.0 Min. : 5.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:157.5 1st Qu.: 53.00
## Median :0.0000 Median :0.0000 Median :163.0 Median : 60.00
## Mean :0.3505 Mean :0.4198 Mean :163.2 Mean : 61.26
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:169.5 3rd Qu.: 68.00
## Max. :1.0000 Max. :1.0000 Max. :193.6 Max. :171.40
## NA's :651 NA's :681
## bmi sys1 sys2 sys3
## Min. : 1.43 Min. : 70.0 Min. : 70.0 Min. : 70.0
## 1st Qu.: 20.45 1st Qu.:110.0 1st Qu.:108.0 1st Qu.:108.0
## Median : 22.44 Median :120.0 Median :118.0 Median :118.0
## Mean : 22.92 Mean :118.1 Mean :117.8 Mean :117.7
## 3rd Qu.: 24.90 3rd Qu.:126.0 3rd Qu.:126.0 3rd Qu.:126.0
## Max. :138.57 Max. :220.0 Max. :224.0 Max. :220.0
## NA's :687 NA's :651 NA's :732 NA's :876
## dias1 dias2 dias3 tsf1
## Min. : 18.00 Min. : 30.00 Min. : 30.00 Min. : 1.00
## 1st Qu.: 70.00 1st Qu.: 70.00 1st Qu.: 70.00 1st Qu.: 9.00
## Median : 78.00 Median : 78.00 Median : 78.00 Median :14.00
## Mean : 77.26 Mean : 77.09 Mean : 76.97 Mean :15.38
## 3rd Qu.: 82.00 3rd Qu.: 82.00 3rd Qu.: 82.00 3rd Qu.:20.00
## Max. :140.00 Max. :145.00 Max. :140.00 Max. :91.00
## NA's :652 NA's :732 NA's :877 NA's :814
## tsf2 tsf3 uac hc
## Min. : 0.00 Min. : 0.00 Min. : 2.00 Min. : 10.50
## 1st Qu.:10.00 1st Qu.:10.00 1st Qu.:24.00 1st Qu.: 89.00
## Median :15.00 Median :15.00 Median :26.40 Median : 94.00
## Mean :16.61 Mean :16.62 Mean :26.69 Mean : 93.99
## 3rd Qu.:22.00 3rd Qu.:22.00 3rd Qu.:29.00 3rd Qu.: 99.00
## Max. :80.00 Max. :80.00 Max. :95.50 Max. :196.00
## NA's :2795 NA's :2817 NA's :751 NA's :2758
## wc
## Min. : 7.0
## 1st Qu.: 73.0
## Median : 80.0
## Mean : 80.8
## 3rd Qu.: 88.0
## Max. :130.0
## NA's :2735
df_clean = df[!is.na(df$income),]
p = ggplot(data = df_clean, aes(x = income))
p1 = p + geom_histogram()
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Thu nhập", y = "Số người", title = "Phân bố thu nhập")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_clean = df[!is.na(df$income) & df$income > 0,]
p = ggplot(data = df_clean, aes(x = log(income)))
p1 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Thu nhập (logarithm scale)", y = "Số người", title = "Phân bố thu nhập")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_clean = df[!is.na(df$income) & !is.na(df$gender) & df$income > 0,]
p = ggplot(data = df_clean, aes(x = log(income), fill = factor(gender)))
p1 = p + geom_histogram(col="white") + labs(x = "Thu nhập (logarithm scale)", y = "Số người", title = "Phân bố thu nhập")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p2 = p + geom_density(alpha = 0.5) + labs(x = "Thu nhập (logarithm scale)", y = "Tỉ lệ", title = "Phân bố thu nhập")
p2
p = ggplot(data = df, aes(x = log(income), fill = factor(gender)))
p1 = p + geom_histogram(col="white") + labs(x = "Thu nhập (logarithm scale)", y = "Số người", title = "Phân bố thu nhập")
p2 = p + geom_density(alpha = 0.5) + labs(x = "Thu nhập (logarithm scale)", y = "Tỉ lệ", title = "Phân bố thu nhập")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_density()`).
df_clean = df %>%
filter(!is.na(income) & !is.na(edu) & income > 0)
temp = df %>%
group_by(edu) %>%
summarise(income = median(income, na.rm = TRUE))
temp
## # A tibble: 3 × 2
## edu income
## <int> <dbl>
## 1 1 1750.
## 2 2 6420
## 3 3 31200
df$edu = as.character(factor(df$edu, levels = c(1, 2, 3), labels = c("Primary", "Secondary", "Tertary")))
p = ggplot(data = temp, aes(x = edu, y = income, fill = edu))
p1 = p + geom_bar(stat = "identity") + labs(x = "Education", y = "Income") + theme(legend.position = "none")
p1
p2 = p + geom_bar(stat = "identity") + geom_text(aes(label = income, vjust = -0.5)) + geom_bar(stat = "identity") + labs(x = "Education", y = "Income") + theme(legend.position="none")
p2
p1 = p + geom_bar(stat = "identity") + labs(x = "Education", y = "Income") + theme(legend.position = "none")
p2 = p + geom_bar(stat = "identity") + geom_text(aes(label = income, vjust = -0.5)) + geom_bar(stat = "identity") + labs(x = "Education", y = "Income") + theme(legend.position="none")
grid.arrange(p1, p2, ncol = 2)
df_clean$gender = as.character(factor(df_clean$gender, levels = c(1, 2), labels = c("Men", "Women")))
p = ggplot(data = df, aes(x = factor(gender), y = log(income), col = factor(gender)))
p + geom_boxplot() + labs(x = "Giới tính", y = "Thu nhập (logarithm scale)") + ggtitle("Thu nhập theo giới tính")
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
p + geom_boxplot() + geom_jitter(alpha = 0.05) + labs(x = "Giới tính", y = "Thu nhập (logarithm scale)") + ggtitle("Thu nhập theo giới tính")
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 224 rows containing missing values or values outside the scale range
## (`geom_point()`).
df$gender = as.character(factor(df$gender, levels = c(1, 2), labels = c("Men", "Women")))
p = ggplot(data = df, aes(x = factor(gender), y = log(income), col = factor(gender)))
p1 = p + geom_boxplot() + labs(x = "Giới tính", y = "Thu nhập (logarithm scale)") + ggtitle("Thu nhập theo giới tính")
p2 = p + geom_boxplot() + geom_jitter(alpha = 0.05) + labs(x = "Giới tính", y = "Thu nhập (logarithm scale)") + ggtitle("Thu nhập theo giới tính")
grid.arrange(p1, p2, ncol = 2)
## Warning: Removed 241 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 241 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 224 rows containing missing values or values outside the scale range
## (`geom_point()`).
p = ggplot(data = df, aes(x = age, y = log(income)))
p + geom_point() + labs(x = "Tuổi", y = "Thu nhập (logarithm scale)") + ggtitle("Thu nhập theo tuổi")
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).
p = ggplot(data = df, aes(x = age, y = log(income), col = factor(gender)))
p1 = p + geom_point() + labs(x = "Tuổi", y = "Thu nhập (logarithm scale)") + ggtitle("Mối liên quan giữa thu nhập và tuổi theo giới tính")
p1
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).
p2 = p1 + geom_smooth()
p2
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 243 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).
p3 = p1 + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3))
p3
## Warning: Removed 243 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).
grid.arrange(p2, p3, ncol = 2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 243 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 243 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 226 rows containing missing values or values outside the scale range
## (`geom_point()`).