data <- read.csv("khao_sat_ho_gia_dinh.csv", header = TRUE)
dim(data)
## [1] 100 8
names(data)
## [1] "ho.id" "khu.vuc" "gioi.tinh" "tuoi" "hoc.van" "thu.nhap"
## [7] "chi.tieu" "tiet.kiem"
summary(data)
## ho.id khu.vuc gioi.tinh tuoi
## Min. : 1.00 Length:100 Length:100 Min. :30
## 1st Qu.: 25.75 Class :character Class :character 1st Qu.:36
## Median : 50.50 Mode :character Mode :character Median :42
## Mean : 50.50 Mean :42
## 3rd Qu.: 75.25 3rd Qu.:48
## Max. :100.00 Max. :54
## hoc.van thu.nhap chi.tieu tiet.kiem
## Length:100 Min. : 7150000 Min. : 5100000 Min. :2050000
## Class :character 1st Qu.:10862500 1st Qu.: 7575000 1st Qu.:3287500
## Mode :character Median :14575000 Median :10050000 Median :4525000
## Mean :14575000 Mean :10050000 Mean :4525000
## 3rd Qu.:18287500 3rd Qu.:12525000 3rd Qu.:5762500
## Max. :22000000 Max. :15000000 Max. :7000000
mean(data$thu.nhap, na.rm = TRUE)
## [1] 14575000
mean(data$chi.tieu, na.rm = TRUE)
## [1] 10050000
mean(data$tiet.kiem, na.rm = TRUE)
## [1] 4525000
str(data)
## 'data.frame': 100 obs. of 8 variables:
## $ ho.id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ khu.vuc : chr "Nong-thon" "Thanh-thi" "Nong-thon" "Thanh-thi" ...
## $ gioi.tinh: chr "Nu" "Nam" "Nu" "Nam" ...
## $ tuoi : int 31 32 33 34 35 36 37 38 39 40 ...
## $ hoc.van : chr "THCS" "THCS" "THPT" "Dai-hoc" ...
## $ thu.nhap : int 7150000 7300000 7450000 7600000 7750000 7900000 8050000 8200000 8350000 8500000 ...
## $ chi.tieu : int 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 6000000 ...
## $ tiet.kiem: int 2050000 2100000 2150000 2200000 2250000 2300000 2350000 2400000 2450000 2500000 ...
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ thu.nhap + chi.tieu + tiet.kiem | khu.vuc, data = data)
| Nong-thon (N=50) |
Thanh-thi (N=50) |
Overall (N=100) |
|
|---|---|---|---|
| thu.nhap | |||
| Mean (SD) | 14500000 (4370000) | 14700000 (4370000) | 14600000 (4350000) |
| Median [Min, Max] | 14500000 [7150000, 21900000] | 14700000 [7300000, 22000000] | 14600000 [7150000, 22000000] |
| chi.tieu | |||
| Mean (SD) | 10000000 (2920000) | 10100000 (2920000) | 10100000 (2900000) |
| Median [Min, Max] | 10000000 [5100000, 14900000] | 10100000 [5200000, 15000000] | 10100000 [5100000, 15000000] |
| tiet.kiem | |||
| Mean (SD) | 4500000 (1460000) | 4550000 (1460000) | 4530000 (1450000) |
| Median [Min, Max] | 4500000 [2050000, 6950000] | 4550000 [2100000, 7000000] | 4530000 [2050000, 7000000] |
table1(~ thu.nhap + chi.tieu + tiet.kiem | hoc.van, data = data)
| Dai-hoc (N=25) |
THCS (N=50) |
THPT (N=25) |
Overall (N=100) |
|
|---|---|---|---|---|
| thu.nhap | ||||
| Mean (SD) | 14800000 (4420000) | 14500000 (4370000) | 14500000 (4410000) | 14600000 (4350000) |
| Median [Min, Max] | 14800000 [7600000, 22000000] | 14400000 [7150000, 21700000] | 14700000 [7450000, 21900000] | 14600000 [7150000, 22000000] |
| chi.tieu | ||||
| Mean (SD) | 10200000 (2940000) | 10000000 (2920000) | 10000000 (2940000) | 10100000 (2900000) |
| Median [Min, Max] | 10200000 [5400000, 15000000] | 9950000 [5100000, 14800000] | 10100000 [5300000, 14900000] | 10100000 [5100000, 15000000] |
| tiet.kiem | ||||
| Mean (SD) | 4600000 (1470000) | 4500000 (1460000) | 4500000 (1470000) | 4530000 (1450000) |
| Median [Min, Max] | 4600000 [2200000, 7000000] | 4480000 [2050000, 6900000] | 4550000 [2150000, 6950000] | 4530000 [2050000, 7000000] |
library(ggplot2)
ggplot(data, aes(x = thu.nhap)) +
geom_histogram(bins = 30, fill = "#2E86C1", color = "white") +
geom_vline(aes(xintercept = mean(thu.nhap, na.rm = TRUE)),
color = "red", linetype = "dashed", size = 1) +
labs(
title = "Phan bo thu nhap",
subtitle = "Duong đo: gia tri trung binh",
x = "Thu nhap",
y = "Tan suat"
) +
theme_minimal(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data, aes(x = khu.vuc, y = chi.tieu, fill = khu.vuc)) +
geom_boxplot(alpha = 0.7) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "yellow") +
labs(
title = "Chi tieu theo khu vuc",
subtitle = "Cham vang: gia tri trung binh",
x = "Khu vuc",
y = "Chi tiêu"
) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")
ggplot(data, aes(x = thu.nhap, y = chi.tieu)) +
geom_point(color = "#28B463", alpha = 0.6, size = 2) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Moi quan he giua thu nhap va chi tieu",
subtitle = "Duong do: hoi quy tuyen tinh",
x = "Thu nhap",
y = "Chi tieu"
) +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'