data <- read.csv("khao_sat_ho_gia_dinh.csv", header = TRUE)
dim(data) 
## [1] 100   8
names(data)
## [1] "ho.id"     "khu.vuc"   "gioi.tinh" "tuoi"      "hoc.van"   "thu.nhap" 
## [7] "chi.tieu"  "tiet.kiem"
summary(data)
##      ho.id          khu.vuc           gioi.tinh              tuoi   
##  Min.   :  1.00   Length:100         Length:100         Min.   :30  
##  1st Qu.: 25.75   Class :character   Class :character   1st Qu.:36  
##  Median : 50.50   Mode  :character   Mode  :character   Median :42  
##  Mean   : 50.50                                         Mean   :42  
##  3rd Qu.: 75.25                                         3rd Qu.:48  
##  Max.   :100.00                                         Max.   :54  
##    hoc.van             thu.nhap           chi.tieu          tiet.kiem      
##  Length:100         Min.   : 7150000   Min.   : 5100000   Min.   :2050000  
##  Class :character   1st Qu.:10862500   1st Qu.: 7575000   1st Qu.:3287500  
##  Mode  :character   Median :14575000   Median :10050000   Median :4525000  
##                     Mean   :14575000   Mean   :10050000   Mean   :4525000  
##                     3rd Qu.:18287500   3rd Qu.:12525000   3rd Qu.:5762500  
##                     Max.   :22000000   Max.   :15000000   Max.   :7000000
mean(data$thu.nhap, na.rm = TRUE)
## [1] 14575000
mean(data$chi.tieu, na.rm = TRUE)
## [1] 10050000
mean(data$tiet.kiem, na.rm = TRUE)
## [1] 4525000
str(data)
## 'data.frame':    100 obs. of  8 variables:
##  $ ho.id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ khu.vuc  : chr  "Nong-thon" "Thanh-thi" "Nong-thon" "Thanh-thi" ...
##  $ gioi.tinh: chr  "Nu" "Nam" "Nu" "Nam" ...
##  $ tuoi     : int  31 32 33 34 35 36 37 38 39 40 ...
##  $ hoc.van  : chr  "THCS" "THCS" "THPT" "Dai-hoc" ...
##  $ thu.nhap : int  7150000 7300000 7450000 7600000 7750000 7900000 8050000 8200000 8350000 8500000 ...
##  $ chi.tieu : int  5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 6000000 ...
##  $ tiet.kiem: int  2050000 2100000 2150000 2200000 2250000 2300000 2350000 2400000 2450000 2500000 ...
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ thu.nhap + chi.tieu + tiet.kiem | khu.vuc, data = data)
Nong-thon
(N=50)
Thanh-thi
(N=50)
Overall
(N=100)
thu.nhap
Mean (SD) 14500000 (4370000) 14700000 (4370000) 14600000 (4350000)
Median [Min, Max] 14500000 [7150000, 21900000] 14700000 [7300000, 22000000] 14600000 [7150000, 22000000]
chi.tieu
Mean (SD) 10000000 (2920000) 10100000 (2920000) 10100000 (2900000)
Median [Min, Max] 10000000 [5100000, 14900000] 10100000 [5200000, 15000000] 10100000 [5100000, 15000000]
tiet.kiem
Mean (SD) 4500000 (1460000) 4550000 (1460000) 4530000 (1450000)
Median [Min, Max] 4500000 [2050000, 6950000] 4550000 [2100000, 7000000] 4530000 [2050000, 7000000]
table1(~ thu.nhap + chi.tieu + tiet.kiem | hoc.van, data = data)
Dai-hoc
(N=25)
THCS
(N=50)
THPT
(N=25)
Overall
(N=100)
thu.nhap
Mean (SD) 14800000 (4420000) 14500000 (4370000) 14500000 (4410000) 14600000 (4350000)
Median [Min, Max] 14800000 [7600000, 22000000] 14400000 [7150000, 21700000] 14700000 [7450000, 21900000] 14600000 [7150000, 22000000]
chi.tieu
Mean (SD) 10200000 (2940000) 10000000 (2920000) 10000000 (2940000) 10100000 (2900000)
Median [Min, Max] 10200000 [5400000, 15000000] 9950000 [5100000, 14800000] 10100000 [5300000, 14900000] 10100000 [5100000, 15000000]
tiet.kiem
Mean (SD) 4600000 (1470000) 4500000 (1460000) 4500000 (1470000) 4530000 (1450000)
Median [Min, Max] 4600000 [2200000, 7000000] 4480000 [2050000, 6900000] 4550000 [2150000, 6950000] 4530000 [2050000, 7000000]
library(ggplot2)
ggplot(data, aes(x = thu.nhap)) +
  geom_histogram(bins = 30, fill = "#2E86C1", color = "white") +
  geom_vline(aes(xintercept = mean(thu.nhap, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Phan bo thu nhap",
    subtitle = "Duong đo: gia tri trung binh",
    x = "Thu nhap",
    y = "Tan suat"
  ) +
  theme_minimal(base_size = 14)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data, aes(x = khu.vuc, y = chi.tieu, fill = khu.vuc)) +
  geom_boxplot(alpha = 0.7) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "yellow") +
  labs(
    title = "Chi tieu theo khu vuc",
    subtitle = "Cham vang: gia tri trung binh",
    x = "Khu vuc",
    y = "Chi tiêu"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

ggplot(data, aes(x = thu.nhap, y = chi.tieu)) +
  geom_point(color = "#28B463", alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(
    title = "Moi quan he giua thu nhap va chi tieu",
    subtitle = "Duong do: hoi quy tuyen tinh",
    x = "Thu nhap",
    y = "Chi tieu"
  ) +
  theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'