# Gọi các thư viện xử lý và trực quan hóa dữ liệu
library(lessR)
## Warning: package 'lessR' was built under R version 4.5.3
##
## lessR 4.5.2 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Find examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## Although most previous function calls still work, most
## visualization functions are now reorganized to three functions:
## Chart(): type = "bar", "pie", "radar", "bubble", "dot",
## "sunburst", "treemap", "icicle"
## X(): type="histogram", "density", "vbs", and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## There is also Flows() for Sankey flow diagrams.
##
## View lessR updates, now including modern time series forecasting.
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
library(table1)
## Warning: package 'table1' was built under R version 4.5.3
##
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
##
## label
## The following objects are masked from 'package:base':
##
## units, units<-
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
dulieu_hh <- read.csv ("C:/DataR/khao_sat_ho_gia_dinh.csv")
dim(dulieu_hh) #travec(nrow,ncol) #View(data_hh) để mở data
## [1] 100 8
cat("So quan sat:", nrow(dulieu_hh), "\nSo bien", ncol(dulieu_hh),"\n")
## So quan sat: 100
## So bien 8
names(dulieu_hh)
## [1] "ho_id" "khu_vuc" "gioi_tinh" "tuoi" "hoc_van" "thu_nhap"
## [7] "chi_tieu" "tiet_kiem"
str (dulieu_hh)
## 'data.frame': 100 obs. of 8 variables:
## $ ho_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ khu_vuc : chr "Nong_thon" "Thanh_thi" "Nong_thon" "Thanh_thi" ...
## $ gioi_tinh: chr "Nu" "Nam" "Nu" "Nam" ...
## $ tuoi : int 31 32 33 34 35 36 37 38 39 40 ...
## $ hoc_van : chr "THCS" "THCS" "THPT" "Dai_hoc" ...
## $ thu_nhap : int 7150000 7300000 7450000 7600000 7750000 7900000 8050000 8200000 8350000 8500000 ...
## $ chi_tieu : int 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 6000000 ...
## $ tiet_kiem: int 2050000 2100000 2150000 2200000 2250000 2300000 2350000 2400000 2450000 2500000 ...
mean_thu_nhap <- mean(dulieu_hh$thu_nhap)
mean_chi_tieu <- mean(dulieu_hh$chi_tieu)
mean_tiet_kiem <- mean(dulieu_hh$tiet_kiem)
print(paste("Thu nhập trung bình:", mean_thu_nhap))
## [1] "Thu nhập trung bình: 14575000"
print(paste("Chi tiêu trung bình:", mean_chi_tieu))
## [1] "Chi tiêu trung bình: 10050000"
print(paste("Tiết kiệm trung bình:", mean_tiet_kiem))
## [1] "Tiết kiệm trung bình: 4525000"
summary(dulieu_hh[c("thu_nhap", "chi_tieu", "tiet_kiem")])
## thu_nhap chi_tieu tiet_kiem
## Min. : 7150000 Min. : 5100000 Min. :2050000
## 1st Qu.:10862500 1st Qu.: 7575000 1st Qu.:3287500
## Median :14575000 Median :10050000 Median :4525000
## Mean :14575000 Mean :10050000 Mean :4525000
## 3rd Qu.:18287500 3rd Qu.:12525000 3rd Qu.:5762500
## Max. :22000000 Max. :15000000 Max. :7000000
##Hoạt động 2: Tạo bảng mô tả thu nhập, chi tiêu, tiết kiệm # 1. Tạo bảng so sánh thu nhập, chi tiêu, tiết kiệm theo khu vực
table1(~ thu_nhap + chi_tieu + tiet_kiem | khu_vuc, data = dulieu_hh)
| Nong_thon (N=50) |
Thanh_thi (N=50) |
Overall (N=100) |
|
|---|---|---|---|
| thu_nhap | |||
| Mean (SD) | 14500000 (4370000) | 14700000 (4370000) | 14600000 (4350000) |
| Median [Min, Max] | 14500000 [7150000, 21900000] | 14700000 [7300000, 22000000] | 14600000 [7150000, 22000000] |
| chi_tieu | |||
| Mean (SD) | 10000000 (2920000) | 10100000 (2920000) | 10100000 (2900000) |
| Median [Min, Max] | 10000000 [5100000, 14900000] | 10100000 [5200000, 15000000] | 10100000 [5100000, 15000000] |
| tiet_kiem | |||
| Mean (SD) | 4500000 (1460000) | 4550000 (1460000) | 4530000 (1450000) |
| Median [Min, Max] | 4500000 [2050000, 6950000] | 4550000 [2100000, 7000000] | 4530000 [2050000, 7000000] |
table1(~ thu_nhap + chi_tieu + tiet_kiem | hoc_van, data = dulieu_hh)
| Dai_hoc (N=25) |
THCS (N=50) |
THPT (N=25) |
Overall (N=100) |
|
|---|---|---|---|---|
| thu_nhap | ||||
| Mean (SD) | 14800000 (4420000) | 14500000 (4370000) | 14500000 (4410000) | 14600000 (4350000) |
| Median [Min, Max] | 14800000 [7600000, 22000000] | 14400000 [7150000, 21700000] | 14700000 [7450000, 21900000] | 14600000 [7150000, 22000000] |
| chi_tieu | ||||
| Mean (SD) | 10200000 (2940000) | 10000000 (2920000) | 10000000 (2940000) | 10100000 (2900000) |
| Median [Min, Max] | 10200000 [5400000, 15000000] | 9950000 [5100000, 14800000] | 10100000 [5300000, 14900000] | 10100000 [5100000, 15000000] |
| tiet_kiem | ||||
| Mean (SD) | 4600000 (1470000) | 4500000 (1460000) | 4500000 (1470000) | 4530000 (1450000) |
| Median [Min, Max] | 4600000 [2200000, 7000000] | 4480000 [2050000, 6900000] | 4550000 [2150000, 6950000] | 4530000 [2050000, 7000000] |
ggplot(dulieu_hh, aes(x = thu_nhap)) +
geom_histogram(fill = "#4e79a7", color = "white", bins = 15) +
theme_minimal() +
labs(title = "Hình 1: Phân bố thu nhập của các hộ gia đình",
x = "Mức thu nhập (VNĐ)",
y = "Số lượng hộ")
# 2. Biểu đồ chi tiêu theo khu vực (Box plot)
ggplot(dulieu_hh, aes(x = khu_vuc, y = chi_tieu, fill = khu_vuc)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("Nong_thon" = "#f28e2b", "Thanh_thi" = "#59a14f")) +
theme_minimal() +
labs(title = "Hình 2: So sánh biến động chi tiêu theo khu vực",
x = "Khu vực", y = "Chi tiêu (VNĐ)") +
theme(legend.position = "none")
ggplot(dulieu_hh, aes(x = thu_nhap, y = chi_tieu)) +
geom_point(color = "#e15759", size = 2, alpha = 0.6) +
geom_smooth(method = "lm", color = "black", linetype = "dashed", se = TRUE) +
theme_minimal() +
labs(title = "Hình 3: Mối quan hệ giữa Thu nhập và Chi tiêu",
subtitle = "Đường nét đứt biểu diễn xu hướng hồi quy tuyến tính",
x = "Thu nhập (VNĐ)", y = "Chi tiêu (VNĐ)")
## `geom_smooth()` using formula = 'y ~ x'