1 Đọc dữ liệu

library(readr)

## Warning: package 'readr' was built under R version 4.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(psych)

## Warning: package 'psych' was built under R version 4.4.3

library(DT)

## Warning: package 'DT' was built under R version 4.4.3

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

data <- read.csv("D:/PTDLDT/Supermarket_Transactions.csv", header = T)
datatable(data)

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

names(data)

##  [1] "X"                 "PurchaseDate"      "CustomerID"       
##  [4] "Gender"            "MaritalStatus"     "Homeowner"        
##  [7] "Children"          "AnnualIncome"      "City"             
## [10] "StateorProvince"   "Country"           "ProductFamily"    
## [13] "ProductDepartment" "ProductCategory"   "UnitsSold"        
## [16] "Revenue"

2 Lọc dữ liệu định tính và kiểm tra lại

categorical_columns <- c("Gender","MaritalStatus","City","Homeowner","StateorProvince","Country","ProductFamily","ProductDepartment","ProductCategory")

Tạo bộ dữ liệu mới chỉ chứa các biến định tính

df <- data[,categorical_columns]

Xem trước dữ liệu

head(df)

##   Gender MaritalStatus          City Homeowner StateorProvince Country
## 1      F             S   Los Angeles         Y              CA     USA
## 2      M             M   Los Angeles         Y              CA     USA
## 3      F             M     Bremerton         N              WA     USA
## 4      M             M      Portland         Y              OR     USA
## 5      F             S Beverly Hills         Y              CA     USA
## 6      F             M Beverly Hills         Y              CA     USA
##   ProductFamily ProductDepartment      ProductCategory
## 1          Food       Snack Foods          Snack Foods
## 2          Food           Produce           Vegetables
## 3          Food       Snack Foods          Snack Foods
## 4          Food            Snacks                Candy
## 5         Drink         Beverages Carbonated Beverages
## 6          Food              Deli          Side Dishes

3 Lập bảng tần số và vẽ biểu đồ

Gender (Giới tính):
Bảng tần số:

table(df$Gender)/sum(nrow(df))

## 
##         F         M 
## 0.5099936 0.4900064

Vậy trong bộ dữ liệu này có 50.9993598% là nữ và 49.0006402% là nam.

Vẽ biểu đồ:

CHuẩn bị số liệu

Biểu đồ cột

ggplot(gender_d, aes(x = Gender, y = Frequency, fill = Gender)) +
  geom_col() +
  geom_text(aes(label = paste0(Percent, "%")), vjust = -0.5) +  # thêm nhãn phần trăm
  scale_fill_manual(values = c("M" = "yellow", "F" = "pink")) +
  labs(x = "Giới tính", y = "Số lượng", title = "Biểu đồ phân bố giới tính") +
  theme_minimal()

Biểu đồ tròn

ggplot(gender_d, aes(x = "", y = Frequency, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  geom_text(aes(label = Label), position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = c("M" = "gray", "F" = "pink")) +
  labs(title = "Biểu đồ phân bố giới tính") +
  theme_void()

MaritalStatus (Tình trạng hôn nhân):
Bảng tần số:

table(df$MaritalStatus)/sum(nrow(df))

## 
##         M         S 
## 0.4883704 0.5116296

Vậy trong bộ dữ liệu này có 48.8370439% là đã kết hôn và 51.1629561% là còn độc thân.

Vẽ biểu đồ:

Biểu đồ cột

ggplot(marital_d, aes(x = MaritalStatus, y = Frequency, fill = MaritalStatus)) +
  geom_col() +
  geom_text(aes(label = paste0(Percent, "%")), vjust = -0.5) +
  labs(x = "MaritalStatus", y = "Count", title = "Bar chart") +
  theme_minimal()

Biểu đồ tròn

ggplot(marital_d, aes(x = "", y = Frequency, fill = MaritalStatus)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  geom_text(aes(label = Label), position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = c("S" = "gray", "M" = "pink")) +
  labs(title = "Pie chart") +
  theme_void()

Thực hành 1

Nguyễn Thị Hoàng Yến

2025-05-19

1 Đọc dữ liệu

2 Lọc dữ liệu định tính và kiểm tra lại

3 Lập bảng tần số và vẽ biểu đồ