library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
data <- read.csv("D:/PTDLDT/Supermarket_Transactions.csv", header = T)
datatable(data)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
names(data)
## [1] "X" "PurchaseDate" "CustomerID"
## [4] "Gender" "MaritalStatus" "Homeowner"
## [7] "Children" "AnnualIncome" "City"
## [10] "StateorProvince" "Country" "ProductFamily"
## [13] "ProductDepartment" "ProductCategory" "UnitsSold"
## [16] "Revenue"
categorical_columns <- c("Gender","MaritalStatus","City","Homeowner","StateorProvince","Country","ProductFamily","ProductDepartment","ProductCategory")
df <- data[,categorical_columns]
head(df)
## Gender MaritalStatus City Homeowner StateorProvince Country
## 1 F S Los Angeles Y CA USA
## 2 M M Los Angeles Y CA USA
## 3 F M Bremerton N WA USA
## 4 M M Portland Y OR USA
## 5 F S Beverly Hills Y CA USA
## 6 F M Beverly Hills Y CA USA
## ProductFamily ProductDepartment ProductCategory
## 1 Food Snack Foods Snack Foods
## 2 Food Produce Vegetables
## 3 Food Snack Foods Snack Foods
## 4 Food Snacks Candy
## 5 Drink Beverages Carbonated Beverages
## 6 Food Deli Side Dishes
table(df$Gender)/sum(nrow(df))
##
## F M
## 0.5099936 0.4900064
Vậy trong bộ dữ liệu này có 50.9993598% là nữ và 49.0006402% là nam.
CHuẩn bị số liệu
Biểu đồ cột
ggplot(gender_d, aes(x = Gender, y = Frequency, fill = Gender)) +
geom_col() +
geom_text(aes(label = paste0(Percent, "%")), vjust = -0.5) + # thêm nhãn phần trăm
scale_fill_manual(values = c("M" = "yellow", "F" = "pink")) +
labs(x = "Giới tính", y = "Số lượng", title = "Biểu đồ phân bố giới tính") +
theme_minimal()
Biểu đồ tròn
ggplot(gender_d, aes(x = "", y = Frequency, fill = Gender)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
geom_text(aes(label = Label), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = c("M" = "gray", "F" = "pink")) +
labs(title = "Biểu đồ phân bố giới tính") +
theme_void()
MaritalStatus (Tình trạng hôn nhân):
Bảng tần số:
table(df$MaritalStatus)/sum(nrow(df))
##
## M S
## 0.4883704 0.5116296
Vậy trong bộ dữ liệu này có 48.8370439% là đã kết hôn và 51.1629561% là còn độc thân.
Biểu đồ cột
ggplot(marital_d, aes(x = MaritalStatus, y = Frequency, fill = MaritalStatus)) +
geom_col() +
geom_text(aes(label = paste0(Percent, "%")), vjust = -0.5) +
labs(x = "MaritalStatus", y = "Count", title = "Bar chart") +
theme_minimal()
Biểu đồ tròn
ggplot(marital_d, aes(x = "", y = Frequency, fill = MaritalStatus)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
geom_text(aes(label = Label), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = c("S" = "gray", "M" = "pink")) +
labs(title = "Pie chart") +
theme_void()