library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(epitools)
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.3
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
library(energy)
## Warning: package 'energy' was built under R version 4.4.3
library(readr)
options(digits = 4)
d <- read_csv("C:/Users/Hoang Quyen/Downloads/Supermarket Transactions.csv")
## New names:
## Rows: 14059 Columns: 16
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (10): Gender, MaritalStatus, Homeowner, AnnualIncome, City, StateorProv... dbl
## (5): ...1, CustomerID, Children, UnitsSold, Revenue date (1): PurchaseDate
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
str(d)
## spc_tbl_ [14,059 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:14059] 1 2 3 4 5 6 7 8 9 10 ...
## $ PurchaseDate : Date[1:14059], format: "2007-12-18" "2007-12-20" ...
## $ CustomerID : num [1:14059] 7223 7841 8374 9619 1900 ...
## $ Gender : chr [1:14059] "F" "M" "F" "M" ...
## $ MaritalStatus : chr [1:14059] "S" "M" "M" "M" ...
## $ Homeowner : chr [1:14059] "Y" "Y" "N" "Y" ...
## $ Children : num [1:14059] 2 5 2 3 3 3 2 2 3 1 ...
## $ AnnualIncome : chr [1:14059] "$30K - $50K" "$70K - $90K" "$50K - $70K" "$30K - $50K" ...
## $ City : chr [1:14059] "Los Angeles" "Los Angeles" "Bremerton" "Portland" ...
## $ StateorProvince : chr [1:14059] "CA" "CA" "WA" "OR" ...
## $ Country : chr [1:14059] "USA" "USA" "USA" "USA" ...
## $ ProductFamily : chr [1:14059] "Food" "Food" "Food" "Food" ...
## $ ProductDepartment: chr [1:14059] "Snack Foods" "Produce" "Snack Foods" "Snacks" ...
## $ ProductCategory : chr [1:14059] "Snack Foods" "Vegetables" "Snack Foods" "Candy" ...
## $ UnitsSold : num [1:14059] 5 5 3 4 4 3 4 6 1 2 ...
## $ Revenue : num [1:14059] 27.38 14.9 5.52 4.44 14 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. PurchaseDate = col_date(format = ""),
## .. CustomerID = col_double(),
## .. Gender = col_character(),
## .. MaritalStatus = col_character(),
## .. Homeowner = col_character(),
## .. Children = col_double(),
## .. AnnualIncome = col_character(),
## .. City = col_character(),
## .. StateorProvince = col_character(),
## .. Country = col_character(),
## .. ProductFamily = col_character(),
## .. ProductDepartment = col_character(),
## .. ProductCategory = col_character(),
## .. UnitsSold = col_double(),
## .. Revenue = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
bdt <- c("Gender","MaritalStatus", "Homeowner", "City", "StateorProvince", "Country",
"ProductFamily", "ProductDepartment", "ProductCategory" )
print(bdt)
## [1] "Gender" "MaritalStatus" "Homeowner"
## [4] "City" "StateorProvince" "Country"
## [7] "ProductFamily" "ProductDepartment" "ProductCategory"
dldt <- d[, bdt]
head(dldt)
## # A tibble: 6 × 9
## Gender MaritalStatus Homeowner City StateorProvince Country ProductFamily
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 F S Y Los Ange… CA USA Food
## 2 M M Y Los Ange… CA USA Food
## 3 F M N Bremerton WA USA Food
## 4 M M Y Portland OR USA Food
## 5 F S Y Beverly … CA USA Drink
## 6 F M Y Beverly … CA USA Food
## # ℹ 2 more variables: ProductDepartment <chr>, ProductCategory <chr>
table(dldt$Gender)
##
## F M
## 7170 6889
table(dldt$Gender) / sum(table(d$Gender))
##
## F M
## 0.51 0.49
Vậy trong bộ dữ liệu này có 50.9994 % là nữ và 49.0006% là nam.
ggplot(dldt, aes(Gender)) +
geom_bar(fill = 'steelblue') +
xlab('Giới tính') + ylab('Số lượng')
ggplot(dldt, aes(Gender)) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), fill = 'steelblue') +
xlab('Giới tính') + ylab('Tỷ lệ %')
gender <- dldt |> group_by(Gender) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(gender, aes(x = '', y = per, fill = Gender)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Giới tính')
table(dldt$MaritalStatus)
##
## M S
## 6866 7193
table(dldt$MaritalStatus) / sum(table(d$MaritalStatus))
##
## M S
## 0.4884 0.5116
Vậy trong bộ dữ liệu này có 48.837 % là đã kết hôn và 51.163% là chưa kết hôn.
ggplot(dldt, aes(MaritalStatus)) +
geom_bar(fill = 'darkorange') +
xlab('Tình trạng hôn nhân') + ylab('Số lượng')
ggplot(dldt, aes(MaritalStatus)) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), fill = 'darkorange') +
xlab('Tình trạng hôn nhân') + ylab('Tỷ lệ %')
marital <- dldt |> group_by(MaritalStatus) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(marital, aes(x = '', y = per, fill = MaritalStatus)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Tình trạng hôn nhân')
## Biến StateorProvince (Bang/Tỉnh)
table(dldt$StateorProvince)
##
## BC CA DF Guerrero Jalisco OR Veracruz WA
## 809 2733 815 383 75 2262 464 4567
## Yucatan Zacatecas
## 654 1297
table(dldt$StateorProvince) / sum(table(d$StateorProvince))
##
## BC CA DF Guerrero Jalisco OR Veracruz WA
## 0.057543 0.194395 0.057970 0.027242 0.005335 0.160893 0.033004 0.324845
## Yucatan Zacatecas
## 0.046518 0.092254
ggplot(dldt, aes(fct_infreq(StateorProvince))) +
geom_bar(color = 'blue', fill = 'coral') +
xlab('Bang/Tỉnh') + ylab('Số lượng') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(dldt, aes(fct_infreq(StateorProvince))) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), color = 'blue', fill = 'coral') +
xlab('Bang/Tỉnh') + ylab('Tỷ lệ %') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
state <- dldt |> group_by(StateorProvince) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(state, aes(x = '', y = per, fill = StateorProvince)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Bang/Tỉnh')
table(dldt$Homeowner)
##
## N Y
## 5615 8444
table(dldt$Homeowner) / sum(table(d$Homeowner))
##
## N Y
## 0.3994 0.6006
Vậy trong bộ dữ liệu này có 39.9388 % là không sỡ hữu nhà và 60.0612% là sỡ hữu nhà.
ggplot(dldt, aes(Homeowner)) +
geom_bar(fill = 'forestgreen') +
xlab('Sở hữu nhà') + ylab('Số lượng')
ggplot(dldt, aes(Homeowner)) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), fill = 'forestgreen') +
xlab('Sở hữu nhà') + ylab('Tỷ lệ %')
homeowner <- dldt |> group_by(Homeowner) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(homeowner, aes(x = '', y = per, fill = Homeowner)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Sở hữu nhà')
table(dldt$ProductFamily)
##
## Drink Food Non-Consumable
## 1250 10153 2656
table(dldt$ProductFamily) / sum(table(dldt$ProductFamily))
##
## Drink Food Non-Consumable
## 0.08891 0.72217 0.18892
ggplot(dldt, aes(ProductFamily)) +
geom_bar(fill = 'purple') +
xlab('Nhóm sản phẩm') + ylab('Số lượng')
ggplot(dldt, aes(ProductFamily)) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), fill = 'purple') +
xlab('Nhóm sản phẩm') + ylab('Tỷ lệ %')
product_family <- dldt |> group_by(ProductFamily) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(product_family, aes(x = '', y = per, fill = ProductFamily)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Nhóm sản phẩm')
## Biến ProductDepartment (Bộ phận sản phẩm)
table(dldt$ProductDepartment)
##
## Alcoholic Beverages Baked Goods Baking Goods Beverages
## 356 425 1072 680
## Breakfast Foods Canned Foods Canned Products Carousel
## 188 977 109 59
## Checkout Dairy Deli Eggs
## 82 903 699 198
## Frozen Foods Health and Hygiene Household Meat
## 1382 893 1420 89
## Periodicals Produce Seafood Snack Foods
## 202 1994 102 1600
## Snacks Starchy Foods
## 352 277
table(dldt$ProductDepartment) / sum(table(dldt$ProductDepartment))
##
## Alcoholic Beverages Baked Goods Baking Goods Beverages
## 0.025322 0.030230 0.076250 0.048368
## Breakfast Foods Canned Foods Canned Products Carousel
## 0.013372 0.069493 0.007753 0.004197
## Checkout Dairy Deli Eggs
## 0.005833 0.064229 0.049719 0.014084
## Frozen Foods Health and Hygiene Household Meat
## 0.098300 0.063518 0.101003 0.006330
## Periodicals Produce Seafood Snack Foods
## 0.014368 0.141831 0.007255 0.113806
## Snacks Starchy Foods
## 0.025037 0.019703
ggplot(dldt, aes(ProductDepartment)) +
geom_bar(fill = 'purple') +
xlab('Bộ phận sản phẩm') + ylab('Số lượng')
ggplot(dldt, aes(ProductDepartment)) +
geom_bar(aes(y = after_stat(count)/sum(after_stat(count))), fill = 'purple') +
xlab('Bộ phận sản phẩm') + ylab('Tỷ lệ %')
product_department <- d |> group_by(ProductDepartment) |> summarise(freq = n()) |> mutate(per = freq/sum(freq))
ggplot(product_department, aes(x = '', y = per, fill = ProductDepartment)) +
geom_bar(stat = 'identity') +
coord_polar('y') +
theme_void() +
labs(fill = 'Bộ phận sản phẩm')
gender_homeowner <- table(dldt$Gender, dldt$Homeowner)
gender_homeowner #tạo bảng chéo
##
## N Y
## F 2826 4344
## M 2789 4100
Giải thích bảng: Hàng F (nữ): có 2826 giao dịch của nữ không sở hữu nhà (N) và 4344 giao dịch của nữ sở hữu nhà (Y). Hàng M (nam): có 2789 giao dịch của nam không sở hữu nhà (N) và 4100 giao dịch của nam sở hữu nhà (Y).
gender_homeowner_prop<- prop.table(gender_homeowner)
addmargins(gender_homeowner_prop) #Tính tỷ lệ và thêm tổng
##
## N Y Sum
## F 0.2010 0.3090 0.5100
## M 0.1984 0.2916 0.4900
## Sum 0.3994 0.6006 1.0000
Nữ chiếm 51.00% giao dịch. Trong đó, 20.10% giao dịch là nữ không sở hữu nhà và 30.90% giao dịch là nữ sở hữu nhà.
Nam chiếm 49.00% giao dịch.Trong đó, 19.84% giao dịch là nam không sở hữu nhà và 29.16% (4100/14,059) giao dịch là nam sở hữu nhà.