Thực hành tuần 2 (ngày 19/5)

Đọc dữ liệu

library(csv)
d <- read.csv("D:/UFM/2025- Kì 2/Phân tích dữ liệu định tính - Trần Mạnh Tường/Supermarket Transactions.csv", header = T)
names(d)
##  [1] "X"                 "PurchaseDate"      "CustomerID"       
##  [4] "Gender"            "MaritalStatus"     "Homeowner"        
##  [7] "Children"          "AnnualIncome"      "City"             
## [10] "StateorProvince"   "Country"           "ProductFamily"    
## [13] "ProductDepartment" "ProductCategory"   "UnitsSold"        
## [16] "Revenue"
tbdt <- c("Gender","MaritalStatus","Homeowner","City","StateorProvince","Country","ProductFamily","ProductDepartment","ProductCategory")
dc <- d[,tbdt]
head(dc)
Gender MaritalStatus Homeowner City StateorProvince Country ProductFamily ProductDepartment ProductCategory
F S Y Los Angeles CA USA Food Snack Foods Snack Foods
M M Y Los Angeles CA USA Food Produce Vegetables
F M N Bremerton WA USA Food Snack Foods Snack Foods
M M Y Portland OR USA Food Snacks Candy
F S Y Beverly Hills CA USA Drink Beverages Carbonated Beverages
F M Y Beverly Hills CA USA Food Deli Side Dishes

Biến Gender

Tần số giới tính

table(d$Gender)
## 
##    F    M 
## 7170 6889

Tần suất giới tính

table(d$Gender)/sum(nrow(d))
## 
##         F         M 
## 0.5099936 0.4900064

Vậy trong bdl này có 50.9993598% nữ và 49.0006402% nam.

Biểu đồ tần số

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

d %>% group_by(Gender) %>% summarise(n = n()) %>%
  ggplot(aes(x = Gender, y = n))+
  geom_col(fill='lightblue')+
  labs(x = "Giới tính", y="tần suất")+
  labs(caption = "Biểu đồ tần suất giới tính")+
  geom_text(aes(label =n), vjust=2, color = 'black')

Biểu đồ tần suất

d %>% group_by(Gender) %>% summarise(n = n()) %>%
  ggplot(aes(x = Gender, y = n))+
  geom_col(fill='pink')+
  labs(x = "Giới tính", y="tần suất")+
  labs(caption = "Biểu đồ tần suất giới tính")+
  geom_text(aes(label = table(d$Gender)/sum(nrow(d))), vjust=2, color = 'black')
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.

LS0tDQp0aXRsZTogIkLDoGkgdOG6rXAgdGjhu7FjIGjDoG5oIGPDoWMgdHXhuqduIFBURExEVCINCmF1dGhvcjogIlbFqSBRdeG7s25oIFRyw7pjIFZ5Ig0KZGF0ZTogImByIGZvcm1hdChTeXMudGltZSgpLCAnJUg6JU06JVMsICVkIC0gJW0gLSAlWScpYCINCm91dHB1dDoNCiAgaHRtbF9kb2N1bWVudDoNCiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlDQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQogICAgdG9jX2Zsb2F0OiB0cnVlDQogICAgdG9jOiB0cnVlDQogICAgZGZfcHJpbnQ6IGthYmxlDQotLS0NCg0KYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9DQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpDQpgYGANCg0KDQoNCg0KIyAqKlRo4buxYyBow6BuaCB0deG6p24gMiAobmfDoHkgMTkvNSkqKg0KDQojIyAqKsSQ4buNYyBk4buvIGxp4buHdSoqDQoNCmBgYHtyfQ0KbGlicmFyeShjc3YpDQpkIDwtIHJlYWQuY3N2KCJEOi9VRk0vMjAyNS0gS8OsIDIvUGjDom4gdMOtY2ggZOG7ryBsaeG7h3UgxJHhu4tuaCB0w61uaCAtIFRy4bqnbiBN4bqhbmggVMaw4budbmcvU3VwZXJtYXJrZXQgVHJhbnNhY3Rpb25zLmNzdiIsIGhlYWRlciA9IFQpDQpgYGANCg0KDQoNCmBgYHtyfQ0KbmFtZXMoZCkNCnRiZHQgPC0gYygiR2VuZGVyIiwiTWFyaXRhbFN0YXR1cyIsIkhvbWVvd25lciIsIkNpdHkiLCJTdGF0ZW9yUHJvdmluY2UiLCJDb3VudHJ5IiwiUHJvZHVjdEZhbWlseSIsIlByb2R1Y3REZXBhcnRtZW50IiwiUHJvZHVjdENhdGVnb3J5IikNCmRjIDwtIGRbLHRiZHRdDQpoZWFkKGRjKQ0KYGBgDQoNCiMjICoqQmnhur9uIEdlbmRlcioqDQoNCiMjIyAqKlThuqduIHPhu5EgZ2nhu5tpIHTDrW5oKioNCg0KYGBge3J9DQp0YWJsZShkJEdlbmRlcikNCmBgYA0KDQoNCiMjIyAqKlThuqduIHN14bqldCBnaeG7m2kgdMOtbmgqKg0KDQpgYGB7cn0NCnRhYmxlKGQkR2VuZGVyKS9zdW0obnJvdyhkKSkNCmBgYA0KDQpW4bqteSB0cm9uZyBiZGwgbsOgeSBjw7MgYHIgdGFibGUoZCRHZW5kZXIpWzFdL3N1bShucm93KGQpKSoxMDBgJSBu4buvIHbDoCBgciB0YWJsZShkJEdlbmRlcilbMl0vc3VtKG5yb3coZCkpKjEwMGAlIG5hbS4NCg0KIyMjICoqQmnhu4N1IMSR4buTIHThuqduIHPhu5EqKg0KDQpgYGB7cn0NCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KGdncGxvdDIpDQoNCmQgJT4lIGdyb3VwX2J5KEdlbmRlcikgJT4lIHN1bW1hcmlzZShuID0gbigpKSAlPiUNCiAgZ2dwbG90KGFlcyh4ID0gR2VuZGVyLCB5ID0gbikpKw0KICBnZW9tX2NvbChmaWxsPSdsaWdodGJsdWUnKSsNCiAgbGFicyh4ID0gIkdp4bubaSB0w61uaCIsIHk9InThuqduIHN14bqldCIpKw0KICBsYWJzKGNhcHRpb24gPSAiQmnhu4N1IMSR4buTIHThuqduIHN14bqldCBnaeG7m2kgdMOtbmgiKSsNCiAgZ2VvbV90ZXh0KGFlcyhsYWJlbCA9biksIHZqdXN0PTIsIGNvbG9yID0gJ2JsYWNrJykNCmBgYA0KDQojIyMgKipCaeG7g3UgxJHhu5MgdOG6p24gc3XhuqV0KioNCg0KYGBge3J9DQpkICU+JSBncm91cF9ieShHZW5kZXIpICU+JSBzdW1tYXJpc2UobiA9IG4oKSkgJT4lDQogIGdncGxvdChhZXMoeCA9IEdlbmRlciwgeSA9IG4pKSsNCiAgZ2VvbV9jb2woZmlsbD0ncGluaycpKw0KICBsYWJzKHggPSAiR2nhu5tpIHTDrW5oIiwgeT0idOG6p24gc3XhuqV0IikrDQogIGxhYnMoY2FwdGlvbiA9ICJCaeG7g3UgxJHhu5MgdOG6p24gc3XhuqV0IGdp4bubaSB0w61uaCIpKw0KICBnZW9tX3RleHQoYWVzKGxhYmVsID0gdGFibGUoZCRHZW5kZXIpL3N1bShucm93KGQpKSksIHZqdXN0PTIsIGNvbG9yID0gJ2JsYWNrJykNCmBgYA0KDQoNCg0KDQoNCg0K