1 .Đọc dữ liệu

data2 <- read.csv("/Users/lengoctuongvy/Downloads/TLHK2:2025/T2_PHÂN TÍCH DỮ LIỆU ĐỊNH TÍNH/Book2.csv")
head(data2,10)
X PurchaseDate CustomerID Gender MaritalStatus Homeowner Children AnnualIncome City StateorProvince Country ProductFamily ProductDepartment ProductCategory UnitsSold Revenue
1 12/18/2007 7223 F S Y 2 $30K - $50K Los Angeles CA USA Food Snack Foods Snack Foods 5 27.38
2 12/20/2007 7841 M M Y 5 $70K - $90K Los Angeles CA USA Food Produce Vegetables 5 14.90
3 12/21/2007 8374 F M N 2 $50K - $70K Bremerton WA USA Food Snack Foods Snack Foods 3 5.52
4 12/21/2007 9619 M M Y 3 $30K - $50K Portland OR USA Food Snacks Candy 4 4.44
5 12/22/2007 1900 F S Y 3 $130K - $150K Beverly Hills CA USA Drink Beverages Carbonated Beverages 4 14.00
6 12/22/2007 6696 F M Y 3 $10K - $30K Beverly Hills CA USA Food Deli Side Dishes 3 4.37
7 12/23/2007 9673 M S Y 2 $30K - $50K Salem OR USA Food Frozen Foods Breakfast Foods 4 13.78
8 12/25/2007 354 F M Y 2 $150K + Yakima WA USA Food Canned Foods Canned Soup 6 7.34
9 12/25/2007 1293 M M Y 3 $10K - $30K Bellingham WA USA Non-Consumable Household Cleaning Supplies 1 2.41
10 12/25/2007 7938 M S N 1 $50K - $70K San Diego CA USA Non-Consumable Health and Hygiene Pain Relievers 2 8.96

2 . CHọn biến định tính

names(data2)
##  [1] "X"                 "PurchaseDate"      "CustomerID"       
##  [4] "Gender"            "MaritalStatus"     "Homeowner"        
##  [7] "Children"          "AnnualIncome"      "City"             
## [10] "StateorProvince"   "Country"           "ProductFamily"    
## [13] "ProductDepartment" "ProductCategory"   "UnitsSold"        
## [16] "Revenue"
tbdt <- c("Gender", "MaritalStatus", "Homeowner",  "AnnualIncome", "City",  "StateorProvince", "Country",  "ProductFamily", "ProductDepartment", "ProductCategory" )

3 .Tạo dữ liệu mới chỉ có biến định tính

dt<- data2[,tbdt]
head(dt)
Gender MaritalStatus Homeowner AnnualIncome City StateorProvince Country ProductFamily ProductDepartment ProductCategory
F S Y $30K - $50K Los Angeles CA USA Food Snack Foods Snack Foods
M M Y $70K - $90K Los Angeles CA USA Food Produce Vegetables
F M N $50K - $70K Bremerton WA USA Food Snack Foods Snack Foods
M M Y $30K - $50K Portland OR USA Food Snacks Candy
F S Y $130K - $150K Beverly Hills CA USA Drink Beverages Carbonated Beverages
F M Y $10K - $30K Beverly Hills CA USA Food Deli Side Dishes
gt <- table(data2$Gender)/sum(nrow(data2))
gt
## 
##         F         M 
## 0.5099936 0.4900064

`

Vậy trong bdl này có 50.9993598 nữ và 49.0006402 nam.

# Tạo bảng tần số từ dữ liệu
gender_freq <- table(data2$Gender)

# 🎯 Biểu đồ cột (barplot)
text(barplot(gender_freq, col = c("purple", "green"),
             main = "Biểu đồ cột: Gender",
             xlab = "Gender", ylab = "Tần số",
             ylim = c(0, max(gender_freq) + 1)),
     gender_freq, labels = gender_freq, pos = 1)

# 🎯 Biểu đồ tròn có phần trăm (optional)
gender_prop <- prop.table(gender_freq)
labels_percent <- paste0(names(gender_prop), ": ", round(gender_prop * 100, 1), "%")

pie(gender_freq,
    col = c("purple", "green"),
    main = "Pie chart with percentages",
    labels = labels_percent)

EVA -L TRUE LÀ THỰC HIỆN CÂU LỆNH, FALSE LÀ KHÔNG THỰC HIỆN CÂU LỆNH. ECHO là xuất hiện câu lệnh.

thanhpho <- table(data2$Country)/sum(nrow(data2))
thanhpho
## 
##     Canada     Mexico        USA 
## 0.05754321 0.26232307 0.68013372

Vậy trong bdl này có 2.7242336 Mexico, 1.017142 Canada và 5.7685468 USA.

# Tạo bảng tần số từ dữ liệu
f2<- table(data2$Country)

# 🎯 Biểu đồ cột (barplot)
text(barplot(f2, col = c("purple", "green","orange"),
             main = "Biểu đồ cột: Country",
             xlab = "Country", ylab = "Tần số",
             ylim = c(0, max(f2) + 1)),
    f2, labels = f2, pos = 1)

country_prop <- prop.table(f2)
labels_percent <- paste0(names(country_prop), ": ", round(country_prop * 100, 1), "%")

pie(f2,
    col = c("purple", "green", "orange"),
    main = "Pie chart with percentages",
    labels = labels_percent)