library(nnet)
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
multinom <- nnet::multinom
data_raw <- read.csv("fashion_data_uk_us.csv", stringsAsFactors = FALSE)
colnames(data_raw) <- make.names(colnames(data_raw))
cat("Dimensi awal:", dim(data_raw), "\n")
## Dimensi awal: 1000000 20
cat("Kolom:", paste(colnames(data_raw), collapse = ", "), "\n")
## Kolom: Product.Name, Price, Brand, Category, Description, Rating, Review.Count, Style.Attributes, Total.Sizes, Available.Sizes, Color, Purchase.History, Age, Fashion.Magazines, Fashion.Influencers, Season, Time.Period.Highest.Purchase, Customer.Reviews, Social.Media.Comments, feedback
Dataset memiliki 1.000.000 baris. Diambil sampel 10.000 baris agar komputasi tidak crash.
set.seed(42)
idx <- sample(nrow(data_raw), 10000)
data <- data_raw[idx, ]
rownames(data) <- NULL
cat("Dimensi setelah sampling:", dim(data), "\n")
## Dimensi setelah sampling: 10000 20
head(data[, 1:8])
## Product.Name Price Brand Category Description Rating
## 1 Q9M5 51.19296 Burberry Accessories Good 1.297044
## 2 W4U9 73.42901 Burberry Lingerie Not Good 2.080760
## 3 E9F9 93.26544 Jigsaw Dresses Best 1.835937
## 4 A8R9 72.03617 Ted Baker Lingerie Best 1.998143
## 5 U9E8 82.52572 Ted Baker Dresses Not Good 4.780790
## 6 R6T9 87.26570 Calvin Klein Swimwear Good 1.491257
## Review.Count Style.Attributes
## 1 69 Bohemian
## 2 417 Edgy
## 3 358 Vintage
## 4 458 Preppy
## 5 349 Edgy
## 6 263 Vintage
kolom_hapus <- c("Product.Name", "Description", "Style.Attributes",
"Fashion.Magazines", "Fashion.Influencers",
"Time.Period.Highest.Purchase", "Customer.Reviews",
"Social.Media.Comments", "feedback", "Total.Sizes")
kolom_hapus <- kolom_hapus[kolom_hapus %in% colnames(data)]
data[kolom_hapus] <- NULL
cat("Kolom tersisa:\n")
## Kolom tersisa:
print(colnames(data))
## [1] "Price" "Brand" "Category" "Rating"
## [5] "Review.Count" "Available.Sizes" "Color" "Purchase.History"
## [9] "Age" "Season"
Kolom yang tidak relevan untuk model dihapus, tersisa 10 variabel penting.
data$Price <- as.numeric(gsub("[^0-9.]", "", data$Price))
data$Review.Count <- as.numeric(gsub("[^0-9.]", "", data$Review.Count))
data$Age <- as.numeric(data$Age)
cat("Cek tipe numerik:\n")
## Cek tipe numerik:
str(data[, c("Price", "Review.Count", "Age")])
## 'data.frame': 10000 obs. of 3 variables:
## $ Price : num 51.2 73.4 93.3 72 82.5 ...
## $ Review.Count: num 69 417 358 458 349 263 446 33 147 235 ...
## $ Age : num 61 63 25 63 49 27 21 18 46 36 ...
Variabel angka sudah dipastikan numeric.
data$Category <- as.factor(data$Category)
data$Brand <- as.factor(data$Brand)
data$Color <- as.factor(data$Color)
data$Season <- as.factor(data$Season)
data$Available.Sizes <- as.factor(data$Available.Sizes)
data$Purchase.History <- as.factor(data$Purchase.History)
cat("Cek tipe faktor:\n")
## Cek tipe faktor:
str(data[, c("Category","Brand","Color","Season","Available.Sizes","Purchase.History")])
## 'data.frame': 10000 obs. of 6 variables:
## $ Category : Factor w/ 10 levels "Accessories",..: 1 7 4 7 4 9 3 6 8 6 ...
## $ Brand : Factor w/ 8 levels "Alexander McQueen",..: 2 2 4 7 7 3 8 3 4 2 ...
## $ Color : Factor w/ 4 levels "Black","Blue",..: 1 2 2 3 3 3 2 2 4 2 ...
## $ Season : Factor w/ 6 levels "Fall","Fall/Winter",..: 1 2 5 4 5 6 2 2 1 3 ...
## $ Available.Sizes : Factor w/ 4 levels "L","M","S","XL": 4 2 1 3 1 2 1 4 3 3 ...
## $ Purchase.History: Factor w/ 10 levels "Above Average",..: 6 8 4 9 4 4 3 8 6 4 ...
Variabel kategori difaktorkan.
cat("Jumlah NA per kolom:\n")
## Jumlah NA per kolom:
print(colSums(is.na(data)))
## Price Brand Category Rating
## 0 0 0 0
## Review.Count Available.Sizes Color Purchase.History
## 0 0 0 0
## Age Season
## 0 0
Tidak ada NA, data bersih.
cat("Frekuensi Category:\n")
## Frekuensi Category:
print(sort(table(data$Category), decreasing = TRUE))
##
## Activewear Swimwear Lingerie Jewelry Tops Dresses
## 1022 1019 1012 1003 1000 995
## Footwear Accessories Outerwear Bottoms
## 993 991 983 982
kategori_valid <- names(table(data$Category))[table(data$Category) >= 50]
data <- data %>% filter(Category %in% kategori_valid)
data$Category <- droplevels(data$Category)
cat("\nKategori digunakan:", nlevels(data$Category), "kelas\n")
##
## Kategori digunakan: 10 kelas
print(levels(data$Category))
## [1] "Accessories" "Activewear" "Bottoms" "Dresses" "Footwear"
## [6] "Jewelry" "Lingerie" "Outerwear" "Swimwear" "Tops"
Semua kategori memiliki ≥50 observasi, hasilnya 10 kelas valid.
str(data)
## 'data.frame': 10000 obs. of 10 variables:
## $ Price : num 51.2 73.4 93.3 72 82.5 ...
## $ Brand : Factor w/ 8 levels "Alexander McQueen",..: 2 2 4 7 7 3 8 3 4 2 ...
## $ Category : Factor w/ 10 levels "Accessories",..: 1 7 4 7 4 9 3 6 8 6 ...
## $ Rating : num 1.3 2.08 1.84 2 4.78 ...
## $ Review.Count : num 69 417 358 458 349 263 446 33 147 235 ...
## $ Available.Sizes : Factor w/ 4 levels "L","M","S","XL": 4 2 1 3 1 2 1 4 3 3 ...
## $ Color : Factor w/ 4 levels "Black","Blue",..: 1 2 2 3 3 3 2 2 4 2 ...
## $ Purchase.History: Factor w/ 10 levels "Above Average",..: 6 8 4 9 4 4 3 8 6 4 ...
## $ Age : num 61 63 25 63 49 27 21 18 46 36 ...
## $ Season : Factor w/ 6 levels "Fall","Fall/Winter",..: 1 2 5 4 5 6 2 2 1 3 ...
ggplot(data, aes(x = forcats::fct_infreq(Category), fill = Category)) +
geom_bar() +
labs(title = "Distribusi Category Fashion",
x = "Category", y = "Frekuensi") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Semua kategori relatif seimbang, tiap kelas sekitar 1.000 observasi.
ggplot(data, aes(x = Category, y = Price, fill = Category)) +
geom_boxplot() +
labs(title = "Distribusi Price per Category", x = "Category", y = "Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Harga bervariasi antar kategori, terlihat perbedaan median.
ggplot(data, aes(x = Category, y = Age, fill = Category)) +
geom_boxplot() +
labs(title = "Distribusi Age per Category", x = "Category", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Usia konsumen berbeda antar kategori, ada variasi jelas.
ggplot(data, aes(x = Category, y = log1p(Review.Count), fill = Category)) +
geom_boxplot() +
labs(title = "Distribusi Log(Review Count+1) per Category",
x = "Category", y = "Log(Review Count+1)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
Jumlah review berbeda antar kategori, distribusi log lebih stabil.
vars_plot <- c("Price", "Age", "Review.Count")
cat_num <- as.numeric(data$Category)
for (v in vars_plot) {
df_plot <- data.frame(x = data[[v]], y = cat_num)
df_plot <- df_plot[is.finite(df_plot$x) & is.finite(df_plot$y), ]
ggplot(df_plot, aes(x = x, y = y)) +
geom_point(alpha = 0.05) +
geom_smooth(method = "gam", formula = y ~ s(x), color = "red") +
theme_minimal()
}
Untuk keperluan visualisasi hubungan dengan variabel numerik, variabel kategori dikonversi menjadi numerik sebagai pendekatan eksploratif, bukan untuk analisis inferensial.
data_bt <- data %>%
mutate(
Price_lnPrice = Price * log(Price + 1),
Age_lnAge = Age * log(Age + 1),
RC_lnRC = ifelse(Review.Count > 0,
Review.Count * log(Review.Count), 0)
)
Uji Box-Tidwell digunakan untuk mendeteksi kemungkinan hubungan non-linear antara variabel numerik dan log-odds. Berdasarkan analisis sebelumnya, sebagian besar variabel memenuhi asumsi linearitas, meskipun terdapat indikasi pelanggaran pada variabel Age.
model_linear <- nnet::multinom(
Category ~ Price + Age + Review.Count,
data = data, trace = FALSE
)
model_kuadrat <- nnet::multinom(
Category ~ Price + Age + I(Age^2) + Review.Count,
data = data, trace = FALSE
)
aic_tbl <- AIC(model_linear, model_kuadrat)
print(aic_tbl)
## df AIC
## model_linear 36 46089.99
## model_kuadrat 45 46099.44
summary(model_linear)
## Call:
## nnet::multinom(formula = Category ~ Price + Age + Review.Count,
## data = data, trace = FALSE)
##
## Coefficients:
## (Intercept) Price Age Review.Count
## Activewear 0.0001609043 1.112262e-03 -0.0021191799 0.0002160210
## Bottoms -0.0005949900 -1.876722e-03 0.0003157833 0.0003667968
## Dresses 0.0011567793 -5.020047e-04 -0.0004696930 0.0001508038
## Footwear -0.0014411318 -2.765038e-04 -0.0002331293 0.0001870367
## Jewelry 0.0003999798 -1.270003e-03 0.0002228524 0.0002838164
## Lingerie -0.0000676065 -2.876643e-05 -0.0008431689 0.0002403303
## Outerwear -0.0004879089 -4.079024e-03 0.0001444544 0.0008643704
## Swimwear -0.0003381086 -2.500550e-03 0.0006363072 0.0005777117
## Tops 0.0011726959 -3.451248e-03 0.0034971355 0.0001611699
##
## Std. Errors:
## (Intercept) Price Age Review.Count
## Activewear 0.1874111 0.001721470 0.003288932 0.0003097966
## Bottoms 0.1886722 0.001733748 0.003313814 0.0003120684
## Dresses 0.1889571 0.001737013 0.003319739 0.0003127499
## Footwear 0.1877894 0.001724792 0.003296265 0.0003105299
## Jewelry 0.1882347 0.001729301 0.003305309 0.0003113129
## Lingerie 0.1876529 0.001723491 0.003293752 0.0003102573
## Outerwear 0.1888359 0.001735695 0.003315967 0.0003122956
## Swimwear 0.1873866 0.001720348 0.003288152 0.0003096261
## Tops 0.1888329 0.001735847 0.003316996 0.0003123720
##
## Residual Deviance: 46017.99
## AIC: 46089.99
Meskipun terdapat indikasi pelanggaran linearitas pada variabel Age, hasil evaluasi AIC menunjukkan bahwa model linear memiliki nilai AIC yang lebih rendah dibandingkan model kuadratik, sehingga model linear tetap dipilih sebagai model akhir dengan mempertimbangkan keterbatasan tersebut.
Data bersifat cross-sectional dan setiap observasi merepresentasikan individu yang berbeda, sehingga asumsi independensi terpenuhi.
cols_num <- c("Price", "Review.Count", "Age")
numerik_df <- as.data.frame(lapply(data[, cols_num], as.numeric))
numerik_df <- na.omit(numerik_df)
cor_mat <- cor(numerik_df)
cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_mat, 3))
## Price Review.Count Age
## Price 1.000 0.015 -0.017
## Review.Count 0.015 1.000 -0.001
## Age -0.017 -0.001 1.000
corrplot(cor_mat, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black",
tl.cex = 0.8, number.cex = 0.7,
title = "Matriks Korelasi Prediktor Numerik",
mar = c(0, 0, 1.5, 0))
model_vif <- lm(Price ~ Review.Count + Age, data = data)
vif(model_vif)
## Review.Count Age
## 1.000002 1.000002
Pengujian VIF dilakukan dengan menjadikan salah satu variabel independen sebagai variabel dependen dalam model regresi linear untuk mendeteksi multikolinearitas antar prediktor. Semua VIF ≈1, tidak ada multikolinieritas.
num_maha <- data[, c("Price","Review.Count","Age")]
md <- mahalanobis(num_maha, colMeans(num_maha), cov(num_maha))
cutoff <- qchisq(0.975, df = 3)
sum(md > cutoff)
## [1] 0
Tidak ditemukan outlier, data aman.
df_md <- data.frame(
index = seq_along(md),
md = md,
outlier = md > cutoff
)
ggplot(df_md, aes(x = index, y = md, color = outlier)) +
geom_point(alpha = 0.4, size = 0.8) +
geom_hline(yintercept = cutoff, linetype = "dashed",
color = "red", linewidth = 1) +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "tomato"),
labels = c("Normal", "Outlier")) +
labs(title = "Mahalanobis Distance per Observasi",
x = "Index", y = "Mahalanobis Distance", color = "") +
theme_minimal()
pred <- predict(model_linear, data)
confusionMatrix(pred, data$Category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Accessories Activewear Bottoms Dresses Footwear Jewelry Lingerie
## Accessories 63 78 71 64 76 60 58
## Activewear 407 427 397 400 396 383 408
## Bottoms 0 0 0 0 0 0 0
## Dresses 0 0 0 0 0 0 0
## Footwear 9 7 3 6 6 4 2
## Jewelry 0 0 0 0 0 0 0
## Lingerie 0 0 0 0 0 0 0
## Outerwear 228 247 244 241 243 248 244
## Swimwear 86 90 79 92 104 100 98
## Tops 198 173 188 192 168 208 202
## Reference
## Prediction Outerwear Swimwear Tops
## Accessories 53 65 72
## Activewear 358 385 375
## Bottoms 0 0 0
## Dresses 0 0 0
## Footwear 2 2 3
## Jewelry 0 0 0
## Lingerie 0 0 0
## Outerwear 282 265 255
## Swimwear 97 98 87
## Tops 191 204 208
##
## Overall Statistics
##
## Accuracy : 0.1084
## 95% CI : (0.1024, 0.1147)
## No Information Rate : 0.1022
## P-Value [Acc > NIR] : 0.02185
##
## Kappa : 0.0087
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Accessories Class: Activewear Class: Bottoms
## Sensitivity 0.06357 0.4178 0.0000
## Specificity 0.93373 0.6092 1.0000
## Pos Pred Value 0.09545 0.1085 NaN
## Neg Pred Value 0.90064 0.9019 0.9018
## Prevalence 0.09910 0.1022 0.0982
## Detection Rate 0.00630 0.0427 0.0000
## Detection Prevalence 0.06600 0.3936 0.0000
## Balanced Accuracy 0.49865 0.5135 0.5000
## Class: Dresses Class: Footwear Class: Jewelry
## Sensitivity 0.0000 0.006042 0.0000
## Specificity 1.0000 0.995781 1.0000
## Pos Pred Value NaN 0.136364 NaN
## Neg Pred Value 0.9005 0.900864 0.8997
## Prevalence 0.0995 0.099300 0.1003
## Detection Rate 0.0000 0.000600 0.0000
## Detection Prevalence 0.0000 0.004400 0.0000
## Balanced Accuracy 0.5000 0.500912 0.5000
## Class: Lingerie Class: Outerwear Class: Swimwear
## Sensitivity 0.0000 0.2869 0.09617
## Specificity 1.0000 0.7544 0.90725
## Pos Pred Value NaN 0.1129 0.10526
## Neg Pred Value 0.8988 0.9066 0.89845
## Prevalence 0.1012 0.0983 0.10190
## Detection Rate 0.0000 0.0282 0.00980
## Detection Prevalence 0.0000 0.2497 0.09310
## Balanced Accuracy 0.5000 0.5206 0.50171
## Class: Tops
## Sensitivity 0.2080
## Specificity 0.8084
## Pos Pred Value 0.1077
## Neg Pred Value 0.9018
## Prevalence 0.1000
## Detection Rate 0.0208
## Detection Prevalence 0.1932
## Balanced Accuracy 0.5082