Import Library

library(nnet)
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
multinom <- nnet::multinom

Import Data

data_raw <- read.csv("fashion_data_uk_us.csv", stringsAsFactors = FALSE)

colnames(data_raw) <- make.names(colnames(data_raw))

cat("Dimensi awal:", dim(data_raw), "\n")
## Dimensi awal: 1000000 20
cat("Kolom:", paste(colnames(data_raw), collapse = ", "), "\n")
## Kolom: Product.Name, Price, Brand, Category, Description, Rating, Review.Count, Style.Attributes, Total.Sizes, Available.Sizes, Color, Purchase.History, Age, Fashion.Magazines, Fashion.Influencers, Season, Time.Period.Highest.Purchase, Customer.Reviews, Social.Media.Comments, feedback

Dataset memiliki 1.000.000 baris. Diambil sampel 10.000 baris agar komputasi tidak crash.

set.seed(42)
idx  <- sample(nrow(data_raw), 10000)
data <- data_raw[idx, ]
rownames(data) <- NULL
cat("Dimensi setelah sampling:", dim(data), "\n")
## Dimensi setelah sampling: 10000 20
head(data[, 1:8])
##   Product.Name    Price        Brand    Category Description   Rating
## 1         Q9M5 51.19296     Burberry Accessories        Good 1.297044
## 2         W4U9 73.42901     Burberry    Lingerie    Not Good 2.080760
## 3         E9F9 93.26544       Jigsaw     Dresses        Best 1.835937
## 4         A8R9 72.03617    Ted Baker    Lingerie        Best 1.998143
## 5         U9E8 82.52572    Ted Baker     Dresses    Not Good 4.780790
## 6         R6T9 87.26570 Calvin Klein    Swimwear        Good 1.491257
##   Review.Count Style.Attributes
## 1           69         Bohemian
## 2          417             Edgy
## 3          358          Vintage
## 4          458           Preppy
## 5          349             Edgy
## 6          263          Vintage

Preprocessing Data

1. Hapus Kolom Teks Bebas

kolom_hapus <- c("Product.Name", "Description", "Style.Attributes",
                 "Fashion.Magazines", "Fashion.Influencers",
                 "Time.Period.Highest.Purchase", "Customer.Reviews",
                 "Social.Media.Comments", "feedback", "Total.Sizes")

kolom_hapus <- kolom_hapus[kolom_hapus %in% colnames(data)]
data[kolom_hapus] <- NULL

cat("Kolom tersisa:\n")
## Kolom tersisa:
print(colnames(data))
##  [1] "Price"            "Brand"            "Category"         "Rating"          
##  [5] "Review.Count"     "Available.Sizes"  "Color"            "Purchase.History"
##  [9] "Age"              "Season"

Kolom yang tidak relevan untuk model dihapus, tersisa 10 variabel penting.

2. Konversi Kolom Numerik

data$Price        <- as.numeric(gsub("[^0-9.]", "", data$Price))
data$Review.Count <- as.numeric(gsub("[^0-9.]", "", data$Review.Count))
data$Age          <- as.numeric(data$Age)

cat("Cek tipe numerik:\n")
## Cek tipe numerik:
str(data[, c("Price", "Review.Count", "Age")])
## 'data.frame':    10000 obs. of  3 variables:
##  $ Price       : num  51.2 73.4 93.3 72 82.5 ...
##  $ Review.Count: num  69 417 358 458 349 263 446 33 147 235 ...
##  $ Age         : num  61 63 25 63 49 27 21 18 46 36 ...

Variabel angka sudah dipastikan numeric.

3. Konversi Kolom Faktor

data$Category         <- as.factor(data$Category)
data$Brand            <- as.factor(data$Brand)
data$Color            <- as.factor(data$Color)
data$Season           <- as.factor(data$Season)
data$Available.Sizes  <- as.factor(data$Available.Sizes)
data$Purchase.History <- as.factor(data$Purchase.History)

cat("Cek tipe faktor:\n")
## Cek tipe faktor:
str(data[, c("Category","Brand","Color","Season","Available.Sizes","Purchase.History")])
## 'data.frame':    10000 obs. of  6 variables:
##  $ Category        : Factor w/ 10 levels "Accessories",..: 1 7 4 7 4 9 3 6 8 6 ...
##  $ Brand           : Factor w/ 8 levels "Alexander McQueen",..: 2 2 4 7 7 3 8 3 4 2 ...
##  $ Color           : Factor w/ 4 levels "Black","Blue",..: 1 2 2 3 3 3 2 2 4 2 ...
##  $ Season          : Factor w/ 6 levels "Fall","Fall/Winter",..: 1 2 5 4 5 6 2 2 1 3 ...
##  $ Available.Sizes : Factor w/ 4 levels "L","M","S","XL": 4 2 1 3 1 2 1 4 3 3 ...
##  $ Purchase.History: Factor w/ 10 levels "Above Average",..: 6 8 4 9 4 4 3 8 6 4 ...

Variabel kategori difaktorkan.

4. Cek Missing Value

cat("Jumlah NA per kolom:\n")
## Jumlah NA per kolom:
print(colSums(is.na(data)))
##            Price            Brand         Category           Rating 
##                0                0                0                0 
##     Review.Count  Available.Sizes            Color Purchase.History 
##                0                0                0                0 
##              Age           Season 
##                0                0

Tidak ada NA, data bersih.

5. Filter Kelas Category (≥ 50 Observasi)

cat("Frekuensi Category:\n")
## Frekuensi Category:
print(sort(table(data$Category), decreasing = TRUE))
## 
##  Activewear    Swimwear    Lingerie     Jewelry        Tops     Dresses 
##        1022        1019        1012        1003        1000         995 
##    Footwear Accessories   Outerwear     Bottoms 
##         993         991         983         982
kategori_valid <- names(table(data$Category))[table(data$Category) >= 50]
data           <- data %>% filter(Category %in% kategori_valid)
data$Category  <- droplevels(data$Category)

cat("\nKategori digunakan:", nlevels(data$Category), "kelas\n")
## 
## Kategori digunakan: 10 kelas
print(levels(data$Category))
##  [1] "Accessories" "Activewear"  "Bottoms"     "Dresses"     "Footwear"   
##  [6] "Jewelry"     "Lingerie"    "Outerwear"   "Swimwear"    "Tops"

Semua kategori memiliki ≥50 observasi, hasilnya 10 kelas valid.

6. Struktur Akhir

str(data)
## 'data.frame':    10000 obs. of  10 variables:
##  $ Price           : num  51.2 73.4 93.3 72 82.5 ...
##  $ Brand           : Factor w/ 8 levels "Alexander McQueen",..: 2 2 4 7 7 3 8 3 4 2 ...
##  $ Category        : Factor w/ 10 levels "Accessories",..: 1 7 4 7 4 9 3 6 8 6 ...
##  $ Rating          : num  1.3 2.08 1.84 2 4.78 ...
##  $ Review.Count    : num  69 417 358 458 349 263 446 33 147 235 ...
##  $ Available.Sizes : Factor w/ 4 levels "L","M","S","XL": 4 2 1 3 1 2 1 4 3 3 ...
##  $ Color           : Factor w/ 4 levels "Black","Blue",..: 1 2 2 3 3 3 2 2 4 2 ...
##  $ Purchase.History: Factor w/ 10 levels "Above Average",..: 6 8 4 9 4 4 3 8 6 4 ...
##  $ Age             : num  61 63 25 63 49 27 21 18 46 36 ...
##  $ Season          : Factor w/ 6 levels "Fall","Fall/Winter",..: 1 2 5 4 5 6 2 2 1 3 ...

Eksplorasi Data (EDA)

1. Distribusi Category

ggplot(data, aes(x = forcats::fct_infreq(Category), fill = Category)) +
  geom_bar() +
  labs(title = "Distribusi Category Fashion",
       x = "Category", y = "Frekuensi") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Semua kategori relatif seimbang, tiap kelas sekitar 1.000 observasi.

2. Price per Category

ggplot(data, aes(x = Category, y = Price, fill = Category)) +
  geom_boxplot() +
  labs(title = "Distribusi Price per Category", x = "Category", y = "Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Harga bervariasi antar kategori, terlihat perbedaan median.

3. Age per Category

ggplot(data, aes(x = Category, y = Age, fill = Category)) +
  geom_boxplot() +
  labs(title = "Distribusi Age per Category", x = "Category", y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Usia konsumen berbeda antar kategori, ada variasi jelas.

4. Review Count per Category

ggplot(data, aes(x = Category, y = log1p(Review.Count), fill = Category)) +
  geom_boxplot() +
  labs(title = "Distribusi Log(Review Count+1) per Category",
       x = "Category", y = "Log(Review Count+1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

Jumlah review berbeda antar kategori, distribusi log lebih stabil.

Uji Asumsi

1. Linearitas Log-Odds

a) Visualisasi GAM Smoother

vars_plot <- c("Price", "Age", "Review.Count")
cat_num   <- as.numeric(data$Category)

for (v in vars_plot) {
  df_plot <- data.frame(x = data[[v]], y = cat_num)
  df_plot <- df_plot[is.finite(df_plot$x) & is.finite(df_plot$y), ]

  ggplot(df_plot, aes(x = x, y = y)) +
    geom_point(alpha = 0.05) +
    geom_smooth(method = "gam", formula = y ~ s(x), color = "red") +
    theme_minimal()
}

Untuk keperluan visualisasi hubungan dengan variabel numerik, variabel kategori dikonversi menjadi numerik sebagai pendekatan eksploratif, bukan untuk analisis inferensial.

b) Uji Box-Tidwell

data_bt <- data %>%
  mutate(
    Price_lnPrice = Price * log(Price + 1),
    Age_lnAge     = Age * log(Age + 1),
    RC_lnRC       = ifelse(Review.Count > 0,
                           Review.Count * log(Review.Count), 0)
  )

Uji Box-Tidwell digunakan untuk mendeteksi kemungkinan hubungan non-linear antara variabel numerik dan log-odds. Berdasarkan analisis sebelumnya, sebagian besar variabel memenuhi asumsi linearitas, meskipun terdapat indikasi pelanggaran pada variabel Age.

c) Perbandingan AIC: Linear vs Kuadratik

model_linear <- nnet::multinom(
  Category ~ Price + Age + Review.Count,
  data = data, trace = FALSE
)

model_kuadrat <- nnet::multinom(
  Category ~ Price + Age + I(Age^2) + Review.Count,
  data = data, trace = FALSE
)

aic_tbl <- AIC(model_linear, model_kuadrat)
print(aic_tbl)
##               df      AIC
## model_linear  36 46089.99
## model_kuadrat 45 46099.44
summary(model_linear)
## Call:
## nnet::multinom(formula = Category ~ Price + Age + Review.Count, 
##     data = data, trace = FALSE)
## 
## Coefficients:
##              (Intercept)         Price           Age Review.Count
## Activewear  0.0001609043  1.112262e-03 -0.0021191799 0.0002160210
## Bottoms    -0.0005949900 -1.876722e-03  0.0003157833 0.0003667968
## Dresses     0.0011567793 -5.020047e-04 -0.0004696930 0.0001508038
## Footwear   -0.0014411318 -2.765038e-04 -0.0002331293 0.0001870367
## Jewelry     0.0003999798 -1.270003e-03  0.0002228524 0.0002838164
## Lingerie   -0.0000676065 -2.876643e-05 -0.0008431689 0.0002403303
## Outerwear  -0.0004879089 -4.079024e-03  0.0001444544 0.0008643704
## Swimwear   -0.0003381086 -2.500550e-03  0.0006363072 0.0005777117
## Tops        0.0011726959 -3.451248e-03  0.0034971355 0.0001611699
## 
## Std. Errors:
##            (Intercept)       Price         Age Review.Count
## Activewear   0.1874111 0.001721470 0.003288932 0.0003097966
## Bottoms      0.1886722 0.001733748 0.003313814 0.0003120684
## Dresses      0.1889571 0.001737013 0.003319739 0.0003127499
## Footwear     0.1877894 0.001724792 0.003296265 0.0003105299
## Jewelry      0.1882347 0.001729301 0.003305309 0.0003113129
## Lingerie     0.1876529 0.001723491 0.003293752 0.0003102573
## Outerwear    0.1888359 0.001735695 0.003315967 0.0003122956
## Swimwear     0.1873866 0.001720348 0.003288152 0.0003096261
## Tops         0.1888329 0.001735847 0.003316996 0.0003123720
## 
## Residual Deviance: 46017.99 
## AIC: 46089.99

Meskipun terdapat indikasi pelanggaran linearitas pada variabel Age, hasil evaluasi AIC menunjukkan bahwa model linear memiliki nilai AIC yang lebih rendah dibandingkan model kuadratik, sehingga model linear tetap dipilih sebagai model akhir dengan mempertimbangkan keterbatasan tersebut.

2. Independensi Observasi

Data bersifat cross-sectional dan setiap observasi merepresentasikan individu yang berbeda, sehingga asumsi independensi terpenuhi.

3. Tidak Ada Multikolinieritas

a) Matriks Korelasi (Numerik)

cols_num   <- c("Price", "Review.Count", "Age")
numerik_df <- as.data.frame(lapply(data[, cols_num], as.numeric))
numerik_df <- na.omit(numerik_df)

cor_mat <- cor(numerik_df)
cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_mat, 3))
##               Price Review.Count    Age
## Price         1.000        0.015 -0.017
## Review.Count  0.015        1.000 -0.001
## Age          -0.017       -0.001  1.000
corrplot(cor_mat, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black",
         tl.cex = 0.8, number.cex = 0.7,
         title = "Matriks Korelasi Prediktor Numerik",
         mar = c(0, 0, 1.5, 0))

b) VIF

model_vif <- lm(Price ~ Review.Count + Age, data = data)
vif(model_vif)
## Review.Count          Age 
##     1.000002     1.000002

Pengujian VIF dilakukan dengan menjadikan salah satu variabel independen sebagai variabel dependen dalam model regresi linear untuk mendeteksi multikolinearitas antar prediktor. Semua VIF ≈1, tidak ada multikolinieritas.

4. Tidak Ada Outlier (Mahalanobis Distance)

num_maha <- data[, c("Price","Review.Count","Age")]

md     <- mahalanobis(num_maha, colMeans(num_maha), cov(num_maha))
cutoff <- qchisq(0.975, df = 3)

sum(md > cutoff)
## [1] 0

Tidak ditemukan outlier, data aman.

Visualisasi

df_md <- data.frame(
  index   = seq_along(md),
  md      = md,
  outlier = md > cutoff
)

ggplot(df_md, aes(x = index, y = md, color = outlier)) +
  geom_point(alpha = 0.4, size = 0.8) +
  geom_hline(yintercept = cutoff, linetype = "dashed",
             color = "red", linewidth = 1) +
  scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "tomato"),
                     labels = c("Normal", "Outlier")) +
  labs(title = "Mahalanobis Distance per Observasi",
       x = "Index", y = "Mahalanobis Distance", color = "") +
  theme_minimal()

pred <- predict(model_linear, data)
confusionMatrix(pred, data$Category)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Accessories Activewear Bottoms Dresses Footwear Jewelry Lingerie
##   Accessories          63         78      71      64       76      60       58
##   Activewear          407        427     397     400      396     383      408
##   Bottoms               0          0       0       0        0       0        0
##   Dresses               0          0       0       0        0       0        0
##   Footwear              9          7       3       6        6       4        2
##   Jewelry               0          0       0       0        0       0        0
##   Lingerie              0          0       0       0        0       0        0
##   Outerwear           228        247     244     241      243     248      244
##   Swimwear             86         90      79      92      104     100       98
##   Tops                198        173     188     192      168     208      202
##              Reference
## Prediction    Outerwear Swimwear Tops
##   Accessories        53       65   72
##   Activewear        358      385  375
##   Bottoms             0        0    0
##   Dresses             0        0    0
##   Footwear            2        2    3
##   Jewelry             0        0    0
##   Lingerie            0        0    0
##   Outerwear         282      265  255
##   Swimwear           97       98   87
##   Tops              191      204  208
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1084          
##                  95% CI : (0.1024, 0.1147)
##     No Information Rate : 0.1022          
##     P-Value [Acc > NIR] : 0.02185         
##                                           
##                   Kappa : 0.0087          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Accessories Class: Activewear Class: Bottoms
## Sensitivity                     0.06357            0.4178         0.0000
## Specificity                     0.93373            0.6092         1.0000
## Pos Pred Value                  0.09545            0.1085            NaN
## Neg Pred Value                  0.90064            0.9019         0.9018
## Prevalence                      0.09910            0.1022         0.0982
## Detection Rate                  0.00630            0.0427         0.0000
## Detection Prevalence            0.06600            0.3936         0.0000
## Balanced Accuracy               0.49865            0.5135         0.5000
##                      Class: Dresses Class: Footwear Class: Jewelry
## Sensitivity                  0.0000        0.006042         0.0000
## Specificity                  1.0000        0.995781         1.0000
## Pos Pred Value                  NaN        0.136364            NaN
## Neg Pred Value               0.9005        0.900864         0.8997
## Prevalence                   0.0995        0.099300         0.1003
## Detection Rate               0.0000        0.000600         0.0000
## Detection Prevalence         0.0000        0.004400         0.0000
## Balanced Accuracy            0.5000        0.500912         0.5000
##                      Class: Lingerie Class: Outerwear Class: Swimwear
## Sensitivity                   0.0000           0.2869         0.09617
## Specificity                   1.0000           0.7544         0.90725
## Pos Pred Value                   NaN           0.1129         0.10526
## Neg Pred Value                0.8988           0.9066         0.89845
## Prevalence                    0.1012           0.0983         0.10190
## Detection Rate                0.0000           0.0282         0.00980
## Detection Prevalence          0.0000           0.2497         0.09310
## Balanced Accuracy             0.5000           0.5206         0.50171
##                      Class: Tops
## Sensitivity               0.2080
## Specificity               0.8084
## Pos Pred Value            0.1077
## Neg Pred Value            0.9018
## Prevalence                0.1000
## Detection Rate            0.0208
## Detection Prevalence      0.1932
## Balanced Accuracy         0.5082