# install.packages(c("MASS", "car", "brant", "dplyr", "lubridate",
# "generalhoslem", "DescTools", "heplots", "rstatix"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(generalhoslem)
## Warning: package 'generalhoslem' was built under R version 4.5.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 4.5.3
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
# Load Data
df <- read.csv("Ecommerce_DBS.csv", check.names = FALSE)
# Standardisasi nama kolom
colnames(df) <- tolower(trimws(gsub(" ", "_", colnames(df))))
colnames(df)[colnames(df) == "longituide"] <- "longitude"
# Kategorisasi NPS → Variabel Respon Ordinal
df$customer_category <- ifelse(df$nps <= 6, "Detractor",
ifelse(df$nps <= 8, "Passive", "Promoter"))
df$customer_category <- factor(df$customer_category,
levels = c("Detractor", "Passive", "Promoter"),
ordered = TRUE)
# Hapus kolom tidak diperlukan
df <- df %>% dplyr::select(-c(nps, customer_id, state, country))
# Ekstraksi bulan & hari dari tanggal pembelian
df$purchase_date <- dmy(df$purchase_date)
df$month <- month(df$purchase_date)
df$day <- day(df$purchase_date)
df <- df %>% dplyr::select(-purchase_date)
# Format variabel kategorik
cat_cols <- c("product_category", "gender", "source")
df[cat_cols] <- lapply(df[cat_cols], as.factor)
# Standardisasi variabel numerik
num_cols <- c("product_price", "quantity", "total_purchase_amount",
"customer_age", "latitude", "longitude")
df[num_cols] <- scale(df[num_cols])
# Hapus baris dengan NA
df <- na.omit(df)
# Distribusi variabel respon
cat("Distribusi Customer Category:\n")
## Distribusi Customer Category:
print(table(df$customer_category))
##
## Detractor Passive Promoter
## 159322 45173 45505
cat("\nProporsi:\n")
##
## Proporsi:
print(prop.table(table(df$customer_category)))
##
## Detractor Passive Promoter
## 0.637288 0.180692 0.182020
# Visualisasi
barplot(table(df$customer_category),
main = "Distribusi Customer Category (NPS)",
xlab = "Kategori",
ylab = "Frekuensi",
col = c("tomato", "steelblue", "seagreen"),
ylim = c(0, max(table(df$customer_category)) * 1.2))
Tujuan: Mengetahui apakah terdapat hubungan antara variabel prediktor kategorik
dengan variabel responcustomer_category.
Hipotesis:
- H0: Tidak ada hubungan antara variabel prediktor dengan
customer_category
- H1: Ada hubungan antara variabel prediktor dengan
customer_category
- alfa = 0,05
cat("UJI INDEPENDENSI (CHI-SQUARE)")
## UJI INDEPENDENSI (CHI-SQUARE)
for (col in cat_cols) {
tbl <- table(df[[col]], df$customer_category)
test <- chisq.test(tbl)
keputusan <- ifelse(test$p.value < 0.05, "Tolak H0 → Ada hubungan",
"Gagal Tolak H0 → Tidak ada hubungan")
cat("Variabel :", col, "\n")
cat("Chi-Square:", round(test$statistic, 3),
"| df:", test$parameter,
"| p-value:", round(test$p.value, 4), "\n")
cat("Keputusan :", keputusan, "\n\n")
}
## Variabel : product_category
## Chi-Square: 11.147 | df: 6 | p-value: 0.0839
## Keputusan : Gagal Tolak H0 → Tidak ada hubungan
##
## Variabel : gender
## Chi-Square: 1.607 | df: 2 | p-value: 0.4479
## Keputusan : Gagal Tolak H0 → Tidak ada hubungan
##
## Variabel : source
## Chi-Square: 2.716 | df: 6 | p-value: 0.8436
## Keputusan : Gagal Tolak H0 → Tidak ada hubungan
model_ordinal <- polr(customer_category ~ ., data = df, Hess = TRUE)
summary(model_ordinal)
## Call:
## polr(formula = customer_category ~ ., data = df, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## product_categoryClothing -5.428e-03 0.0104753 -0.51815
## product_categoryElectronics -2.331e-02 0.0117155 -1.98944
## product_categoryHome 1.183e-02 0.0117056 1.01097
## product_price -1.291e-03 0.0040569 -0.31833
## quantity -1.264e-03 0.0040584 -0.31156
## total_purchase_amount -2.588e-03 0.0040618 -0.63715
## customer_age -2.280e-03 0.0040618 -0.56144
## genderMale 4.552e-04 0.0081136 0.05610
## sourceInstagram Campign 3.609e-03 0.0109657 0.32915
## sourceOrganic Search -5.253e-03 0.0122940 -0.42724
## sourceSEM -6.933e-04 0.0111030 -0.06244
## latitude 4.076e-04 0.0044002 0.09264
## longitude -6.186e-03 0.0044093 -1.40282
## month 3.315e-05 0.0012098 0.02740
## day 1.220e-03 0.0004614 2.64471
##
## Intercepts:
## Value Std. Error t value
## Detractor|Passive 0.5790 0.0150 38.5219
## Passive|Promoter 1.5182 0.0153 98.9073
##
## Residual Deviance: 453167.70
## AIC: 453201.70
Asumsi utama model ordinal: odds ratio antar kategori bersifat konstan
(proporsional) di semua level prediktor.
Hipotesis:
- H0: Asumsi Proportional Odds terpenuhi
- H1: Asumsi Proportional Odds tidak terpenuhi
- Asumsi terpenuhi jika p-value > 0,05
cat("UJI PROPORTIONAL ODDS (BRANT TEST)")
## UJI PROPORTIONAL ODDS (BRANT TEST)
brant_result <- brant(model_ordinal)
## ------------------------------------------------------------
## Test for X2 df probability
## ------------------------------------------------------------
## Omnibus 14.96 15 0.45
## product_categoryClothing 0 1 0.96
## product_categoryElectronics 2.13 1 0.14
## product_categoryHome 0.11 1 0.74
## product_price 0.08 1 0.78
## quantity 2.4 1 0.12
## total_purchase_amount 0.91 1 0.34
## customer_age 0 1 0.98
## genderMale 1.58 1 0.21
## sourceInstagram Campign 0.99 1 0.32
## sourceOrganic Search 0.19 1 0.67
## sourceSEM 1.8 1 0.18
## latitude 1.57 1 0.21
## longitude 0.86 1 0.35
## month 0.08 1 0.77
## day 2.82 1 0.09
## ------------------------------------------------------------
##
## H0: Parallel Regression Assumption holds
print(brant_result)
## X2 df probability
## Omnibus 1.495763e+01 15 0.45447383
## product_categoryClothing 2.228098e-03 1 0.96235165
## product_categoryElectronics 2.130014e+00 1 0.14443947
## product_categoryHome 1.083105e-01 1 0.74207590
## product_price 7.703925e-02 1 0.78135057
## quantity 2.401493e+00 1 0.12121948
## total_purchase_amount 9.132271e-01 1 0.33925963
## customer_age 3.652649e-04 1 0.98475183
## genderMale 1.579978e+00 1 0.20876387
## sourceInstagram Campign 9.875503e-01 1 0.32034183
## sourceOrganic Search 1.866618e-01 1 0.66570963
## sourceSEM 1.801753e+00 1 0.17950072
## latitude 1.570828e+00 1 0.21008686
## longitude 8.623676e-01 1 0.35307698
## month 8.314654e-02 1 0.77307775
## day 2.821358e+00 1 0.09301766
Menguji apakah hubungan antara variabel numerik dengan log-odds bersifat linear.
Dilakukan dengan menambahkan interaksi X × log(X) ke dalam modelpolr().
Asumsi terpenuhi jika koefisien interaksi tidak signifikan (p-value > 0,05).
cat("UJI LINEARITAS LOGIT (BOX-TIDWELL)")
## UJI LINEARITAS LOGIT (BOX-TIDWELL)
# Geser variabel agar semua bernilai positif sebelum transformasi log
df_bt <- df
for (col in num_cols) {
min_val <- min(df_bt[[col]], na.rm = TRUE)
if (min_val <= 0) df_bt[[col]] <- df_bt[[col]] - min_val + 1
df_bt[[paste0(col, "_x_log")]] <- df_bt[[col]] * log(df_bt[[col]])
}
# Model Box-Tidwell menggunakan polr() (BUKAN multinom)
model_bt <- polr(customer_category ~ ., data = df_bt, Hess = TRUE)
# Ambil koefisien interaksi X*log(X)
coef_bt <- coef(summary(model_bt))
log_vars <- grep("_x_log", rownames(coef_bt), value = TRUE)
p_bt <- pnorm(abs(coef_bt[log_vars, "t value"]), lower.tail = FALSE) * 2
result_bt <- cbind(coef_bt[log_vars, ], "p-value" = round(p_bt, 4))
cat("Koefisien interaksi X*log(X) — signifikan = linearitas TIDAK terpenuhi:\n")
## Koefisien interaksi X*log(X) — signifikan = linearitas TIDAK terpenuhi:
print(result_bt)
## Value Std. Error t value p-value
## product_price_x_log -0.006517742 0.02301569 -0.2831869 0.7770
## quantity_x_log 0.008293432 0.02168250 0.3824943 0.7021
## total_purchase_amount_x_log -0.025171853 0.02382404 -1.0565737 0.2907
## customer_age_x_log 0.003063892 0.02275533 0.1346450 0.8929
## latitude_x_log -0.029757888 0.02310850 -1.2877462 0.1978
## longitude_x_log 0.022843183 0.01227283 1.8612812 0.0627
cat("\nInterpretasi: p-value > 0.05 → linearitas logit TERPENUHI\n")
##
## Interpretasi: p-value > 0.05 → linearitas logit TERPENUHI
Asumsi terpenuhi jika VIF < 5 (ideal), maksimum toleransi VIF < 10.
cat("UJI MULTIKOLINEARITAS (VIF)")
## UJI MULTIKOLINEARITAS (VIF)
vif_values <- vif(model_ordinal)
print(vif_values)
## GVIF Df GVIF^(1/(2*Df))
## product_category 1.000203 3 1.000034
## product_price 1.000071 1 1.000035
## quantity 1.000090 1 1.000045
## total_purchase_amount 1.002695 1 1.001347
## customer_age 1.002950 1 1.001474
## gender 1.000050 1 1.000025
## source 1.005186 3 1.000862
## latitude 1.177144 1 1.084963
## longitude 1.176746 1 1.084779
## month 1.000053 1 1.000026
## day 1.000085 1 1.000042
cat("\nInterpretasi: VIF < 5 = tidak ada masalah multikolinearitas\n")
##
## Interpretasi: VIF < 5 = tidak ada masalah multikolinearitas
Karena variabel numerik telah distandarisasi (z-score), outlier diidentifikasi
sebagai observasi dengan |z| > 3.
cat("DETEKSI OUTLIER (|z| > 3)")
## DETEKSI OUTLIER (|z| > 3)
check_outliers <- function(x) sum(abs(x) > 3, na.rm = TRUE)
outliers_count <- sapply(df[num_cols], check_outliers)
print(outliers_count)
## product_price quantity total_purchase_amount
## 0 0 0
## customer_age latitude longitude
## 0 0 7480
cat("\nTotal data:", nrow(df), "baris\n")
##
## Total data: 250000 baris
cat("Interpretasi: Jumlah observasi ekstrem per variabel numerik\n")
## Interpretasi: Jumlah observasi ekstrem per variabel numerik
Menguji apakah minimal satu prediktor berpengaruh signifikan terhadap
variabel respon secara bersama-sama.
Hipotesis:
- Tolak H0 jika p-value < 0,05
cat("UJI SERENTAK (LIKELIHOOD RATIO TEST)")
## UJI SERENTAK (LIKELIHOOD RATIO TEST)
model_null <- polr(customer_category ~ 1, data = df, Hess = TRUE)
lrt <- anova(model_null, model_ordinal)
print(lrt)
## Likelihood ratio tests of ordinal regression models
##
## Response: customer_category
## Model
## 1 1
## 2 product_category + product_price + quantity + total_purchase_amount + customer_age + gender + source + latitude + longitude + month + day
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 249998 453186.5
## 2 249983 453167.7 1 vs 2 15 18.8017 0.2228608
Menguji pengaruh masing-masing prediktor secara individu terhadap variabel respon.
Hipotesis:
- Tolak H0 jika p-value < 0,05
cat("UJI PARSIAL (WALD TEST)")
## UJI PARSIAL (WALD TEST)
coef_tbl <- coef(summary(model_ordinal))
p_values <- pnorm(abs(coef_tbl[, "t value"]), lower.tail = FALSE) * 2
result_parsial <- cbind(coef_tbl, "p-value" = round(p_values, 4))
keputusan <- ifelse(p_values < 0.05, "Tolak H0", "Gagal Tolak H0")
result_parsial <- cbind(result_parsial, "Keputusan" = keputusan)
print(result_parsial)
## Value Std. Error
## product_categoryClothing "-0.00542772613745639" "0.0104752916928211"
## product_categoryElectronics "-0.0233073141834052" "0.0117154921266062"
## product_categoryHome "0.011833996571253" "0.0117056077906622"
## product_price "-0.00129143863465004" "0.00405690157195208"
## quantity "-0.00126444848469551" "0.00405840438289163"
## total_purchase_amount "-0.00258801612485333" "0.00406184573014308"
## customer_age "-0.00228045314959403" "0.00406179292225994"
## genderMale "0.00045520205772619" "0.00811355955777092"
## sourceInstagram Campign "0.00360939046685077" "0.0109657340241937"
## sourceOrganic Search "-0.0052525216067492" "0.0122939696383565"
## sourceSEM "-0.000693302146227106" "0.0111030239950248"
## latitude "0.000407642350587165" "0.00440022220479336"
## longitude "-0.00618550043965871" "0.00440934609584473"
## month "3.31542283913096e-05" "0.00120983073148356"
## day "0.00122017149732816" "0.000461363390154428"
## Detractor|Passive "0.579043061780652" "0.015031521888375"
## Passive|Promoter "1.51820712408755" "0.0153497930142354"
## t value p-value Keputusan
## product_categoryClothing "-0.518145584545022" "0.6044" "Gagal Tolak H0"
## product_categoryElectronics "-1.98944388605525" "0.0467" "Tolak H0"
## product_categoryHome "1.01096814303766" "0.312" "Gagal Tolak H0"
## product_price "-0.318331271228902" "0.7502" "Gagal Tolak H0"
## quantity "-0.311562960562986" "0.7554" "Gagal Tolak H0"
## total_purchase_amount "-0.637152737152864" "0.524" "Gagal Tolak H0"
## customer_age "-0.561440032330652" "0.5745" "Gagal Tolak H0"
## genderMale "0.0561038659400992" "0.9553" "Gagal Tolak H0"
## sourceInstagram Campign "0.329151742955589" "0.742" "Gagal Tolak H0"
## sourceOrganic Search "-0.427243743173209" "0.6692" "Gagal Tolak H0"
## sourceSEM "-0.0624426414405455" "0.9502" "Gagal Tolak H0"
## latitude "0.0926413102827174" "0.9262" "Gagal Tolak H0"
## longitude "-1.40281581558948" "0.1607" "Gagal Tolak H0"
## month "0.0274040223384425" "0.9781" "Gagal Tolak H0"
## day "2.64470810507904" "0.0082" "Tolak H0"
## Detractor|Passive "38.5219185442873" "0" "Tolak H0"
## Passive|Promoter "98.907335276741" "0" "Tolak H0"
Nilai odds ratio digunakan untuk menginterpretasikan koefisien estimasi parameter
model regresi logistik ordinal.
cat("ODDS RATIO")
## ODDS RATIO
odds_ratio <- exp(coef(model_ordinal))
ci_or <- exp(confint(model_ordinal))
## Waiting for profiling to be done...
or_tbl <- cbind("Odds Ratio" = round(odds_ratio, 4),
round(ci_or, 4))
print(or_tbl)
## Odds Ratio 2.5 % 97.5 %
## product_categoryClothing 0.9946 0.9744 1.0152
## product_categoryElectronics 0.9770 0.9550 0.9994
## product_categoryHome 1.0119 0.9890 1.0353
## product_price 0.9987 0.9908 1.0067
## quantity 0.9987 0.9908 1.0067
## total_purchase_amount 0.9974 0.9895 1.0054
## customer_age 0.9977 0.9898 1.0057
## genderMale 1.0005 0.9847 1.0165
## sourceInstagram Campign 1.0036 0.9824 1.0254
## sourceOrganic Search 0.9948 0.9714 1.0189
## sourceSEM 0.9993 0.9779 1.0212
## latitude 1.0004 0.9919 1.0091
## longitude 0.9938 0.9853 1.0024
## month 1.0000 0.9977 1.0024
## day 1.0012 1.0003 1.0021
Menguji apakah model yang terbentuk sudah sesuai dengan data (tidak ada
perbedaan signifikan antara nilai observasi dan prediksi).
Hipotesis:
- ho: Model sesuai
- h1: Model tidak sesuai
- Gagal Tolak H0 (model sesuai) jika p-value >
0,05
cat("UJI KESESUAIAN MODEL")
## UJI KESESUAIAN MODEL
lipsitz_result <- lipsitz.test(model_ordinal)
print(lipsitz_result)
##
## Lipsitz goodness of fit test for ordinal response models
##
## data: formula: customer_category ~ product_category + product_price + quantity + formula: total_purchase_amount + customer_age + gender + source + formula: latitude + longitude + month + day
## LR statistic = 17.576, df = 9, p-value = 0.04043
Mengukur seberapa besar variabilitas variabel respon yang mampu dijelaskan
oleh model (analog dengan R pada regresi linear).
cat("PSEUDO R-SQUARE")
## PSEUDO R-SQUARE
pseudo_r2 <- PseudoR2(model_ordinal,
which = c("CoxSnell", "Nagelkerke", "McFadden"))
print(round(pseudo_r2, 4))
## CoxSnell Nagelkerke McFadden
## 1e-04 1e-04 0e+00
cat("\nInterpretasi: Nilai Nagelkerke menunjukkan persentase variabilitas yang\n")
##
## Interpretasi: Nilai Nagelkerke menunjukkan persentase variabilitas yang
cat("dijelaskan oleh model (semakin mendekati 1 = semakin baik)\n")
## dijelaskan oleh model (semakin mendekati 1 = semakin baik)