library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ranger)
## Warning: package 'ranger' was built under R version 4.5.2
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# 1) Load data
samples <- read_excel("C:/Users/Shrutika Ruia/Desktop/DAR/samples_Champo Carpets.xlsx")
# 2) Minimal prep
if ("CustomerOrderNo" %in% names(samples)) samples$CustomerOrderNo <- NULL
# Ensure target is a 0/1 factor with consistent level order
samples$Converted <- as.factor(samples$Converted)
if (all(levels(samples$Converted) %in% c("0","1"))) {
samples$Converted <- factor(samples$Converted, levels = c("0","1"))
}
# Convert character predictors to factors
char_cols <- sapply(samples, is.character)
samples[char_cols] <- lapply(samples[char_cols], as.factor)
# Drop rows with missing values (simple, safe)
samples <- na.omit(samples)
# 3) Train / test split
set.seed(3)
idx <- sample(seq_len(nrow(samples)), size = floor(0.8 * nrow(samples)))
train <- samples[idx, , drop = FALSE]
test <- samples[-idx, , drop = FALSE]
# 4) Fit random forest with probability output
rf <- ranger(
Converted ~ ., data = train,
num.trees = 500, probability = TRUE,
importance = "impurity"
)
# 5) Predict probabilities and classes
pred_mat <- predict(rf, data = test)$predictions # matrix of class probs
# Pick probability for class "1" robustly
pos_class <- if ("1" %in% colnames(pred_mat)) "1" else levels(train$Converted)[2]
pred_prob <- pred_mat[, pos_class]
pred_class <- factor(ifelse(pred_prob >= 0.5, "1", "0"), levels = c("0","1"))
# 6) Accuracy + confusion matrix
acc <- mean(pred_class == test$Converted)
acc
## [1] 0.9639175
table(Predicted = pred_class, Actual = test$Converted)
## Actual
## Predicted 0 1
## 0 915 26
## 1 16 207
# 7) Top drivers (feature importance)
sort(rf$variable.importance, decreasing = TRUE)[1:10]
## combo_code DesignName CountryName Custorderdate CustomerCode
## 292.96655 222.89873 182.34810 152.97037 138.74252
## QualityName TotalArea AreaFt ColorName QtyRequired
## 87.38011 71.26858 57.80269 53.74229 51.00201
# 8) ROC + AUC
roc_obj <- roc(response = test$Converted, predictor = pred_prob, levels = c("0","1"))
## Setting direction: controls < cases
plot(roc_obj, lwd = 3, main = "ROC Curve - Random Forest (ranger)")

auc(roc_obj)
## Area under the curve: 0.9908