library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ranger)
## Warning: package 'ranger' was built under R version 4.5.2
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# 1) Load data
samples <- read_excel("C:/Users/Shrutika Ruia/Desktop/DAR/samples_Champo Carpets.xlsx")

# 2) Minimal prep
if ("CustomerOrderNo" %in% names(samples)) samples$CustomerOrderNo <- NULL
# Ensure target is a 0/1 factor with consistent level order
samples$Converted <- as.factor(samples$Converted)
if (all(levels(samples$Converted) %in% c("0","1"))) {
  samples$Converted <- factor(samples$Converted, levels = c("0","1"))
}
# Convert character predictors to factors
char_cols <- sapply(samples, is.character)
samples[char_cols] <- lapply(samples[char_cols], as.factor)
# Drop rows with missing values (simple, safe)
samples <- na.omit(samples)
# 3) Train / test split
set.seed(3)
idx  <- sample(seq_len(nrow(samples)), size = floor(0.8 * nrow(samples)))
train <- samples[idx, , drop = FALSE]
test  <- samples[-idx, , drop = FALSE]
# 4) Fit random forest with probability output
rf <- ranger(
  Converted ~ ., data = train,
  num.trees = 500, probability = TRUE,
  importance = "impurity"
)
# 5) Predict probabilities and classes
pred_mat <- predict(rf, data = test)$predictions  # matrix of class probs
# Pick probability for class "1" robustly
pos_class <- if ("1" %in% colnames(pred_mat)) "1" else levels(train$Converted)[2]
pred_prob  <- pred_mat[, pos_class]
pred_class <- factor(ifelse(pred_prob >= 0.5, "1", "0"), levels = c("0","1"))
# 6) Accuracy + confusion matrix
acc <- mean(pred_class == test$Converted)
acc
## [1] 0.9639175
table(Predicted = pred_class, Actual = test$Converted)
##          Actual
## Predicted   0   1
##         0 915  26
##         1  16 207
# 7) Top drivers (feature importance)
sort(rf$variable.importance, decreasing = TRUE)[1:10]
##    combo_code    DesignName   CountryName Custorderdate  CustomerCode 
##     292.96655     222.89873     182.34810     152.97037     138.74252 
##   QualityName     TotalArea        AreaFt     ColorName   QtyRequired 
##      87.38011      71.26858      57.80269      53.74229      51.00201
# 8) ROC + AUC
roc_obj <- roc(response = test$Converted, predictor = pred_prob, levels = c("0","1"))
## Setting direction: controls < cases
plot(roc_obj, lwd = 3, main = "ROC Curve - Random Forest (ranger)")

auc(roc_obj)
## Area under the curve: 0.9908