data <- read.csv("C:/Users/nicho/OneDrive/Desktop/MSDA/MK 6460 - Marketing Research & Analytics/Week 7 - Decision Tree & Random Forest/Customer_Complaints_Dataset.csv")
# Sample 2000 observations
sample_data <- data %>% sample_n(2000)
# Convert character columns to factors
sample_data <- sample_data %>% mutate_if(is.character, as.factor)
# Drop ID-like columns
sample_data <- sample_data %>% select(-Customer)
# Train/Test Split
index <- createDataPartition(sample_data$Filed.Complaint, p = 0.7, list = FALSE)
train <- sample_data[index, ]
test <- sample_data[-index, ]
summary(sample_data)
## State Customer.Lifetime.Value Response Coverage
## Arizona :385 Min. : 1898 No :1685 Basic :1211
## California:685 1st Qu.: 4128 Yes: 315 Extended: 603
## Nevada :190 Median : 5915 Premium : 186
## Oregon :567 Mean : 8376
## Washington:173 3rd Qu.: 9095
## Max. :83325
##
## Education EmploymentStatus Gender Income
## Bachelor :633 Disabled : 87 F:1042 Min. : 0
## College :584 Employed :1261 M: 958 1st Qu.:11432
## Doctor : 80 Medical Leave: 91 Median :34326
## High School or Below:534 Retired : 70 Mean :38253
## Master :169 Unemployed : 491 3rd Qu.:63371
## Max. :99981
##
## Location.Code Marital.Status Monthly.Premium.Auto Months.Since.Last.Claim
## Rural : 373 Divorced: 281 Min. : 61.00 Min. : 0.00
## Suburban:1283 Married :1195 1st Qu.: 69.00 1st Qu.: 6.00
## Urban : 344 Single : 524 Median : 84.00 Median :14.00
## Mean : 94.61 Mean :15.12
## 3rd Qu.:110.00 3rd Qu.:23.00
## Max. :297.00 Max. :35.00
##
## Months.Since.Policy.Inception Number.of.Open.Complaints Number.of.Policies
## Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:23.75 1st Qu.:0.000 1st Qu.:1.000
## Median :47.00 Median :0.000 Median :2.000
## Mean :47.46 Mean :0.378 Mean :2.989
## 3rd Qu.:71.00 3rd Qu.:0.000 3rd Qu.:4.000
## Max. :99.00 Max. :5.000 Max. :9.000
##
## Policy.Type Policy Renew.Offer.Type Sales.Channel
## Corporate Auto: 424 Personal L3 :779 Offer1:829 Agent :758
## Personal Auto :1485 Personal L2 :425 Offer2:648 Branch :560
## Special Auto : 91 Personal L1 :281 Offer3:308 Call Center:393
## Corporate L3:197 Offer4:215 Web :289
## Corporate L2:147
## Corporate L1: 80
## (Other) : 91
## Vehicle.Class Vehicle.Size Filed.Complaint
## Four-Door Car:992 Large : 209 no :1028
## Luxury Car : 36 Medsize:1414 yes: 972
## Luxury SUV : 45 Small : 377
## Sports Car :106
## SUV :432
## Two-Door Car :389
##
str(sample_data)
## 'data.frame': 2000 obs. of 22 variables:
## $ State : Factor w/ 5 levels "Arizona","California",..: 4 2 2 2 4 3 1 2 2 2 ...
## $ Customer.Lifetime.Value : num 24179 5462 4890 7433 2237 ...
## $ Response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Coverage : Factor w/ 3 levels "Basic","Extended",..: 2 1 1 1 1 1 1 2 1 1 ...
## $ Education : Factor w/ 5 levels "Bachelor","College",..: 2 4 1 2 2 3 4 4 1 1 ...
## $ EmploymentStatus : Factor w/ 5 levels "Disabled","Employed",..: 2 2 2 2 5 1 5 2 5 5 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 1 1 1 1 1 2 2 1 1 ...
## $ Income : int 43704 39626 46819 58839 0 25703 0 44379 0 0 ...
## $ Location.Code : Factor w/ 3 levels "Rural","Suburban",..: 2 2 3 1 2 2 2 2 2 2 ...
## $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 2 2 3 2 3 3 ...
## $ Monthly.Premium.Auto : int 201 69 62 65 61 62 107 83 101 116 ...
## $ Months.Since.Last.Claim : int 29 23 15 4 19 4 22 10 10 12 ...
## $ Months.Since.Policy.Inception: int 67 0 0 65 44 64 96 12 48 76 ...
## $ Number.of.Open.Complaints : int 0 0 0 3 0 0 0 1 0 1 ...
## $ Number.of.Policies : int 2 3 5 2 1 1 4 8 2 2 ...
## $ Policy.Type : Factor w/ 3 levels "Corporate Auto",..: 2 2 2 2 2 2 2 3 2 2 ...
## $ Policy : Factor w/ 9 levels "Corporate L1",..: 6 6 5 6 5 6 5 9 6 6 ...
## $ Renew.Offer.Type : Factor w/ 4 levels "Offer1","Offer2",..: 1 2 1 1 3 1 4 2 1 1 ...
## $ Sales.Channel : Factor w/ 4 levels "Agent","Branch",..: 2 1 1 3 1 3 4 3 1 1 ...
## $ Vehicle.Class : Factor w/ 6 levels "Four-Door Car",..: 3 1 1 6 1 1 5 1 5 5 ...
## $ Vehicle.Size : Factor w/ 3 levels "Large","Medsize",..: 2 1 1 2 3 3 2 2 2 3 ...
## $ Filed.Complaint : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 2 2 ...
prop.table(table(sample_data$Filed.Complaint))
##
## no yes
## 0.514 0.486
logit_model <- glm(Filed.Complaint ~ ., data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit_probs <- predict(logit_model, test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
logit_preds <- ifelse(logit_probs > 0.5, "yes", "no")
logit_cm <- confusionMatrix(factor(logit_preds, levels = levels(test$Filed.Complaint)), test$Filed.Complaint)
logit_roc <- roc(test$Filed.Complaint, as.numeric(logit_probs))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
logit_auc <- auc(logit_roc)
logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 267 47
## yes 41 244
##
## Accuracy : 0.8531
## 95% CI : (0.8222, 0.8805)
## No Information Rate : 0.5142
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7058
##
## Mcnemar's Test P-Value : 0.594
##
## Sensitivity : 0.8669
## Specificity : 0.8385
## Pos Pred Value : 0.8503
## Neg Pred Value : 0.8561
## Prevalence : 0.5142
## Detection Rate : 0.4457
## Detection Prevalence : 0.5242
## Balanced Accuracy : 0.8527
##
## 'Positive' Class : no
##
tree_model <- rpart(Filed.Complaint ~ ., data = train, method = "class")
tree_probs <- predict(tree_model, test, type = "prob")[, "yes"]
tree_preds <- ifelse(tree_probs > 0.5, "yes", "no")
tree_cm <- confusionMatrix(factor(tree_preds, levels = levels(test$Filed.Complaint)), test$Filed.Complaint)
tree_roc <- roc(test$Filed.Complaint, as.numeric(tree_probs))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
tree_auc <- auc(tree_roc)
tree_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 286 47
## yes 22 244
##
## Accuracy : 0.8848
## 95% CI : (0.8565, 0.9093)
## No Information Rate : 0.5142
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7689
##
## Mcnemar's Test P-Value : 0.003861
##
## Sensitivity : 0.9286
## Specificity : 0.8385
## Pos Pred Value : 0.8589
## Neg Pred Value : 0.9173
## Prevalence : 0.5142
## Detection Rate : 0.4775
## Detection Prevalence : 0.5559
## Balanced Accuracy : 0.8835
##
## 'Positive' Class : no
##
rf_model <- randomForest(Filed.Complaint ~ ., data = train, ntree = 100)
rf_probs <- predict(rf_model, test, type = "prob")[, "yes"]
rf_preds <- ifelse(rf_probs > 0.5, "yes", "no")
rf_cm <- confusionMatrix(factor(rf_preds, levels = levels(test$Filed.Complaint)), test$Filed.Complaint)
rf_roc <- roc(test$Filed.Complaint, as.numeric(rf_probs))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
rf_auc <- auc(rf_roc)
rf_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 283 44
## yes 25 247
##
## Accuracy : 0.8848
## 95% CI : (0.8565, 0.9093)
## No Information Rate : 0.5142
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.769
##
## Mcnemar's Test P-Value : 0.03024
##
## Sensitivity : 0.9188
## Specificity : 0.8488
## Pos Pred Value : 0.8654
## Neg Pred Value : 0.9081
## Prevalence : 0.5142
## Detection Rate : 0.4725
## Detection Prevalence : 0.5459
## Balanced Accuracy : 0.8838
##
## 'Positive' Class : no
##
model_results <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
Accuracy = c(logit_cm$overall['Accuracy'], tree_cm$overall['Accuracy'], rf_cm$overall['Accuracy']),
Sensitivity = c(logit_cm$byClass['Sensitivity'], tree_cm$byClass['Sensitivity'], rf_cm$byClass['Sensitivity']),
Specificity = c(logit_cm$byClass['Specificity'], tree_cm$byClass['Specificity'], rf_cm$byClass['Specificity']),
AUC = c(logit_auc, tree_auc, rf_auc)
)
model_results
All three models were applied to a 2000-record sample of the customer complaints data. After ensuring that factor levels aligned and the dependent variable was treated consistently, the models were able to generate valid results. Random Forest showed the strongest performance in terms of AUC and accuracy, followed by Logistic Regression. Decision Tree provided interpretable structure but lower overall performance. The dataset’s balance between complaint classes (~50/50) allowed for effective model comparisons.
fifa <- read.csv("C:/Users/nicho/OneDrive/Desktop/MSDA/MK 6460 - Marketing Research & Analytics/Week 7 - Decision Tree & Random Forest/Fifa_preferred_foot.csv")
fifa$preferred_foot <- as.factor(fifa$preferred_foot)
# Train/Test Split
set.seed(123)
index <- createDataPartition(fifa$preferred_foot, p = 0.7, list = FALSE)
train <- fifa[index, ]
test <- fifa[-index, ]
fifa_logit <- glm(preferred_foot ~ ., data = train, family = "binomial")
fifa_logit_probs <- predict(fifa_logit, test, type = "response")
fifa_logit_preds <- ifelse(fifa_logit_probs > 0.5, "Right", "Left")
fifa_logit_cm <- confusionMatrix(factor(fifa_logit_preds, levels = levels(test$preferred_foot)), test$preferred_foot)
fifa_logit_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Right
## Left 833 437
## Right 418 814
##
## Accuracy : 0.6583
## 95% CI : (0.6393, 0.6769)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3165
##
## Mcnemar's Test P-Value : 0.5382
##
## Sensitivity : 0.6659
## Specificity : 0.6507
## Pos Pred Value : 0.6559
## Neg Pred Value : 0.6607
## Prevalence : 0.5000
## Detection Rate : 0.3329
## Detection Prevalence : 0.5076
## Balanced Accuracy : 0.6583
##
## 'Positive' Class : Left
##
fifa_tree <- rpart(preferred_foot ~ ., data = train, method = "class")
fifa_tree_probs <- predict(fifa_tree, test, type = "prob")[,2]
fifa_tree_preds <- ifelse(fifa_tree_probs > 0.5, "Right", "Left")
fifa_tree_cm <- confusionMatrix(factor(fifa_tree_preds, levels = levels(test$preferred_foot)), test$preferred_foot)
fifa_tree_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Right
## Left 706 367
## Right 545 884
##
## Accuracy : 0.6355
## 95% CI : (0.6163, 0.6544)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.271
##
## Mcnemar's Test P-Value : 4.599e-09
##
## Sensitivity : 0.5643
## Specificity : 0.7066
## Pos Pred Value : 0.6580
## Neg Pred Value : 0.6186
## Prevalence : 0.5000
## Detection Rate : 0.2822
## Detection Prevalence : 0.4289
## Balanced Accuracy : 0.6355
##
## 'Positive' Class : Left
##
fifa_rf <- randomForest(preferred_foot ~ ., data = train, ntree = 150)
fifa_rf_preds <- predict(fifa_rf, test)
fifa_rf_cm <- confusionMatrix(fifa_rf_preds, test$preferred_foot)
fifa_rf_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Right
## Left 822 413
## Right 429 838
##
## Accuracy : 0.6635
## 95% CI : (0.6446, 0.682)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.3269
##
## Mcnemar's Test P-Value : 0.6052
##
## Sensitivity : 0.6571
## Specificity : 0.6699
## Pos Pred Value : 0.6656
## Neg Pred Value : 0.6614
## Prevalence : 0.5000
## Detection Rate : 0.3285
## Detection Prevalence : 0.4936
## Balanced Accuracy : 0.6635
##
## 'Positive' Class : Left
##
fifa_results <- data.frame(
Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
Accuracy = c(fifa_logit_cm$overall["Accuracy"], fifa_tree_cm$overall["Accuracy"], fifa_rf_cm$overall["Accuracy"]),
Sensitivity = c(fifa_logit_cm$byClass["Sensitivity"], fifa_tree_cm$byClass["Sensitivity"], fifa_rf_cm$byClass["Sensitivity"]),
Specificity = c(fifa_logit_cm$byClass["Specificity"], fifa_tree_cm$byClass["Specificity"], fifa_rf_cm$byClass["Specificity"])
)
fifa_results
The FIFA foot prediction task yielded insightful comparisons across modeling techniques. Random Forest performed best overall, achieving the highest accuracy (~66%) and balancing sensitivity and specificity well. Its ensemble approach captured the complexity of player skill profiles better than the simpler models. Logistic Regression performed reasonably, though it was limited in capturing non-linear patterns in skill interactions. Decision Tree gave slightly lower predictive power and can be prone to overfitting, but it provides valuable interpretability. Overall, Random Forest is recommended in this context due to its robustness and superior performance on this balanced dataset.