# Load necessary libraries
library(rpart)
library(rpart.plot)
# Read the dataset
data <- read.csv("RidingMowers.csv")
# View the first few rows of the dataset
head(data)
## Income Lot_Size Ownership
## 1 60.0 18.4 Owner
## 2 85.5 16.8 Owner
## 3 64.8 21.6 Owner
## 4 61.5 20.8 Owner
## 5 87.0 23.6 Owner
## 6 110.1 19.2 Owner
# Build a CART model using rpart
# Ownership is the dependent variable, and Income and Lot_Size are the features
cart_model <- rpart(Ownership ~ Income + Lot_Size, data = data, method = "class")
# Print the model summary
summary(cart_model)
## Call:
## rpart(formula = Ownership ~ Income + Lot_Size, data = data, method = "class")
## n= 24
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.0 1.416667 0.1855610
## 2 0.01 1 0.5 1.000000 0.2041241
##
## Variable importance
## Income Lot_Size
## 80 20
##
## Node number 1: 24 observations, complexity param=0.5
## predicted class=Nonowner expected loss=0.5 P(node) =1
## class counts: 12 12
## probabilities: 0.500 0.500
## left son=2 (8 obs) right son=3 (16 obs)
## Primary splits:
## Income < 59.7 to the left, improve=3.375000, (0 missing)
## Lot_Size < 19.8 to the left, improve=3.085714, (0 missing)
## Surrogate splits:
## Lot_Size < 16.6 to the left, agree=0.75, adj=0.25, (0 split)
##
## Node number 2: 8 observations
## predicted class=Nonowner expected loss=0.125 P(node) =0.3333333
## class counts: 7 1
## probabilities: 0.875 0.125
##
## Node number 3: 16 observations
## predicted class=Owner expected loss=0.3125 P(node) =0.6666667
## class counts: 5 11
## probabilities: 0.312 0.688
# Plot the CART decision tree
rpart.plot(cart_model, type = 4, extra = 104)
# Make predictions on the training data
predictions <- predict(cart_model, data, type = "class")
# Create a confusion matrix to evaluate the performance of the model
conf_matrix <- table(Predicted = predictions, Actual = data$Ownership)
# Print the confusion matrix
print(conf_matrix)
## Actual
## Predicted Nonowner Owner
## Nonowner 7 1
## Owner 5 11
# Calculate the accuracy of the model
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy: ", round(accuracy * 100, 2), "%", sep = ""))
## [1] "Accuracy: 75%"
CHAID Algorithm
# Install the party package if you don't have it already
# install.packages("party")
# Load necessary libraries
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
# Read the dataset
data <- read.csv("RidingMowers.csv")
# Convert Ownership to a factor as we are performing classification
data$Ownership <- as.factor(data$Ownership)
# View the first few rows of the dataset
head(data)
## Income Lot_Size Ownership
## 1 60.0 18.4 Owner
## 2 85.5 16.8 Owner
## 3 64.8 21.6 Owner
## 4 61.5 20.8 Owner
## 5 87.0 23.6 Owner
## 6 110.1 19.2 Owner
# Build the Conditional Inference Tree (similar to CHAID)
ctree_model <- ctree(Ownership ~ Income + Lot_Size, data = data)
# Print the model summary
summary(ctree_model)
## Length Class Mode
## 1 BinaryTree S4
# Plot the decision tree
plot(ctree_model)
# Make predictions on the training data
predictions <- predict(ctree_model, data)
# Create a confusion matrix to evaluate the performance of the model
conf_matrix <- table(Predicted = predictions, Actual = data$Ownership)
# Print the confusion matrix
print(conf_matrix)
## Actual
## Predicted Nonowner Owner
## Nonowner 11 5
## Owner 1 7
# Calculate the accuracy of the model
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy: ", round(accuracy * 100, 2), "%", sep = ""))
## [1] "Accuracy: 75%"
# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:party':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read the dataset
data <- read.csv("RidingMowers.csv")
# Convert Ownership to a factor as it's a classification problem
data$Ownership <- as.factor(data$Ownership)
# Function to calculate Gini index
gini_index <- function(class_counts) {
total <- sum(class_counts)
proportions <- class_counts / total
return(1 - sum(proportions^2))
}
# Function to calculate Gini decrease for a feature
gini_decrease <- function(data, split_feature, target_feature) {
# Initialize variables to store the best split and the maximum Gini decrease
best_gini_decrease <- -Inf
best_split_value <- NA
best_left_gini <- NA
best_right_gini <- NA
best_parent_gini <- NA
# Loop through all unique values of the split feature to find the optimum split
for (split_value in unique(data[[split_feature]])) {
# Splitting the dataset
left_node <- data %>% filter(!!as.name(split_feature) <= split_value)
right_node <- data %>% filter(!!as.name(split_feature) > split_value)
# Check if the split results in empty nodes (avoid division by zero)
if (nrow(left_node) == 0 || nrow(right_node) == 0) {
next
}
# Gini index for the parent node
parent_gini <- gini_index(table(data[[target_feature]]))
# Gini index for left and right nodes
left_gini <- gini_index(table(left_node[[target_feature]]))
right_gini <- gini_index(table(right_node[[target_feature]]))
# Weighted Gini of the split
n <- nrow(data)
n_left <- nrow(left_node)
n_right <- nrow(right_node)
weighted_gini <- (n_left / n) * left_gini + (n_right / n) * right_gini
# Gini decrease
gini_decrease_value <- parent_gini - weighted_gini
# If the current split gives a better Gini decrease, update the best values
if (gini_decrease_value > best_gini_decrease) {
best_gini_decrease <- gini_decrease_value
best_split_value <- split_value
best_left_gini <- left_gini
best_right_gini <- right_gini
best_parent_gini <- parent_gini
}
}
return(list(
"split_value" = best_split_value,
"parent_gini" = best_parent_gini,
"left_gini" = best_left_gini,
"right_gini" = best_right_gini,
"gini_decrease" = best_gini_decrease
))
}
# Calculate Gini decrease for "Income"
gini_income <- gini_decrease(data, "Income", "Ownership")
# Print all values for the optimum split of Income
print("Optimum Gini Index and Decrease for Income:")
## [1] "Optimum Gini Index and Decrease for Income:"
print(paste("Optimum Split Value: ", gini_income$split_value))
## [1] "Optimum Split Value: 75"
print(paste("Parent Gini: ", gini_income$parent_gini))
## [1] "Parent Gini: 0.5"
print(paste("Left Node Gini: ", gini_income$left_gini))
## [1] "Left Node Gini: 0.4296875"
print(paste("Right Node Gini: ", gini_income$right_gini))
## [1] "Right Node Gini: 0.21875"
print(paste("Gini Decrease: ", gini_income$gini_decrease))
## [1] "Gini Decrease: 0.140625"
# Calculate Gini decrease for "Lot_Size"
gini_lot_size <- gini_decrease(data, "Lot_Size", "Ownership")
# Print all values for the optimum split of Lot_Size
print("Optimum Gini Index and Decrease for Lot_Size:")
## [1] "Optimum Gini Index and Decrease for Lot_Size:"
print(paste("Optimum Split Value: ", gini_lot_size$split_value))
## [1] "Optimum Split Value: 19.6"
print(paste("Parent Gini: ", gini_lot_size$parent_gini))
## [1] "Parent Gini: 0.5"
print(paste("Left Node Gini: ", gini_lot_size$left_gini))
## [1] "Left Node Gini: 0.408163265306122"
print(paste("Right Node Gini: ", gini_lot_size$right_gini))
## [1] "Right Node Gini: 0.32"
print(paste("Gini Decrease: ", gini_lot_size$gini_decrease))
## [1] "Gini Decrease: 0.128571428571429"