RPART

# Load necessary libraries
library(rpart)
library(rpart.plot)

# Read the dataset
data <- read.csv("RidingMowers.csv")

# View the first few rows of the dataset
head(data)

##   Income Lot_Size Ownership
## 1   60.0     18.4     Owner
## 2   85.5     16.8     Owner
## 3   64.8     21.6     Owner
## 4   61.5     20.8     Owner
## 5   87.0     23.6     Owner
## 6  110.1     19.2     Owner

# Build a CART model using rpart
# Ownership is the dependent variable, and Income and Lot_Size are the features
cart_model <- rpart(Ownership ~ Income + Lot_Size, data = data, method = "class")

# Print the model summary
summary(cart_model)

## Call:
## rpart(formula = Ownership ~ Income + Lot_Size, data = data, method = "class")
##   n= 24 
## 
##     CP nsplit rel error   xerror      xstd
## 1 0.50      0       1.0 1.416667 0.1855610
## 2 0.01      1       0.5 1.000000 0.2041241
## 
## Variable importance
##   Income Lot_Size 
##       80       20 
## 
## Node number 1: 24 observations,    complexity param=0.5
##   predicted class=Nonowner  expected loss=0.5  P(node) =1
##     class counts:    12    12
##    probabilities: 0.500 0.500 
##   left son=2 (8 obs) right son=3 (16 obs)
##   Primary splits:
##       Income   < 59.7 to the left,  improve=3.375000, (0 missing)
##       Lot_Size < 19.8 to the left,  improve=3.085714, (0 missing)
##   Surrogate splits:
##       Lot_Size < 16.6 to the left,  agree=0.75, adj=0.25, (0 split)
## 
## Node number 2: 8 observations
##   predicted class=Nonowner  expected loss=0.125  P(node) =0.3333333
##     class counts:     7     1
##    probabilities: 0.875 0.125 
## 
## Node number 3: 16 observations
##   predicted class=Owner     expected loss=0.3125  P(node) =0.6666667
##     class counts:     5    11
##    probabilities: 0.312 0.688

# Plot the CART decision tree
rpart.plot(cart_model, type = 4, extra = 104)

# Make predictions on the training data
predictions <- predict(cart_model, data, type = "class")

# Create a confusion matrix to evaluate the performance of the model
conf_matrix <- table(Predicted = predictions, Actual = data$Ownership)

# Print the confusion matrix
print(conf_matrix)

##           Actual
## Predicted  Nonowner Owner
##   Nonowner        7     1
##   Owner           5    11

# Calculate the accuracy of the model
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy: ", round(accuracy * 100, 2), "%", sep = ""))

## [1] "Accuracy: 75%"

CHAID Algorithm

# Install the party package if you don't have it already
# install.packages("party")

# Load necessary libraries
library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

# Read the dataset
data <- read.csv("RidingMowers.csv")

# Convert Ownership to a factor as we are performing classification
data$Ownership <- as.factor(data$Ownership)

# View the first few rows of the dataset
head(data)

##   Income Lot_Size Ownership
## 1   60.0     18.4     Owner
## 2   85.5     16.8     Owner
## 3   64.8     21.6     Owner
## 4   61.5     20.8     Owner
## 5   87.0     23.6     Owner
## 6  110.1     19.2     Owner

# Build the Conditional Inference Tree (similar to CHAID)
ctree_model <- ctree(Ownership ~ Income + Lot_Size, data = data)

# Print the model summary
summary(ctree_model)

##     Length      Class       Mode 
##          1 BinaryTree         S4

# Plot the decision tree
plot(ctree_model)

# Make predictions on the training data
predictions <- predict(ctree_model, data)

# Create a confusion matrix to evaluate the performance of the model
conf_matrix <- table(Predicted = predictions, Actual = data$Ownership)

# Print the confusion matrix
print(conf_matrix)

##           Actual
## Predicted  Nonowner Owner
##   Nonowner       11     5
##   Owner           1     7

# Calculate the accuracy of the model
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy: ", round(accuracy * 100, 2), "%", sep = ""))

## [1] "Accuracy: 75%"

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:party':
## 
##     where

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Read the dataset
data <- read.csv("RidingMowers.csv")

# Convert Ownership to a factor as it's a classification problem
data$Ownership <- as.factor(data$Ownership)

# Function to calculate Gini index
gini_index <- function(class_counts) {
  total <- sum(class_counts)
  proportions <- class_counts / total
  return(1 - sum(proportions^2))
}

# Function to calculate Gini decrease for a feature
gini_decrease <- function(data, split_feature, target_feature) {
  # Initialize variables to store the best split and the maximum Gini decrease
  best_gini_decrease <- -Inf
  best_split_value <- NA
  best_left_gini <- NA
  best_right_gini <- NA
  best_parent_gini <- NA
  
  # Loop through all unique values of the split feature to find the optimum split
  for (split_value in unique(data[[split_feature]])) {
    
    # Splitting the dataset
    left_node <- data %>% filter(!!as.name(split_feature) <= split_value)
    right_node <- data %>% filter(!!as.name(split_feature) > split_value)
    
    # Check if the split results in empty nodes (avoid division by zero)
    if (nrow(left_node) == 0 || nrow(right_node) == 0) {
      next
    }
    
    # Gini index for the parent node
    parent_gini <- gini_index(table(data[[target_feature]]))
    
    # Gini index for left and right nodes
    left_gini <- gini_index(table(left_node[[target_feature]]))
    right_gini <- gini_index(table(right_node[[target_feature]]))
    
    # Weighted Gini of the split
    n <- nrow(data)
    n_left <- nrow(left_node)
    n_right <- nrow(right_node)
    weighted_gini <- (n_left / n) * left_gini + (n_right / n) * right_gini
    
    # Gini decrease
    gini_decrease_value <- parent_gini - weighted_gini
    
    # If the current split gives a better Gini decrease, update the best values
    if (gini_decrease_value > best_gini_decrease) {
      best_gini_decrease <- gini_decrease_value
      best_split_value <- split_value
      best_left_gini <- left_gini
      best_right_gini <- right_gini
      best_parent_gini <- parent_gini
    }
  }
  
  return(list(
    "split_value" = best_split_value,
    "parent_gini" = best_parent_gini,
    "left_gini" = best_left_gini,
    "right_gini" = best_right_gini,
    "gini_decrease" = best_gini_decrease
  ))
}

# Calculate Gini decrease for "Income"
gini_income <- gini_decrease(data, "Income", "Ownership")

# Print all values for the optimum split of Income
print("Optimum Gini Index and Decrease for Income:")

## [1] "Optimum Gini Index and Decrease for Income:"

print(paste("Optimum Split Value: ", gini_income$split_value))

## [1] "Optimum Split Value:  75"

print(paste("Parent Gini: ", gini_income$parent_gini))

## [1] "Parent Gini:  0.5"

print(paste("Left Node Gini: ", gini_income$left_gini))

## [1] "Left Node Gini:  0.4296875"

print(paste("Right Node Gini: ", gini_income$right_gini))

## [1] "Right Node Gini:  0.21875"

print(paste("Gini Decrease: ", gini_income$gini_decrease))

## [1] "Gini Decrease:  0.140625"

# Calculate Gini decrease for "Lot_Size"
gini_lot_size <- gini_decrease(data, "Lot_Size", "Ownership")

# Print all values for the optimum split of Lot_Size
print("Optimum Gini Index and Decrease for Lot_Size:")

## [1] "Optimum Gini Index and Decrease for Lot_Size:"

print(paste("Optimum Split Value: ", gini_lot_size$split_value))

## [1] "Optimum Split Value:  19.6"

print(paste("Parent Gini: ", gini_lot_size$parent_gini))

## [1] "Parent Gini:  0.5"

print(paste("Left Node Gini: ", gini_lot_size$left_gini))

## [1] "Left Node Gini:  0.408163265306122"

print(paste("Right Node Gini: ", gini_lot_size$right_gini))

## [1] "Right Node Gini:  0.32"

print(paste("Gini Decrease: ", gini_lot_size$gini_decrease))

## [1] "Gini Decrease:  0.128571428571429"

RPART

2024-10-14