#### CSUEB - STAT 6620 - Spring 2017 - Prof E. Suess
#### Gui Larange - Homework 5
#### Regression Trees and Model Trees -------------------
## Understanding regression trees and model trees ----
## Example: Calculating Standard Deviation Reduction ----
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
# First partition, a
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
# Second partition, b
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR for each partition
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a < sdr_b
## [1] TRUE
# Since sdr_b is larger, b is the split that produces the most homogeneity and is the preferred partition.
## Example: Estimating Red Wine Quality ----
## Step 2: Exploring and preparing the data ----
wine<-read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/redwines.csv")
# Summary statistics of the wine data
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
wine_train <- wine[1:1200, ]
wine_test <- wine[1201:1599, ]
## Step 3: Training a model on the data ----
# Regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# Get basic information about the tree and produce a tree map. As in the case of white wine,
# Alcohol level is the top-most node.
m.rpart
## n= 1200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1200 771.63250 5.642500
## 2) alcohol< 11.45 982 519.39820 5.489817
## 4) sulphates< 0.585 393 147.11960 5.201018
## 8) volatile.acidity>=1.0125 16 11.75000 4.375000 *
## 9) volatile.acidity< 1.0125 377 123.98940 5.236074 *
## 5) sulphates>=0.585 589 317.62990 5.682513
## 10) alcohol< 9.975 248 114.22180 5.415323
## 20) volatile.acidity>=0.555 109 35.02752 5.165138 *
## 21) volatile.acidity< 0.555 139 67.02158 5.611511 *
## 11) alcohol>=9.975 341 172.82700 5.876833
## 22) volatile.acidity>=0.405 217 91.85253 5.718894 *
## 23) volatile.acidity< 0.405 124 66.08871 6.153226 *
## 3) alcohol>=11.45 218 126.22020 6.330275
## 6) sulphates< 0.635 95 52.00000 6.000000
## 12) pH>=3.265 65 29.53846 5.769231 *
## 13) pH< 3.265 30 11.50000 6.500000 *
## 7) sulphates>=0.635 123 55.85366 6.585366 *
library(rpart.plot)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
## Step 4: Evaluate model performance ----
# Generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
# Compare the distribution of predicted values vs. actual values
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.375 5.236 5.612 5.610 5.769 6.585
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.617 6.000 8.000
# Calculate the correlation
cor(p.rpart, wine_test$quality)
## [1] 0.5986281
# Function to calculate the mean absolute error
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
## [1] 0.5384269
## Step 5: Improving model performance ----
# Train a M5' Model Tree
library(RWeka)

m.m5p <- M5P(quality ~ ., data = wine_train)
# Get a summary of the model's performance
summary(m.m5p)
##
## === Summary ===
##
## Correlation coefficient 0.6136
## Mean absolute error 0.5051
## Root mean squared error 0.6332
## Relative absolute error 74.5819 %
## Root relative squared error 78.967 %
## Total Number of Instances 1200
# Generate predictions for the model
p.m5p <- predict(m.m5p, wine_test)
# Summary statistics about the predictions
summary(p.m5p)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.768 5.259 5.461 5.603 5.916 7.049
# Correlation between the predicted and true values. Higher correlation than for the simple Regression Tree.
cor(p.m5p, wine_test$quality)
## [1] 0.6639448
# Mean absolute error of predicted and true values. Lower than the simple RT
MAE(wine_test$quality, p.m5p)
## [1] 0.4908107