#### CSUEB - STAT 6620 - Spring 2017 - Prof E. Suess
#### Gui Larange - Homework 5
#### Regression Trees and Model Trees -------------------

## Understanding regression trees and model trees ----
## Example: Calculating Standard  Deviation Reduction ----
# set up the data
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
# First partition, a
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
# Second partition, b
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
# compute the SDR for each partition
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# compare the SDR for each split
sdr_a < sdr_b
## [1] TRUE
# Since sdr_b is larger, b is the split that produces the most homogeneity and is the preferred partition.


## Example: Estimating Red Wine Quality ----
## Step 2: Exploring and preparing the data ----
wine<-read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml10/redwines.csv")


# Summary statistics of the wine data
summary(wine)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00      
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00      
##  Median :0.07900   Median :14.00       Median : 38.00      
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47      
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00      
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00      
##     density             pH          sulphates         alcohol     
##  Min.   :0.9901   Min.   :2.740   Min.   :0.3300   Min.   : 8.40  
##  1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50  
##  Median :0.9968   Median :3.310   Median :0.6200   Median :10.20  
##  Mean   :0.9967   Mean   :3.311   Mean   :0.6581   Mean   :10.42  
##  3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10  
##  Max.   :1.0037   Max.   :4.010   Max.   :2.0000   Max.   :14.90  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.636  
##  3rd Qu.:6.000  
##  Max.   :8.000
wine_train <- wine[1:1200, ]
wine_test <- wine[1201:1599, ]

## Step 3: Training a model on the data ----
# Regression tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)

# Get basic information about the tree and produce a tree map. As in the case of white wine, 
# Alcohol level is the top-most node.
m.rpart
## n= 1200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1200 771.63250 5.642500  
##    2) alcohol< 11.45 982 519.39820 5.489817  
##      4) sulphates< 0.585 393 147.11960 5.201018  
##        8) volatile.acidity>=1.0125 16  11.75000 4.375000 *
##        9) volatile.acidity< 1.0125 377 123.98940 5.236074 *
##      5) sulphates>=0.585 589 317.62990 5.682513  
##       10) alcohol< 9.975 248 114.22180 5.415323  
##         20) volatile.acidity>=0.555 109  35.02752 5.165138 *
##         21) volatile.acidity< 0.555 139  67.02158 5.611511 *
##       11) alcohol>=9.975 341 172.82700 5.876833  
##         22) volatile.acidity>=0.405 217  91.85253 5.718894 *
##         23) volatile.acidity< 0.405 124  66.08871 6.153226 *
##    3) alcohol>=11.45 218 126.22020 6.330275  
##      6) sulphates< 0.635 95  52.00000 6.000000  
##       12) pH>=3.265 65  29.53846 5.769231 *
##       13) pH< 3.265 30  11.50000 6.500000 *
##      7) sulphates>=0.635 123  55.85366 6.585366 *
library(rpart.plot)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

## Step 4: Evaluate model performance ----

# Generate predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)

# Compare the distribution of predicted values vs. actual values
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.375   5.236   5.612   5.610   5.769   6.585
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.617   6.000   8.000
# Calculate the correlation
cor(p.rpart, wine_test$quality)
## [1] 0.5986281
# Function to calculate the mean absolute error
MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))  
}

# mean absolute error between predicted and actual values
MAE(p.rpart, wine_test$quality)
## [1] 0.5384269
## Step 5: Improving model performance ----
# Train a M5' Model Tree
library(RWeka)

m.m5p <- M5P(quality ~ ., data = wine_train)

# Get a summary of the model's performance
summary(m.m5p)
## 
## === Summary ===
## 
## Correlation coefficient                  0.6136
## Mean absolute error                      0.5051
## Root mean squared error                  0.6332
## Relative absolute error                 74.5819 %
## Root relative squared error             78.967  %
## Total Number of Instances             1200
# Generate predictions for the model
p.m5p <- predict(m.m5p, wine_test)

# Summary statistics about the predictions
summary(p.m5p)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.768   5.259   5.461   5.603   5.916   7.049
# Correlation between the predicted and true values. Higher correlation than for the simple Regression Tree.
cor(p.m5p, wine_test$quality)
## [1] 0.6639448
# Mean absolute error of predicted and true values. Lower than the simple RT
MAE(wine_test$quality, p.m5p)
## [1] 0.4908107