Used Libraries

library(DMwR) # library including the dataset
## Warning: package 'DMwR' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: grid
library(car)
library(lattice)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.2.3
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(rpart) # regression trees
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.3
## Rattle: A free graphical interface for data mining with R.
## Version 4.0.5 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.3
library(RColorBrewer)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:Hmisc':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(plyr)
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
## 
## The following object is masked from 'package:DMwR':
## 
##     join

Load Data

The dataset was downloaded from https://archive.ics.uci.edu/ml/datasets/Coil+1999+Competition+Data .

algae <- read.csv('analysis.data',
                    header=F,
                    dec='.',
                  col.names=c('season','size','speed','mxPH','mnO2','Cl',
                              'NO3','NH4','oPO4','PO4','Chla','a1','a2','a3','a4',
                              'a5','a6','a7'),
                  na.strings=c('XXXXXXX'))

algae$NO3 <- as.numeric(algae$NO3)

Data Summaries

summary(algae)
##     season       size       speed         mxPH            mnO2       
##  autumn:40   large_:45   high__:84   Min.   :5.600   Min.   : 1.500  
##  spring:53   medium:84   low___:33   1st Qu.:7.700   1st Qu.: 7.725  
##  summer:45   small_:71   medium:83   Median :8.060   Median : 9.800  
##  winter:62                           Mean   :8.012   Mean   : 9.118  
##                                      3rd Qu.:8.400   3rd Qu.:10.800  
##                                      Max.   :9.700   Max.   :13.400  
##                                      NA's   :1       NA's   :2       
##        Cl               NO3              NH4              oPO4       
##  Min.   :  0.222   Min.   :  1.00   Min.   :  5.00   Min.   :  1.00  
##  1st Qu.: 10.981   1st Qu.: 48.25   1st Qu.: 35.62   1st Qu.: 16.00  
##  Median : 32.730   Median : 96.50   Median : 99.67   Median : 41.40  
##  Mean   : 43.636   Mean   : 95.86   Mean   :154.45   Mean   : 83.33  
##  3rd Qu.: 57.824   3rd Qu.:143.75   3rd Qu.:203.73   3rd Qu.:102.25  
##  Max.   :391.500   Max.   :192.00   Max.   :931.83   Max.   :771.60  
##  NA's   :10        NA's   :2        NA's   :2        NA's   :2       
##       PO4              Chla              a1               a2        
##  Min.   :  0.90   Min.   :  0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 19.39   1st Qu.:  2.00   1st Qu.: 1.475   1st Qu.: 0.000  
##  Median : 84.50   Median :  5.20   Median : 7.400   Median : 2.100  
##  Mean   :111.55   Mean   : 13.54   Mean   :16.863   Mean   : 6.934  
##  3rd Qu.:182.16   3rd Qu.: 18.30   3rd Qu.:24.075   3rd Qu.: 9.075  
##  Max.   :558.75   Max.   :110.46   Max.   :89.800   Max.   :72.600  
##  NA's   :2        NA's   :12                                        
##        a3               a4               a5              a6        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.00   1st Qu.: 0.000  
##  Median : 1.750   Median : 0.000   Median : 1.90   Median : 0.000  
##  Mean   : 4.729   Mean   : 1.885   Mean   : 5.63   Mean   : 5.199  
##  3rd Qu.: 6.150   3rd Qu.: 2.225   3rd Qu.: 7.70   3rd Qu.: 6.725  
##  Max.   :44.600   Max.   :35.600   Max.   :77.60   Max.   :52.500  
##                                                                    
##        a7        
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 0.000  
##  Mean   : 2.506  
##  3rd Qu.: 2.400  
##  Max.   :31.600  
##  NA's   :17
str(algae)
## 'data.frame':    200 obs. of  18 variables:
##  $ season: Factor w/ 4 levels "autumn","spring",..: 4 2 1 2 1 4 3 1 4 4 ...
##  $ size  : Factor w/ 3 levels "large_","medium",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ speed : Factor w/ 3 levels "high__","low___",..: 3 3 3 3 3 1 1 1 3 1 ...
##  $ mxPH  : num  8 8.35 8.1 8.07 8.06 8.25 8.15 8.05 8.7 7.93 ...
##  $ mnO2  : num  9.8 8 11.4 4.8 9 13.1 10.3 10.6 3.4 9.9 ...
##  $ Cl    : num  60.8 57.8 40 77.4 55.4 ...
##  $ NO3   : num  177 48 165 86 80 190 64 158 32 53 ...
##  $ NH4   : num  578 370 346.7 98.2 233.7 ...
##  $ oPO4  : num  105 428.8 125.7 61.2 58.2 ...
##  $ PO4   : num  170 558.8 187.1 138.7 97.6 ...
##  $ Chla  : num  50 1.3 15.6 1.4 10.5 ...
##  $ a1    : num  0 1.4 3.3 3.1 9.2 15.1 2.4 18.2 25.4 17 ...
##  $ a2    : num  0 7.6 53.6 41 2.9 14.6 1.2 1.6 5.4 0 ...
##  $ a3    : num  0 4.8 1.9 18.9 7.5 1.4 3.2 0 2.5 0 ...
##  $ a4    : num  0 1.9 0 0 0 0 3.9 0 0 2.9 ...
##  $ a5    : num  34.2 6.7 0 1.4 7.5 22.5 5.8 5.5 0 0 ...
##  $ a6    : num  8.3 0 0 0 4.1 12.6 6.8 8.7 0 0 ...
##  $ a7    : num  0 2.1 9.7 1.4 1 2.9 0 0 0 1.7 ...

Univariate Analysis

#hist(algae$mxPH, prob = T)

par(mfrow=c(1,2)) # get two graphs side by side 
hist(algae$mxPH, prob=T, xlab='',
     main='Histogram of maximum pH value',ylim=0:1)
lines(density(algae$mxPH,na.rm=T))
rug(jitter(algae$mxPH))
qq.plot(algae$mxPH,main='Normal QQ plot of maximum pH')
## Warning: 'qq.plot' is deprecated.
## Use 'qqPlot' instead.
## See help("Deprecated") and help("car-deprecated").

par(mfrow=c(1,1))

The plot shows that there are several low values of the variable that clearly break the assumptions of a normal distribution with 95% confidence.

Next plot is for variable oPO4:

boxplot(algae$oPO4, ylab = "Orthophosphate (oPO4)")
rug(jitter(algae$oPO4), side = 2)
abline(h = mean(algae$oPO4, na.rm = T), lty = 2)

Here, horizontal dashed line shows the mean value of the variable. The variable oPO4 has a distribution of the observed values clearly concentrated on low values, thus with a positive skew. In most of the water samples, the value of oPO4 is low, but there are several observations with high values, and even with extremely high values.

Now, let’s plot the values of variable NH4.

plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T),
       lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)

# Below - interactive plots that do not work in a presentation format.
# identify(algae$NH4)
# 
# plot(algae$NH4, xlab = "")
# clicked.lines <- identify(algae$NH4)
# algae[clicked.lines, ]

The plot shows the outliers have NH4 > 800, so I will inspect these cases:

algae[!is.na(algae$NH4) & algae$NH4 > 800, ]
##     season   size  speed mxPH mnO2     Cl NO3     NH4  oPO4     PO4 Chla
## 86  winter medium medium  7.6  6.0 53.000 126 914.000 137.6 254.600  4.3
## 108 autumn medium high__  8.3 11.1 41.500 149 931.833  39.0 124.200 13.1
## 144 winter medium low___  8.0  4.5 79.077 188 920.000  70.0 200.231 19.4
##       a1   a2  a3  a4  a5   a6   a7
## 86   0.0  0.0 0.0 4.6 9.0 13.1 30.1
## 108 23.7 13.7 0.0 1.7 6.4  2.6  0.0
## 144  2.5  1.4 1.4 6.2 4.1  1.8  3.9

Bivariate Analysis

bwplot(size ~ a1, data=algae, ylab='River Size',xlab='Algal A1')

bwplot(size ~ a1, data=algae,panel=panel.bpplot,
       probs=seq(.01,.49,by=.01), datadensity=TRUE,
       ylab='River Size',xlab='Algal A1')

The first plot (a simple boxplot) shows that higher frequencies of algal a1 are expected in smaller rivers. The second plot (a box percentile plot) confirms the previous observation that smaller rivers have higher frequencies of this alga, but it also shows that the value of the observed frequencies for these small rivers is much more widespread across the domain of frequencies than for other types of rivers.

Let’s now see how the behavior of the frequency of algal a3 depends on by season and mnO2.

# create a factorized version of continuous mnO2 variable
# number sets the number of desired bins
# overlap sets the overlap between the bins near their respective boundaries 
# thus certain observations will be assigned to adjacent bins:
minO2 <- equal.count(na.omit(algae$mnO2),
                     number=4,overlap=1/5)


# create the plot:
stripplot(season ~ a3|minO2,
          data=algae[!is.na(algae$mnO2),])

The bottom-left plot corresponds to lower values of mnO2.

Dealing with Unknown Values

Here, I will show a commented-out code for several different ways of dealing with unknown values. Just one part of code will be uncommented and active - the approach that I will actually use for dealing with unknown values in my dataset.

Before dealing with unknown values, I will count them.

nrow(algae[!complete.cases(algae),]) # number of rows with NA
## [1] 33
apply(algae, 1, function(x) sum(is.na(x))) # number of NA in each row
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1
##  [36] 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 6 1 0 0 0 0 0 1 1
##  [71] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## [176] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0
manyNAs(algae, 0.2) # gives numbers of row that have more than 20% of NA
## [1]  62 199

Removing the Observations with Unknown Values

In order to remove water samples with NA values from the data frame, simply do:

# algae <- na.omit(algae)       # remove all the rows with NA cases
# algae <- algae[-c(62, 199), ] # remove selected observations
# algae <- algae[-manyNAs(algae), ] # same as previous, if you do not know the rows Nos

Filling in the Unknowns with the Most Frequent Values

An alternative to eliminating the cases with unknown values finding the most probable value for each of these unknowns.

For approximately normal distributions, using mean statistics is the best choice. Skewed distributions and outliers make the choice of median better.

# algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = T) # fill NA of mxPH with mean
# algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = T) # filling NA with median

The function centralImputation() of the package “DMwR” fills in all unknowns in a dataset using a statistic of centrality. This function uses the median for numeric columns and uses the most frequent value (mode) for nominal variables.

# algae <- centralImputation(algae)

Filling in the unknown values by exploring the correlations between variables

An alternative for getting less biased estimators of the unknown values is to explore the relationships between variables.

# cor(algae[, 4:18], use = "complete.obs") # get the matrix of variables correlation
symnum(cor(algae[,4:18],use="complete.obs")) 
##      mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1                                         
## mnO2    1                                      
## Cl         1                                   
## NO3        .  1                                
## NH4           .  1                             
## oPO4       .  .     1                          
## PO4     .  .  .  .  + 1                        
## Chla .                . 1                      
## a1         .  .     . .    1                   
## a2   .                  .     1                
## a3      .                        1             
## a4      .        .                  1          
## a5            .                        1       
## a6            .                        .  1    
## a7                                           1 
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1

The correlation matrix shows there is only a stroger correlation between PO4 and oPO4 (0.8) and this allows me to fill in the unknowns on these variables. In order to achieve this I will find the form of the linear correlation between these variables:

lm(PO4 ~ oPO4, data = algae)
## 
## Call:
## lm(formula = PO4 ~ oPO4, data = algae)
## 
## Coefficients:
## (Intercept)         oPO4  
##     91.0375       0.2509

Now, I could use the discovered linear relation and create a function that would return the value of PO4 given the value of oPO4, and then apply this function to all unknown values of PO4:

# fillPO4 <- function(oP) {
#   if (is.na(oP))
#     return(NA)
#   else return(42.897 + 1.293 * oP)
#   }
# algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4),"oPO4"], fillPO4)

Filling in the unknown values by exploring the similarity between cases

Instead of exploring the correlation between the columns (variables) of a dataset, we can try to use the similarities between the rows (observations) to fill in the unknown values.

A common choice of simmilarity is the Euclidean distance, which is implemented in function knnImputation() available in the package “DMwR”, where k is the number of nearest neighbours to use.

clean.algae <- knnImputation(algae, k = 10)
nrow(algae[!complete.cases(algae),])
## [1] 33

After applying the knn imputation, I do not have NA values in the dataset anymore.

Prediction Models

The main goal of this case study is to obtain predictions for the frequency values of the seven algae in a set of 140 water samples. This is a regression task.

I will explore two different predictive models:

Multiple Linear Regression

lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12]) # predict a1 using all other variables
summary(lm.a1)
## 
## Call:
## lm(formula = a1 ~ ., data = clean.algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.743 -11.056  -2.242   7.917  62.621 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  27.077728  23.611308   1.147  0.25295   
## seasonspring  3.770756   4.079948   0.924  0.35658   
## seasonsummer  0.106507   3.812980   0.028  0.97775   
## seasonwinter  4.882924   3.680132   1.327  0.18621   
## sizemedium    5.017141   3.525988   1.423  0.15646   
## sizesmall_   10.872536   3.989929   2.725  0.00705 **
## speedlow___   4.025678   4.533596   0.888  0.37572   
## speedmedium   1.622210   3.088080   0.525  0.60000   
## mxPH         -1.823119   2.653886  -0.687  0.49297   
## mnO2          1.481458   0.675167   2.194  0.02947 * 
## Cl           -0.019811   0.032857  -0.603  0.54729   
## NO3          -0.090714   0.029202  -3.106  0.00219 **
## NH4          -0.003584   0.008093  -0.443  0.65840   
## oPO4         -0.030759   0.013031  -2.360  0.01930 * 
## PO4          -0.043188   0.016346  -2.642  0.00895 **
## Chla         -0.116731   0.075785  -1.540  0.12520   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.93 on 184 degrees of freedom
## Multiple R-squared:  0.4185, Adjusted R-squared:  0.3711 
## F-statistic: 8.829 on 15 and 184 DF,  p-value: 3.36e-15

The proportion of variance explained by this model is not very impressive (around 32.0%). Still, I can reject the hypothesis that the target variable does not depend on the predictors (the p value of the F test is very small). Looking at the significance of some of the coeficients, I may question the inclusion of some of them in the model. I will explore the backward elimination method for simplifying regression models.

I will start the study of simplifying the linear model using the anova() function.

anova(lm.a1)
## Analysis of Variance Table
## 
## Response: a1
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## season      3    215    71.6  0.2499 0.8613482    
## size        2  11770  5884.8 20.5323 8.933e-09 ***
## speed       2   4025  2012.6  7.0222 0.0011510 ** 
## mxPH        1   1016  1016.1  3.5452 0.0612974 .  
## mnO2        1   2661  2661.2  9.2852 0.0026500 ** 
## Cl          1   4319  4318.9 15.0690 0.0001443 ***
## NO3         1   8616  8616.3 30.0630 1.366e-07 ***
## NH4         1    441   440.9  1.5384 0.2164367    
## oPO4        1   1426  1426.2  4.9761 0.0269074 *  
## PO4         1   2788  2788.3  9.7286 0.0021063 ** 
## Chla        1    680   680.0  2.3725 0.1252037    
## Residuals 184  52736   286.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The results indicate that season is the variable that least contributes to the reduction of the fitting error of the model, as it has the smallest residual sum of squares Sum Sq (the total error of the model). Let’s remove it from the model:

lm2.a1 <- update(lm.a1, . ~ . - season)
summary(lm2.a1)
## 
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + 
##     oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.615 -11.237  -2.685   7.465  63.857 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 28.343420  22.639819   1.252  0.21216   
## sizemedium   5.002188   3.521347   1.421  0.15712   
## sizesmall_  11.626057   3.956956   2.938  0.00372 **
## speedlow___  3.110949   4.464468   0.697  0.48678   
## speedmedium  1.030734   3.051908   0.338  0.73594   
## mxPH        -1.335101   2.606168  -0.512  0.60906   
## mnO2         1.195072   0.636049   1.879  0.06181 . 
## Cl          -0.017843   0.032700  -0.546  0.58596   
## NO3         -0.093062   0.028732  -3.239  0.00142 **
## NH4         -0.003897   0.007924  -0.492  0.62347   
## oPO4        -0.031115   0.012954  -2.402  0.01728 * 
## PO4         -0.039892   0.015936  -2.503  0.01316 * 
## Chla        -0.119554   0.075705  -1.579  0.11598   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.92 on 187 degrees of freedom
## Multiple R-squared:  0.4096, Adjusted R-squared:  0.3718 
## F-statistic: 10.81 on 12 and 187 DF,  p-value: 3.451e-16
anova(lm.a1,lm2.a1)
## Analysis of Variance Table
## 
## Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
##     PO4 + Chla
## Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
##     Chla
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    184 52736                           
## 2    187 53541 -3   -804.97 0.9362 0.4243

The fit improved a bit (32.7%) but it is still not too impressive. The comparison shows that the differences between to models are not significant, as the value of Pr(>F) is equal to 0.42, which is more than 0.05. However, this model is simpler than the first one.

To continue the backward process of eliminating unimportant coefficients for simplifying the model, I use the function step() with the default parameter direction equal to backward:

final.lm <- step(lm.a1)
## Start:  AIC=1146.95
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
##     PO4 + Chla
## 
##          Df Sum of Sq   RSS    AIC
## - speed   2    227.83 52964 1143.8
## - season  3    804.97 53541 1144.0
## - NH4     1     56.21 52792 1145.2
## - Cl      1    104.19 52840 1145.3
## - mxPH    1    135.26 52871 1145.5
## <none>                52736 1147.0
## - Chla    1    679.99 53416 1147.5
## - mnO2    1   1379.90 54116 1150.1
## - oPO4    1   1596.86 54333 1150.9
## - size    2   2177.48 54914 1151.0
## - PO4     1   2000.67 54737 1152.4
## - NO3     1   2765.73 55502 1155.2
## 
## Step:  AIC=1143.81
## a1 ~ season + size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
##     Chla
## 
##          Df Sum of Sq   RSS    AIC
## - season  3    716.23 53680 1140.5
## - NH4     1     38.27 53002 1142.0
## - Cl      1     84.82 53049 1142.1
## - mxPH    1    169.97 53134 1142.5
## - Chla    1    516.62 53481 1143.8
## <none>                52964 1143.8
## - mnO2    1   1216.90 54181 1146.3
## - size    2   1950.77 54915 1147.0
## - oPO4    1   1515.17 54479 1147.5
## - PO4     1   1872.60 54837 1148.8
## - NO3     1   2883.01 55847 1152.4
## 
## Step:  AIC=1140.5
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - NH4   1     48.90 53729 1138.7
## - Cl    1     76.78 53757 1138.8
## - mxPH  1     95.67 53776 1138.8
## <none>              53680 1140.5
## - Chla  1    590.23 54271 1140.7
## - mnO2  1    931.60 54612 1141.9
## - oPO4  1   1606.70 55287 1144.4
## - PO4   1   1720.76 55401 1144.8
## - size  2   2461.83 56142 1145.5
## - NO3   1   3093.31 56774 1149.7
## 
## Step:  AIC=1138.68
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - Cl    1      74.5 53804 1137.0
## - mxPH  1      86.6 53816 1137.0
## <none>              53729 1138.7
## - Chla  1     595.8 54325 1138.9
## - mnO2  1     995.4 54725 1140.3
## - oPO4  1    1749.5 55479 1143.1
## - PO4   1    1811.7 55541 1143.3
## - size  2    2529.8 56259 1143.9
## - NO3   1    3469.9 57199 1149.2
## 
## Step:  AIC=1136.96
## a1 ~ size + mxPH + mnO2 + NO3 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - mxPH  1     100.5 53904 1135.3
## <none>              53804 1137.0
## - Chla  1     577.0 54381 1137.1
## - mnO2  1    1141.5 54945 1139.2
## - oPO4  1    1871.0 55675 1141.8
## - PO4   1    2032.8 55837 1142.4
## - size  2    2651.3 56455 1142.6
## - NO3   1    4102.2 57906 1149.7
## 
## Step:  AIC=1135.33
## a1 ~ size + mnO2 + NO3 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## <none>              53904 1135.3
## - Chla  1     746.3 54650 1136.1
## - mnO2  1    1102.2 55006 1137.4
## - oPO4  1    1987.6 55892 1140.6
## - PO4   1    2418.2 56322 1142.1
## - size  2    3747.6 57652 1144.8
## - NO3   1    4058.2 57962 1147.8
summary(final.lm)
## 
## Call:
## lm(formula = a1 ~ size + mnO2 + NO3 + oPO4 + PO4 + Chla, data = clean.algae[, 
##     1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.100 -10.847  -3.178   6.987  65.162 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.21750    6.38537   2.853 0.004806 ** 
## sizemedium   4.54614    3.23192   1.407 0.161152    
## sizesmall_  11.83813    3.33811   3.546 0.000491 ***
## mnO2         1.18380    0.59745   1.981 0.048972 *  
## NO3         -0.09642    0.02536  -3.802 0.000193 ***
## oPO4        -0.03299    0.01240  -2.661 0.008457 ** 
## PO4         -0.04351    0.01482  -2.935 0.003744 ** 
## Chla        -0.11090    0.06802  -1.630 0.104663    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.76 on 192 degrees of freedom
## Multiple R-squared:  0.4056, Adjusted R-squared:  0.384 
## F-statistic: 18.72 on 7 and 192 DF,  p-value: < 2.2e-16

The proportion of variance explained by this model is still not very interesting (38%). Such kind of proportion is usually a sign that the linearity assumptions of this model are inadequate for the domain.

Regression Trees

Different type of regression model is a regression tree.

rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
rt.a1
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90693.8700 16.863000  
##    2) oPO4>=27.125 124 11273.0000  6.335484  
##      4) oPO4>=51.118 87  1836.2570  3.319540 *
##      5) oPO4< 51.118 37  6784.6730 13.427030  
##       10) NH4< 124.6665 21  2831.8180  9.090476 *
##       11) NH4>=124.6665 16  3039.6040 19.118750 *
##    3) oPO4< 27.125 76 43255.7400 34.039470  
##      6) Chla>=4.15 18  3838.5450 13.883330 *
##      7) Chla< 4.15 58 29834.8300 40.294830  
##       14) speed=high__,low___ 47 19839.6100 36.372340  
##         28) Cl>=7.4315 23  6941.4460 27.513040  
##           56) PO4>=43.818 7   309.5343 10.728570 *
##           57) PO4< 43.818 16  3797.1190 34.856250 *
##         29) Cl< 7.4315 24  9362.9760 44.862500  
##           58) mxPH< 7.865 12  4780.1900 35.150000 *
##           59) mxPH>=7.865 12  2318.8030 54.575000 *
##       15) speed=medium 11  6182.3070 57.054550 *
prettyTree(rt.a1)

fancyRpartPlot(rt.a1, cex=0.7)

Trees are usually obtained in two steps. Initially, a large tree is grown, and then this tree is pruned by deleting bottom nodes through a process of statistical estimation. This process has the goal of avoiding overfitting.

The tree grown by rpart() stopped growing when the decrease in the deviance (parameter cp) went below a default threshold of 0.01. I will check the information on cp for the produced tree:

printcp(rt.a1)
## 
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
## 
## Variables actually used in tree construction:
## [1] Chla  Cl    mxPH  NH4   oPO4  PO4   speed
## 
## Root node error: 90694/200 = 453.47
## 
## n= 200 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.398760      0   1.00000 1.01305 0.13190
## 2 0.105656      1   0.60124 0.69962 0.11122
## 3 0.042042      2   0.49558 0.67956 0.11341
## 4 0.038979      3   0.45354 0.62910 0.10314
## 5 0.031257      4   0.41456 0.62394 0.10323
## 6 0.029242      5   0.38331 0.59826 0.10157
## 7 0.024963      6   0.35406 0.58485 0.10197
## 8 0.010070      7   0.32910 0.63583 0.10648
## 9 0.010000      8   0.31903 0.66263 0.11145

The tree grown by rpart() is tree number 9 with CP=0.01 and a relative error of 0.30. However, R estimates, using an internal process of ten-fold crossvalidation, that this tree will have an average relative error of 0.736 +/- 0.105. Using the information provided by these more reliable estimates of performance, which avoid the overfitting problem, I would select the tree number 8 as a more reliable one, whith a lower estimated relative error (0.725). I am going to obtain this tree by using that cp value:

rt2.a1 <- prune(rt.a1, cp = 0.010070)
rt2.a1
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90693.8700 16.863000  
##    2) oPO4>=27.125 124 11273.0000  6.335484  
##      4) oPO4>=51.118 87  1836.2570  3.319540 *
##      5) oPO4< 51.118 37  6784.6730 13.427030 *
##    3) oPO4< 27.125 76 43255.7400 34.039470  
##      6) Chla>=4.15 18  3838.5450 13.883330 *
##      7) Chla< 4.15 58 29834.8300 40.294830  
##       14) speed=high__,low___ 47 19839.6100 36.372340  
##         28) Cl>=7.4315 23  6941.4460 27.513040  
##           56) PO4>=43.818 7   309.5343 10.728570 *
##           57) PO4< 43.818 16  3797.1190 34.856250 *
##         29) Cl< 7.4315 24  9362.9760 44.862500  
##           58) mxPH< 7.865 12  4780.1900 35.150000 *
##           59) mxPH>=7.865 12  2318.8030 54.575000 *
##       15) speed=medium 11  6182.3070 57.054550 *
printcp(rt2.a1)
## 
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
## 
## Variables actually used in tree construction:
## [1] Chla  Cl    mxPH  oPO4  PO4   speed
## 
## Root node error: 90694/200 = 453.47
## 
## n= 200 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.398760      0   1.00000 1.01305 0.13190
## 2 0.105656      1   0.60124 0.69962 0.11122
## 3 0.042042      2   0.49558 0.67956 0.11341
## 4 0.038979      3   0.45354 0.62910 0.10314
## 5 0.031257      4   0.41456 0.62394 0.10323
## 6 0.029242      5   0.38331 0.59826 0.10157
## 7 0.024963      6   0.35406 0.58485 0.10197
## 8 0.010070      7   0.32910 0.63583 0.10648

There is a function rpartXse() in the package “DMwR” that automates the process and takes as argument the se value, defaulting to 1:

(rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 200 90693.870 16.863000  
##   2) oPO4>=27.125 124 11273.000  6.335484 *
##   3) oPO4< 27.125 76 43255.740 34.039470  
##     6) Chla>=4.15 18  3838.545 13.883330 *
##     7) Chla< 4.15 58 29834.830 40.294830 *

It is also possible to interactively prune a tree using the function snip.rpart() by either indicating the number of nodes at which you want to prune the tree, or using snip.rpart() in a graphical way.

# first option:
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
my.tree <- snip.rpart(first.tree, c(4, 7))
my.tree
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90693.870 16.863000  
##    2) oPO4>=27.125 124 11273.000  6.335484  
##      4) oPO4>=51.118 87  1836.257  3.319540 *
##      5) oPO4< 51.118 37  6784.673 13.427030  
##       10) NH4< 124.6665 21  2831.818  9.090476 *
##       11) NH4>=124.6665 16  3039.604 19.118750 *
##    3) oPO4< 27.125 76 43255.740 34.039470  
##      6) Chla>=4.15 18  3838.545 13.883330 *
##      7) Chla< 4.15 58 29834.830 40.294830 *
# # second option (not active in presentation):
# prettyTree(first.tree)
# snip.rpart(first.tree)

Model Evaluation and Selection

Now, I have two prediction models. The obvious question is which one we should use for obtaining the predictions for the seven algae of the 140 test samples. I will evaluate the predictive performance of two models and compare it.

Obtain predictions

lm.predictions.a1 <- predict(final.lm, clean.algae)
rt.predictions.a1 <- predict(rt.a1, algae)

Calculate MAE, MSE, NMSE

# MAE:
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[, "a1"])))
## [1] 12.46743
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[, "a1"])))
## [1] 10.73599
# MSE:
(mse.a1.lm <- mean((lm.predictions.a1 - algae[, "a1"])^2))
## [1] 269.5209
(mse.a1.rt <- mean((rt.predictions.a1 - algae[, "a1"])^2))
## [1] 224.7319
# NMSE:
(nmse.a1.lm <- mean((lm.predictions.a1-algae[,'a1'])^2)/
  mean((mean(algae[,'a1'])-algae[,'a1'])^2))
## [1] 0.5943531
(nmse.a1.rt <- mean((rt.predictions.a1-algae[,'a1'])^2)/
  mean((mean(algae[,'a1'])-algae[,'a1'])^2))
## [1] 0.4955834
# all together:
regr.eval(algae[, "a1"], lm.predictions.a1, train.y = algae[,"a1"])
##         mae         mse        rmse        mape        nmse        nmae 
##  12.4674296 269.5208958  16.4170916         Inf   0.5943531   0.7530437
regr.eval(algae[, "a1"], rt.predictions.a1, train.y = algae[,"a1"])
##         mae         mse        rmse        mape        nmse        nmae 
##  10.7359946 224.7318866  14.9910602         Inf   0.4955834   0.6484635

Visualization of the predictions of the models:

old.par <- par(mfrow = c(1, 2))
plot(lm.predictions.a1, algae[, "a1"], main = "Linear Model",
     xlab = "Predictions", ylab = "True Values")
abline(0, 1, lty = 2)
plot(rt.predictions.a1, algae[, "a1"], main = "Regression Tree",
     xlab = "Predictions", ylab = "True Values")
abline(0, 1, lty = 2)

par(old.par)

The models have rather poor performance in several cases. In the ideal scenario that they make correct predictions for all cases, all the circles in the plots should lie on the dashed lines. The plot shows that this is not the case at all.

Looking at the predictions of the linear model, I can see that this model predicts negative algae frequencies for some cases. In this application domain, it makes no sense to say that the occurrence of an alga in a water sample is negative (at most, it can be zero). As such, I can use the minimum value of zero as a form of improving the linear model performance:

sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 
                                     0, 
                                     lm.predictions.a1)

# compare the performance of improved and initial modelss:
regr.eval(algae[, "a1"], sensible.lm.predictions.a1, stats = c("mae", "mse")) #improved
##       mae       mse 
##  11.94207 262.81421
regr.eval(algae[, "a1"], lm.predictions.a1, stats = c("mae", "mse")) # initial
##       mae       mse 
##  12.46743 269.52090

The results show that this small detail has increased the performance of the model.

In the package “DMwR” there is the function experimentalComparison(), which is designed to help in the model selection/comparison tasks. It can be used with different estimation methods, including cross-validation.

# 1. defining functions that will carry out 
# the learning and testing phase of the models
cv.rpart <- function(form,train,test,...) {
  m <- rpartXse(form,train,...)
  p <- predict(m,test)
  mse <- mean((p-resp(form,test))^2)
  c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
  }

cv.lm <- function(form,train,test,...) {
  m <- lm(form,train,...)
  p <- predict(m,test)
  p <- ifelse(p < 0,0,p)
  mse <- mean((p-resp(form,test))^2)
  c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}

# 2. cross-validation comparison.
# The experiment includes one variant of linear regression
# and three variants of regression trees (with 3 different se).
# cvSettings define numbr of repetitions of cv process (3),
# value of k (10), 
# seed for the random number generator (1234)
res <- experimentalComparison(
  c(dataset(a1 ~ .,clean.algae[,1:12],'a1')),
  c(variants('cv.lm'),
    variants('cv.rpart',se=c(0,0.5,1))),
  cvSettings(3,10,1234))
## 
## 
## #####  CROSS VALIDATION  EXPERIMENTAL COMPARISON #####
## 
## ** DATASET :: a1
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
summary(res)
## 
## == Summary of a  Cross Validation  Experiment ==
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## 
## * Data sets ::  a1
## * Learners  ::  cv.lm.v1, cv.rpart.v1, cv.rpart.v2, cv.rpart.v3
## 
## * Summary of Experiment Results:
## 
## 
## -> Datataset:  a1 
## 
##  *Learner: cv.lm.v1 
##              nmse
## avg     0.6740195
## std     0.1943386
## min     0.4469701
## max     1.3202826
## invalid 0.0000000
## 
##  *Learner: cv.rpart.v1 
##              nmse
## avg     0.7040149
## std     0.3276007
## min     0.2694526
## max     1.6241139
## invalid 0.0000000
## 
##  *Learner: cv.rpart.v2 
##              nmse
## avg     0.7048697
## std     0.2841595
## min     0.2934157
## max     1.6241139
## invalid 0.0000000
## 
##  *Learner: cv.rpart.v3 
##              nmse
## avg     0.7200008
## std     0.2756571
## min     0.2934157
## max     1.6241139
## invalid 0.0000000
plot(res)

In order to know the specific parameter settings corresponding to a specific model:

getVariant("cv.rpart.v1", res)
## 
## Learner::  "cv.rpart" 
## 
## Parameter values
##   se  =  0

I will carry out a similar comparative experiment for all seven prediction tasks at the same time:

DSs <- sapply(names(clean.algae)[12:18],
              function(x,names.attrs) {
                f <- as.formula(paste(x,"~ ."))
                dataset(f,clean.algae[,c(names.attrs,x)],x)
                },
              names(clean.algae)[1:11])

res.all <- experimentalComparison(
  DSs,
  c(variants('cv.lm'),
    variants('cv.rpart',se=c(0,0.5,1))),
  cvSettings(3,10,1234))
## 
## 
## #####  CROSS VALIDATION  EXPERIMENTAL COMPARISON #####
## 
## ** DATASET :: a1
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a2
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a3
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a4
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a5
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a6
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a7
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
plot(res.all)

There are several very bad results; that is, NMSE scores clearly above 1, which is the baseline of being as competitive as predicting always the average target variable value for all test cases.

In order to check which is the best model for each problem, we can use the function bestScores() from the “DMwR” package:

bestScores(res.all)
## $a1
##        system     score
## nmse cv.lm.v1 0.6740195
## 
## $a2
##        system     score
## nmse cv.lm.v1 0.7738637
## 
## $a3
##           system score
## nmse cv.rpart.v3     1
## 
## $a4
##           system score
## nmse cv.rpart.v1     1
## 
## $a5
##        system  score
## nmse cv.lm.v1 0.9692
## 
## $a6
##        system     score
## nmse cv.lm.v1 0.9680376
## 
## $a7
##           system score
## nmse cv.rpart.v2     1

The output of this function confirms that, with the exception of alga 1, the results are rather disappointing. The variability of the results provides good indications that this might be a good candidate for an ensemble approach. I will use a function randomForest() from the library “randomForest” to obtain predictions for regression tasks by averaging the predictions of the trees in the ensemble.

The following code repeats the previous cross-validation experiment, this time including three variants of random forests, each with a di erent number of trees in the ensemble.

cv.rf <- function(form,train,test,...) {
  m <- randomForest(form,train,...)
  p <- predict(m,test)
  mse <- mean((p-resp(form,test))^2)
  c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
  }

res.all <- experimentalComparison(
  DSs,
  c(variants('cv.lm'),
    variants('cv.rpart',se=c(0,0.5,1)),
    variants('cv.rf',ntree=c(200,500,700))
    ),
  cvSettings(3,10,1234))
## 
## 
## #####  CROSS VALIDATION  EXPERIMENTAL COMPARISON #####
## 
## ** DATASET :: a1
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a2
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a3
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a4
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a5
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a6
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ** DATASET :: a7
## 
## ++ LEARNER :: cv.lm  variant ->  cv.lm.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rpart  variant ->  cv.rpart.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v1 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v2 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
## 
## 
## ++ LEARNER :: cv.rf  variant ->  cv.rf.v3 
## 
##  3 x 10 - Fold Cross Validation run with seed =  1234 
## Repetition  1 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  2 
## Fold:  1  2  3  4  5  6  7  8  9  10
## Repetition  3 
## Fold:  1  2  3  4  5  6  7  8  9  10
bestScores(res.all)
## $a1
##        system     score
## nmse cv.rf.v3 0.5172045
## 
## $a2
##        system     score
## nmse cv.rf.v2 0.7553715
## 
## $a3
##           system score
## nmse cv.rpart.v3     1
## 
## $a4
##           system score
## nmse cv.rpart.v1     1
## 
## $a5
##        system     score
## nmse cv.rf.v1 0.8148144
## 
## $a6
##        system    score
## nmse cv.rf.v2 0.894367
## 
## $a7
##           system score
## nmse cv.rpart.v2     1

The result show the advantages of the ensemble approach. In fact, for all problems except alga 7, the best score is obtained by some variant of a random forest. Still, the results are not always very good, in particular for alga 7. In addition, The output of the function bestScores() does not tell whether the difference between the scores of these best models and the remaining alternatives is statistically significant. The function compAnalysis() in the package “DMwR” provides this information.

compAnalysis(res.all,against='cv.rf.v3',
             datasets=c('a1','a2','a4','a6'))
## 
## == Statistical Significance Analysis of Comparison Results ==
## 
## Baseline Learner::    cv.rf.v3  (Learn.1)
## 
## ** Evaluation Metric::    nmse 
## 
## - Dataset: a1 
##       Learn.1   Learn.2 sig.2   Learn.3 sig.3   Learn.4 sig.4   Learn.5
## AVG 0.5172045 0.6740195    ++ 0.7040149    ++ 0.7048697    ++ 0.7200008
## STD 0.2275834 0.1943386       0.3276007       0.2841595       0.2756571
##     sig.5   Learn.6 sig.6   Learn.7 sig.7
## AVG    ++ 0.5201657       0.5193789      
## STD       0.2319076       0.2288161      
## 
## - Dataset: a2 
##       Learn.1   Learn.2 sig.2   Learn.3 sig.3    Learn.4 sig.4
## AVG 0.7566567 0.7738637       1.0028767    ++ 0.99787737    ++
## STD 0.1706870 0.2055161       0.1570531       0.02030462      
##          Learn.5 sig.5   Learn.6 sig.6   Learn.7 sig.7
## AVG 1.000000e+00    ++ 0.7634266       0.7553715      
## STD 3.220373e-16       0.1778386       0.1610061      
## 
## - Dataset: a4 
##      Learn.1   Learn.2 sig.2      Learn.3 sig.3      Learn.4 sig.4
## AVG 1.416682 1.2777027       1.000000e+00       1.000000e+00      
## STD 1.193849 0.7680887       1.147868e-16       1.147868e-16      
##          Learn.5 sig.5  Learn.6 sig.6  Learn.7 sig.7
## AVG 1.000000e+00       1.416414       1.425637      
## STD 1.147868e-16       1.126784       1.196226      
## 
## - Dataset: a6 
##       Learn.1   Learn.2 sig.2    Learn.3 sig.3      Learn.4 sig.4
## AVG 0.9002171 0.9680376       1.00962926       1.000000e+00      
## STD 0.2976772 0.4216282       0.05274165       1.457794e-16      
##          Learn.5 sig.5   Learn.6 sig.6   Learn.7 sig.7
## AVG 1.000000e+00       0.9018138       0.8943670      
## STD 1.457794e-16       0.3139281       0.2803978      
## 
## Legends:
## Learners -> Learn.1 = cv.rf.v3 ; Learn.2 = cv.lm.v1 ; Learn.3 = cv.rpart.v1 ; Learn.4 = cv.rpart.v2 ; Learn.5 = cv.rpart.v3 ; Learn.6 = cv.rf.v1 ; Learn.7 = cv.rf.v2 ; 
## Signif. Codes -> 0 '++' or '--' 0.001 '+' or '-' 0.05 ' ' 1

The columns “sig.X” provide the information on statistical significance. Absence of a symbol means that our confidence in the observed difference between the respective model and the “cv.rf.v3” being statistically significant is lower than 95%. Plus signals mean that the average evaluation metric of the model is significantly higher than the one of “cv.rf.v3”, which is bad as best NMSE scores are the lower ones. Minus signals represent the opposite.

The results of compAnalysis show the difference between this variant of random forests and the other variants is usually not statistically significant. With respect to the other models, there is in most cases a significant advantage to this variant of random forests.

To overview, the cross-validation process has indicated the following models as being the “best” for that task: cv.rf.v3, cv.rf.v2, cv.rf.v1, and cv.rpart.v3.

Predictions for the Seven Algae

I start by obtaining the four best models using all the available training data so that I could apply them to the test set later.

bestModelsNames <- sapply(bestScores(res.all),
                          function(x) x['nmse','system'])

learners <- c(rf='randomForest',rpart='rpartXse')

funcs <- learners[sapply(strsplit(bestModelsNames,'\\.'),
                         function(x) x[2])]

parSetts <- lapply(bestModelsNames,
                   function(x) getVariant(x,res.all)@pars)

bestModels <- list()

for(a in 1:7) {
  form <- as.formula(paste(names(clean.algae)[11+a],'~ .'))
  bestModels[[a]] <- do.call(funcs[a],
                        c(list(form,clean.algae[,c(1:11,11+a)]),parSetts[[a]]))
  }

Now, I have a list with seven models obtained for each algae and ready for making predictions for the test set.

The data frame test.algae is a test set containing the 140 test samples. I fill in NA values with the results of knnImputation for the training set.

clean.test.algae <- knnImputation(test.algae, 
                                  k = 10, 
                                  distData = algae[,1:11])
clean.test.algae$size <- revalue(clean.test.algae$size, c("large"="large_", "small"="small_"))
clean.test.algae$speed <- revalue(clean.test.algae$speed, c("low"="low___", "high"="high__"))

Now, I obtain the matrix with the predictions for the entire test set:

preds <- matrix(ncol=7,nrow=140)

for(i in 1:nrow(clean.test.algae)){
  preds[i,] <- sapply(1:7,
                      function(x) predict(bestModels[[x]],clean.test.algae[i,]))
  }

Now, I can compare these predictions with the real values to obtain some feedback on the quality of my approach to this prediction problem. The true values of the test set are contained in the algae.sols data frame. I calculate NMSE scores for each of seven models.

avg.preds <- apply(algae[,12:18],2,mean)
apply(((algae.sols-preds)^2),2,mean) /
  apply( (scale(algae.sols,avg.preds,F)^2),2,mean)
##        a1        a2        a3        a4        a5        a6        a7 
## 0.4944927 0.8861564 1.0000000 1.0000000 0.7792627 0.9333836        NA

The results that we obtained are in accordance with the cross-validation estimates obtained previously. They confirm the difficulty in obtaining good scores for alga 7, while for the other problems the results are more competitive, in particular for alga 1.