library(DMwR) # library including the dataset
## Warning: package 'DMwR' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: grid
library(car)
library(lattice)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.2.3
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(rpart) # regression trees
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.3
## Rattle: A free graphical interface for data mining with R.
## Version 4.0.5 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.3
library(RColorBrewer)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:Hmisc':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(plyr)
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
##
## The following object is masked from 'package:DMwR':
##
## join
The dataset was downloaded from https://archive.ics.uci.edu/ml/datasets/Coil+1999+Competition+Data .
algae <- read.csv('analysis.data',
header=F,
dec='.',
col.names=c('season','size','speed','mxPH','mnO2','Cl',
'NO3','NH4','oPO4','PO4','Chla','a1','a2','a3','a4',
'a5','a6','a7'),
na.strings=c('XXXXXXX'))
algae$NO3 <- as.numeric(algae$NO3)
summary(algae)
## season size speed mxPH mnO2
## autumn:40 large_:45 high__:84 Min. :5.600 Min. : 1.500
## spring:53 medium:84 low___:33 1st Qu.:7.700 1st Qu.: 7.725
## summer:45 small_:71 medium:83 Median :8.060 Median : 9.800
## winter:62 Mean :8.012 Mean : 9.118
## 3rd Qu.:8.400 3rd Qu.:10.800
## Max. :9.700 Max. :13.400
## NA's :1 NA's :2
## Cl NO3 NH4 oPO4
## Min. : 0.222 Min. : 1.00 Min. : 5.00 Min. : 1.00
## 1st Qu.: 10.981 1st Qu.: 48.25 1st Qu.: 35.62 1st Qu.: 16.00
## Median : 32.730 Median : 96.50 Median : 99.67 Median : 41.40
## Mean : 43.636 Mean : 95.86 Mean :154.45 Mean : 83.33
## 3rd Qu.: 57.824 3rd Qu.:143.75 3rd Qu.:203.73 3rd Qu.:102.25
## Max. :391.500 Max. :192.00 Max. :931.83 Max. :771.60
## NA's :10 NA's :2 NA's :2 NA's :2
## PO4 Chla a1 a2
## Min. : 0.90 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 19.39 1st Qu.: 2.00 1st Qu.: 1.475 1st Qu.: 0.000
## Median : 84.50 Median : 5.20 Median : 7.400 Median : 2.100
## Mean :111.55 Mean : 13.54 Mean :16.863 Mean : 6.934
## 3rd Qu.:182.16 3rd Qu.: 18.30 3rd Qu.:24.075 3rd Qu.: 9.075
## Max. :558.75 Max. :110.46 Max. :89.800 Max. :72.600
## NA's :2 NA's :12
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.750 Median : 0.000 Median : 1.90 Median : 0.000
## Mean : 4.729 Mean : 1.885 Mean : 5.63 Mean : 5.199
## 3rd Qu.: 6.150 3rd Qu.: 2.225 3rd Qu.: 7.70 3rd Qu.: 6.725
## Max. :44.600 Max. :35.600 Max. :77.60 Max. :52.500
##
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 2.506
## 3rd Qu.: 2.400
## Max. :31.600
## NA's :17
str(algae)
## 'data.frame': 200 obs. of 18 variables:
## $ season: Factor w/ 4 levels "autumn","spring",..: 4 2 1 2 1 4 3 1 4 4 ...
## $ size : Factor w/ 3 levels "large_","medium",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ speed : Factor w/ 3 levels "high__","low___",..: 3 3 3 3 3 1 1 1 3 1 ...
## $ mxPH : num 8 8.35 8.1 8.07 8.06 8.25 8.15 8.05 8.7 7.93 ...
## $ mnO2 : num 9.8 8 11.4 4.8 9 13.1 10.3 10.6 3.4 9.9 ...
## $ Cl : num 60.8 57.8 40 77.4 55.4 ...
## $ NO3 : num 177 48 165 86 80 190 64 158 32 53 ...
## $ NH4 : num 578 370 346.7 98.2 233.7 ...
## $ oPO4 : num 105 428.8 125.7 61.2 58.2 ...
## $ PO4 : num 170 558.8 187.1 138.7 97.6 ...
## $ Chla : num 50 1.3 15.6 1.4 10.5 ...
## $ a1 : num 0 1.4 3.3 3.1 9.2 15.1 2.4 18.2 25.4 17 ...
## $ a2 : num 0 7.6 53.6 41 2.9 14.6 1.2 1.6 5.4 0 ...
## $ a3 : num 0 4.8 1.9 18.9 7.5 1.4 3.2 0 2.5 0 ...
## $ a4 : num 0 1.9 0 0 0 0 3.9 0 0 2.9 ...
## $ a5 : num 34.2 6.7 0 1.4 7.5 22.5 5.8 5.5 0 0 ...
## $ a6 : num 8.3 0 0 0 4.1 12.6 6.8 8.7 0 0 ...
## $ a7 : num 0 2.1 9.7 1.4 1 2.9 0 0 0 1.7 ...
#hist(algae$mxPH, prob = T)
par(mfrow=c(1,2)) # get two graphs side by side
hist(algae$mxPH, prob=T, xlab='',
main='Histogram of maximum pH value',ylim=0:1)
lines(density(algae$mxPH,na.rm=T))
rug(jitter(algae$mxPH))
qq.plot(algae$mxPH,main='Normal QQ plot of maximum pH')
## Warning: 'qq.plot' is deprecated.
## Use 'qqPlot' instead.
## See help("Deprecated") and help("car-deprecated").
par(mfrow=c(1,1))
The plot shows that there are several low values of the variable that clearly break the assumptions of a normal distribution with 95% confidence.
Next plot is for variable oPO4:
boxplot(algae$oPO4, ylab = "Orthophosphate (oPO4)")
rug(jitter(algae$oPO4), side = 2)
abline(h = mean(algae$oPO4, na.rm = T), lty = 2)
Here, horizontal dashed line shows the mean value of the variable. The variable oPO4 has a distribution of the observed values clearly concentrated on low values, thus with a positive skew. In most of the water samples, the value of oPO4 is low, but there are several observations with high values, and even with extremely high values.
Now, let’s plot the values of variable NH4.
plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T),
lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
# Below - interactive plots that do not work in a presentation format.
# identify(algae$NH4)
#
# plot(algae$NH4, xlab = "")
# clicked.lines <- identify(algae$NH4)
# algae[clicked.lines, ]
The plot shows the outliers have NH4 > 800, so I will inspect these cases:
algae[!is.na(algae$NH4) & algae$NH4 > 800, ]
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla
## 86 winter medium medium 7.6 6.0 53.000 126 914.000 137.6 254.600 4.3
## 108 autumn medium high__ 8.3 11.1 41.500 149 931.833 39.0 124.200 13.1
## 144 winter medium low___ 8.0 4.5 79.077 188 920.000 70.0 200.231 19.4
## a1 a2 a3 a4 a5 a6 a7
## 86 0.0 0.0 0.0 4.6 9.0 13.1 30.1
## 108 23.7 13.7 0.0 1.7 6.4 2.6 0.0
## 144 2.5 1.4 1.4 6.2 4.1 1.8 3.9
bwplot(size ~ a1, data=algae, ylab='River Size',xlab='Algal A1')
bwplot(size ~ a1, data=algae,panel=panel.bpplot,
probs=seq(.01,.49,by=.01), datadensity=TRUE,
ylab='River Size',xlab='Algal A1')
The first plot (a simple boxplot) shows that higher frequencies of algal a1 are expected in smaller rivers. The second plot (a box percentile plot) confirms the previous observation that smaller rivers have higher frequencies of this alga, but it also shows that the value of the observed frequencies for these small rivers is much more widespread across the domain of frequencies than for other types of rivers.
Let’s now see how the behavior of the frequency of algal a3 depends on by season and mnO2.
# create a factorized version of continuous mnO2 variable
# number sets the number of desired bins
# overlap sets the overlap between the bins near their respective boundaries
# thus certain observations will be assigned to adjacent bins:
minO2 <- equal.count(na.omit(algae$mnO2),
number=4,overlap=1/5)
# create the plot:
stripplot(season ~ a3|minO2,
data=algae[!is.na(algae$mnO2),])
The bottom-left plot corresponds to lower values of mnO2.
Here, I will show a commented-out code for several different ways of dealing with unknown values. Just one part of code will be uncommented and active - the approach that I will actually use for dealing with unknown values in my dataset.
Before dealing with unknown values, I will count them.
nrow(algae[!complete.cases(algae),]) # number of rows with NA
## [1] 33
apply(algae, 1, function(x) sum(is.na(x))) # number of NA in each row
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1
## [36] 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 6 1 0 0 0 0 0 1 1
## [71] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## [176] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0
manyNAs(algae, 0.2) # gives numbers of row that have more than 20% of NA
## [1] 62 199
In order to remove water samples with NA values from the data frame, simply do:
# algae <- na.omit(algae) # remove all the rows with NA cases
# algae <- algae[-c(62, 199), ] # remove selected observations
# algae <- algae[-manyNAs(algae), ] # same as previous, if you do not know the rows Nos
An alternative to eliminating the cases with unknown values finding the most probable value for each of these unknowns.
For approximately normal distributions, using mean statistics is the best choice. Skewed distributions and outliers make the choice of median better.
# algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = T) # fill NA of mxPH with mean
# algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = T) # filling NA with median
The function centralImputation() of the package “DMwR” fills in all unknowns in a dataset using a statistic of centrality. This function uses the median for numeric columns and uses the most frequent value (mode) for nominal variables.
# algae <- centralImputation(algae)
An alternative for getting less biased estimators of the unknown values is to explore the relationships between variables.
# cor(algae[, 4:18], use = "complete.obs") # get the matrix of variables correlation
symnum(cor(algae[,4:18],use="complete.obs"))
## mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1
## mnO2 1
## Cl 1
## NO3 . 1
## NH4 . 1
## oPO4 . . 1
## PO4 . . . . + 1
## Chla . . 1
## a1 . . . . 1
## a2 . . 1
## a3 . 1
## a4 . . 1
## a5 . 1
## a6 . . 1
## a7 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
The correlation matrix shows there is only a stroger correlation between PO4 and oPO4 (0.8) and this allows me to fill in the unknowns on these variables. In order to achieve this I will find the form of the linear correlation between these variables:
lm(PO4 ~ oPO4, data = algae)
##
## Call:
## lm(formula = PO4 ~ oPO4, data = algae)
##
## Coefficients:
## (Intercept) oPO4
## 91.0375 0.2509
Now, I could use the discovered linear relation and create a function that would return the value of PO4 given the value of oPO4, and then apply this function to all unknown values of PO4:
# fillPO4 <- function(oP) {
# if (is.na(oP))
# return(NA)
# else return(42.897 + 1.293 * oP)
# }
# algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4),"oPO4"], fillPO4)
Instead of exploring the correlation between the columns (variables) of a dataset, we can try to use the similarities between the rows (observations) to fill in the unknown values.
A common choice of simmilarity is the Euclidean distance, which is implemented in function knnImputation() available in the package “DMwR”, where k is the number of nearest neighbours to use.
clean.algae <- knnImputation(algae, k = 10)
nrow(algae[!complete.cases(algae),])
## [1] 33
After applying the knn imputation, I do not have NA values in the dataset anymore.
The main goal of this case study is to obtain predictions for the frequency values of the seven algae in a set of 140 water samples. This is a regression task.
I will explore two different predictive models:
lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12]) # predict a1 using all other variables
summary(lm.a1)
##
## Call:
## lm(formula = a1 ~ ., data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.743 -11.056 -2.242 7.917 62.621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.077728 23.611308 1.147 0.25295
## seasonspring 3.770756 4.079948 0.924 0.35658
## seasonsummer 0.106507 3.812980 0.028 0.97775
## seasonwinter 4.882924 3.680132 1.327 0.18621
## sizemedium 5.017141 3.525988 1.423 0.15646
## sizesmall_ 10.872536 3.989929 2.725 0.00705 **
## speedlow___ 4.025678 4.533596 0.888 0.37572
## speedmedium 1.622210 3.088080 0.525 0.60000
## mxPH -1.823119 2.653886 -0.687 0.49297
## mnO2 1.481458 0.675167 2.194 0.02947 *
## Cl -0.019811 0.032857 -0.603 0.54729
## NO3 -0.090714 0.029202 -3.106 0.00219 **
## NH4 -0.003584 0.008093 -0.443 0.65840
## oPO4 -0.030759 0.013031 -2.360 0.01930 *
## PO4 -0.043188 0.016346 -2.642 0.00895 **
## Chla -0.116731 0.075785 -1.540 0.12520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.93 on 184 degrees of freedom
## Multiple R-squared: 0.4185, Adjusted R-squared: 0.3711
## F-statistic: 8.829 on 15 and 184 DF, p-value: 3.36e-15
The proportion of variance explained by this model is not very impressive (around 32.0%). Still, I can reject the hypothesis that the target variable does not depend on the predictors (the p value of the F test is very small). Looking at the significance of some of the coeficients, I may question the inclusion of some of them in the model. I will explore the backward elimination method for simplifying regression models.
I will start the study of simplifying the linear model using the anova() function.
anova(lm.a1)
## Analysis of Variance Table
##
## Response: a1
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 215 71.6 0.2499 0.8613482
## size 2 11770 5884.8 20.5323 8.933e-09 ***
## speed 2 4025 2012.6 7.0222 0.0011510 **
## mxPH 1 1016 1016.1 3.5452 0.0612974 .
## mnO2 1 2661 2661.2 9.2852 0.0026500 **
## Cl 1 4319 4318.9 15.0690 0.0001443 ***
## NO3 1 8616 8616.3 30.0630 1.366e-07 ***
## NH4 1 441 440.9 1.5384 0.2164367
## oPO4 1 1426 1426.2 4.9761 0.0269074 *
## PO4 1 2788 2788.3 9.7286 0.0021063 **
## Chla 1 680 680.0 2.3725 0.1252037
## Residuals 184 52736 286.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results indicate that season is the variable that least contributes to the reduction of the fitting error of the model, as it has the smallest residual sum of squares Sum Sq (the total error of the model). Let’s remove it from the model:
lm2.a1 <- update(lm.a1, . ~ . - season)
summary(lm2.a1)
##
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 +
## oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.615 -11.237 -2.685 7.465 63.857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.343420 22.639819 1.252 0.21216
## sizemedium 5.002188 3.521347 1.421 0.15712
## sizesmall_ 11.626057 3.956956 2.938 0.00372 **
## speedlow___ 3.110949 4.464468 0.697 0.48678
## speedmedium 1.030734 3.051908 0.338 0.73594
## mxPH -1.335101 2.606168 -0.512 0.60906
## mnO2 1.195072 0.636049 1.879 0.06181 .
## Cl -0.017843 0.032700 -0.546 0.58596
## NO3 -0.093062 0.028732 -3.239 0.00142 **
## NH4 -0.003897 0.007924 -0.492 0.62347
## oPO4 -0.031115 0.012954 -2.402 0.01728 *
## PO4 -0.039892 0.015936 -2.503 0.01316 *
## Chla -0.119554 0.075705 -1.579 0.11598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.92 on 187 degrees of freedom
## Multiple R-squared: 0.4096, Adjusted R-squared: 0.3718
## F-statistic: 10.81 on 12 and 187 DF, p-value: 3.451e-16
anova(lm.a1,lm2.a1)
## Analysis of Variance Table
##
## Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
## Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 184 52736
## 2 187 53541 -3 -804.97 0.9362 0.4243
The fit improved a bit (32.7%) but it is still not too impressive. The comparison shows that the differences between to models are not significant, as the value of Pr(>F) is equal to 0.42, which is more than 0.05. However, this model is simpler than the first one.
To continue the backward process of eliminating unimportant coefficients for simplifying the model, I use the function step() with the default parameter direction equal to backward:
final.lm <- step(lm.a1)
## Start: AIC=1146.95
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - speed 2 227.83 52964 1143.8
## - season 3 804.97 53541 1144.0
## - NH4 1 56.21 52792 1145.2
## - Cl 1 104.19 52840 1145.3
## - mxPH 1 135.26 52871 1145.5
## <none> 52736 1147.0
## - Chla 1 679.99 53416 1147.5
## - mnO2 1 1379.90 54116 1150.1
## - oPO4 1 1596.86 54333 1150.9
## - size 2 2177.48 54914 1151.0
## - PO4 1 2000.67 54737 1152.4
## - NO3 1 2765.73 55502 1155.2
##
## Step: AIC=1143.81
## a1 ~ season + size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
##
## Df Sum of Sq RSS AIC
## - season 3 716.23 53680 1140.5
## - NH4 1 38.27 53002 1142.0
## - Cl 1 84.82 53049 1142.1
## - mxPH 1 169.97 53134 1142.5
## - Chla 1 516.62 53481 1143.8
## <none> 52964 1143.8
## - mnO2 1 1216.90 54181 1146.3
## - size 2 1950.77 54915 1147.0
## - oPO4 1 1515.17 54479 1147.5
## - PO4 1 1872.60 54837 1148.8
## - NO3 1 2883.01 55847 1152.4
##
## Step: AIC=1140.5
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - NH4 1 48.90 53729 1138.7
## - Cl 1 76.78 53757 1138.8
## - mxPH 1 95.67 53776 1138.8
## <none> 53680 1140.5
## - Chla 1 590.23 54271 1140.7
## - mnO2 1 931.60 54612 1141.9
## - oPO4 1 1606.70 55287 1144.4
## - PO4 1 1720.76 55401 1144.8
## - size 2 2461.83 56142 1145.5
## - NO3 1 3093.31 56774 1149.7
##
## Step: AIC=1138.68
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - Cl 1 74.5 53804 1137.0
## - mxPH 1 86.6 53816 1137.0
## <none> 53729 1138.7
## - Chla 1 595.8 54325 1138.9
## - mnO2 1 995.4 54725 1140.3
## - oPO4 1 1749.5 55479 1143.1
## - PO4 1 1811.7 55541 1143.3
## - size 2 2529.8 56259 1143.9
## - NO3 1 3469.9 57199 1149.2
##
## Step: AIC=1136.96
## a1 ~ size + mxPH + mnO2 + NO3 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - mxPH 1 100.5 53904 1135.3
## <none> 53804 1137.0
## - Chla 1 577.0 54381 1137.1
## - mnO2 1 1141.5 54945 1139.2
## - oPO4 1 1871.0 55675 1141.8
## - PO4 1 2032.8 55837 1142.4
## - size 2 2651.3 56455 1142.6
## - NO3 1 4102.2 57906 1149.7
##
## Step: AIC=1135.33
## a1 ~ size + mnO2 + NO3 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## <none> 53904 1135.3
## - Chla 1 746.3 54650 1136.1
## - mnO2 1 1102.2 55006 1137.4
## - oPO4 1 1987.6 55892 1140.6
## - PO4 1 2418.2 56322 1142.1
## - size 2 3747.6 57652 1144.8
## - NO3 1 4058.2 57962 1147.8
summary(final.lm)
##
## Call:
## lm(formula = a1 ~ size + mnO2 + NO3 + oPO4 + PO4 + Chla, data = clean.algae[,
## 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.100 -10.847 -3.178 6.987 65.162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.21750 6.38537 2.853 0.004806 **
## sizemedium 4.54614 3.23192 1.407 0.161152
## sizesmall_ 11.83813 3.33811 3.546 0.000491 ***
## mnO2 1.18380 0.59745 1.981 0.048972 *
## NO3 -0.09642 0.02536 -3.802 0.000193 ***
## oPO4 -0.03299 0.01240 -2.661 0.008457 **
## PO4 -0.04351 0.01482 -2.935 0.003744 **
## Chla -0.11090 0.06802 -1.630 0.104663
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.76 on 192 degrees of freedom
## Multiple R-squared: 0.4056, Adjusted R-squared: 0.384
## F-statistic: 18.72 on 7 and 192 DF, p-value: < 2.2e-16
The proportion of variance explained by this model is still not very interesting (38%). Such kind of proportion is usually a sign that the linearity assumptions of this model are inadequate for the domain.
Different type of regression model is a regression tree.
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
rt.a1
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90693.8700 16.863000
## 2) oPO4>=27.125 124 11273.0000 6.335484
## 4) oPO4>=51.118 87 1836.2570 3.319540 *
## 5) oPO4< 51.118 37 6784.6730 13.427030
## 10) NH4< 124.6665 21 2831.8180 9.090476 *
## 11) NH4>=124.6665 16 3039.6040 19.118750 *
## 3) oPO4< 27.125 76 43255.7400 34.039470
## 6) Chla>=4.15 18 3838.5450 13.883330 *
## 7) Chla< 4.15 58 29834.8300 40.294830
## 14) speed=high__,low___ 47 19839.6100 36.372340
## 28) Cl>=7.4315 23 6941.4460 27.513040
## 56) PO4>=43.818 7 309.5343 10.728570 *
## 57) PO4< 43.818 16 3797.1190 34.856250 *
## 29) Cl< 7.4315 24 9362.9760 44.862500
## 58) mxPH< 7.865 12 4780.1900 35.150000 *
## 59) mxPH>=7.865 12 2318.8030 54.575000 *
## 15) speed=medium 11 6182.3070 57.054550 *
prettyTree(rt.a1)
fancyRpartPlot(rt.a1, cex=0.7)
Trees are usually obtained in two steps. Initially, a large tree is grown, and then this tree is pruned by deleting bottom nodes through a process of statistical estimation. This process has the goal of avoiding overfitting.
The tree grown by rpart() stopped growing when the decrease in the deviance (parameter cp) went below a default threshold of 0.01. I will check the information on cp for the produced tree:
printcp(rt.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
##
## Variables actually used in tree construction:
## [1] Chla Cl mxPH NH4 oPO4 PO4 speed
##
## Root node error: 90694/200 = 453.47
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.398760 0 1.00000 1.01305 0.13190
## 2 0.105656 1 0.60124 0.69962 0.11122
## 3 0.042042 2 0.49558 0.67956 0.11341
## 4 0.038979 3 0.45354 0.62910 0.10314
## 5 0.031257 4 0.41456 0.62394 0.10323
## 6 0.029242 5 0.38331 0.59826 0.10157
## 7 0.024963 6 0.35406 0.58485 0.10197
## 8 0.010070 7 0.32910 0.63583 0.10648
## 9 0.010000 8 0.31903 0.66263 0.11145
The tree grown by rpart() is tree number 9 with CP=0.01 and a relative error of 0.30. However, R estimates, using an internal process of ten-fold crossvalidation, that this tree will have an average relative error of 0.736 +/- 0.105. Using the information provided by these more reliable estimates of performance, which avoid the overfitting problem, I would select the tree number 8 as a more reliable one, whith a lower estimated relative error (0.725). I am going to obtain this tree by using that cp value:
rt2.a1 <- prune(rt.a1, cp = 0.010070)
rt2.a1
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90693.8700 16.863000
## 2) oPO4>=27.125 124 11273.0000 6.335484
## 4) oPO4>=51.118 87 1836.2570 3.319540 *
## 5) oPO4< 51.118 37 6784.6730 13.427030 *
## 3) oPO4< 27.125 76 43255.7400 34.039470
## 6) Chla>=4.15 18 3838.5450 13.883330 *
## 7) Chla< 4.15 58 29834.8300 40.294830
## 14) speed=high__,low___ 47 19839.6100 36.372340
## 28) Cl>=7.4315 23 6941.4460 27.513040
## 56) PO4>=43.818 7 309.5343 10.728570 *
## 57) PO4< 43.818 16 3797.1190 34.856250 *
## 29) Cl< 7.4315 24 9362.9760 44.862500
## 58) mxPH< 7.865 12 4780.1900 35.150000 *
## 59) mxPH>=7.865 12 2318.8030 54.575000 *
## 15) speed=medium 11 6182.3070 57.054550 *
printcp(rt2.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
##
## Variables actually used in tree construction:
## [1] Chla Cl mxPH oPO4 PO4 speed
##
## Root node error: 90694/200 = 453.47
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.398760 0 1.00000 1.01305 0.13190
## 2 0.105656 1 0.60124 0.69962 0.11122
## 3 0.042042 2 0.49558 0.67956 0.11341
## 4 0.038979 3 0.45354 0.62910 0.10314
## 5 0.031257 4 0.41456 0.62394 0.10323
## 6 0.029242 5 0.38331 0.59826 0.10157
## 7 0.024963 6 0.35406 0.58485 0.10197
## 8 0.010070 7 0.32910 0.63583 0.10648
There is a function rpartXse() in the package “DMwR” that automates the process and takes as argument the se value, defaulting to 1:
(rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90693.870 16.863000
## 2) oPO4>=27.125 124 11273.000 6.335484 *
## 3) oPO4< 27.125 76 43255.740 34.039470
## 6) Chla>=4.15 18 3838.545 13.883330 *
## 7) Chla< 4.15 58 29834.830 40.294830 *
It is also possible to interactively prune a tree using the function snip.rpart() by either indicating the number of nodes at which you want to prune the tree, or using snip.rpart() in a graphical way.
# first option:
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
my.tree <- snip.rpart(first.tree, c(4, 7))
my.tree
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90693.870 16.863000
## 2) oPO4>=27.125 124 11273.000 6.335484
## 4) oPO4>=51.118 87 1836.257 3.319540 *
## 5) oPO4< 51.118 37 6784.673 13.427030
## 10) NH4< 124.6665 21 2831.818 9.090476 *
## 11) NH4>=124.6665 16 3039.604 19.118750 *
## 3) oPO4< 27.125 76 43255.740 34.039470
## 6) Chla>=4.15 18 3838.545 13.883330 *
## 7) Chla< 4.15 58 29834.830 40.294830 *
# # second option (not active in presentation):
# prettyTree(first.tree)
# snip.rpart(first.tree)
Now, I have two prediction models. The obvious question is which one we should use for obtaining the predictions for the seven algae of the 140 test samples. I will evaluate the predictive performance of two models and compare it.
lm.predictions.a1 <- predict(final.lm, clean.algae)
rt.predictions.a1 <- predict(rt.a1, algae)
# MAE:
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[, "a1"])))
## [1] 12.46743
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[, "a1"])))
## [1] 10.73599
# MSE:
(mse.a1.lm <- mean((lm.predictions.a1 - algae[, "a1"])^2))
## [1] 269.5209
(mse.a1.rt <- mean((rt.predictions.a1 - algae[, "a1"])^2))
## [1] 224.7319
# NMSE:
(nmse.a1.lm <- mean((lm.predictions.a1-algae[,'a1'])^2)/
mean((mean(algae[,'a1'])-algae[,'a1'])^2))
## [1] 0.5943531
(nmse.a1.rt <- mean((rt.predictions.a1-algae[,'a1'])^2)/
mean((mean(algae[,'a1'])-algae[,'a1'])^2))
## [1] 0.4955834
# all together:
regr.eval(algae[, "a1"], lm.predictions.a1, train.y = algae[,"a1"])
## mae mse rmse mape nmse nmae
## 12.4674296 269.5208958 16.4170916 Inf 0.5943531 0.7530437
regr.eval(algae[, "a1"], rt.predictions.a1, train.y = algae[,"a1"])
## mae mse rmse mape nmse nmae
## 10.7359946 224.7318866 14.9910602 Inf 0.4955834 0.6484635
Visualization of the predictions of the models:
old.par <- par(mfrow = c(1, 2))
plot(lm.predictions.a1, algae[, "a1"], main = "Linear Model",
xlab = "Predictions", ylab = "True Values")
abline(0, 1, lty = 2)
plot(rt.predictions.a1, algae[, "a1"], main = "Regression Tree",
xlab = "Predictions", ylab = "True Values")
abline(0, 1, lty = 2)
par(old.par)
The models have rather poor performance in several cases. In the ideal scenario that they make correct predictions for all cases, all the circles in the plots should lie on the dashed lines. The plot shows that this is not the case at all.
Looking at the predictions of the linear model, I can see that this model predicts negative algae frequencies for some cases. In this application domain, it makes no sense to say that the occurrence of an alga in a water sample is negative (at most, it can be zero). As such, I can use the minimum value of zero as a form of improving the linear model performance:
sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0,
0,
lm.predictions.a1)
# compare the performance of improved and initial modelss:
regr.eval(algae[, "a1"], sensible.lm.predictions.a1, stats = c("mae", "mse")) #improved
## mae mse
## 11.94207 262.81421
regr.eval(algae[, "a1"], lm.predictions.a1, stats = c("mae", "mse")) # initial
## mae mse
## 12.46743 269.52090
The results show that this small detail has increased the performance of the model.
In the package “DMwR” there is the function experimentalComparison(), which is designed to help in the model selection/comparison tasks. It can be used with different estimation methods, including cross-validation.
# 1. defining functions that will carry out
# the learning and testing phase of the models
cv.rpart <- function(form,train,test,...) {
m <- rpartXse(form,train,...)
p <- predict(m,test)
mse <- mean((p-resp(form,test))^2)
c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}
cv.lm <- function(form,train,test,...) {
m <- lm(form,train,...)
p <- predict(m,test)
p <- ifelse(p < 0,0,p)
mse <- mean((p-resp(form,test))^2)
c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}
# 2. cross-validation comparison.
# The experiment includes one variant of linear regression
# and three variants of regression trees (with 3 different se).
# cvSettings define numbr of repetitions of cv process (3),
# value of k (10),
# seed for the random number generator (1234)
res <- experimentalComparison(
c(dataset(a1 ~ .,clean.algae[,1:12],'a1')),
c(variants('cv.lm'),
variants('cv.rpart',se=c(0,0.5,1))),
cvSettings(3,10,1234))
##
##
## ##### CROSS VALIDATION EXPERIMENTAL COMPARISON #####
##
## ** DATASET :: a1
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
summary(res)
##
## == Summary of a Cross Validation Experiment ==
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
##
## * Data sets :: a1
## * Learners :: cv.lm.v1, cv.rpart.v1, cv.rpart.v2, cv.rpart.v3
##
## * Summary of Experiment Results:
##
##
## -> Datataset: a1
##
## *Learner: cv.lm.v1
## nmse
## avg 0.6740195
## std 0.1943386
## min 0.4469701
## max 1.3202826
## invalid 0.0000000
##
## *Learner: cv.rpart.v1
## nmse
## avg 0.7040149
## std 0.3276007
## min 0.2694526
## max 1.6241139
## invalid 0.0000000
##
## *Learner: cv.rpart.v2
## nmse
## avg 0.7048697
## std 0.2841595
## min 0.2934157
## max 1.6241139
## invalid 0.0000000
##
## *Learner: cv.rpart.v3
## nmse
## avg 0.7200008
## std 0.2756571
## min 0.2934157
## max 1.6241139
## invalid 0.0000000
plot(res)
In order to know the specific parameter settings corresponding to a specific model:
getVariant("cv.rpart.v1", res)
##
## Learner:: "cv.rpart"
##
## Parameter values
## se = 0
I will carry out a similar comparative experiment for all seven prediction tasks at the same time:
DSs <- sapply(names(clean.algae)[12:18],
function(x,names.attrs) {
f <- as.formula(paste(x,"~ ."))
dataset(f,clean.algae[,c(names.attrs,x)],x)
},
names(clean.algae)[1:11])
res.all <- experimentalComparison(
DSs,
c(variants('cv.lm'),
variants('cv.rpart',se=c(0,0.5,1))),
cvSettings(3,10,1234))
##
##
## ##### CROSS VALIDATION EXPERIMENTAL COMPARISON #####
##
## ** DATASET :: a1
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a2
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a3
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a4
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a5
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a6
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a7
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
plot(res.all)
There are several very bad results; that is, NMSE scores clearly above 1, which is the baseline of being as competitive as predicting always the average target variable value for all test cases.
In order to check which is the best model for each problem, we can use the function bestScores() from the “DMwR” package:
bestScores(res.all)
## $a1
## system score
## nmse cv.lm.v1 0.6740195
##
## $a2
## system score
## nmse cv.lm.v1 0.7738637
##
## $a3
## system score
## nmse cv.rpart.v3 1
##
## $a4
## system score
## nmse cv.rpart.v1 1
##
## $a5
## system score
## nmse cv.lm.v1 0.9692
##
## $a6
## system score
## nmse cv.lm.v1 0.9680376
##
## $a7
## system score
## nmse cv.rpart.v2 1
The output of this function confirms that, with the exception of alga 1, the results are rather disappointing. The variability of the results provides good indications that this might be a good candidate for an ensemble approach. I will use a function randomForest() from the library “randomForest” to obtain predictions for regression tasks by averaging the predictions of the trees in the ensemble.
The following code repeats the previous cross-validation experiment, this time including three variants of random forests, each with a dierent number of trees in the ensemble.
cv.rf <- function(form,train,test,...) {
m <- randomForest(form,train,...)
p <- predict(m,test)
mse <- mean((p-resp(form,test))^2)
c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}
res.all <- experimentalComparison(
DSs,
c(variants('cv.lm'),
variants('cv.rpart',se=c(0,0.5,1)),
variants('cv.rf',ntree=c(200,500,700))
),
cvSettings(3,10,1234))
##
##
## ##### CROSS VALIDATION EXPERIMENTAL COMPARISON #####
##
## ** DATASET :: a1
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a2
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a3
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a4
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a5
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a6
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ** DATASET :: a7
##
## ++ LEARNER :: cv.lm variant -> cv.lm.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rpart variant -> cv.rpart.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v1
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v2
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
##
##
## ++ LEARNER :: cv.rf variant -> cv.rf.v3
##
## 3 x 10 - Fold Cross Validation run with seed = 1234
## Repetition 1
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 2
## Fold: 1 2 3 4 5 6 7 8 9 10
## Repetition 3
## Fold: 1 2 3 4 5 6 7 8 9 10
bestScores(res.all)
## $a1
## system score
## nmse cv.rf.v3 0.5172045
##
## $a2
## system score
## nmse cv.rf.v2 0.7553715
##
## $a3
## system score
## nmse cv.rpart.v3 1
##
## $a4
## system score
## nmse cv.rpart.v1 1
##
## $a5
## system score
## nmse cv.rf.v1 0.8148144
##
## $a6
## system score
## nmse cv.rf.v2 0.894367
##
## $a7
## system score
## nmse cv.rpart.v2 1
The result show the advantages of the ensemble approach. In fact, for all problems except alga 7, the best score is obtained by some variant of a random forest. Still, the results are not always very good, in particular for alga 7. In addition, The output of the function bestScores() does not tell whether the difference between the scores of these best models and the remaining alternatives is statistically significant. The function compAnalysis() in the package “DMwR” provides this information.
compAnalysis(res.all,against='cv.rf.v3',
datasets=c('a1','a2','a4','a6'))
##
## == Statistical Significance Analysis of Comparison Results ==
##
## Baseline Learner:: cv.rf.v3 (Learn.1)
##
## ** Evaluation Metric:: nmse
##
## - Dataset: a1
## Learn.1 Learn.2 sig.2 Learn.3 sig.3 Learn.4 sig.4 Learn.5
## AVG 0.5172045 0.6740195 ++ 0.7040149 ++ 0.7048697 ++ 0.7200008
## STD 0.2275834 0.1943386 0.3276007 0.2841595 0.2756571
## sig.5 Learn.6 sig.6 Learn.7 sig.7
## AVG ++ 0.5201657 0.5193789
## STD 0.2319076 0.2288161
##
## - Dataset: a2
## Learn.1 Learn.2 sig.2 Learn.3 sig.3 Learn.4 sig.4
## AVG 0.7566567 0.7738637 1.0028767 ++ 0.99787737 ++
## STD 0.1706870 0.2055161 0.1570531 0.02030462
## Learn.5 sig.5 Learn.6 sig.6 Learn.7 sig.7
## AVG 1.000000e+00 ++ 0.7634266 0.7553715
## STD 3.220373e-16 0.1778386 0.1610061
##
## - Dataset: a4
## Learn.1 Learn.2 sig.2 Learn.3 sig.3 Learn.4 sig.4
## AVG 1.416682 1.2777027 1.000000e+00 1.000000e+00
## STD 1.193849 0.7680887 1.147868e-16 1.147868e-16
## Learn.5 sig.5 Learn.6 sig.6 Learn.7 sig.7
## AVG 1.000000e+00 1.416414 1.425637
## STD 1.147868e-16 1.126784 1.196226
##
## - Dataset: a6
## Learn.1 Learn.2 sig.2 Learn.3 sig.3 Learn.4 sig.4
## AVG 0.9002171 0.9680376 1.00962926 1.000000e+00
## STD 0.2976772 0.4216282 0.05274165 1.457794e-16
## Learn.5 sig.5 Learn.6 sig.6 Learn.7 sig.7
## AVG 1.000000e+00 0.9018138 0.8943670
## STD 1.457794e-16 0.3139281 0.2803978
##
## Legends:
## Learners -> Learn.1 = cv.rf.v3 ; Learn.2 = cv.lm.v1 ; Learn.3 = cv.rpart.v1 ; Learn.4 = cv.rpart.v2 ; Learn.5 = cv.rpart.v3 ; Learn.6 = cv.rf.v1 ; Learn.7 = cv.rf.v2 ;
## Signif. Codes -> 0 '++' or '--' 0.001 '+' or '-' 0.05 ' ' 1
The columns “sig.X” provide the information on statistical significance. Absence of a symbol means that our confidence in the observed difference between the respective model and the “cv.rf.v3” being statistically significant is lower than 95%. Plus signals mean that the average evaluation metric of the model is significantly higher than the one of “cv.rf.v3”, which is bad as best NMSE scores are the lower ones. Minus signals represent the opposite.
The results of compAnalysis show the difference between this variant of random forests and the other variants is usually not statistically significant. With respect to the other models, there is in most cases a significant advantage to this variant of random forests.
To overview, the cross-validation process has indicated the following models as being the “best” for that task: cv.rf.v3, cv.rf.v2, cv.rf.v1, and cv.rpart.v3.
I start by obtaining the four best models using all the available training data so that I could apply them to the test set later.
bestModelsNames <- sapply(bestScores(res.all),
function(x) x['nmse','system'])
learners <- c(rf='randomForest',rpart='rpartXse')
funcs <- learners[sapply(strsplit(bestModelsNames,'\\.'),
function(x) x[2])]
parSetts <- lapply(bestModelsNames,
function(x) getVariant(x,res.all)@pars)
bestModels <- list()
for(a in 1:7) {
form <- as.formula(paste(names(clean.algae)[11+a],'~ .'))
bestModels[[a]] <- do.call(funcs[a],
c(list(form,clean.algae[,c(1:11,11+a)]),parSetts[[a]]))
}
Now, I have a list with seven models obtained for each algae and ready for making predictions for the test set.
The data frame test.algae is a test set containing the 140 test samples. I fill in NA values with the results of knnImputation for the training set.
clean.test.algae <- knnImputation(test.algae,
k = 10,
distData = algae[,1:11])
clean.test.algae$size <- revalue(clean.test.algae$size, c("large"="large_", "small"="small_"))
clean.test.algae$speed <- revalue(clean.test.algae$speed, c("low"="low___", "high"="high__"))
Now, I obtain the matrix with the predictions for the entire test set:
preds <- matrix(ncol=7,nrow=140)
for(i in 1:nrow(clean.test.algae)){
preds[i,] <- sapply(1:7,
function(x) predict(bestModels[[x]],clean.test.algae[i,]))
}
Now, I can compare these predictions with the real values to obtain some feedback on the quality of my approach to this prediction problem. The true values of the test set are contained in the algae.sols data frame. I calculate NMSE scores for each of seven models.
avg.preds <- apply(algae[,12:18],2,mean)
apply(((algae.sols-preds)^2),2,mean) /
apply( (scale(algae.sols,avg.preds,F)^2),2,mean)
## a1 a2 a3 a4 a5 a6 a7
## 0.4944927 0.8861564 1.0000000 1.0000000 0.7792627 0.9333836 NA
The results that we obtained are in accordance with the cross-validation estimates obtained previously. They confirm the difficulty in obtaining good scores for alga 7, while for the other problems the results are more competitive, in particular for alga 1.