High concentrations of certain harmful algae in rivers constitute a serious ecological problem with a strong impact not only on river life forms, but also on water quality. Being able to monitor and perform an early forecast of algae blooms is essential to improving the quality of rivers. With the goal of addressing this prediction problem, several water samples were collected in different European rivers at different times during a period of approximately one year.
One of the main motivations behind this application lies in the fact that chemical monitoring is cheap and easily automated, therefore, obtaining models that are able to accurately predict the algae frequencies based on chemical properties would facilitate the creation of cheap and automated systems for monitoring harmful algae blooms.
Another objective of this study is to provide a better understanding of the factors influencing the algae frequencies. Namely, to understand how these frequencies are related to certain chemical attributes of water samples as well as other characteristics of the samples.
library(DMwR)
data(algae)
dim(algae) # 200 rows, 18 columns
## [1] 200 18
head(algae) # show the first 6 rows
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla
## 1 winter small medium 8.00 9.8 60.800 6.238 578.000 105.000 170.000 50.0
## 2 spring small medium 8.35 8.0 57.750 1.288 370.000 428.750 558.750 1.3
## 3 autumn small medium 8.10 11.4 40.020 5.330 346.667 125.667 187.057 15.6
## 4 spring small medium 8.07 4.8 77.364 2.302 98.182 61.182 138.700 1.4
## 5 autumn small medium 8.06 9.0 55.350 10.416 233.700 58.222 97.580 10.5
## 6 winter small high 8.25 13.1 65.750 9.248 430.000 18.250 56.667 28.4
## a1 a2 a3 a4 a5 a6 a7
## 1 0.0 0.0 0.0 0.0 34.2 8.3 0.0
## 2 1.4 7.6 4.8 1.9 6.7 0.0 2.1
## 3 3.3 53.6 1.9 0.0 0.0 0.0 9.7
## 4 3.1 41.0 18.9 0.0 1.4 0.0 1.4
## 5 9.2 2.9 7.5 0.0 7.5 4.1 1.0
## 6 15.1 14.6 1.4 0.0 22.5 12.6 2.9
str(algae) # display the structure of the data
## 'data.frame': 200 obs. of 18 variables:
## $ season: Factor w/ 4 levels "autumn","spring",..: 4 2 1 2 1 4 3 1 4 4 ...
## $ size : Factor w/ 3 levels "large","medium",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ speed : Factor w/ 3 levels "high","low","medium": 3 3 3 3 3 1 1 1 3 1 ...
## $ mxPH : num 8 8.35 8.1 8.07 8.06 8.25 8.15 8.05 8.7 7.93 ...
## $ mnO2 : num 9.8 8 11.4 4.8 9 13.1 10.3 10.6 3.4 9.9 ...
## $ Cl : num 60.8 57.8 40 77.4 55.4 ...
## $ NO3 : num 6.24 1.29 5.33 2.3 10.42 ...
## $ NH4 : num 578 370 346.7 98.2 233.7 ...
## $ oPO4 : num 105 428.8 125.7 61.2 58.2 ...
## $ PO4 : num 170 558.8 187.1 138.7 97.6 ...
## $ Chla : num 50 1.3 15.6 1.4 10.5 ...
## $ a1 : num 0 1.4 3.3 3.1 9.2 15.1 2.4 18.2 25.4 17 ...
## $ a2 : num 0 7.6 53.6 41 2.9 14.6 1.2 1.6 5.4 0 ...
## $ a3 : num 0 4.8 1.9 18.9 7.5 1.4 3.2 0 2.5 0 ...
## $ a4 : num 0 1.9 0 0 0 0 3.9 0 0 2.9 ...
## $ a5 : num 34.2 6.7 0 1.4 7.5 22.5 5.8 5.5 0 0 ...
## $ a6 : num 8.3 0 0 0 4.1 12.6 6.8 8.7 0 0 ...
## $ a7 : num 0 2.1 9.7 1.4 1 2.9 0 0 0 1.7 ...
summary(algae) # summarize the data
## season size speed mxPH mnO2
## autumn:40 large :45 high :84 Min. :5.600 Min. : 1.500
## spring:53 medium:84 low :33 1st Qu.:7.700 1st Qu.: 7.725
## summer:45 small :71 medium:83 Median :8.060 Median : 9.800
## winter:62 Mean :8.012 Mean : 9.118
## 3rd Qu.:8.400 3rd Qu.:10.800
## Max. :9.700 Max. :13.400
## NA's :1 NA's :2
## Cl NO3 NH4 oPO4
## Min. : 0.222 Min. : 0.050 Min. : 5.00 Min. : 1.00
## 1st Qu.: 10.981 1st Qu.: 1.296 1st Qu.: 38.33 1st Qu.: 15.70
## Median : 32.730 Median : 2.675 Median : 103.17 Median : 40.15
## Mean : 43.636 Mean : 3.282 Mean : 501.30 Mean : 73.59
## 3rd Qu.: 57.824 3rd Qu.: 4.446 3rd Qu.: 226.95 3rd Qu.: 99.33
## Max. :391.500 Max. :45.650 Max. :24064.00 Max. :564.60
## NA's :10 NA's :2 NA's :2 NA's :2
## PO4 Chla a1 a2
## Min. : 1.00 Min. : 0.200 Min. : 0.00 Min. : 0.000
## 1st Qu.: 41.38 1st Qu.: 2.000 1st Qu.: 1.50 1st Qu.: 0.000
## Median :103.29 Median : 5.475 Median : 6.95 Median : 3.000
## Mean :137.88 Mean : 13.971 Mean :16.92 Mean : 7.458
## 3rd Qu.:213.75 3rd Qu.: 18.308 3rd Qu.:24.80 3rd Qu.:11.375
## Max. :771.60 Max. :110.456 Max. :89.80 Max. :72.600
## NA's :2 NA's :12
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.550 Median : 0.000 Median : 1.900 Median : 0.000
## Mean : 4.309 Mean : 1.992 Mean : 5.064 Mean : 5.964
## 3rd Qu.: 4.925 3rd Qu.: 2.400 3rd Qu.: 7.500 3rd Qu.: 6.925
## Max. :42.800 Max. :44.600 Max. :44.400 Max. :77.600
##
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 2.495
## 3rd Qu.: 2.400
## Max. :31.600
##
# relevel the factor variables
library(dplyr)
library(forcats)
algae.clean <- algae %>%
mutate(season = fct_relevel(season, c("spring", "summer", "autumn", "winter")),
size = fct_relevel(size, c("small", "medium", "large")),
speed = fct_relevel(speed, c("low", "medium", "high")))
summary(algae.clean)
## season size speed mxPH mnO2
## spring:53 small :71 low :33 Min. :5.600 Min. : 1.500
## summer:45 medium:84 medium:83 1st Qu.:7.700 1st Qu.: 7.725
## autumn:40 large :45 high :84 Median :8.060 Median : 9.800
## winter:62 Mean :8.012 Mean : 9.118
## 3rd Qu.:8.400 3rd Qu.:10.800
## Max. :9.700 Max. :13.400
## NA's :1 NA's :2
## Cl NO3 NH4 oPO4
## Min. : 0.222 Min. : 0.050 Min. : 5.00 Min. : 1.00
## 1st Qu.: 10.981 1st Qu.: 1.296 1st Qu.: 38.33 1st Qu.: 15.70
## Median : 32.730 Median : 2.675 Median : 103.17 Median : 40.15
## Mean : 43.636 Mean : 3.282 Mean : 501.30 Mean : 73.59
## 3rd Qu.: 57.824 3rd Qu.: 4.446 3rd Qu.: 226.95 3rd Qu.: 99.33
## Max. :391.500 Max. :45.650 Max. :24064.00 Max. :564.60
## NA's :10 NA's :2 NA's :2 NA's :2
## PO4 Chla a1 a2
## Min. : 1.00 Min. : 0.200 Min. : 0.00 Min. : 0.000
## 1st Qu.: 41.38 1st Qu.: 2.000 1st Qu.: 1.50 1st Qu.: 0.000
## Median :103.29 Median : 5.475 Median : 6.95 Median : 3.000
## Mean :137.88 Mean : 13.971 Mean :16.92 Mean : 7.458
## 3rd Qu.:213.75 3rd Qu.: 18.308 3rd Qu.:24.80 3rd Qu.:11.375
## Max. :771.60 Max. :110.456 Max. :89.80 Max. :72.600
## NA's :2 NA's :12
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.550 Median : 0.000 Median : 1.900 Median : 0.000
## Mean : 4.309 Mean : 1.992 Mean : 5.064 Mean : 5.964
## 3rd Qu.: 4.925 3rd Qu.: 2.400 3rd Qu.: 7.500 3rd Qu.: 6.925
## Max. :42.800 Max. :44.600 Max. :44.400 Max. :77.600
##
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 2.495
## 3rd Qu.: 2.400
## Max. :31.600
##
There are 7 numeric varaibles in the dataset, I explored their distribution by plotting the histograms, checking and correcting extreme values, and log-transformation the highly right-skewed variables.
According to the result below, it’s reasonable to say that mxPH nearly follows a normal distribution.
summary(algae.clean$mxPH) # 1 missing value
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.600 7.700 8.060 8.012 8.400 9.700 1
# check the missing value
algae.clean %>%
filter(is.na(mxPH))
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1 a2 a3 a4 a5
## 1 winter small low NA 12.6 9 0.23 10 5 6 1.1 35.5 0 0 0 0
## a6 a7
## 1 0 0
library(ggplot2)
ggplot(algae.clean, aes(x = mxPH)) + geom_histogram(aes(y = ..density..), na.rm = T) + geom_density() +
geom_rug() + labs(title = "Histogram of maximum pH value")
# qq-plot (test for normality)
ggplot(algae.clean, aes(sample = mxPH)) +
stat_qq() + stat_qq_line() + labs(title = "Normal QQ plot of maximum pH")
Based on the histograms and summary output of the numeric varaibles, the max value of Cl, NO3, and NH4 is pretty strange. Therefore, I explored the specific observations of the extreme values, and corrected the unreasonable values.
max(algae.clean$Cl, na.rm = T) # 391.5
max(algae.clean$NO3, na.rm = T) # 45.65
max(algae.clean$NH4, na.rm = T) # 2406.4
which.max(algae.clean$Cl) # row 134
which.max(algae.clean$NH4) # row 153
which.max(algae.clean$NO3) # row 153
# filter out observations with extreme values
algae.clean %>%
filter(Cl == max(Cl, na.rm = T) | NO3 == max(NO3, na.rm = T) | NH4 == max(NH4, na.rm = T))
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1
## 1 spring medium medium 7.9 8.3 391.500 6.045 380 173 317 5.5 2.4
## 2 autumn medium high 7.3 11.8 44.205 45.650 24064 44 34 53.1 2.2
## a2 a3 a4 a5 a6 a7
## 1 1.7 4.2 8.3 1.7 0.0 2.4
## 2 0.0 0.0 1.2 5.9 77.6 0.0
For the observations above, the values of all other varaibles are near the median/mean level, therefore, I will chane 391.5 to 39.15, 45.65 to 4.565, and 2406.4 to 240.64 to make them also near the median/mean level, which seems more reasonable.
algae.clean$Cl[134] <- 39.15
algae.clean$NO3[153] <- 4.565
algae.clean$NH4[153] <- 240.64
In this step, I took logarithmic transformation of highly right skewed variables to make them more normalized. Take Cl as an example, I compared the histograms of Cl before and after log-transformation, and do the same comparison on the other 4 variables.
ggplot(algae.clean, aes(x = Cl)) + geom_histogram(aes(y = ..density..), na.rm = T) +
geom_density() + geom_rug() +
labs(title = "Histogram of mean Cl value")
ggplot(algae.clean, aes(x = log(Cl))) + geom_histogram(aes(y = ..density..), na.rm = T) +
geom_density() + geom_rug() +
labs(title = "Histogram of log(mean Cl value)")
Conditioned Boxplots are graphical representations that depend on a certain factor. In this dataset, I could explore the effects of season, size, and speed on the 7 numeric variables. There are 21 combinations in total, however, I will only list the plots that show some significant results below.
## method1: ggplot2::ggplot
ggplot(algae.clean, aes(x = size, y = mxPH)) +
geom_boxplot(na.rm = T) +
geom_hline(aes(yintercept = mean(algae.clean$mxPH,
na.rm = T)),
linetype = 2, colour = "red")
According to the conditioned plot above, it seems that the larger the river is, the higher the mxPH value is.
# method2
par(mfrow = c(1,1))
boxplot(algae.clean$mxPH ~ algae.clean$size,
ylab = "Maximum pH value")
rug(jitter(algae.clean$mxPH), side = 2)
abline(h = mean(algae.clean$mxPH, na.rm = T), lty = 2)
Next, I conducted an experiment design to check if the impact of river size on mxPH is statistically significant.
aov1 <- aov(mxPH ~ size, data = algae.clean)
# conduct Tukey's HSD test
library(broom)
out1 <- TukeyHSD(aov1, which = "size", conf.level = 0.95)
tidy(out1) # significant difference
## # A tibble: 3 x 6
## term comparison estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 size medium-small 0.444 0.242 0.646 1.53e- 6
## 2 size large-small 0.739 0.501 0.978 1.85e-11
## 3 size large-medium 0.295 0.0646 0.526 7.93e- 3
Based on the result of the Tukey’s test, the difference of mxPH value between any two groups of river size is significant.
# method3:lattice::bwplot
library(lattice)
bwplot(speed ~ mxPH, data = algae.clean, ylab='River Size',xlab='Algal A1')
# experiment design
aov2 <- aov(mxPH ~ speed, data = algae.clean)
# Tukey's HSD test
out2 <- TukeyHSD(aov2, which = "speed", conf.level = 0.95)
tidy(out2)
## # A tibble: 3 x 6
## term comparison estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 speed medium-low -0.349 -0.633 -0.0655 0.0113
## 2 speed high-low -0.491 -0.774 -0.208 0.000180
## 3 speed high-medium -0.142 -0.353 0.0688 0.252
This time, it’s easy to find that the difference between the mxPH value of high and medium speed is not statistically significant.
summary(algae.clean$a1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.50 6.95 16.92 24.80 89.80
ggplot(algae.clean, aes(x = a1)) + geom_histogram(aes(y = ..density..)) + geom_density() +
geom_rug() + ggtitle("Distribution of a1") # highly right skewed
Based on the plot below, we can observe that for rivers of medium size most values of a1 are packed near zero, while for smaller rivers the values are more spread across the range (thinner violin).
ggplot(algae.clean, aes(x = size, y = a1)) + geom_violin() +
geom_jitter() + xlab("River Size") + ylab("Algae a1")
We could explore the linear relationship between the numeric independent variables and the numeric dependent variable by looking at correlation matrix and the scatter plot.
# explore the correlation
log <- algae.clean %>%
select(Cl, NH4, oPO4, PO4, Chla) %>%
mutate_all(log)
names(log) <- c("log.Cl", "log.NH4", "log.oPO4", "log.PO4", "log.Chla")
new.algae <- cbind(log, algae.clean[,4:18])
library(corrplot)
cm <- cor(new.algae, use = "complete.obs")
corrplot(cm, type="upper", tl.pos="d")
corrplot(cm, add = T, type = "lower", method = "number",
diag = F, tl.pos = "n", cl.pos = "n")
# scatter plot to explore the potential interactions
ggplot(algae.clean, aes(x = mxPH, y = a1, col = speed)) + geom_point() +
ggtitle("Scatter plot of a1 by mxPH with different speed")
There are several water samples with unknown values in some of the variables, I will use several method to deal with missing values.
nrow(algae.clean[!complete.cases(algae),]) # there are 16 incomplete cases
apply(algae.clean, 1, function(x) sum(is.na(x))) # the number of unknown values in each row of the dataset
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [36] 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 6 1 0 0 0 0 0 0 0
## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0
manyNAs(algae.clean, 0.2) # returns the row numbers that have more than 20% of the columns with an NA
## [1] 62 199
For observation 62 and 199, the number of unknown values is so high that they are almost useless, and even complex methods of filling in these values will be too unreliable. Therefore, I directly remove them from the datasets.
algae.clean <- algae.clean[-c(62, 199), ]
cor(algae.clean[, 4:18], use = "complete.obs")
symnum(cor(algae.clean[, 4:18],use="complete.obs"))
## mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1
## mnO2 1
## Cl 1
## NO3 . 1
## NH4 1
## oPO4 . . . 1
## PO4 . . . . * 1
## Chla . 1
## a1 . . . . 1
## a2 . . 1
## a3 1
## a4 . . . . 1
## a5 . 1
## a6 . . 1
## a7 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
Based on the table above, PO4 is highly correlated with oPO4.
summary(algae.clean$oPO4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 15.70 40.15 73.59 99.33 564.60
summary(algae.clean$PO4)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 43.5 104.0 138.5 214.0 771.6 1
lm(PO4 ~ oPO4, data = algae.clean)
##
## Call:
## lm(formula = PO4 ~ oPO4, data = algae.clean)
##
## Coefficients:
## (Intercept) oPO4
## 42.897 1.293
After removing the sample 62 and 199, we are left with only one observation with an unknow value on variable PO4, therefore, we could fill the missing values of PO4 with a linear regression model of PO4 on oPO4.
# manually fill the missing value of PO4
which(is.na(algae.clean$PO4)) # row 28
algae.clean[28, "PO4"] <- 42.897 + 1.293 * algae.clean[28, "oPO4"]
algae.clean$PO4[28]
## [1] 48.069
summary(algae.clean$PO4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 43.66 103.29 138.05 213.75 771.60
# create a function to fill multiple NAs
fillPO4 <- function(oPo4) {
if(is.na(oPO4)) {
return(NA)
} else{
return(42.897 + 1.293 * oPO4)
}
}
algae.clean[is.na(algae.clean$PO4), "PO4"] <- sapply(algae.clean[is.na(algae.clean$PO4), "oPO4"], fillPO4)
Now, we could go further to apply KNN Imputation method to fill the missing values based on the K-nearest neighbours.
algae.clean <- knnImputation(algae.clean, k = 10, meth = "median")
summary(algae.clean)
## season size speed mxPH mnO2
## spring:53 small :70 low :33 Min. :5.600 Min. : 1.500
## summer:44 medium:84 medium:81 1st Qu.:7.705 1st Qu.: 7.825
## autumn:40 large :44 high :84 Median :8.060 Median : 9.800
## winter:61 Mean :8.019 Mean : 9.135
## 3rd Qu.:8.400 3rd Qu.:10.800
## Max. :9.700 Max. :13.400
## Cl NO3 NH4 oPO4
## Min. : 0.222 Min. : 0.050 Min. : 5.00 Min. : 1.00
## 1st Qu.: 10.425 1st Qu.: 1.296 1st Qu.: 38.33 1st Qu.: 15.70
## Median : 32.178 Median : 2.675 Median : 103.17 Median : 40.15
## Mean : 40.650 Mean : 3.075 Mean : 380.98 Mean : 73.59
## 3rd Qu.: 56.375 3rd Qu.: 4.446 3rd Qu.: 226.95 3rd Qu.: 99.33
## Max. :208.364 Max. :10.416 Max. :8777.60 Max. :564.60
## PO4 Chla a1 a2
## Min. : 1.00 Min. : 0.200 Min. : 0.000 Min. : 0.000
## 1st Qu.: 43.66 1st Qu.: 2.000 1st Qu.: 1.525 1st Qu.: 0.000
## Median :103.29 Median : 5.155 Median : 6.950 Median : 3.000
## Mean :138.05 Mean : 13.386 Mean :16.996 Mean : 7.471
## 3rd Qu.:213.75 3rd Qu.: 17.200 3rd Qu.:24.800 3rd Qu.:11.275
## Max. :771.60 Max. :110.456 Max. :89.800 Max. :72.600
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.550 Median : 0.000 Median : 2.000 Median : 0.000
## Mean : 4.334 Mean : 1.997 Mean : 5.116 Mean : 6.005
## 3rd Qu.: 4.975 3rd Qu.: 2.400 3rd Qu.: 7.500 3rd Qu.: 6.975
## Max. :42.800 Max. :44.600 Max. :44.400 Max. :77.600
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 2.487
## 3rd Qu.: 2.400
## Max. :31.600
The main goal of this case study is to obtain predictions for the frequency values of the seven algae in a set of 140 water samples. Therefore, it’s a regression problem. In this section, I will explore 3 different predictive models: linear regression model, regression tree model, and random forest model.
Use the stepwise method to perform the linear regression model as below:
lm.null.a1 <- lm(a1 ~ 1, data = algae.clean[, 1:12])
lm1.full.a1 <- lm(a1 ~ season + size + speed + mxPH + mnO2 + log(Cl) + NO3 +
log(NH4) + log(oPO4) + log(PO4) + log(Chla), data = algae.clean[,1:12])
lm1.a1 <-step(lm.null.a1, scope = list(lower = lm.null.a1, upper = lm1.full.a1), direction = "both")
## Start: AIC=1214.5
## a1 ~ 1
##
## Df Sum of Sq RSS AIC
## + log(PO4) 1 39716 50685 1101.9
## + log(oPO4) 1 37579 52822 1110.1
## + log(Cl) 1 30616 59786 1134.6
## + log(Chla) 1 26198 64203 1148.8
## + log(NH4) 1 21100 69301 1163.9
## + size 2 11400 79001 1191.8
## + NO3 1 10394 80008 1192.3
## + mnO2 1 7615 82786 1199.1
## + speed 2 8140 82261 1199.8
## + mxPH 1 6581 83820 1201.5
## <none> 90401 1214.5
## + season 3 85 90317 1220.3
##
## Step: AIC=1101.93
## a1 ~ log(PO4)
##
## Df Sum of Sq RSS AIC
## + log(Chla) 1 2221 48464 1095.1
## + log(Cl) 1 1277 49408 1098.9
## + size 2 1366 49319 1100.5
## + log(oPO4) 1 761 49924 1100.9
## <none> 50685 1101.9
## + NO3 1 408 50277 1102.3
## + log(NH4) 1 222 50463 1103.1
## + mnO2 1 126 50559 1103.4
## + mxPH 1 80 50605 1103.6
## + speed 2 88 50598 1105.6
## + season 3 299 50386 1106.8
## - log(PO4) 1 39716 90401 1214.5
##
## Step: AIC=1095.06
## a1 ~ log(PO4) + log(Chla)
##
## Df Sum of Sq RSS AIC
## + log(oPO4) 1 845.3 47619 1093.6
## + log(Cl) 1 582.6 47882 1094.7
## <none> 48464 1095.1
## + NO3 1 306.5 48158 1095.8
## + log(NH4) 1 266.1 48198 1096.0
## + mnO2 1 65.7 48398 1096.8
## + mxPH 1 45.7 48418 1096.9
## + size 2 498.9 47965 1097.0
## + speed 2 314.3 48150 1097.8
## + season 3 424.2 48040 1099.3
## - log(Chla) 1 2221.2 50685 1101.9
## - log(PO4) 1 15739.2 64203 1148.8
##
## Step: AIC=1093.58
## a1 ~ log(PO4) + log(Chla) + log(oPO4)
##
## Df Sum of Sq RSS AIC
## + log(Cl) 1 666.13 46953 1092.8
## <none> 47619 1093.6
## + NO3 1 323.58 47295 1094.2
## + mxPH 1 249.12 47370 1094.5
## + log(NH4) 1 228.45 47390 1094.6
## - log(oPO4) 1 845.35 48464 1095.1
## + mnO2 1 72.38 47546 1095.3
## + size 2 387.58 47231 1096.0
## + speed 2 270.20 47349 1096.5
## - log(PO4) 1 1222.87 48842 1096.6
## + season 3 313.62 47305 1098.3
## - log(Chla) 1 2305.65 49924 1100.9
##
## Step: AIC=1092.79
## a1 ~ log(PO4) + log(Chla) + log(oPO4) + log(Cl)
##
## Df Sum of Sq RSS AIC
## <none> 46953 1092.8
## - log(PO4) 1 489.24 47442 1092.8
## - log(Cl) 1 666.13 47619 1093.6
## + NO3 1 131.83 46821 1094.2
## + mxPH 1 121.71 46831 1094.3
## + mnO2 1 113.95 46839 1094.3
## + log(NH4) 1 100.32 46852 1094.4
## + size 2 516.47 46436 1094.6
## - log(oPO4) 1 928.90 47882 1094.7
## + speed 2 290.67 46662 1095.6
## - log(Chla) 1 1563.11 48516 1097.3
## + season 3 267.76 46685 1097.7
summary(lm1.a1)
##
## Call:
## lm(formula = a1 ~ log(PO4) + log(Chla) + log(oPO4) + log(Cl),
## data = algae.clean[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.220 -8.673 -1.831 4.074 70.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.177 4.634 12.554 <2e-16 ***
## log(PO4) -3.568 2.516 -1.418 0.1578
## log(Chla) -2.578 1.017 -2.535 0.0120 *
## log(oPO4) -3.821 1.956 -1.954 0.0521 .
## log(Cl) -2.502 1.512 -1.655 0.0996 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.6 on 193 degrees of freedom
## Multiple R-squared: 0.4806, Adjusted R-squared: 0.4699
## F-statistic: 44.65 on 4 and 193 DF, p-value: < 2.2e-16
Next, I checked the validation of linear assumptions for this model with plot() function.
par(mfrow=c(2,2))
plot(lm1.a1)
par(mfrow=c(1,1))
Based on the results above, it’s obvious to conclude that the model does not meet t he linear assumption. Therefore, I tried the regression tree model next.
Regression tree models have different assumption from that of linear regression model, meaning that it allows missing values, so here, I reloaded and used the original algae dataset.
library(rpart)
library(rpart.plot)
data(algae)
algae <- algae[-manyNAs(algae),]
# build the default tree model
rt1.a1 <- rpart(a1 ~ ., data = algae[1:12])
prp(rt1.a1, type = 1, extra = 101, box.col = "orange",
split.box.col = "grey", split.font = 1, varlen = -10)
printcp(rt1.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[1:12])
##
## Variables actually used in tree construction:
## [1] Cl mnO2 mxPH NO3 oPO4 PO4
##
## Root node error: 90401/198 = 456.57
##
## n= 198
##
## CP nsplit rel error xerror xstd
## 1 0.405740 0 1.00000 1.00623 0.13049
## 2 0.071885 1 0.59426 0.66957 0.11105
## 3 0.030887 2 0.52237 0.65524 0.11161
## 4 0.030408 3 0.49149 0.69088 0.11090
## 5 0.027872 4 0.46108 0.71609 0.11305
## 6 0.027754 5 0.43321 0.71397 0.11310
## 7 0.018124 6 0.40545 0.70066 0.11047
## 8 0.016344 7 0.38733 0.72927 0.11260
## 9 0.010000 9 0.35464 0.72830 0.11753
# build the most complex model
rt2.a1 <- rpart(a1 ~ ., data = algae[1:12], cp = 0.0000001)
printcp(rt2.a1) # cp == 0.07188523
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[1:12], cp = 1e-07)
##
## Variables actually used in tree construction:
## [1] Chla Cl mnO2 mxPH NH4 NO3 oPO4 PO4 speed
##
## Root node error: 90401/198 = 456.57
##
## n= 198
##
## CP nsplit rel error xerror xstd
## 1 0.40573990 0 1.00000 1.01237 0.13121
## 2 0.07188523 1 0.59426 0.67898 0.11124
## 3 0.03088731 2 0.52237 0.68326 0.11343
## 4 0.03040753 3 0.49149 0.68953 0.11475
## 5 0.02787181 4 0.46108 0.69376 0.11592
## 6 0.02775354 5 0.43321 0.73555 0.12101
## 7 0.01812406 6 0.40545 0.74231 0.11481
## 8 0.01634372 7 0.38733 0.75003 0.11464
## 9 0.00462036 9 0.35464 0.75414 0.11598
## 10 0.00348211 10 0.35002 0.77137 0.11285
## 11 0.00295807 11 0.34654 0.78065 0.11771
## 12 0.00088108 12 0.34358 0.77859 0.11761
## 13 0.00072255 13 0.34270 0.77581 0.11773
## 14 0.00012213 14 0.34198 0.77709 0.11803
## 15 0.00012010 15 0.34186 0.77727 0.11802
## 16 0.00000010 16 0.34174 0.77692 0.11787
# prune the tree with 1SE
rt3.a1 <- rpart(a1 ~ ., data = algae[1:12], cp = 0.07188523)
prp(rt3.a1, type = 1, extra = 101, box.col = "orange",
split.box.col = "grey", split.font = 1, varlen = -10)
# alternative: rpartXse: integrates the tree growth and tree post-pruning in a single function call.
# The post-pruning phase is essentially the 1-SE rule
rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12])
prp(rt.a1, type = 1, extra = 101, box.col = "orange",
split.box.col = "grey", split.font = 1, varlen = -10)
library(randomForest)
rf.a1 <- randomForest(a1 ~ ., data = algae.clean[,1:12], ntree = 500, mtry = 4,
nodesize = 5, importance = TRUE)
varImpPlot(rf.a1, type = 1) # check the importance of variables
First, perform in-sample prediction of the two models. Looking at the plots below, we could observe that the models have rather poor performance in several cases. In the ideal scenario that they make correct predictions for all cases, all the points in the plots should lie on the red lines.
lm1.a1.pred <- predict(lm1.a1, algae.clean)
rt.a1.pred <- predict(rt.a1, algae)
rf.a1.pred <- predict(rf.a1, algae.clean)
df <- data.frame(lm1.a1 = lm1.a1.pred,
rt.a1 = rt.a1.pred,
rf.a1 = rf.a1.pred,
true.a1 = algae$a1)
ggplot(df, aes(x = lm1.a1, y = true.a1)) +
geom_point() + geom_abline(slope = 1, intercept = 0, color = "red") +
ggtitle("Linear Model with tranformation")
ggplot(df, aes(x = rt.a1, y = true.a1)) +
geom_point() + geom_abline(slope = 1, intercept = 0, color = "red") +
ggtitle("Regression Tree")
ggplot(df, aes(x = rf.a1, y = true.a1)) +
geom_point() + geom_abline(slope = 1, intercept = 0, color = "red") +
ggtitle("Random Forest")
Also, based on the plot with the predictions of the linear model, it’s obvious that this model predicts negative algae frequencies for some cases. However, according to the prefessional knowledge, it’s impossible to have negative frequency values. Therefore, I transformed all negative prediction values to zero:
mod.lm1.a1.pred <- ifelse(lm1.a1.pred < 0, 0, lm1.a1.pred)
For this case, I will use RMSE as the criteria of model selection, basically, the lower RMSE is, the better the model is.
# in-sample error
library(tidyr)
df1 <- data.frame(mod.lm1.a1 = mod.lm1.a1.pred,
rt.a1 = rt.a1.pred,
rf.a1 = rf.a1.pred,
true.a1 = algae$a1)
df1 %>%
gather(key = type, value = value, - true.a1) %>%
mutate(residual = true.a1 - value) %>%
group_by(type) %>%
summarise(rmse = sqrt(mean(residual^2)))
## # A tibble: 3 x 2
## type rmse
## <chr> <dbl>
## 1 mod.lm1.a1 15.3
## 2 rf.a1 7.13
## 3 rt.a1 15.1
Since 3 models have different assumption, as mentioned above, here, I will do cross-validation separately.
library(performanceEstimation)
res.lm <- performanceEstimation(
PredTask(a1 ~ log(PO4) + log(Chla) + log(oPO4) + log(Cl), algae.clean[, 1:12], "a1"),
Workflow(learner = "lm", pre = "knnImp", post = "onlyPos"),
EstimationTask(metrics = "rmse",method = CV(nReps=5,nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
summary(res.lm)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1
## * Workflows :: lm
##
## -> Task: a1
## *Workflow: lm
## rmse
## avg 15.373901
## std 3.834642
## med 14.428018
## iqr 4.983312
## min 8.943522
## max 26.937622
## invalid 0.000000
plot(res.lm)
res.rt <- performanceEstimation(
PredTask(a1 ~ ., algae[, 1:12], "a1"),
workflowVariants(learner = "rpartXse",learner.pars = list(se=c(0,0.5,1))),
EstimationTask(metrics = "rmse", method = CV(nReps=5,nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
summary(res.rt)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1
## * Workflows :: rpartXse.v1, rpartXse.v2, rpartXse.v3
##
## -> Task: a1
## *Workflow: rpartXse.v1
## rmse
## avg 16.186311
## std 4.762697
## med 15.614493
## iqr 6.775280
## min 8.560563
## max 28.572168
## invalid 0.000000
##
## *Workflow: rpartXse.v2
## rmse
## avg 16.831864
## std 4.522871
## med 16.363788
## iqr 6.111277
## min 8.129696
## max 28.864045
## invalid 0.000000
##
## *Workflow: rpartXse.v3
## rmse
## avg 17.213449
## std 4.373961
## med 16.596553
## iqr 5.566633
## min 8.910874
## max 28.128565
## invalid 0.000000
getWorkflow("rpartXse.v1", res.rt)
## Workflow Object:
## Workflow ID :: rpartXse.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0
## learner -> rpartXse
plot(res.rt)
res.rf <- performanceEstimation(
PredTask(a1 ~ ., algae.clean[, 1:12], "a1"),
workflowVariants(learner = "randomForest", learner.pars = list(ntree = c(200, 500, 700))),
EstimationTask(metrics = "rmse", method = CV(nReps=5,nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
summary(res.rf)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1
## * Workflows :: randomForest.v1, randomForest.v2, randomForest.v3
##
## -> Task: a1
## *Workflow: randomForest.v1
## rmse
## avg 15.306264
## std 4.158935
## med 14.065973
## iqr 5.317952
## min 9.071738
## max 26.785578
## invalid 0.000000
##
## *Workflow: randomForest.v2
## rmse
## avg 15.333838
## std 4.150589
## med 13.997902
## iqr 5.214072
## min 8.861183
## max 27.128481
## invalid 0.000000
##
## *Workflow: randomForest.v3
## rmse
## avg 15.329250
## std 4.165849
## med 14.018964
## iqr 5.330604
## min 8.813906
## max 27.224756
## invalid 0.000000
plot(res.rf)
DS.lm <- sapply(names(algae.clean)[12:18],
function(x, names.attrs) {
f <- as.formula(paste(x, "~ log(PO4) + log(Chla) + log(oPO4) + log(Cl)"))
PredTask(f, algae.clean[, c(names.attrs, x)], x, copy = T)
},
names(algae.clean)[1:11])
res.lm.all <- performanceEstimation(
DS.lm,
Workflow(learner="lm", pre="knnImp", post="onlyPos"),
EstimationTask(metrics="rmse" ,method=CV(nReps=5, nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
plot(res.lm.all)
summary(res.lm.all)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1, a2, a3, a4, a5, a6, a7
## * Workflows :: lm
##
## -> Task: a1
## *Workflow: lm
## rmse
## avg 15.373901
## std 3.834642
## med 14.428018
## iqr 4.983312
## min 8.943522
## max 26.937622
## invalid 0.000000
##
## -> Task: a2
## *Workflow: lm
## rmse
## avg 9.836443
## std 3.715329
## med 8.704967
## iqr 3.256512
## min 5.501596
## max 19.634340
## invalid 0.000000
##
## -> Task: a3
## *Workflow: lm
## rmse
## avg 6.696303
## std 2.254685
## med 6.577072
## iqr 2.927882
## min 3.229518
## max 11.826706
## invalid 0.000000
##
## -> Task: a4
## *Workflow: lm
## rmse
## avg 3.469810
## std 2.444415
## med 2.516132
## iqr 1.519727
## min 1.277727
## max 10.326709
## invalid 0.000000
##
## -> Task: a5
## *Workflow: lm
## rmse
## avg 6.804237
## std 2.503848
## med 6.196962
## iqr 3.557455
## min 3.777818
## max 13.386939
## invalid 0.000000
##
## -> Task: a6
## *Workflow: lm
## rmse
## avg 10.849035
## std 4.596002
## med 9.874834
## iqr 5.171821
## min 5.128981
## max 24.676705
## invalid 0.000000
##
## -> Task: a7
## *Workflow: lm
## rmse
## avg 4.728331
## std 2.056581
## med 4.200264
## iqr 3.989409
## min 1.700101
## max 8.768580
## invalid 0.000000
DS.rt <- sapply(names(algae)[12:18],
function(x, names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[, c(names.attrs, x)], x, copy = T)
},
names(algae)[1:11])
res.rt.all <- performanceEstimation(
DS.rt,
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1))),
EstimationTask(metrics="rmse" ,method=CV(nReps=5, nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
plot(res.rt.all)
DS.rf <- sapply(names(algae.clean)[12:18],
function(x, names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae.clean[, c(names.attrs, x)], x, copy = T)
},
names(algae.clean)[1:11])
res.rf.all <- performanceEstimation(
DS.rf,
workflowVariants(learner="randomForest", pre = "knnImp", learner.pars=list(ntree =c(200, 500, 700))),
EstimationTask(metrics="rmse" ,method=CV(nReps=5, nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
plot(res.rf.all)
First, we could directly compare the smallest RMSE for different models with different parameters.
topPerformers(res.rf.all)
## $a1
## Workflow Estimate
## rmse randomForest.v1 15.306
##
## $a2
## Workflow Estimate
## rmse randomForest.v1 9.307
##
## $a3
## Workflow Estimate
## rmse randomForest.v3 6.433
##
## $a4
## Workflow Estimate
## rmse randomForest.v3 3.268
##
## $a5
## Workflow Estimate
## rmse randomForest.v3 6.162
##
## $a6
## Workflow Estimate
## rmse randomForest.v2 10.247
##
## $a7
## Workflow Estimate
## rmse randomForest.v3 4.747
topPerformers(res.rt.all)
## $a1
## Workflow Estimate
## rmse rpartXse.v1 16.186
##
## $a2
## Workflow Estimate
## rmse rpartXse.v3 10.784
##
## $a3
## Workflow Estimate
## rmse rpartXse.v2 6.652
##
## $a4
## Workflow Estimate
## rmse rpartXse.v2 3.469
##
## $a5
## Workflow Estimate
## rmse rpartXse.v3 7.117
##
## $a6
## Workflow Estimate
## rmse rpartXse.v2 10.906
##
## $a7
## Workflow Estimate
## rmse rpartXse.v3 4.823
summary(res.lm.all)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating rmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1, a2, a3, a4, a5, a6, a7
## * Workflows :: lm
##
## -> Task: a1
## *Workflow: lm
## rmse
## avg 15.373901
## std 3.834642
## med 14.428018
## iqr 4.983312
## min 8.943522
## max 26.937622
## invalid 0.000000
##
## -> Task: a2
## *Workflow: lm
## rmse
## avg 9.836443
## std 3.715329
## med 8.704967
## iqr 3.256512
## min 5.501596
## max 19.634340
## invalid 0.000000
##
## -> Task: a3
## *Workflow: lm
## rmse
## avg 6.696303
## std 2.254685
## med 6.577072
## iqr 2.927882
## min 3.229518
## max 11.826706
## invalid 0.000000
##
## -> Task: a4
## *Workflow: lm
## rmse
## avg 3.469810
## std 2.444415
## med 2.516132
## iqr 1.519727
## min 1.277727
## max 10.326709
## invalid 0.000000
##
## -> Task: a5
## *Workflow: lm
## rmse
## avg 6.804237
## std 2.503848
## med 6.196962
## iqr 3.557455
## min 3.777818
## max 13.386939
## invalid 0.000000
##
## -> Task: a6
## *Workflow: lm
## rmse
## avg 10.849035
## std 4.596002
## med 9.874834
## iqr 5.171821
## min 5.128981
## max 24.676705
## invalid 0.000000
##
## -> Task: a7
## *Workflow: lm
## rmse
## avg 4.728331
## std 2.056581
## med 4.200264
## iqr 3.989409
## min 1.700101
## max 8.768580
## invalid 0.000000
However, this method does not provide significance level. Therefore, I performed the following series of A/B tests to check the statistical validity of certain hypotheses concerning the observed differences among the performance of the different workflows. (Here, only take random forest model as an example)
To be specific, use the Friedman test to check if we can reject the null hypothesis that all workflows perform equally on a set of predictive tasks, and if this is rejected then proceed with a post-hoc test; If the goal is to compare all workflows against each other we use the post-hoc Nemenyi test, whilst if the objective is to compare a series of workflows against some baseline we use the post-hoc Bonferroni-Dunn test.
p <- pairedComparisons(res.rf.all, baseline = "randomForest.v3")
p$rmse$F.test # overall test
## $chi
## [1] 2.571429
##
## $FF
## [1] 1.35
##
## $critVal
## [1] 0.3574087
##
## $rejNull
## [1] TRUE
p$rmse$Nemenyi.test # compared among groups
## $critDif
## [1] 1.252761
##
## $rkDifs
## randomForest.v1 randomForest.v2 randomForest.v3
## randomForest.v1 0.0000000 0.4285714 0.8571429
## randomForest.v2 0.4285714 0.0000000 0.4285714
## randomForest.v3 0.8571429 0.4285714 0.0000000
##
## $signifDifs
## randomForest.v1 randomForest.v2 randomForest.v3
## randomForest.v1 FALSE FALSE FALSE
## randomForest.v2 FALSE FALSE FALSE
## randomForest.v3 FALSE FALSE FALSE
p$rmse$BonferroniDunn.test # compared with baseline
## $critDif
## [1] 1.19808
##
## $baseline
## [1] "randomForest.v3"
##
## $rkDifs
## randomForest.v1 randomForest.v2
## 0.8571429 0.4285714
##
## $signifDifs
## randomForest.v1 randomForest.v2
## FALSE FALSE
CDdiagram.BD(p)
Based on the out-of-sample prediction result, the model selection for different dependent variables are as below:
full.test.algae <- cbind(test.algae, algae.sols)
summary(full.test.algae)
## season size speed mxPH mnO2
## autumn:40 large :38 high :58 Min. :5.900 Min. : 1.800
## spring:31 medium:52 low :25 1st Qu.:7.800 1st Qu.: 8.275
## summer:41 small :50 medium:57 Median :8.030 Median : 9.400
## winter:28 Mean :7.977 Mean : 9.212
## 3rd Qu.:8.340 3rd Qu.:10.800
## Max. :9.130 Max. :13.200
## NA's :1
## Cl NO3 NH4 oPO4
## Min. : 0.50 Min. : 0.000 Min. : 5.00 Min. : 1.00
## 1st Qu.: 11.01 1st Qu.: 0.987 1st Qu.: 37.33 1st Qu.: 11.75
## Median : 32.18 Median : 2.174 Median : 119.72 Median : 34.39
## Mean : 40.93 Mean : 2.892 Mean : 429.93 Mean : 72.39
## 3rd Qu.: 57.32 3rd Qu.: 4.035 3rd Qu.: 285.42 3rd Qu.: 84.72
## Max. :271.50 Max. :12.130 Max. :11160.60 Max. :1435.00
## NA's :6
## PO4 Chla a1 a2
## Min. : 2.00 Min. : 0.40 Min. : 0.000 Min. : 0.000
## 1st Qu.: 32.71 1st Qu.: 2.30 1st Qu.: 1.475 1st Qu.: 0.000
## Median : 89.17 Median : 4.50 Median : 7.400 Median : 2.250
## Mean : 134.93 Mean :11.08 Mean :16.385 Mean : 6.833
## 3rd Qu.: 177.85 3rd Qu.:16.70 3rd Qu.:27.050 3rd Qu.: 8.725
## Max. :1690.00 Max. :63.50 Max. :83.000 Max. :55.400
## NA's :5 NA's :11
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.000 Median : 0.000 Median : 2.300 Median : 0.500
## Mean : 3.326 Mean : 1.549 Mean : 6.160 Mean : 7.051
## 3rd Qu.: 3.650 3rd Qu.: 2.200 3rd Qu.: 8.825 3rd Qu.: 7.850
## Max. :24.300 Max. :19.600 Max. :61.100 Max. :70.700
##
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.794
## 3rd Qu.: 1.900
## Max. :30.800
##
wfs <- sapply(taskNames(res.rf.all),
function(x) topPerformer(res.rf.all, metric = "rmse", task = x))
for (i in 1:7) {
res <- runWorkflow(wfs[[i]],
as.formula(paste(names(wfs)[i], "~ .")),
algae.clean[,c(1:11, 11+i)],
full.test.algae[,c(1:11, 11+i)])
full.test.algae[,18 + i] <- res$preds
names(full.test.algae)[18+i] <- paste0("pred.a", i)
}
# calculate RMSE
RMSE.df <- data.frame(algae = paste0("a", 1:7),
RMSE = rep(0, 7))
for (i in 1:7) {
RMSE.df$RMSE[i] <- sqrt(mean((full.test.algae[, 11+i] - full.test.algae[, 18+i])^2))
}
RMSE.df
## algae RMSE
## 1 a1 14.195969
## 2 a2 9.704109
## 3 a3 5.097277
## 4 a4 2.522578
## 5 a5 8.283076
## 6 a6 11.980764
## 7 a7 4.544353