Predicting Algae bloom

summary of descriptive statistics

library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DMwR2)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data("algae")
algae <- as.tibble(algae)
str(algae)
summary(algae)

mxph histogram and qqplot

Based on qqplot, some observations can’t satisfy normality assumption

library(ggplot2)
theme_set(
  theme_minimal() +
    theme(legend.position = "top")
  )
p1 <- algae %>% ggplot() + aes(x=mxPH) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() +
 ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("") 

p1

library(car)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57

oPO4 boxplot

algae %>% ggplot(aes(x = factor(0),y = oPO4)) + geom_boxplot() + geom_rug() + geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)), linetype=2,colour="red") + ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)

ordering nominal factors and algal a1 boxplot

library(forcats)
 algae <- mutate(algae, size=fct_relevel(size,c("small","medium","large")),
   speed=fct_relevel(speed,c("low","medium","high")),
   season=fct_relevel(season,c("spring","summer","autumn","winter")))

ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() + xlab("River Size") + ylab("Algal A1")

Missing Values identification

 filter(algae, !complete.cases(algae) )
 apply(algae, 1, function(x) sum(is.na(x)))
 manyNAs(algae, 0.2) # row locations with more than 20 percent unknown values
algae <- algae[-c(62,199),] # remove those rows


apply(algae, 2, function(x) sum(is.na(x))) # cl has 8 NA, and chla has 10 NA

filling unknown values

# correlation matrix

library(corrplot)
## corrplot 0.84 loaded
cm <- cor(algae[, 4:18], use = "complete.obs")
corrplot(cm, type="upper",tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
          diag=FALSE, tl.pos="n", cl.pos="n")

fill the unknown of oPO4 using regression and others using knnimputation

lm(PO4~oPO4,data = algae) # PO4 = 1.293(oPO4) + 42.897
algae$PO4[28] <- 1.293*algae$oPO4[28] + 42.897
algae <- knnImputation(algae,k=10)

Predictive models

Multiple regression and decision tree
library(performanceEstimation)
DSs <- sapply(names(algae)[12:18], function(x,names.attrs) {
                                      f <- as.formula(paste(x, "~ ."))
                                      PredTask(f, algae[,c(names.attrs,x)], x,                                               copy=TRUE)
                                   },
              names(algae)[1:11])
res.all <- performanceEstimation(
 DSs,
 c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
 workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
 EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
plot(res.all)

topPerformers(res.all)
## $a1
##      Workflow Estimate
## nmse       lm     0.69
## 
## $a2
##      Workflow Estimate
## nmse       lm    0.987
## 
## $a3
##         Workflow Estimate
## nmse rpartXse.v3        1
## 
## $a4
##         Workflow Estimate
## nmse rpartXse.v2        1
## 
## $a5
##      Workflow Estimate
## nmse       lm    0.959
## 
## $a6
##      Workflow Estimate
## nmse       lm    0.872
## 
## $a7
##         Workflow Estimate
## nmse rpartXse.v2        1

ensemble model(Random forest)

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
res.all <- performanceEstimation(DSs,c(Workflow(learner="lm",                      pre="knnImp",post="onlyPos"),
    workflowVariants(learner="rpartXse",
    learner.pars=list(se=c(0,0.5,1))),
    workflowVariants(learner="randomForest", pre="knnImp",
    learner.pars=list(ntree=c(200,500,700)))),
    EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10))) 

friedman test to check significance of results of all models

bonferroni-dun test to test significant of baseline random forest

rankWorkflows(res.all, top=3)
## $a1
## $a1$nmse
##          Workflow  Estimate
## 1 randomForest.v1 0.5294489
## 2 randomForest.v2 0.5303601
## 3 randomForest.v3 0.5309474
## 
## 
## $a2
## $a2$nmse
##          Workflow  Estimate
## 1 randomForest.v2 0.7821793
## 2 randomForest.v3 0.7829396
## 3 randomForest.v1 0.7874964
## 
## 
## $a3
## $a3$nmse
##          Workflow  Estimate
## 1 randomForest.v2 0.9819369
## 2 randomForest.v3 0.9842245
## 3 randomForest.v1 0.9906091
## 
## 
## $a4
## $a4$nmse
##          Workflow Estimate
## 1     rpartXse.v2 1.000000
## 2     rpartXse.v3 1.000000
## 3 randomForest.v2 1.023628
## 
## 
## $a5
## $a5$nmse
##          Workflow  Estimate
## 1 randomForest.v3 0.8175258
## 2 randomForest.v2 0.8190284
## 3 randomForest.v1 0.8217082
## 
## 
## $a6
## $a6$nmse
##          Workflow  Estimate
## 1              lm 0.8719455
## 2 randomForest.v2 0.9152018
## 3 randomForest.v3 0.9166955
## 
## 
## $a7
## $a7$nmse
##          Workflow Estimate
## 1     rpartXse.v2 1.000000
## 2     rpartXse.v3 1.000000
## 3 randomForest.v2 1.039846
p <- pairedComparisons(res.all,baseline="randomForest.v3")
p$nmse$F.test
## $chi
## [1] 18.62755
## 
## $FF
## [1] 4.781925
## 
## $critVal
## [1] 0.6524015
## 
## $rejNull
## [1] TRUE
p$nmse$BonferroniDunn.test
## $critDif
## [1] 3.046397
## 
## $baseline
## [1] "randomForest.v3"
## 
## $rkDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##       1.7142857       3.5714286       1.6428571       1.5000000       0.4285714 
## randomForest.v2 
##       0.8571429 
## 
## $signifDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##           FALSE            TRUE           FALSE           FALSE           FALSE 
## randomForest.v2 
##           FALSE

CD diagram

CDdiagram.BD(p)

# predicting on test sample

wfs <- sapply(taskNames(res.all),
  function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
## Workflow Object:
##  Workflow ID       ::  randomForest.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> ntree=200 
##       learner  -> randomForest 
##       pre  -> knnImp
full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
     dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
 for(i in 1:7) {
     res <- runWorkflow(wfs[[i]],
     as.formula(paste(names(wfs)[i],"~.")),
     algae[,c(1:11,11+i)],
     full.test.algae[,c(1:11,11+i)])
     pts[,i,"trues"] <- res$trues
     pts[,i,"preds"] <- res$preds
  }

Test NMSE

avg.preds <- apply(algae[,12:18], 2, mean)
  apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) /
 apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)
##        a1        a2        a3        a4        a5        a6        a7 
## 0.4860505 0.8837355 0.8328648 1.0000000 0.7531270 0.8337472 1.0000000