Predicting Algae bloom

summary of descriptive statistics

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(DMwR2)

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

data("algae")
algae <- as.tibble(algae)
str(algae)
summary(algae)

mxph histogram and qqplot

Based on qqplot, some observations can’t satisfy normality assumption

library(ggplot2)
theme_set(
  theme_minimal() +
    theme(legend.position = "top")
  )
p1 <- algae %>% ggplot() + aes(x=mxPH) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() +
 ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("") 

p1

library(car)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57

oPO4 boxplot

algae %>% ggplot(aes(x = factor(0),y = oPO4)) + geom_boxplot() + geom_rug() + geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)), linetype=2,colour="red") + ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)

ordering nominal factors and algal a1 boxplot

library(forcats)
 algae <- mutate(algae, size=fct_relevel(size,c("small","medium","large")),
   speed=fct_relevel(speed,c("low","medium","high")),
   season=fct_relevel(season,c("spring","summer","autumn","winter")))

ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() + xlab("River Size") + ylab("Algal A1")

Missing Values identification

 filter(algae, !complete.cases(algae) )
 apply(algae, 1, function(x) sum(is.na(x)))
 manyNAs(algae, 0.2) # row locations with more than 20 percent unknown values
algae <- algae[-c(62,199),] # remove those rows


apply(algae, 2, function(x) sum(is.na(x))) # cl has 8 NA, and chla has 10 NA

filling unknown values

# correlation matrix

library(corrplot)

## corrplot 0.84 loaded

cm <- cor(algae[, 4:18], use = "complete.obs")
corrplot(cm, type="upper",tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
          diag=FALSE, tl.pos="n", cl.pos="n")

fill the unknown of oPO4 using regression and others using knnimputation

lm(PO4~oPO4,data = algae) # PO4 = 1.293(oPO4) + 42.897
algae$PO4[28] <- 1.293*algae$oPO4[28] + 42.897
algae <- knnImputation(algae,k=10)

Predictive models

Multiple regression and decision tree

library(performanceEstimation)
DSs <- sapply(names(algae)[12:18], function(x,names.attrs) {
                                      f <- as.formula(paste(x, "~ ."))
                                      PredTask(f, algae[,c(names.attrs,x)], x,                                               copy=TRUE)
                                   },
              names(algae)[1:11])
res.all <- performanceEstimation(
 DSs,
 c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
 workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
 EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))

plot(res.all)

topPerformers(res.all)

## $a1
##      Workflow Estimate
## nmse       lm     0.69
## 
## $a2
##      Workflow Estimate
## nmse       lm    0.987
## 
## $a3
##         Workflow Estimate
## nmse rpartXse.v3        1
## 
## $a4
##         Workflow Estimate
## nmse rpartXse.v2        1
## 
## $a5
##      Workflow Estimate
## nmse       lm    0.959
## 
## $a6
##      Workflow Estimate
## nmse       lm    0.872
## 
## $a7
##         Workflow Estimate
## nmse rpartXse.v2        1

ensemble model(Random forest)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

res.all <- performanceEstimation(DSs,c(Workflow(learner="lm",                      pre="knnImp",post="onlyPos"),
    workflowVariants(learner="rpartXse",
    learner.pars=list(se=c(0,0.5,1))),
    workflowVariants(learner="randomForest", pre="knnImp",
    learner.pars=list(ntree=c(200,500,700)))),
    EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))

friedman test to check significance of results of all models

bonferroni-dun test to test significant of baseline random forest

rankWorkflows(res.all, top=3)

## $a1
## $a1$nmse
##          Workflow  Estimate
## 1 randomForest.v1 0.5294489
## 2 randomForest.v2 0.5303601
## 3 randomForest.v3 0.5309474
## 
## 
## $a2
## $a2$nmse
##          Workflow  Estimate
## 1 randomForest.v2 0.7821793
## 2 randomForest.v3 0.7829396
## 3 randomForest.v1 0.7874964
## 
## 
## $a3
## $a3$nmse
##          Workflow  Estimate
## 1 randomForest.v2 0.9819369
## 2 randomForest.v3 0.9842245
## 3 randomForest.v1 0.9906091
## 
## 
## $a4
## $a4$nmse
##          Workflow Estimate
## 1     rpartXse.v2 1.000000
## 2     rpartXse.v3 1.000000
## 3 randomForest.v2 1.023628
## 
## 
## $a5
## $a5$nmse
##          Workflow  Estimate
## 1 randomForest.v3 0.8175258
## 2 randomForest.v2 0.8190284
## 3 randomForest.v1 0.8217082
## 
## 
## $a6
## $a6$nmse
##          Workflow  Estimate
## 1              lm 0.8719455
## 2 randomForest.v2 0.9152018
## 3 randomForest.v3 0.9166955
## 
## 
## $a7
## $a7$nmse
##          Workflow Estimate
## 1     rpartXse.v2 1.000000
## 2     rpartXse.v3 1.000000
## 3 randomForest.v2 1.039846

p <- pairedComparisons(res.all,baseline="randomForest.v3")
p$nmse$F.test

## $chi
## [1] 18.62755
## 
## $FF
## [1] 4.781925
## 
## $critVal
## [1] 0.6524015
## 
## $rejNull
## [1] TRUE

p$nmse$BonferroniDunn.test

## $critDif
## [1] 3.046397
## 
## $baseline
## [1] "randomForest.v3"
## 
## $rkDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##       1.7142857       3.5714286       1.6428571       1.5000000       0.4285714 
## randomForest.v2 
##       0.8571429 
## 
## $signifDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##           FALSE            TRUE           FALSE           FALSE           FALSE 
## randomForest.v2 
##           FALSE

CD diagram

CDdiagram.BD(p)

# predicting on test sample

wfs <- sapply(taskNames(res.all),
  function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]

## Workflow Object:
##  Workflow ID       ::  randomForest.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> ntree=200 
##       learner  -> randomForest 
##       pre  -> knnImp

full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
     dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
 for(i in 1:7) {
     res <- runWorkflow(wfs[[i]],
     as.formula(paste(names(wfs)[i],"~.")),
     algae[,c(1:11,11+i)],
     full.test.algae[,c(1:11,11+i)])
     pts[,i,"trues"] <- res$trues
     pts[,i,"preds"] <- res$preds
  }

Test NMSE

avg.preds <- apply(algae[,12:18], 2, mean)
  apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) /
 apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)

##        a1        a2        a3        a4        a5        a6        a7 
## 0.4860505 0.8837355 0.8328648 1.0000000 0.7531270 0.8337472 1.0000000

algae

soroush

2/9/2020

Predicting Algae bloom

mxph histogram and qqplot

oPO4 boxplot

ordering nominal factors and algal a1 boxplot

Missing Values identification

filling unknown values

fill the unknown of oPO4 using regression and others using knnimputation

Predictive models

Multiple regression and decision tree

ensemble model(Random forest)

friedman test to check significance of results of all models

bonferroni-dun test to test significant of baseline random forest

CD diagram

Test NMSE