Predicting Algae Blooms

Loading the Data into R

# Load libraries including the DMwR2

library(DMwR2)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(grid)
library(forcats)
library(corrplot)

## corrplot 0.95 loaded

# Load the algae dataset from the DMwR2 package and print the dataset

data(algae, package="DMwR2")
algae

## # A tibble: 200 × 18
##    season size  speed   mxPH  mnO2    Cl    NO3   NH4  oPO4   PO4  Chla    a1
##    <fct>  <fct> <fct>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 winter small medium  8      9.8  60.8  6.24  578   105   170   50      0  
##  2 spring small medium  8.35   8    57.8  1.29  370   429.  559.   1.3    1.4
##  3 autumn small medium  8.1   11.4  40.0  5.33  347.  126.  187.  15.6    3.3
##  4 spring small medium  8.07   4.8  77.4  2.30   98.2  61.2 139.   1.4    3.1
##  5 autumn small medium  8.06   9    55.4 10.4   234.   58.2  97.6 10.5    9.2
##  6 winter small high    8.25  13.1  65.8  9.25  430    18.2  56.7 28.4   15.1
##  7 summer small high    8.15  10.3  73.2  1.54  110    61.2 112.   3.2    2.4
##  8 autumn small high    8.05  10.6  59.1  4.99  206.   44.7  77.4  6.9   18.2
##  9 winter small medium  8.7    3.4  22.0  0.886 103.   36.3  71    5.54  25.4
## 10 winter small high    7.93   9.9   8    1.39    5.8  27.2  46.6  0.8   17  
## # ℹ 190 more rows
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## #   a7 <dbl>

# load the data as a tibble (data frame)

tibble::as_tibble(algae)

## # A tibble: 200 × 18
##    season size  speed   mxPH  mnO2    Cl    NO3   NH4  oPO4   PO4  Chla    a1
##    <fct>  <fct> <fct>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 winter small medium  8      9.8  60.8  6.24  578   105   170   50      0  
##  2 spring small medium  8.35   8    57.8  1.29  370   429.  559.   1.3    1.4
##  3 autumn small medium  8.1   11.4  40.0  5.33  347.  126.  187.  15.6    3.3
##  4 spring small medium  8.07   4.8  77.4  2.30   98.2  61.2 139.   1.4    3.1
##  5 autumn small medium  8.06   9    55.4 10.4   234.   58.2  97.6 10.5    9.2
##  6 winter small high    8.25  13.1  65.8  9.25  430    18.2  56.7 28.4   15.1
##  7 summer small high    8.15  10.3  73.2  1.54  110    61.2 112.   3.2    2.4
##  8 autumn small high    8.05  10.6  59.1  4.99  206.   44.7  77.4  6.9   18.2
##  9 winter small medium  8.7    3.4  22.0  0.886 103.   36.3  71    5.54  25.4
## 10 winter small high    7.93   9.9   8    1.39    5.8  27.2  46.6  0.8   17  
## # ℹ 190 more rows
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## #   a7 <dbl>

# Use summary to give the basic statistics of the dataset including counts, min, max, median, mean, quartile and NAs

summary(algae)

##     season       size       speed         mxPH            mnO2       
##  autumn:40   large :45   high  :84   Min.   :5.600   Min.   : 1.500  
##  spring:53   medium:84   low   :33   1st Qu.:7.700   1st Qu.: 7.725  
##  summer:45   small :71   medium:83   Median :8.060   Median : 9.800  
##  winter:62                           Mean   :8.012   Mean   : 9.118  
##                                      3rd Qu.:8.400   3rd Qu.:10.800  
##                                      Max.   :9.700   Max.   :13.400  
##                                      NA's   :1       NA's   :2       
##        Cl               NO3              NH4                oPO4       
##  Min.   :  0.222   Min.   : 0.050   Min.   :    5.00   Min.   :  1.00  
##  1st Qu.: 10.981   1st Qu.: 1.296   1st Qu.:   38.33   1st Qu.: 15.70  
##  Median : 32.730   Median : 2.675   Median :  103.17   Median : 40.15  
##  Mean   : 43.636   Mean   : 3.282   Mean   :  501.30   Mean   : 73.59  
##  3rd Qu.: 57.824   3rd Qu.: 4.446   3rd Qu.:  226.95   3rd Qu.: 99.33  
##  Max.   :391.500   Max.   :45.650   Max.   :24064.00   Max.   :564.60  
##  NA's   :10        NA's   :2        NA's   :2          NA's   :2       
##       PO4              Chla               a1              a2        
##  Min.   :  1.00   Min.   :  0.200   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 41.38   1st Qu.:  2.000   1st Qu.: 1.50   1st Qu.: 0.000  
##  Median :103.29   Median :  5.475   Median : 6.95   Median : 3.000  
##  Mean   :137.88   Mean   : 13.971   Mean   :16.92   Mean   : 7.458  
##  3rd Qu.:213.75   3rd Qu.: 18.308   3rd Qu.:24.80   3rd Qu.:11.375  
##  Max.   :771.60   Max.   :110.456   Max.   :89.80   Max.   :72.600  
##  NA's   :2        NA's   :12                                        
##        a3               a4               a5               a6        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 1.550   Median : 0.000   Median : 1.900   Median : 0.000  
##  Mean   : 4.309   Mean   : 1.992   Mean   : 5.064   Mean   : 5.964  
##  3rd Qu.: 4.925   3rd Qu.: 2.400   3rd Qu.: 7.500   3rd Qu.: 6.925  
##  Max.   :42.800   Max.   :44.600   Max.   :44.400   Max.   :77.600  
##                                                                     
##        a7        
##  Min.   : 0.000  
##  1st Qu.: 0.000  
##  Median : 1.000  
##  Mean   : 2.495  
##  3rd Qu.: 2.400  
##  Max.   :31.600  
##

# Produce a histogram of the maximum pH (mxPH) scaled to density

ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..))

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Produce a histogram with a smooth density curve and rug plot which shows the individual points & a normal QQ plot

ggplot(algae,aes(x=mxPH)) + 
    geom_histogram(aes(y=..density..)) + 
    geom_density(color="red") + geom_rug() + 
    ggtitle("The Histogram of mxPH (maximum pH)") + 
    xlab("") + ylab("")

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57

# Produce the enhanced histogram and QQ plot to be in the same frame

gh <- ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() + ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
par(mfrow=c(1,2))
vpL <- viewport(height=unit(1, "npc"), width=unit(0.5, "npc"), 
                           just="left", 
                           y=0.5, x=0)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH')

## [1] 56 57

print(gh,vp=vpL)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57

# Shows a boxplot of orthophosphate with rug plot and red dotted line mean

ggplot(algae,aes(x=factor(0),y=oPO4)) + 
    geom_boxplot() + geom_rug() + 
    geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),linetype=2,colour="red") +
    ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)

## Warning: Use of `algae$oPO4` is discouraged.
## ℹ Use `oPO4` instead.

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Scatter plot of NH4 (ammonium) with mean, mean plus the standard deviation, and the median
# Helps identify the outlier plot point

plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T), lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
identify(algae$NH4)

## integer(0)

# clean scatter plot of NH4, this helps identify the outlier plot point 

plot(algae$NH4, xlab = "")
clickedRows <- identify(algae$NH4)

algae[clickedRows, ]

## # A tibble: 0 × 18
## # ℹ 18 variables: season <fct>, size <fct>, speed <fct>, mxPH <dbl>,
## #   mnO2 <dbl>, Cl <dbl>, NO3 <dbl>, NH4 <dbl>, oPO4 <dbl>, PO4 <dbl>,
## #   Chla <dbl>, a1 <dbl>, a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## #   a7 <dbl>

# using the filter function to find all the NH4 values over 19000 and identify the outlier

filter(algae, NH4 > 19000)

## # A tibble: 1 × 18
##   season size  speed  mxPH  mnO2    Cl   NO3   NH4  oPO4   PO4  Chla    a1    a2
##   <fct>  <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 autumn medi… high    7.3  11.8  44.2  45.6 24064    44    34  53.1   2.2     0
## # ℹ 5 more variables: a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>, a7 <dbl>

# produce a boxplot that compares algae A1 by river size

ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
    xlab("River Size") + ylab("Algal A1")

# reorder the factors in a more meaningful way

algae <- mutate(algae,
                size=fct_relevel(size,c("small","medium","large")),
                speed=fct_relevel(speed,c("low","medium","high")),
                season=fct_relevel(season,c("spring","summer","autumn","winter")))

# produce boxplot of reordered factors

ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
    xlab("River Size") + ylab("Algal A1")

# produce a violin plot of algal A1 with the individual points

ggplot(algae,aes(x=size,y=a1)) + geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")

# produce a faceted grid of of Algal A3 with quartile cut minimum O2 levels (with NAs filtered out) according to season

data2graph <- filter(algae,!is.na(mnO2)) %>%
    mutate(minO2=cut(mnO2, 
                     quantile(mnO2,c(0,0.25,0.5,0.75,1)), 
                     include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season,col=season)) + geom_point() + facet_wrap(~ minO2) + guides(color=FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Removing the Observations with Unknown Values

# identifies rows that are NOT complete across the dataframe

filter(algae, !complete.cases(algae) )

## # A tibble: 16 × 18
##    season size   speed  mxPH  mnO2     Cl    NO3   NH4   oPO4    PO4  Chla    a1
##    <fct>  <fct>  <fct> <dbl> <dbl>  <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>
##  1 autumn small  high   6.8   11.1  9      0.63     20   4     NA      2.7  30.3
##  2 spring small  high   8     NA    1.45   0.81     10   2.5    3      0.3  75.8
##  3 winter small  low   NA     12.6  9      0.23     10   5      6      1.1  35.5
##  4 winter small  high   6.6   10.8 NA      3.24     10   1      6.5   NA    24.3
##  5 spring small  medi…  5.6   11.8 NA      2.22      5   1      1     NA    82.7
##  6 autumn small  medi…  5.7   10.8 NA      2.55     10   1      4     NA    16.8
##  7 spring small  high   6.6    9.5 NA      1.32     20   1      6     NA    46.8
##  8 summer small  high   6.6   10.8 NA      2.64     10   2     11     NA    46.9
##  9 autumn small  medi…  6.6   11.3 NA      4.17     10   1      6     NA    47.1
## 10 spring small  medi…  6.5   10.4 NA      5.97     10   2     14     NA    66.9
## 11 summer small  medi…  6.4   NA   NA     NA        NA  NA     14     NA    19.4
## 12 autumn small  high   7.83  11.7  4.08   1.33     18   3.33   6.67  NA    14.4
## 13 winter medium high   9.7   10.8  0.222  0.406    10  22.4   10.1   NA    41  
## 14 spring large  low    9      5.8 NA      0.9     142 102    186     68.0   1.7
## 15 winter large  high   8     10.9  9.06   0.825    40  21.1   56.1   NA    16.8
## 16 winter large  medi…  8      7.6 NA     NA        NA  NA     NA     NA     0  
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## #   a7 <dbl>

# count the number of NAs in each row of dataframe

apply(algae, 1, function(x) sum(is.na(x)))

##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
##  [38] 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 6 1 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0

# identify rows with more than 20% NAs

data(algae, package="DMwR2")
manyNAs(algae, 0.2)

## [1]  62 199

# remove the rows with more than 20% NA

algae <- algae[-manyNAs(algae), ]

Filling in the Unknowns with the Most Frequent Values

# use the mean of the max pH (mxPH) column to fill in unknown values in mxPH column

data(algae, package="DMwR2")
algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = TRUE)

# use the median of the chlorophyll (Chla) column to fill in unknown values in the Chla column

data(algae, package="DMwR2")
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = TRUE)

# fill in all unknown values using statistics of centrality (median for numerical columns and mode for nominal variables)

data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
algae <- centralImputation(algae)

Filling in the Unknown Values by Exploring Correlations

# obtains variable correlation with cor() & complete.obs tells program to disregard the NAs

data(algae, package="DMwR2")
cor(algae[, 4:18], use = "complete.obs")

##             mxPH        mnO2          Cl         NO3         NH4         oPO4
## mxPH  1.00000000 -0.10269374  0.14709539 -0.17213024 -0.15429757  0.090229085
## mnO2 -0.10269374  1.00000000 -0.26324536  0.11790769 -0.07826816 -0.393752688
## Cl    0.14709539 -0.26324536  1.00000000  0.21095831  0.06598336  0.379255958
## NO3  -0.17213024  0.11790769  0.21095831  1.00000000  0.72467766  0.133014517
## NH4  -0.15429757 -0.07826816  0.06598336  0.72467766  1.00000000  0.219311206
## oPO4  0.09022909 -0.39375269  0.37925596  0.13301452  0.21931121  1.000000000
## PO4   0.10132957 -0.46396073  0.44519118  0.15702971  0.19939575  0.911964602
## Chla  0.43182377 -0.13121671  0.14295776  0.14549290  0.09120406  0.106914784
## a1   -0.16262986  0.24998372 -0.35923946 -0.24723921 -0.12360578 -0.394574479
## a2    0.33501740 -0.06848199  0.07845402  0.01997079 -0.03790296  0.123811068
## a3   -0.02716034 -0.23522831  0.07653027 -0.09182236 -0.11290467  0.005704557
## a4   -0.18435348 -0.37982999  0.14147281 -0.01448875  0.27452000  0.382481433
## a5   -0.10731230  0.21001174  0.14534877  0.21213579  0.01544458  0.122027482
## a6   -0.17273795  0.18862656  0.16904394  0.54404455  0.40119275  0.003340366
## a7   -0.17027088 -0.10455106 -0.04494524  0.07505030 -0.02539279  0.026150420
##              PO4        Chla          a1           a2           a3          a4
## mxPH  0.10132957  0.43182377 -0.16262986  0.335017401 -0.027160336 -0.18435348
## mnO2 -0.46396073 -0.13121671  0.24998372 -0.068481989 -0.235228307 -0.37982999
## Cl    0.44519118  0.14295776 -0.35923946  0.078454019  0.076530269  0.14147281
## NO3   0.15702971  0.14549290 -0.24723921  0.019970786 -0.091822358 -0.01448875
## NH4   0.19939575  0.09120406 -0.12360578 -0.037902958 -0.112904666  0.27452000
## oPO4  0.91196460  0.10691478 -0.39457448  0.123811068  0.005704557  0.38248143
## PO4   1.00000000  0.24849223 -0.45816781  0.132667891  0.032193981  0.40883951
## Chla  0.24849223  1.00000000 -0.26601088  0.366724647 -0.063301128 -0.08600540
## a1   -0.45816781 -0.26601088  1.00000000 -0.262665485 -0.108177581 -0.09338072
## a2    0.13266789  0.36672465 -0.26266549  1.000000000  0.009759915 -0.17628704
## a3    0.03219398 -0.06330113 -0.10817758  0.009759915  1.000000000  0.03336910
## a4    0.40883951 -0.08600540 -0.09338072 -0.176287038  0.033369102  1.00000000
## a5    0.15548900 -0.07342837 -0.26972709 -0.186758940 -0.141610948 -0.10131827
## a6    0.05320294  0.01032550 -0.26156402 -0.133518480 -0.196900051 -0.08488426
## a7    0.07978353  0.01760782 -0.19306384  0.036206205  0.039060248  0.07114638
##               a5           a6          a7
## mxPH -0.10731230 -0.172737947 -0.17027088
## mnO2  0.21001174  0.188626555 -0.10455106
## Cl    0.14534877  0.169043945 -0.04494524
## NO3   0.21213579  0.544044553  0.07505030
## NH4   0.01544458  0.401192749 -0.02539279
## oPO4  0.12202748  0.003340366  0.02615042
## PO4   0.15548900  0.053202942  0.07978353
## Chla -0.07342837  0.010325497  0.01760782
## a1   -0.26972709 -0.261564023 -0.19306384
## a2   -0.18675894 -0.133518480  0.03620621
## a3   -0.14161095 -0.196900051  0.03906025
## a4   -0.10131827 -0.084884259  0.07114638
## a5    1.00000000  0.388608955 -0.05149346
## a6    0.38860896  1.000000000 -0.03033428
## a7   -0.05149346 -0.030334277  1.00000000

#symnum() produces correlation matrix with symbolic representation where legend shows which symbol represents the strength of correlation (e.g. orthophosphate is about 90% correlated with phosphate)

symnum(cor(algae[,4:18],use="complete.obs"))

##      mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1                                         
## mnO2    1                                      
## Cl         1                                   
## NO3           1                                
## NH4           ,  1                             
## oPO4    .  .        1                          
## PO4     .  .        * 1                        
## Chla .                  1                      
## a1         .        . .    1                   
## a2   .                  .     1                
## a3                               1             
## a4      .           . .             1          
## a5                                     1       
## a6            .  .                     .  1    
## a7                                           1 
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1

# produces a colorful correlation matrix

cm <- cor(algae[,4:18],use="complete.obs")
corrplot(cm,type="upper",tl.pos="d",tl.cex=0.75)
corrplot(cm,add=TRUE, type="lower", method="number",tl.cex=0.75, diag=FALSE,tl.pos="n", cl.pos="n")

# obtain information for linear model Y = β0 + β1X1 +...+βnXn -> PO4 = 42.897 + 1.293 x oPO4

data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
lm(PO4 ~ oPO4, data = algae)

## 
## Call:
## lm(formula = PO4 ~ oPO4, data = algae)
## 
## Coefficients:
## (Intercept)         oPO4  
##      42.897        1.293

# fill in the unknown values of PO4 column using our lm formula

algae[28, "PO4"] <- 42.897 + 1.293 * algae[28, "oPO4"]

# remove the manyNAs rows and use our function that fills in PO4 value given the oPO4 value

data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
fillPO4 <- function(oP) ifelse(is.na(oP),NA,42.897 + 1.293 * oP)
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4), "oPO4"], fillPO4)

# reorder data in meaningful way then produce histogram of mxPH for the four seasons

algae <- mutate(algae,
                size=fct_relevel(size,c("small","medium","large")),
                speed=fct_relevel(speed,c("low","medium","high")),
                season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# produces histogram of mxPH by river size and speed

ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + 
    facet_wrap(size ~ speed)

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# produces multifaceted scatterplot that breaks up the facets according to speed and looks at mxPH with size on the y-axis. geom_jitter height=0.4 moves points 0.4 units to prevent overlapping points when points share same size value

ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed) + geom_jitter(height = 0.4)

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Filling in the Unknown Values by Exploring Similarities between Cases

# prepare for KNN imputation by reloading the dataset and removing the manyNA rows

data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]

# fills in a missing value using the k-nearest neighbor (KNN) by using the 10 most similar rows based on the other variables

algae <- knnImputation(algae, k = 10)

# fills in a missing value using the k-nearest neighbor (KNN) by looking at the 10 most similar rows based on the other variables and fills in the missing value using the median of the neighbors

data(algae, package="DMwR2")
algae <- knnImputation(algae, k = 10, meth = "median")

Multiple Linear Regression

# prepare the data for linear regression by reloadng data, removing the manyNA rows. Create a clean.algae data frame by imputing data using knn so there are no NAs

data(algae, package="DMwR2")
algae <-  algae[-manyNAs(algae), ]
clean.algae <- knnImputation(algae, k = 10)

# obtain linear regression model for predicting frequency of algal 1 where '.' means using all predictors to predict a1 (i.e. a1 as a function of all variables)

lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12])

# obtains summary of linear model (shows coefficients, significance, etc.)

summary(lm.a1)

## 
## Call:
## lm(formula = a1 ~ ., data = clean.algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.679 -11.893  -2.567   7.410  62.190 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  42.942055  24.010879   1.788  0.07537 . 
## seasonspring  3.726978   4.137741   0.901  0.36892   
## seasonsummer  0.747597   4.020711   0.186  0.85270   
## seasonwinter  3.692955   3.865391   0.955  0.34065   
## sizemedium    3.263728   3.802051   0.858  0.39179   
## sizesmall     9.682140   4.179971   2.316  0.02166 * 
## speedlow      3.922084   4.706315   0.833  0.40573   
## speedmedium   0.246764   3.241874   0.076  0.93941   
## mxPH         -3.589118   2.703528  -1.328  0.18598   
## mnO2          1.052636   0.705018   1.493  0.13715   
## Cl           -0.040172   0.033661  -1.193  0.23426   
## NO3          -1.511235   0.551339  -2.741  0.00674 **
## NH4           0.001634   0.001003   1.628  0.10516   
## oPO4         -0.005435   0.039884  -0.136  0.89177   
## PO4          -0.052241   0.030755  -1.699  0.09109 . 
## Chla         -0.088022   0.079998  -1.100  0.27265   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.65 on 182 degrees of freedom
## Multiple R-squared:  0.3731, Adjusted R-squared:  0.3215 
## F-statistic: 7.223 on 15 and 182 DF,  p-value: 2.444e-12

NOTE: adjusted R-squared (proportion of variance explained by the model) is 32.15%

# simplify the linear model by using anova() for model variance analysis

anova(lm.a1)

## Analysis of Variance Table
## 
## Response: a1
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## season      3     85    28.2  0.0905 0.9651944    
## size        2  11401  5700.7 18.3088  5.69e-08 ***
## speed       2   3934  1967.2  6.3179 0.0022244 ** 
## mxPH        1   1329  1328.8  4.2677 0.0402613 *  
## mnO2        1   2287  2286.8  7.3444 0.0073705 ** 
## Cl          1   4304  4304.3 13.8239 0.0002671 ***
## NO3         1   3418  3418.5 10.9789 0.0011118 ** 
## NH4         1    404   403.6  1.2963 0.2563847    
## oPO4        1   4788  4788.0 15.3774 0.0001246 ***
## PO4         1   1406  1405.6  4.5142 0.0349635 *  
## Chla        1    377   377.0  1.2107 0.2726544    
## Residuals 182  56668   311.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# remove season as it least contributes to reduction of fitting error of model (largest p-value in anova)

lm2.a1 <- update(lm.a1, . ~ . - season)

# obtain summary of the linear model with season removed

summary(lm2.a1)

## 
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + 
##     oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.460 -11.953  -3.044   7.444  63.730 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 44.9532874 23.2378377   1.934  0.05458 . 
## sizemedium   3.3092102  3.7825221   0.875  0.38278   
## sizesmall   10.2730961  4.1223163   2.492  0.01358 * 
## speedlow     3.0546270  4.6108069   0.662  0.50848   
## speedmedium -0.2976867  3.1818585  -0.094  0.92556   
## mxPH        -3.2684281  2.6576592  -1.230  0.22033   
## mnO2         0.8011759  0.6589644   1.216  0.22561   
## Cl          -0.0381881  0.0333791  -1.144  0.25407   
## NO3         -1.5334300  0.5476550  -2.800  0.00565 **
## NH4          0.0015777  0.0009951   1.586  0.11456   
## oPO4        -0.0062392  0.0395086  -0.158  0.87469   
## PO4         -0.0509543  0.0305189  -1.670  0.09669 . 
## Chla        -0.0841371  0.0794459  -1.059  0.29096   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.57 on 185 degrees of freedom
## Multiple R-squared:  0.3682, Adjusted R-squared:  0.3272 
## F-statistic: 8.984 on 12 and 185 DF,  p-value: 1.762e-13

NOTE: fit has improved only slightly to ~32.7%

#compare models lm.a1 and lm2.a1 with anova function. Look at F-test to assess significance of differences

anova(lm.a1,lm2.a1)

## Analysis of Variance Table
## 
## Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
##     PO4 + Chla
## Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
##     Chla
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    182 56668                           
## 2    185 57116 -3   -447.62 0.4792 0.6971

only about 30% confidence that they are different

# obtain linear model that results from applying backwards elimination method to lm.a1

final.lm <- step(lm.a1)

## Start:  AIC=1152.03
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
##     PO4 + Chla
## 
##          Df Sum of Sq   RSS    AIC
## - season  3    447.62 57116 1147.6
## - speed   2    269.60 56938 1149.0
## - oPO4    1      5.78 56674 1150.0
## - Chla    1    376.96 57045 1151.3
## - Cl      1    443.46 57112 1151.6
## - mxPH    1    548.76 57217 1151.9
## <none>                56668 1152.0
## - mnO2    1    694.11 57363 1152.4
## - NH4     1    825.67 57494 1152.9
## - PO4     1    898.42 57567 1153.1
## - size    2   1857.16 58526 1154.4
## - NO3     1   2339.36 59008 1158.0
## 
## Step:  AIC=1147.59
## a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
##     Chla
## 
##         Df Sum of Sq   RSS    AIC
## - speed  2    210.64 57327 1144.3
## - oPO4   1      7.70 57124 1145.6
## - Chla   1    346.27 57462 1146.8
## - Cl     1    404.10 57520 1147.0
## - mnO2   1    456.37 57572 1147.2
## - mxPH   1    466.95 57583 1147.2
## <none>               57116 1147.6
## - NH4    1    776.11 57892 1148.3
## - PO4    1    860.62 57977 1148.5
## - size   2   2175.59 59292 1151.0
## - NO3    1   2420.47 59537 1153.8
## 
## Step:  AIC=1144.31
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - oPO4  1     16.29 57343 1142.4
## - Chla  1    223.29 57550 1143.1
## - mnO2  1    413.77 57740 1143.7
## - Cl    1    472.70 57799 1143.9
## - mxPH  1    483.56 57810 1144.0
## <none>              57327 1144.3
## - NH4   1    720.19 58047 1144.8
## - PO4   1    809.30 58136 1145.1
## - size  2   2060.95 59388 1147.3
## - NO3   1   2379.75 59706 1150.4
## 
## Step:  AIC=1142.37
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - Chla  1     207.7 57551 1141.1
## - mnO2  1     402.6 57746 1141.8
## - Cl    1     470.7 57814 1142.0
## - mxPH  1     519.7 57863 1142.2
## <none>              57343 1142.4
## - NH4   1     704.4 58047 1142.8
## - size  2    2050.3 59393 1145.3
## - NO3   1    2370.4 59713 1148.4
## - PO4   1    5818.4 63161 1159.5
## 
## Step:  AIC=1141.09
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## - mnO2  1     435.3 57986 1140.6
## - Cl    1     438.1 57989 1140.6
## <none>              57551 1141.1
## - NH4   1     746.9 58298 1141.6
## - mxPH  1     833.1 58384 1141.9
## - size  2    2217.5 59768 1144.6
## - NO3   1    2667.1 60218 1148.1
## - PO4   1    6309.7 63860 1159.7
## 
## Step:  AIC=1140.58
## a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## - NH4   1     531.0 58517 1140.4
## - Cl    1     584.9 58571 1140.6
## <none>              57986 1140.6
## - mxPH  1     819.1 58805 1141.4
## - size  2    2478.2 60464 1144.9
## - NO3   1    2251.4 60237 1146.1
## - PO4   1    9097.9 67084 1167.4
## 
## Step:  AIC=1140.38
## a1 ~ size + mxPH + Cl + NO3 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## <none>              58517 1140.4
## - mxPH  1     784.1 59301 1141.0
## - Cl    1     835.6 59353 1141.2
## - NO3   1    1987.9 60505 1145.0
## - size  2    2664.3 61181 1145.2
## - PO4   1    8575.8 67093 1165.5

# obtain summary of the final model

summary(final.lm)

## 
## Call:
## lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = clean.algae[, 
##     1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.874 -12.732  -3.741   8.424  62.926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57.28555   20.96132   2.733  0.00687 ** 
## sizemedium   2.80050    3.40190   0.823  0.41141    
## sizesmall   10.40636    3.82243   2.722  0.00708 ** 
## mxPH        -3.97076    2.48204  -1.600  0.11130    
## Cl          -0.05227    0.03165  -1.651  0.10028    
## NO3         -0.89529    0.35148  -2.547  0.01165 *  
## PO4         -0.05911    0.01117  -5.291 3.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.5 on 191 degrees of freedom
## Multiple R-squared:  0.3527, Adjusted R-squared:  0.3324 
## F-statistic: 17.35 on 6 and 191 DF,  p-value: 5.554e-16

NOTE: proportion explained by the model still low. Sign that linearity assumptions of model not adequate for the domain

Regression Trees

#rpart library used for regression trees. Prepare the data by reloadng data, removing the manyNA rows. Do not need imputation as model can handle missing values. rpart() function builds decision tree model

library(rpart)
data(algae, package="DMwR2") 
algae <- algae[-manyNAs(algae), ]
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])

# shows content of rt.a1 object. Note that not all variables appear as the model selects the more relevant variables

rt.a1

## n= 198 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 198 90401.290 16.996460  
##    2) PO4>=43.818 147 31279.120  8.979592  
##      4) Cl>=7.8065 140 21622.830  7.492857  
##        8) oPO4>=51.118 84  3441.149  3.846429 *
##        9) oPO4< 51.118 56 15389.430 12.962500  
##         18) mnO2>=10.05 24  1248.673  6.716667 *
##         19) mnO2< 10.05 32 12502.320 17.646870  
##           38) NO3>=3.1875 9   257.080  7.866667 *
##           39) NO3< 3.1875 23 11047.500 21.473910  
##             78) mnO2< 8 13  2919.549 13.807690 *
##             79) mnO2>=8 10  6370.704 31.440000 *
##      5) Cl< 7.8065 7  3157.769 38.714290 *
##    3) PO4< 43.818 51 22442.760 40.103920  
##      6) mxPH< 7.87 28 11452.770 33.450000  
##       12) mxPH>=7.045 18  5146.169 26.394440 *
##       13) mxPH< 7.045 10  3797.645 46.150000 *
##      7) mxPH>=7.87 23  8241.110 48.204350  
##       14) PO4>=15.177 12  3047.517 38.183330 *
##       15) PO4< 15.177 11  2673.945 59.136360 *

# use use prp() function from rpart.plot to visualize the decision tree

library(rpart.plot)
prp(rt.a1,extra=101,box.col="orange",split.box.col="grey")

# prints the complexity parameter table which shows how tree's accuracy changes as it's simplified to pick best tree size and prevent overfitting

printcp(rt.a1)

## 
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
## 
## Variables actually used in tree construction:
## [1] Cl   mnO2 mxPH NO3  oPO4 PO4 
## 
## Root node error: 90401/198 = 456.57
## 
## n= 198 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.405740      0   1.00000 1.02289 0.13176
## 2 0.071885      1   0.59426 0.73783 0.11484
## 3 0.030887      2   0.52237 0.70321 0.11273
## 4 0.030408      3   0.49149 0.70910 0.11532
## 5 0.027872      4   0.46108 0.70175 0.11520
## 6 0.027754      5   0.43321 0.68693 0.11569
## 7 0.018124      6   0.40545 0.65404 0.10605
## 8 0.016344      7   0.38733 0.68903 0.10884
## 9 0.010000      9   0.35464 0.70651 0.11143

# can use 1-SE rule: cross-validation error estimates (xerror) + standard deviation (xstd). e.g. Smallest tree with error less than 0.59452+0.10083=0.69535 is 2. If we want this tree over the one suggested by program, can prune rt.a1 to cp = 0.08

rt2.a1 <- prune(rt.a1, cp = 0.08)
rt2.a1

## n= 198 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 198 90401.29 16.996460  
##   2) PO4>=43.818 147 31279.12  8.979592 *
##   3) PO4< 43.818 51 22442.76 40.103920 *

# rpartXse automates the previous process 

(rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))

## n= 198 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 198 90401.29 16.996460  
##   2) PO4>=43.818 147 31279.12  8.979592 *
##   3) PO4< 43.818 51 22442.76 40.103920 *

# interactive manual pruning tree, indicates number of nodes at which you want to prune tree, prune tree with snip.rpart() function

first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
snip.rpart(first.tree, c(4, 7))

## n= 198 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 198 90401.290 16.996460  
##    2) PO4>=43.818 147 31279.120  8.979592  
##      4) Cl>=7.8065 140 21622.830  7.492857 *
##      5) Cl< 7.8065 7  3157.769 38.714290 *
##    3) PO4< 43.818 51 22442.760 40.103920  
##      6) mxPH< 7.87 28 11452.770 33.450000  
##       12) mxPH>=7.045 18  5146.169 26.394440 *
##       13) mxPH< 7.045 10  3797.645 46.150000 *
##      7) mxPH>=7.87 23  8241.110 48.204350 *

# produces way to prune tree graphically

plot(first.tree)
text(first.tree)
snip.rpart(first.tree)

## n= 198 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 198 90401.290 16.996460  
##    2) PO4>=43.818 147 31279.120  8.979592  
##      4) Cl>=7.8065 140 21622.830  7.492857  
##        8) oPO4>=51.118 84  3441.149  3.846429 *
##        9) oPO4< 51.118 56 15389.430 12.962500  
##         18) mnO2>=10.05 24  1248.673  6.716667 *
##         19) mnO2< 10.05 32 12502.320 17.646870  
##           38) NO3>=3.1875 9   257.080  7.866667 *
##           39) NO3< 3.1875 23 11047.500 21.473910  
##             78) mnO2< 8 13  2919.549 13.807690 *
##             79) mnO2>=8 10  6370.704 31.440000 *
##      5) Cl< 7.8065 7  3157.769 38.714290 *
##    3) PO4< 43.818 51 22442.760 40.103920  
##      6) mxPH< 7.87 28 11452.770 33.450000  
##       12) mxPH>=7.045 18  5146.169 26.394440 *
##       13) mxPH< 7.045 10  3797.645 46.150000 *
##      7) mxPH>=7.87 23  8241.110 48.204350  
##       14) PO4>=15.177 12  3047.517 38.183330 *
##       15) PO4< 15.177 11  2673.945 59.136360 *

Model Evaluation and Selection

# generate predicted values using predict() function 

lm.predictions.a1 <- predict(final.lm, clean.algae)
rt.predictions.a1 <- predict(rt.a1, algae)

# calculate the mean absolute error for linear model and regression tree model

(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))

## [1] 13.10681

(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[["a1"]])))

## [1] 11.61717

# calculate the mean squared error for both models

(mse.a1.lm <- mean((lm.predictions.a1 - algae[["a1"]])^2))

## [1] 295.5407

(mse.a1.rt <- mean((rt.predictions.a1 - algae[["a1"]])^2))

## [1] 271.3226

# calculate the normalized mean squared error for both models

(nmse.a1.lm <- mean((lm.predictions.a1-algae[['a1']])^2)/
               mean((mean(algae[['a1']])-algae[['a1']])^2))

## [1] 0.6473034

(nmse.a1.rt <- mean((rt.predictions.a1-algae[['a1']])^2)/
               mean((mean(algae[['a1']])-algae[['a1']])^2))

## [1] 0.5942601

# visualize the predictions with errors scatterplot

dg <- data.frame(lm.a1=lm.predictions.a1,
                 rt.a1=rt.predictions.a1,
                 true.a1=algae[["a1"]])
ggplot(dg,aes(x=lm.a1,y=true.a1)) +
    geom_point() + geom_abline(slope=1,intercept=0,color="red") +
    ggtitle("Linear Model")

ggplot(dg,aes(x=rt.a1,y=true.a1)) +
    geom_point() + geom_abline(slope=1,intercept=0,color="red") +
    ggtitle("Regression Tree")

# use identify to check sample number where bad prediction is made (poor performance)

plot(lm.predictions.a1,algae[['a1']],main="Linear Model",
     xlab="Predictions",ylab="True Values")
abline(0,1,col="red")
algae[identify(lm.predictions.a1,algae[['a1']]),]

## # A tibble: 0 × 18
## # ℹ 18 variables: season <fct>, size <fct>, speed <fct>, mxPH <dbl>,
## #   mnO2 <dbl>, Cl <dbl>, NO3 <dbl>, NH4 <dbl>, oPO4 <dbl>, PO4 <dbl>,
## #   Chla <dbl>, a1 <dbl>, a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## #   a7 <dbl>

# use knowledge that algae frequency can't be less than zero to make minimum value 0 and thus improve linear model

sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 0, lm.predictions.a1)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))

## [1] 13.10681

(smae.a1.lm <- mean(abs(sensible.lm.predictions.a1 - algae[["a1"]])))

## [1] 12.48276

# carry out performance estimation experiment using cross validation. This is used to compare the linear model with three different pruning levels of regression tree on dataset. Uses 5 repetitions of 10-fold cross validation process. Using normalized mean squared error for evaluation here.

library(performanceEstimation)
res <- performanceEstimation(
    PredTask(a1 ~ ., algae[, 1:12], "a1"),
    c(Workflow(learner="lm",pre="knnImp",post="onlyPos"),
      workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1)))),
    EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10))
)

## 
## 
## ##### PERFORMANCE ESTIMATION USING  CROSS VALIDATION  #####
## 
## ** PREDICTIVE TASK :: a1
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************

# obtain summary of the results of comparison

summary(res)

## 
## == Summary of a  Cross Validation Performance Estimation Experiment ==
## 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## 
## * Predictive Tasks ::  a1
## * Workflows  ::  lm, rpartXse.v1, rpartXse.v2, rpartXse.v3 
## 
## -> Task:  a1
##   *Workflow: lm 
##              nmse
## avg     0.6899530
## std     0.1538314
## med     0.6693373
## iqr     0.1831528
## min     0.4062558
## max     1.0801633
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v1 
##              nmse
## avg     0.5972440
## std     0.2537998
## med     0.5202799
## iqr     0.3615993
## min     0.2906485
## max     1.4125642
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v2 
##              nmse
## avg     0.6303191
## std     0.2793589
## med     0.5843170
## iqr     0.3081521
## min     0.2712879
## max     1.2908643
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v3 
##              nmse
## avg     0.6708279
## std     0.2697628
## med     0.6256652
## iqr     0.3189446
## min     0.2712879
## max     1.2908643
## invalid 0.0000000

# box plot visualization of results

plot(res)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
##   Please report the issue at
##   <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# get specific parameter settings corresponding to labels

getWorkflow("rpartXse.v1", res)

## Workflow Object:
##  Workflow ID       ::  rpartXse.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> se=0 
##       learner  -> rpartXse

# comparison experiment for all seven algae

DSs <- sapply(names(algae)[12:18],
          function(x,names.attrs) { 
            f <- as.formula(paste(x, "~ ."))
            PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE) 
          },
          names(algae)[1:11])
res.all <- performanceEstimation(
    DSs,
    c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
      workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
    EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))

## 
## 
## ##### PERFORMANCE ESTIMATION USING  CROSS VALIDATION  #####
## 
## ** PREDICTIVE TASK :: a1
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a2
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a3
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a4
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a5
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a6
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a7
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************

# plot of cross-validation results for all algae

plot(res.all)

# produces best model for each problem

topPerformers(res.all)

## $a1
##         Workflow Estimate
## nmse rpartXse.v1    0.597
## 
## $a2
##      Workflow Estimate
## nmse       lm    0.973
## 
## $a3
##         Workflow Estimate
## nmse rpartXse.v3        1
## 
## $a4
##         Workflow Estimate
## nmse rpartXse.v2        1
## 
## $a5
##      Workflow Estimate
## nmse       lm    0.906
## 
## $a6
##      Workflow Estimate
## nmse       lm    0.866
## 
## $a7
##         Workflow Estimate
## nmse rpartXse.v2        1

# use library randomForest (an ensemble approach) and repeat the cross-validation experiment using random forest model, linear model, and regression tree model

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

res.all <- performanceEstimation(
    DSs,
    c(Workflow(learner="lm", pre="knnImp",post="onlyPos"),
      workflowVariants(learner="rpartXse",
                       learner.pars=list(se=c(0,0.5,1))),
      workflowVariants(learner="randomForest", pre="knnImp",
                       learner.pars=list(ntree=c(200,500,700)))),
    EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))

## 
## 
## ##### PERFORMANCE ESTIMATION USING  CROSS VALIDATION  #####
## 
## ** PREDICTIVE TASK :: a1
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a2
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a3
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a4
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a5
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a6
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a7
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: randomForest.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************

# Get top 3 models for each algae

rankWorkflows(res.all, top=3)

## $a1
## $a1$nmse
##          Workflow  Estimate
## 1 randomForest.v1 0.5491748
## 2 randomForest.v3 0.5512905
## 3 randomForest.v2 0.5514432
## 
## 
## $a2
## $a2$nmse
##          Workflow  Estimate
## 1 randomForest.v2 0.7800659
## 2 randomForest.v1 0.7813307
## 3 randomForest.v3 0.7816703
## 
## 
## $a3
## $a3$nmse
##          Workflow  Estimate
## 1 randomForest.v3 0.9768068
## 2 randomForest.v2 0.9770564
## 3 randomForest.v1 0.9898435
## 
## 
## $a4
## $a4$nmse
##          Workflow  Estimate
## 1 randomForest.v3 0.9529889
## 2 randomForest.v2 0.9572795
## 3 randomForest.v1 0.9709013
## 
## 
## $a5
## $a5$nmse
##          Workflow  Estimate
## 1 randomForest.v3 0.7868151
## 2 randomForest.v2 0.7873769
## 3 randomForest.v1 0.7888624
## 
## 
## $a6
## $a6$nmse
##          Workflow  Estimate
## 1              lm 0.8664797
## 2 randomForest.v2 0.8983741
## 3 randomForest.v3 0.8986129
## 
## 
## $a7
## $a7$nmse
##          Workflow Estimate
## 1     rpartXse.v2  1.00000
## 2     rpartXse.v3  1.00000
## 3 randomForest.v3  1.01476

# pairedComparison() function shows whether the diﬀerence between the scores of these best models and the remaining alternatives is statistically significant. Baseline workflow is random forest. Use Bonferroni-Dunn test to compare other workflows to baseline.

p <- pairedComparisons(res.all,baseline="randomForest.v3")
p$nmse$F.test

## $chi
## [1] 23.44898
## 
## $FF
## [1] 7.584158
## 
## $critVal
## [1] 0.6524015
## 
## $rejNull
## [1] TRUE

p$nmse$BonferroniDunn.test

## $critDif
## [1] 3.046397
## 
## $baseline
## [1] "randomForest.v3"
## 
## $rkDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##       2.8571429       4.4285714       2.8571429       2.5714286       1.0000000 
## randomForest.v2 
##       0.2857143 
## 
## $signifDifs
##              lm     rpartXse.v1     rpartXse.v2     rpartXse.v3 randomForest.v1 
##           FALSE            TRUE           FALSE           FALSE           FALSE 
## randomForest.v2 
##           FALSE

# produces a Bonferroni-Dunn CD diagram

CDdiagram.BD(p)

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
##   Please report the issue at
##   <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
##   Please report the issue at
##   <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Predictions for the Seven Algae

# obtain the best model for each of the algae by using the top performer for each of the algae. taskNames() - obtain vector with names of algae prediction tasks. topPerformer() - best workflow for each prediction task name.

wfs <- sapply(taskNames(res.all),
              function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]

## Workflow Object:
##  Workflow ID       ::  randomForest.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> ntree=200 
##       learner  -> randomForest 
##       pre  -> knnImp

wfs[["a7"]]

## Workflow Object:
##  Workflow ID       ::  rpartXse.v2 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> se=0.5 
##       learner  -> rpartXse

# obtain matrix with prediction of best workflows by creating array where 140 rows is test samples, 7 columns are algae, and 2 layers are true values (trues) and prediction values (preds)

full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
             dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
for(i in 1:7) { #Loop to train model/predict test data
    res <- runWorkflow(wfs[[i]], #best workflow for each algae
                       as.formula(paste(names(wfs)[i],"~.")),
                       algae[,c(1:11,11+i)],
                       full.test.algae[,c(1:11,11+i)])
    pts[,i,"trues"] <- res$trues #trues and preds stored in pts
    pts[,i,"preds"] <- res$preds
}

# shows the prediction and true values for algae “a1” and “a3” on the first 3 test cases by successively applying the best workflows for each of the 7 algae using function runWorkflow()

pts[1:3,c("a1","a3"),]

## , , trues
## 
##    a1  a3
## 1 1.2 1.9
## 2 1.2 0.0
## 3 7.0 6.5
## 
## , , preds
## 
##          a1       a3
## 1  3.063175 4.656507
## 2 11.023367 3.482419
## 3 13.902800 6.822738

# calculate NMSE scores of models on the seven algae by taking the model error over the baseline error. scale() used to normalize data set.

avg.preds <- apply(algae[,12:18], 2, mean)
apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) /
    apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)

##        a1        a2        a3        a4        a5        a6        a7 
## 0.4705232 0.8644970 0.7812299 0.7141314 0.7122073 0.8340645 1.0000000

Predicting Algae Blooms

2026-04-11

Loading the Data into R

Removing the Observations with Unknown Values

Filling in the Unknowns with the Most Frequent Values

Filling in the Unknown Values by Exploring Correlations

Filling in the Unknown Values by Exploring Similarities between Cases

Multiple Linear Regression

NOTE: adjusted R-squared (proportion of variance explained by the model) is 32.15%

NOTE: fit has improved only slightly to ~32.7%

only about 30% confidence that they are different

NOTE: proportion explained by the model still low. Sign that linearity assumptions of model not adequate for the domain

Regression Trees

Model Evaluation and Selection

Predictions for the Seven Algae