Loading the Data into R
# Load libraries including the DMwR2
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(grid)
library(forcats)
library(corrplot)
## corrplot 0.95 loaded
# Load the algae dataset from the DMwR2 package and print the dataset
data(algae, package="DMwR2")
algae
## # A tibble: 200 × 18
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 winter small medium 8 9.8 60.8 6.24 578 105 170 50 0
## 2 spring small medium 8.35 8 57.8 1.29 370 429. 559. 1.3 1.4
## 3 autumn small medium 8.1 11.4 40.0 5.33 347. 126. 187. 15.6 3.3
## 4 spring small medium 8.07 4.8 77.4 2.30 98.2 61.2 139. 1.4 3.1
## 5 autumn small medium 8.06 9 55.4 10.4 234. 58.2 97.6 10.5 9.2
## 6 winter small high 8.25 13.1 65.8 9.25 430 18.2 56.7 28.4 15.1
## 7 summer small high 8.15 10.3 73.2 1.54 110 61.2 112. 3.2 2.4
## 8 autumn small high 8.05 10.6 59.1 4.99 206. 44.7 77.4 6.9 18.2
## 9 winter small medium 8.7 3.4 22.0 0.886 103. 36.3 71 5.54 25.4
## 10 winter small high 7.93 9.9 8 1.39 5.8 27.2 46.6 0.8 17
## # ℹ 190 more rows
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## # a7 <dbl>
# load the data as a tibble (data frame)
tibble::as_tibble(algae)
## # A tibble: 200 × 18
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 winter small medium 8 9.8 60.8 6.24 578 105 170 50 0
## 2 spring small medium 8.35 8 57.8 1.29 370 429. 559. 1.3 1.4
## 3 autumn small medium 8.1 11.4 40.0 5.33 347. 126. 187. 15.6 3.3
## 4 spring small medium 8.07 4.8 77.4 2.30 98.2 61.2 139. 1.4 3.1
## 5 autumn small medium 8.06 9 55.4 10.4 234. 58.2 97.6 10.5 9.2
## 6 winter small high 8.25 13.1 65.8 9.25 430 18.2 56.7 28.4 15.1
## 7 summer small high 8.15 10.3 73.2 1.54 110 61.2 112. 3.2 2.4
## 8 autumn small high 8.05 10.6 59.1 4.99 206. 44.7 77.4 6.9 18.2
## 9 winter small medium 8.7 3.4 22.0 0.886 103. 36.3 71 5.54 25.4
## 10 winter small high 7.93 9.9 8 1.39 5.8 27.2 46.6 0.8 17
## # ℹ 190 more rows
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## # a7 <dbl>
# Use summary to give the basic statistics of the dataset including counts, min, max, median, mean, quartile and NAs
summary(algae)
## season size speed mxPH mnO2
## autumn:40 large :45 high :84 Min. :5.600 Min. : 1.500
## spring:53 medium:84 low :33 1st Qu.:7.700 1st Qu.: 7.725
## summer:45 small :71 medium:83 Median :8.060 Median : 9.800
## winter:62 Mean :8.012 Mean : 9.118
## 3rd Qu.:8.400 3rd Qu.:10.800
## Max. :9.700 Max. :13.400
## NA's :1 NA's :2
## Cl NO3 NH4 oPO4
## Min. : 0.222 Min. : 0.050 Min. : 5.00 Min. : 1.00
## 1st Qu.: 10.981 1st Qu.: 1.296 1st Qu.: 38.33 1st Qu.: 15.70
## Median : 32.730 Median : 2.675 Median : 103.17 Median : 40.15
## Mean : 43.636 Mean : 3.282 Mean : 501.30 Mean : 73.59
## 3rd Qu.: 57.824 3rd Qu.: 4.446 3rd Qu.: 226.95 3rd Qu.: 99.33
## Max. :391.500 Max. :45.650 Max. :24064.00 Max. :564.60
## NA's :10 NA's :2 NA's :2 NA's :2
## PO4 Chla a1 a2
## Min. : 1.00 Min. : 0.200 Min. : 0.00 Min. : 0.000
## 1st Qu.: 41.38 1st Qu.: 2.000 1st Qu.: 1.50 1st Qu.: 0.000
## Median :103.29 Median : 5.475 Median : 6.95 Median : 3.000
## Mean :137.88 Mean : 13.971 Mean :16.92 Mean : 7.458
## 3rd Qu.:213.75 3rd Qu.: 18.308 3rd Qu.:24.80 3rd Qu.:11.375
## Max. :771.60 Max. :110.456 Max. :89.80 Max. :72.600
## NA's :2 NA's :12
## a3 a4 a5 a6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.550 Median : 0.000 Median : 1.900 Median : 0.000
## Mean : 4.309 Mean : 1.992 Mean : 5.064 Mean : 5.964
## 3rd Qu.: 4.925 3rd Qu.: 2.400 3rd Qu.: 7.500 3rd Qu.: 6.925
## Max. :42.800 Max. :44.600 Max. :44.400 Max. :77.600
##
## a7
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 2.495
## 3rd Qu.: 2.400
## Max. :31.600
##
# Produce a histogram of the maximum pH (mxPH) scaled to density
ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Produce a histogram with a smooth density curve and rug plot which shows the individual points & a normal QQ plot
ggplot(algae,aes(x=mxPH)) +
geom_histogram(aes(y=..density..)) +
geom_density(color="red") + geom_rug() +
ggtitle("The Histogram of mxPH (maximum pH)") +
xlab("") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57
# Produce the enhanced histogram and QQ plot to be in the same frame
gh <- ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() + ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
par(mfrow=c(1,2))
vpL <- viewport(height=unit(1, "npc"), width=unit(0.5, "npc"),
just="left",
y=0.5, x=0)
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH')
## [1] 56 57
print(gh,vp=vpL)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57
# Shows a boxplot of orthophosphate with rug plot and red dotted line mean
ggplot(algae,aes(x=factor(0),y=oPO4)) +
geom_boxplot() + geom_rug() +
geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),linetype=2,colour="red") +
ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)
## Warning: Use of `algae$oPO4` is discouraged.
## ℹ Use `oPO4` instead.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Scatter plot of NH4 (ammonium) with mean, mean plus the standard deviation, and the median
# Helps identify the outlier plot point
plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T), lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
identify(algae$NH4)

## integer(0)
# clean scatter plot of NH4, this helps identify the outlier plot point
plot(algae$NH4, xlab = "")
clickedRows <- identify(algae$NH4)

algae[clickedRows, ]
## # A tibble: 0 × 18
## # ℹ 18 variables: season <fct>, size <fct>, speed <fct>, mxPH <dbl>,
## # mnO2 <dbl>, Cl <dbl>, NO3 <dbl>, NH4 <dbl>, oPO4 <dbl>, PO4 <dbl>,
## # Chla <dbl>, a1 <dbl>, a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## # a7 <dbl>
# using the filter function to find all the NH4 values over 19000 and identify the outlier
filter(algae, NH4 > 19000)
## # A tibble: 1 × 18
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1 a2
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 autumn medi… high 7.3 11.8 44.2 45.6 24064 44 34 53.1 2.2 0
## # ℹ 5 more variables: a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>, a7 <dbl>
# produce a boxplot that compares algae A1 by river size
ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
xlab("River Size") + ylab("Algal A1")

# reorder the factors in a more meaningful way
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
# produce boxplot of reordered factors
ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
xlab("River Size") + ylab("Algal A1")

# produce a violin plot of algal A1 with the individual points
ggplot(algae,aes(x=size,y=a1)) + geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")

# produce a faceted grid of of Algal A3 with quartile cut minimum O2 levels (with NAs filtered out) according to season
data2graph <- filter(algae,!is.na(mnO2)) %>%
mutate(minO2=cut(mnO2,
quantile(mnO2,c(0,0.25,0.5,0.75,1)),
include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season,col=season)) + geom_point() + facet_wrap(~ minO2) + guides(color=FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Removing the Observations with Unknown Values
# identifies rows that are NOT complete across the dataframe
filter(algae, !complete.cases(algae) )
## # A tibble: 16 × 18
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 autumn small high 6.8 11.1 9 0.63 20 4 NA 2.7 30.3
## 2 spring small high 8 NA 1.45 0.81 10 2.5 3 0.3 75.8
## 3 winter small low NA 12.6 9 0.23 10 5 6 1.1 35.5
## 4 winter small high 6.6 10.8 NA 3.24 10 1 6.5 NA 24.3
## 5 spring small medi… 5.6 11.8 NA 2.22 5 1 1 NA 82.7
## 6 autumn small medi… 5.7 10.8 NA 2.55 10 1 4 NA 16.8
## 7 spring small high 6.6 9.5 NA 1.32 20 1 6 NA 46.8
## 8 summer small high 6.6 10.8 NA 2.64 10 2 11 NA 46.9
## 9 autumn small medi… 6.6 11.3 NA 4.17 10 1 6 NA 47.1
## 10 spring small medi… 6.5 10.4 NA 5.97 10 2 14 NA 66.9
## 11 summer small medi… 6.4 NA NA NA NA NA 14 NA 19.4
## 12 autumn small high 7.83 11.7 4.08 1.33 18 3.33 6.67 NA 14.4
## 13 winter medium high 9.7 10.8 0.222 0.406 10 22.4 10.1 NA 41
## 14 spring large low 9 5.8 NA 0.9 142 102 186 68.0 1.7
## 15 winter large high 8 10.9 9.06 0.825 40 21.1 56.1 NA 16.8
## 16 winter large medi… 8 7.6 NA NA NA NA NA NA 0
## # ℹ 6 more variables: a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## # a7 <dbl>
# count the number of NAs in each row of dataframe
apply(algae, 1, function(x) sum(is.na(x)))
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [38] 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 6 1 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0
# identify rows with more than 20% NAs
data(algae, package="DMwR2")
manyNAs(algae, 0.2)
## [1] 62 199
# remove the rows with more than 20% NA
algae <- algae[-manyNAs(algae), ]
Filling in the Unknowns with the Most Frequent Values
# use the mean of the max pH (mxPH) column to fill in unknown values in mxPH column
data(algae, package="DMwR2")
algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = TRUE)
# use the median of the chlorophyll (Chla) column to fill in unknown values in the Chla column
data(algae, package="DMwR2")
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = TRUE)
# fill in all unknown values using statistics of centrality (median for numerical columns and mode for nominal variables)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
algae <- centralImputation(algae)
Filling in the Unknown Values by Exploring Correlations
# obtains variable correlation with cor() & complete.obs tells program to disregard the NAs
data(algae, package="DMwR2")
cor(algae[, 4:18], use = "complete.obs")
## mxPH mnO2 Cl NO3 NH4 oPO4
## mxPH 1.00000000 -0.10269374 0.14709539 -0.17213024 -0.15429757 0.090229085
## mnO2 -0.10269374 1.00000000 -0.26324536 0.11790769 -0.07826816 -0.393752688
## Cl 0.14709539 -0.26324536 1.00000000 0.21095831 0.06598336 0.379255958
## NO3 -0.17213024 0.11790769 0.21095831 1.00000000 0.72467766 0.133014517
## NH4 -0.15429757 -0.07826816 0.06598336 0.72467766 1.00000000 0.219311206
## oPO4 0.09022909 -0.39375269 0.37925596 0.13301452 0.21931121 1.000000000
## PO4 0.10132957 -0.46396073 0.44519118 0.15702971 0.19939575 0.911964602
## Chla 0.43182377 -0.13121671 0.14295776 0.14549290 0.09120406 0.106914784
## a1 -0.16262986 0.24998372 -0.35923946 -0.24723921 -0.12360578 -0.394574479
## a2 0.33501740 -0.06848199 0.07845402 0.01997079 -0.03790296 0.123811068
## a3 -0.02716034 -0.23522831 0.07653027 -0.09182236 -0.11290467 0.005704557
## a4 -0.18435348 -0.37982999 0.14147281 -0.01448875 0.27452000 0.382481433
## a5 -0.10731230 0.21001174 0.14534877 0.21213579 0.01544458 0.122027482
## a6 -0.17273795 0.18862656 0.16904394 0.54404455 0.40119275 0.003340366
## a7 -0.17027088 -0.10455106 -0.04494524 0.07505030 -0.02539279 0.026150420
## PO4 Chla a1 a2 a3 a4
## mxPH 0.10132957 0.43182377 -0.16262986 0.335017401 -0.027160336 -0.18435348
## mnO2 -0.46396073 -0.13121671 0.24998372 -0.068481989 -0.235228307 -0.37982999
## Cl 0.44519118 0.14295776 -0.35923946 0.078454019 0.076530269 0.14147281
## NO3 0.15702971 0.14549290 -0.24723921 0.019970786 -0.091822358 -0.01448875
## NH4 0.19939575 0.09120406 -0.12360578 -0.037902958 -0.112904666 0.27452000
## oPO4 0.91196460 0.10691478 -0.39457448 0.123811068 0.005704557 0.38248143
## PO4 1.00000000 0.24849223 -0.45816781 0.132667891 0.032193981 0.40883951
## Chla 0.24849223 1.00000000 -0.26601088 0.366724647 -0.063301128 -0.08600540
## a1 -0.45816781 -0.26601088 1.00000000 -0.262665485 -0.108177581 -0.09338072
## a2 0.13266789 0.36672465 -0.26266549 1.000000000 0.009759915 -0.17628704
## a3 0.03219398 -0.06330113 -0.10817758 0.009759915 1.000000000 0.03336910
## a4 0.40883951 -0.08600540 -0.09338072 -0.176287038 0.033369102 1.00000000
## a5 0.15548900 -0.07342837 -0.26972709 -0.186758940 -0.141610948 -0.10131827
## a6 0.05320294 0.01032550 -0.26156402 -0.133518480 -0.196900051 -0.08488426
## a7 0.07978353 0.01760782 -0.19306384 0.036206205 0.039060248 0.07114638
## a5 a6 a7
## mxPH -0.10731230 -0.172737947 -0.17027088
## mnO2 0.21001174 0.188626555 -0.10455106
## Cl 0.14534877 0.169043945 -0.04494524
## NO3 0.21213579 0.544044553 0.07505030
## NH4 0.01544458 0.401192749 -0.02539279
## oPO4 0.12202748 0.003340366 0.02615042
## PO4 0.15548900 0.053202942 0.07978353
## Chla -0.07342837 0.010325497 0.01760782
## a1 -0.26972709 -0.261564023 -0.19306384
## a2 -0.18675894 -0.133518480 0.03620621
## a3 -0.14161095 -0.196900051 0.03906025
## a4 -0.10131827 -0.084884259 0.07114638
## a5 1.00000000 0.388608955 -0.05149346
## a6 0.38860896 1.000000000 -0.03033428
## a7 -0.05149346 -0.030334277 1.00000000
#symnum() produces correlation matrix with symbolic representation where legend shows which symbol represents the strength of correlation (e.g. orthophosphate is about 90% correlated with phosphate)
symnum(cor(algae[,4:18],use="complete.obs"))
## mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1
## mnO2 1
## Cl 1
## NO3 1
## NH4 , 1
## oPO4 . . 1
## PO4 . . * 1
## Chla . 1
## a1 . . . 1
## a2 . . 1
## a3 1
## a4 . . . 1
## a5 1
## a6 . . . 1
## a7 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
# produces a colorful correlation matrix
cm <- cor(algae[,4:18],use="complete.obs")
corrplot(cm,type="upper",tl.pos="d",tl.cex=0.75)
corrplot(cm,add=TRUE, type="lower", method="number",tl.cex=0.75, diag=FALSE,tl.pos="n", cl.pos="n")

# obtain information for linear model Y = β0 + β1X1 +...+βnXn -> PO4 = 42.897 + 1.293 x oPO4
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
lm(PO4 ~ oPO4, data = algae)
##
## Call:
## lm(formula = PO4 ~ oPO4, data = algae)
##
## Coefficients:
## (Intercept) oPO4
## 42.897 1.293
# fill in the unknown values of PO4 column using our lm formula
algae[28, "PO4"] <- 42.897 + 1.293 * algae[28, "oPO4"]
# remove the manyNAs rows and use our function that fills in PO4 value given the oPO4 value
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
fillPO4 <- function(oP) ifelse(is.na(oP),NA,42.897 + 1.293 * oP)
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4), "oPO4"], fillPO4)
# reorder data in meaningful way then produce histogram of mxPH for the four seasons
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# produces histogram of mxPH by river size and speed
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) +
facet_wrap(size ~ speed)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# produces multifaceted scatterplot that breaks up the facets according to speed and looks at mxPH with size on the y-axis. geom_jitter height=0.4 moves points 0.4 units to prevent overlapping points when points share same size value
ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed) + geom_jitter(height = 0.4)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Filling in the Unknown Values by Exploring Similarities between
Cases
# prepare for KNN imputation by reloading the dataset and removing the manyNA rows
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
# fills in a missing value using the k-nearest neighbor (KNN) by using the 10 most similar rows based on the other variables
algae <- knnImputation(algae, k = 10)
# fills in a missing value using the k-nearest neighbor (KNN) by looking at the 10 most similar rows based on the other variables and fills in the missing value using the median of the neighbors
data(algae, package="DMwR2")
algae <- knnImputation(algae, k = 10, meth = "median")
Multiple Linear Regression
# prepare the data for linear regression by reloadng data, removing the manyNA rows. Create a clean.algae data frame by imputing data using knn so there are no NAs
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
clean.algae <- knnImputation(algae, k = 10)
# obtain linear regression model for predicting frequency of algal 1 where '.' means using all predictors to predict a1 (i.e. a1 as a function of all variables)
lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12])
# obtains summary of linear model (shows coefficients, significance, etc.)
summary(lm.a1)
##
## Call:
## lm(formula = a1 ~ ., data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.679 -11.893 -2.567 7.410 62.190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.942055 24.010879 1.788 0.07537 .
## seasonspring 3.726978 4.137741 0.901 0.36892
## seasonsummer 0.747597 4.020711 0.186 0.85270
## seasonwinter 3.692955 3.865391 0.955 0.34065
## sizemedium 3.263728 3.802051 0.858 0.39179
## sizesmall 9.682140 4.179971 2.316 0.02166 *
## speedlow 3.922084 4.706315 0.833 0.40573
## speedmedium 0.246764 3.241874 0.076 0.93941
## mxPH -3.589118 2.703528 -1.328 0.18598
## mnO2 1.052636 0.705018 1.493 0.13715
## Cl -0.040172 0.033661 -1.193 0.23426
## NO3 -1.511235 0.551339 -2.741 0.00674 **
## NH4 0.001634 0.001003 1.628 0.10516
## oPO4 -0.005435 0.039884 -0.136 0.89177
## PO4 -0.052241 0.030755 -1.699 0.09109 .
## Chla -0.088022 0.079998 -1.100 0.27265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 182 degrees of freedom
## Multiple R-squared: 0.3731, Adjusted R-squared: 0.3215
## F-statistic: 7.223 on 15 and 182 DF, p-value: 2.444e-12
NOTE: adjusted R-squared (proportion of variance explained by the
model) is 32.15%
# simplify the linear model by using anova() for model variance analysis
anova(lm.a1)
## Analysis of Variance Table
##
## Response: a1
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 85 28.2 0.0905 0.9651944
## size 2 11401 5700.7 18.3088 5.69e-08 ***
## speed 2 3934 1967.2 6.3179 0.0022244 **
## mxPH 1 1329 1328.8 4.2677 0.0402613 *
## mnO2 1 2287 2286.8 7.3444 0.0073705 **
## Cl 1 4304 4304.3 13.8239 0.0002671 ***
## NO3 1 3418 3418.5 10.9789 0.0011118 **
## NH4 1 404 403.6 1.2963 0.2563847
## oPO4 1 4788 4788.0 15.3774 0.0001246 ***
## PO4 1 1406 1405.6 4.5142 0.0349635 *
## Chla 1 377 377.0 1.2107 0.2726544
## Residuals 182 56668 311.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# remove season as it least contributes to reduction of fitting error of model (largest p-value in anova)
lm2.a1 <- update(lm.a1, . ~ . - season)
# obtain summary of the linear model with season removed
summary(lm2.a1)
##
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 +
## oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.460 -11.953 -3.044 7.444 63.730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.9532874 23.2378377 1.934 0.05458 .
## sizemedium 3.3092102 3.7825221 0.875 0.38278
## sizesmall 10.2730961 4.1223163 2.492 0.01358 *
## speedlow 3.0546270 4.6108069 0.662 0.50848
## speedmedium -0.2976867 3.1818585 -0.094 0.92556
## mxPH -3.2684281 2.6576592 -1.230 0.22033
## mnO2 0.8011759 0.6589644 1.216 0.22561
## Cl -0.0381881 0.0333791 -1.144 0.25407
## NO3 -1.5334300 0.5476550 -2.800 0.00565 **
## NH4 0.0015777 0.0009951 1.586 0.11456
## oPO4 -0.0062392 0.0395086 -0.158 0.87469
## PO4 -0.0509543 0.0305189 -1.670 0.09669 .
## Chla -0.0841371 0.0794459 -1.059 0.29096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.57 on 185 degrees of freedom
## Multiple R-squared: 0.3682, Adjusted R-squared: 0.3272
## F-statistic: 8.984 on 12 and 185 DF, p-value: 1.762e-13
NOTE: fit has improved only slightly to ~32.7%
#compare models lm.a1 and lm2.a1 with anova function. Look at F-test to assess significance of differences
anova(lm.a1,lm2.a1)
## Analysis of Variance Table
##
## Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
## Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 182 56668
## 2 185 57116 -3 -447.62 0.4792 0.6971
only about 30% confidence that they are different
# obtain linear model that results from applying backwards elimination method to lm.a1
final.lm <- step(lm.a1)
## Start: AIC=1152.03
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - season 3 447.62 57116 1147.6
## - speed 2 269.60 56938 1149.0
## - oPO4 1 5.78 56674 1150.0
## - Chla 1 376.96 57045 1151.3
## - Cl 1 443.46 57112 1151.6
## - mxPH 1 548.76 57217 1151.9
## <none> 56668 1152.0
## - mnO2 1 694.11 57363 1152.4
## - NH4 1 825.67 57494 1152.9
## - PO4 1 898.42 57567 1153.1
## - size 2 1857.16 58526 1154.4
## - NO3 1 2339.36 59008 1158.0
##
## Step: AIC=1147.59
## a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
##
## Df Sum of Sq RSS AIC
## - speed 2 210.64 57327 1144.3
## - oPO4 1 7.70 57124 1145.6
## - Chla 1 346.27 57462 1146.8
## - Cl 1 404.10 57520 1147.0
## - mnO2 1 456.37 57572 1147.2
## - mxPH 1 466.95 57583 1147.2
## <none> 57116 1147.6
## - NH4 1 776.11 57892 1148.3
## - PO4 1 860.62 57977 1148.5
## - size 2 2175.59 59292 1151.0
## - NO3 1 2420.47 59537 1153.8
##
## Step: AIC=1144.31
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - oPO4 1 16.29 57343 1142.4
## - Chla 1 223.29 57550 1143.1
## - mnO2 1 413.77 57740 1143.7
## - Cl 1 472.70 57799 1143.9
## - mxPH 1 483.56 57810 1144.0
## <none> 57327 1144.3
## - NH4 1 720.19 58047 1144.8
## - PO4 1 809.30 58136 1145.1
## - size 2 2060.95 59388 1147.3
## - NO3 1 2379.75 59706 1150.4
##
## Step: AIC=1142.37
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - Chla 1 207.7 57551 1141.1
## - mnO2 1 402.6 57746 1141.8
## - Cl 1 470.7 57814 1142.0
## - mxPH 1 519.7 57863 1142.2
## <none> 57343 1142.4
## - NH4 1 704.4 58047 1142.8
## - size 2 2050.3 59393 1145.3
## - NO3 1 2370.4 59713 1148.4
## - PO4 1 5818.4 63161 1159.5
##
## Step: AIC=1141.09
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - mnO2 1 435.3 57986 1140.6
## - Cl 1 438.1 57989 1140.6
## <none> 57551 1141.1
## - NH4 1 746.9 58298 1141.6
## - mxPH 1 833.1 58384 1141.9
## - size 2 2217.5 59768 1144.6
## - NO3 1 2667.1 60218 1148.1
## - PO4 1 6309.7 63860 1159.7
##
## Step: AIC=1140.58
## a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - NH4 1 531.0 58517 1140.4
## - Cl 1 584.9 58571 1140.6
## <none> 57986 1140.6
## - mxPH 1 819.1 58805 1141.4
## - size 2 2478.2 60464 1144.9
## - NO3 1 2251.4 60237 1146.1
## - PO4 1 9097.9 67084 1167.4
##
## Step: AIC=1140.38
## a1 ~ size + mxPH + Cl + NO3 + PO4
##
## Df Sum of Sq RSS AIC
## <none> 58517 1140.4
## - mxPH 1 784.1 59301 1141.0
## - Cl 1 835.6 59353 1141.2
## - NO3 1 1987.9 60505 1145.0
## - size 2 2664.3 61181 1145.2
## - PO4 1 8575.8 67093 1165.5
# obtain summary of the final model
summary(final.lm)
##
## Call:
## lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = clean.algae[,
## 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.874 -12.732 -3.741 8.424 62.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.28555 20.96132 2.733 0.00687 **
## sizemedium 2.80050 3.40190 0.823 0.41141
## sizesmall 10.40636 3.82243 2.722 0.00708 **
## mxPH -3.97076 2.48204 -1.600 0.11130
## Cl -0.05227 0.03165 -1.651 0.10028
## NO3 -0.89529 0.35148 -2.547 0.01165 *
## PO4 -0.05911 0.01117 -5.291 3.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.5 on 191 degrees of freedom
## Multiple R-squared: 0.3527, Adjusted R-squared: 0.3324
## F-statistic: 17.35 on 6 and 191 DF, p-value: 5.554e-16
NOTE: proportion explained by the model still low. Sign that
linearity assumptions of model not adequate for the domain
Regression Trees
#rpart library used for regression trees. Prepare the data by reloadng data, removing the manyNA rows. Do not need imputation as model can handle missing values. rpart() function builds decision tree model
library(rpart)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
# shows content of rt.a1 object. Note that not all variables appear as the model selects the more relevant variables
rt.a1
## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.290 16.996460
## 2) PO4>=43.818 147 31279.120 8.979592
## 4) Cl>=7.8065 140 21622.830 7.492857
## 8) oPO4>=51.118 84 3441.149 3.846429 *
## 9) oPO4< 51.118 56 15389.430 12.962500
## 18) mnO2>=10.05 24 1248.673 6.716667 *
## 19) mnO2< 10.05 32 12502.320 17.646870
## 38) NO3>=3.1875 9 257.080 7.866667 *
## 39) NO3< 3.1875 23 11047.500 21.473910
## 78) mnO2< 8 13 2919.549 13.807690 *
## 79) mnO2>=8 10 6370.704 31.440000 *
## 5) Cl< 7.8065 7 3157.769 38.714290 *
## 3) PO4< 43.818 51 22442.760 40.103920
## 6) mxPH< 7.87 28 11452.770 33.450000
## 12) mxPH>=7.045 18 5146.169 26.394440 *
## 13) mxPH< 7.045 10 3797.645 46.150000 *
## 7) mxPH>=7.87 23 8241.110 48.204350
## 14) PO4>=15.177 12 3047.517 38.183330 *
## 15) PO4< 15.177 11 2673.945 59.136360 *
# use use prp() function from rpart.plot to visualize the decision tree
library(rpart.plot)
prp(rt.a1,extra=101,box.col="orange",split.box.col="grey")

# prints the complexity parameter table which shows how tree's accuracy changes as it's simplified to pick best tree size and prevent overfitting
printcp(rt.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
##
## Variables actually used in tree construction:
## [1] Cl mnO2 mxPH NO3 oPO4 PO4
##
## Root node error: 90401/198 = 456.57
##
## n= 198
##
## CP nsplit rel error xerror xstd
## 1 0.405740 0 1.00000 1.02289 0.13176
## 2 0.071885 1 0.59426 0.73783 0.11484
## 3 0.030887 2 0.52237 0.70321 0.11273
## 4 0.030408 3 0.49149 0.70910 0.11532
## 5 0.027872 4 0.46108 0.70175 0.11520
## 6 0.027754 5 0.43321 0.68693 0.11569
## 7 0.018124 6 0.40545 0.65404 0.10605
## 8 0.016344 7 0.38733 0.68903 0.10884
## 9 0.010000 9 0.35464 0.70651 0.11143
# can use 1-SE rule: cross-validation error estimates (xerror) + standard deviation (xstd). e.g. Smallest tree with error less than 0.59452+0.10083=0.69535 is 2. If we want this tree over the one suggested by program, can prune rt.a1 to cp = 0.08
rt2.a1 <- prune(rt.a1, cp = 0.08)
rt2.a1
## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.29 16.996460
## 2) PO4>=43.818 147 31279.12 8.979592 *
## 3) PO4< 43.818 51 22442.76 40.103920 *
# rpartXse automates the previous process
(rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))
## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.29 16.996460
## 2) PO4>=43.818 147 31279.12 8.979592 *
## 3) PO4< 43.818 51 22442.76 40.103920 *
# interactive manual pruning tree, indicates number of nodes at which you want to prune tree, prune tree with snip.rpart() function
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
snip.rpart(first.tree, c(4, 7))
## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.290 16.996460
## 2) PO4>=43.818 147 31279.120 8.979592
## 4) Cl>=7.8065 140 21622.830 7.492857 *
## 5) Cl< 7.8065 7 3157.769 38.714290 *
## 3) PO4< 43.818 51 22442.760 40.103920
## 6) mxPH< 7.87 28 11452.770 33.450000
## 12) mxPH>=7.045 18 5146.169 26.394440 *
## 13) mxPH< 7.045 10 3797.645 46.150000 *
## 7) mxPH>=7.87 23 8241.110 48.204350 *
# produces way to prune tree graphically
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)

## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.290 16.996460
## 2) PO4>=43.818 147 31279.120 8.979592
## 4) Cl>=7.8065 140 21622.830 7.492857
## 8) oPO4>=51.118 84 3441.149 3.846429 *
## 9) oPO4< 51.118 56 15389.430 12.962500
## 18) mnO2>=10.05 24 1248.673 6.716667 *
## 19) mnO2< 10.05 32 12502.320 17.646870
## 38) NO3>=3.1875 9 257.080 7.866667 *
## 39) NO3< 3.1875 23 11047.500 21.473910
## 78) mnO2< 8 13 2919.549 13.807690 *
## 79) mnO2>=8 10 6370.704 31.440000 *
## 5) Cl< 7.8065 7 3157.769 38.714290 *
## 3) PO4< 43.818 51 22442.760 40.103920
## 6) mxPH< 7.87 28 11452.770 33.450000
## 12) mxPH>=7.045 18 5146.169 26.394440 *
## 13) mxPH< 7.045 10 3797.645 46.150000 *
## 7) mxPH>=7.87 23 8241.110 48.204350
## 14) PO4>=15.177 12 3047.517 38.183330 *
## 15) PO4< 15.177 11 2673.945 59.136360 *
Model Evaluation and Selection
# generate predicted values using predict() function
lm.predictions.a1 <- predict(final.lm, clean.algae)
rt.predictions.a1 <- predict(rt.a1, algae)
# calculate the mean absolute error for linear model and regression tree model
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.10681
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[["a1"]])))
## [1] 11.61717
# calculate the mean squared error for both models
(mse.a1.lm <- mean((lm.predictions.a1 - algae[["a1"]])^2))
## [1] 295.5407
(mse.a1.rt <- mean((rt.predictions.a1 - algae[["a1"]])^2))
## [1] 271.3226
# calculate the normalized mean squared error for both models
(nmse.a1.lm <- mean((lm.predictions.a1-algae[['a1']])^2)/
mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.6473034
(nmse.a1.rt <- mean((rt.predictions.a1-algae[['a1']])^2)/
mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.5942601
# visualize the predictions with errors scatterplot
dg <- data.frame(lm.a1=lm.predictions.a1,
rt.a1=rt.predictions.a1,
true.a1=algae[["a1"]])
ggplot(dg,aes(x=lm.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Linear Model")

ggplot(dg,aes(x=rt.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Regression Tree")

# use identify to check sample number where bad prediction is made (poor performance)
plot(lm.predictions.a1,algae[['a1']],main="Linear Model",
xlab="Predictions",ylab="True Values")
abline(0,1,col="red")
algae[identify(lm.predictions.a1,algae[['a1']]),]

## # A tibble: 0 × 18
## # ℹ 18 variables: season <fct>, size <fct>, speed <fct>, mxPH <dbl>,
## # mnO2 <dbl>, Cl <dbl>, NO3 <dbl>, NH4 <dbl>, oPO4 <dbl>, PO4 <dbl>,
## # Chla <dbl>, a1 <dbl>, a2 <dbl>, a3 <dbl>, a4 <dbl>, a5 <dbl>, a6 <dbl>,
## # a7 <dbl>
# use knowledge that algae frequency can't be less than zero to make minimum value 0 and thus improve linear model
sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 0, lm.predictions.a1)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.10681
(smae.a1.lm <- mean(abs(sensible.lm.predictions.a1 - algae[["a1"]])))
## [1] 12.48276
# carry out performance estimation experiment using cross validation. This is used to compare the linear model with three different pruning levels of regression tree on dataset. Uses 5 repetitions of 10-fold cross validation process. Using normalized mean squared error for evaluation here.
library(performanceEstimation)
res <- performanceEstimation(
PredTask(a1 ~ ., algae[, 1:12], "a1"),
c(Workflow(learner="lm",pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1)))),
EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10))
)
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
# obtain summary of the results of comparison
summary(res)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1
## * Workflows :: lm, rpartXse.v1, rpartXse.v2, rpartXse.v3
##
## -> Task: a1
## *Workflow: lm
## nmse
## avg 0.6899530
## std 0.1538314
## med 0.6693373
## iqr 0.1831528
## min 0.4062558
## max 1.0801633
## invalid 0.0000000
##
## *Workflow: rpartXse.v1
## nmse
## avg 0.5972440
## std 0.2537998
## med 0.5202799
## iqr 0.3615993
## min 0.2906485
## max 1.4125642
## invalid 0.0000000
##
## *Workflow: rpartXse.v2
## nmse
## avg 0.6303191
## std 0.2793589
## med 0.5843170
## iqr 0.3081521
## min 0.2712879
## max 1.2908643
## invalid 0.0000000
##
## *Workflow: rpartXse.v3
## nmse
## avg 0.6708279
## std 0.2697628
## med 0.6256652
## iqr 0.3189446
## min 0.2712879
## max 1.2908643
## invalid 0.0000000
# box plot visualization of results
plot(res)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
## Please report the issue at
## <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# get specific parameter settings corresponding to labels
getWorkflow("rpartXse.v1", res)
## Workflow Object:
## Workflow ID :: rpartXse.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0
## learner -> rpartXse
# comparison experiment for all seven algae
DSs <- sapply(names(algae)[12:18],
function(x,names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE)
},
names(algae)[1:11])
res.all <- performanceEstimation(
DSs,
c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))),
EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
# plot of cross-validation results for all algae
plot(res.all)

# produces best model for each problem
topPerformers(res.all)
## $a1
## Workflow Estimate
## nmse rpartXse.v1 0.597
##
## $a2
## Workflow Estimate
## nmse lm 0.973
##
## $a3
## Workflow Estimate
## nmse rpartXse.v3 1
##
## $a4
## Workflow Estimate
## nmse rpartXse.v2 1
##
## $a5
## Workflow Estimate
## nmse lm 0.906
##
## $a6
## Workflow Estimate
## nmse lm 0.866
##
## $a7
## Workflow Estimate
## nmse rpartXse.v2 1
# use library randomForest (an ensemble approach) and repeat the cross-validation experiment using random forest model, linear model, and regression tree model
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
res.all <- performanceEstimation(
DSs,
c(Workflow(learner="lm", pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",
learner.pars=list(se=c(0,0.5,1))),
workflowVariants(learner="randomForest", pre="knnImp",
learner.pars=list(ntree=c(200,500,700)))),
EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: randomForest.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
# Get top 3 models for each algae
rankWorkflows(res.all, top=3)
## $a1
## $a1$nmse
## Workflow Estimate
## 1 randomForest.v1 0.5491748
## 2 randomForest.v3 0.5512905
## 3 randomForest.v2 0.5514432
##
##
## $a2
## $a2$nmse
## Workflow Estimate
## 1 randomForest.v2 0.7800659
## 2 randomForest.v1 0.7813307
## 3 randomForest.v3 0.7816703
##
##
## $a3
## $a3$nmse
## Workflow Estimate
## 1 randomForest.v3 0.9768068
## 2 randomForest.v2 0.9770564
## 3 randomForest.v1 0.9898435
##
##
## $a4
## $a4$nmse
## Workflow Estimate
## 1 randomForest.v3 0.9529889
## 2 randomForest.v2 0.9572795
## 3 randomForest.v1 0.9709013
##
##
## $a5
## $a5$nmse
## Workflow Estimate
## 1 randomForest.v3 0.7868151
## 2 randomForest.v2 0.7873769
## 3 randomForest.v1 0.7888624
##
##
## $a6
## $a6$nmse
## Workflow Estimate
## 1 lm 0.8664797
## 2 randomForest.v2 0.8983741
## 3 randomForest.v3 0.8986129
##
##
## $a7
## $a7$nmse
## Workflow Estimate
## 1 rpartXse.v2 1.00000
## 2 rpartXse.v3 1.00000
## 3 randomForest.v3 1.01476
# pairedComparison() function shows whether the difference between the scores of these best models and the remaining alternatives is statistically significant. Baseline workflow is random forest. Use Bonferroni-Dunn test to compare other workflows to baseline.
p <- pairedComparisons(res.all,baseline="randomForest.v3")
p$nmse$F.test
## $chi
## [1] 23.44898
##
## $FF
## [1] 7.584158
##
## $critVal
## [1] 0.6524015
##
## $rejNull
## [1] TRUE
p$nmse$BonferroniDunn.test
## $critDif
## [1] 3.046397
##
## $baseline
## [1] "randomForest.v3"
##
## $rkDifs
## lm rpartXse.v1 rpartXse.v2 rpartXse.v3 randomForest.v1
## 2.8571429 4.4285714 2.8571429 2.5714286 1.0000000
## randomForest.v2
## 0.2857143
##
## $signifDifs
## lm rpartXse.v1 rpartXse.v2 rpartXse.v3 randomForest.v1
## FALSE TRUE FALSE FALSE FALSE
## randomForest.v2
## FALSE
# produces a Bonferroni-Dunn CD diagram
CDdiagram.BD(p)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
## Please report the issue at
## <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the performanceEstimation package.
## Please report the issue at
## <https://github.com/ltorgo/performanceEstimation/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Predictions for the Seven Algae
# obtain the best model for each of the algae by using the top performer for each of the algae. taskNames() - obtain vector with names of algae prediction tasks. topPerformer() - best workflow for each prediction task name.
wfs <- sapply(taskNames(res.all),
function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
## Workflow Object:
## Workflow ID :: randomForest.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> ntree=200
## learner -> randomForest
## pre -> knnImp
wfs[["a7"]]
## Workflow Object:
## Workflow ID :: rpartXse.v2
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0.5
## learner -> rpartXse
# obtain matrix with prediction of best workflows by creating array where 140 rows is test samples, 7 columns are algae, and 2 layers are true values (trues) and prediction values (preds)
full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2),
dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
for(i in 1:7) { #Loop to train model/predict test data
res <- runWorkflow(wfs[[i]], #best workflow for each algae
as.formula(paste(names(wfs)[i],"~.")),
algae[,c(1:11,11+i)],
full.test.algae[,c(1:11,11+i)])
pts[,i,"trues"] <- res$trues #trues and preds stored in pts
pts[,i,"preds"] <- res$preds
}
# shows the prediction and true values for algae “a1” and “a3” on the first 3 test cases by successively applying the best workflows for each of the 7 algae using function runWorkflow()
pts[1:3,c("a1","a3"),]
## , , trues
##
## a1 a3
## 1 1.2 1.9
## 2 1.2 0.0
## 3 7.0 6.5
##
## , , preds
##
## a1 a3
## 1 3.063175 4.656507
## 2 11.023367 3.482419
## 3 13.902800 6.822738
# calculate NMSE scores of models on the seven algae by taking the model error over the baseline error. scale() used to normalize data set.
avg.preds <- apply(algae[,12:18], 2, mean)
apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) /
apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)
## a1 a2 a3 a4 a5 a6 a7
## 0.4705232 0.8644970 0.7812299 0.7141314 0.7122073 0.8340645 1.0000000