#install.packages('DMwR2')
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data(algae)
head(algae)
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(select(algae,-c(season, size, speed)))
## mxPH mnO2 Cl NO3
## Min. :5.600 Min. : 1.500 Min. : 0.222 Min. : 0.050
## 1st Qu.:7.700 1st Qu.: 7.725 1st Qu.: 10.981 1st Qu.: 1.296
## Median :8.060 Median : 9.800 Median : 32.730 Median : 2.675
## Mean :8.012 Mean : 9.118 Mean : 43.636 Mean : 3.282
## 3rd Qu.:8.400 3rd Qu.:10.800 3rd Qu.: 57.824 3rd Qu.: 4.446
## Max. :9.700 Max. :13.400 Max. :391.500 Max. :45.650
## NA's :1 NA's :2 NA's :10 NA's :2
## NH4 oPO4 PO4 Chla
## Min. : 5.00 Min. : 1.00 Min. : 1.00 Min. : 0.200
## 1st Qu.: 38.33 1st Qu.: 15.70 1st Qu.: 41.38 1st Qu.: 2.000
## Median : 103.17 Median : 40.15 Median :103.29 Median : 5.475
## Mean : 501.30 Mean : 73.59 Mean :137.88 Mean : 13.971
## 3rd Qu.: 226.95 3rd Qu.: 99.33 3rd Qu.:213.75 3rd Qu.: 18.308
## Max. :24064.00 Max. :564.60 Max. :771.60 Max. :110.456
## NA's :2 NA's :2 NA's :2 NA's :12
## a1 a2 a3 a4
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.50 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.95 Median : 3.000 Median : 1.550 Median : 0.000
## Mean :16.92 Mean : 7.458 Mean : 4.309 Mean : 1.992
## 3rd Qu.:24.80 3rd Qu.:11.375 3rd Qu.: 4.925 3rd Qu.: 2.400
## Max. :89.80 Max. :72.600 Max. :42.800 Max. :44.600
##
## a5 a6 a7
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.900 Median : 0.000 Median : 1.000
## Mean : 5.064 Mean : 5.964 Mean : 2.495
## 3rd Qu.: 7.500 3rd Qu.: 6.925 3rd Qu.: 2.400
## Max. :44.400 Max. :77.600 Max. :31.600
##
library('ggplot2')
ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() + ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 1 rows containing non-finite values (`stat_density()`).
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
## [1] 56 57
ggplot(algae,aes(x=size,y=a1)) + geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")
ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed) + geom_jitter(height = 0.4)
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).
library(ggplot2)
library(forcats)
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")), speed=fct_relevel(speed,c("low","medium","high")), season=fct_relevel(season,c("spring","summer","autumn","winter")))
symnum(cor(algae[,4:18],use="complete.obs"))
## mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1
## mnO2 1
## Cl 1
## NO3 1
## NH4 , 1
## oPO4 . . 1
## PO4 . . * 1
## Chla . 1
## a1 . . . 1
## a2 . . 1
## a3 1
## a4 . . . 1
## a5 1
## a6 . . . 1
## a7 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
algae <- knnImputation(algae, k = 10)
#algae <- knnImputation(algae, k = 10, meth = "median")
#algae <- na.omit(algae)
#Ten most similar cases of any water sample with some unknown
colSums(is.na(algae))
## season size speed mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla
## 0 0 0 0 0 0 0 0 0 0 0
## a1 a2 a3 a4 a5 a6 a7
## 0 0 0 0 0 0 0
data2graph <- filter(algae,!is.na(mnO2)) %>% mutate(minO2=cut(mnO2, quantile(mnO2,c(0,0.25,.5,.75,1)), include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season, color=season)) + geom_point() + facet_wrap(~ minO2) + guides(color=FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(corrplot)
## corrplot 0.92 loaded
cm <- cor(algae[,4:18], use="complete.obs")
corrplot(cm, type="upper", tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
diag=FALSE, tl.pos="n", cl.pos="n")
lm.a1 <- lm(a1 ~ ., data = algae[, 1:12])
summary(lm.a1)
##
## Call:
## lm(formula = a1 ~ ., data = algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.289 -11.984 -2.587 7.158 62.485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.130043 21.494984 2.611 0.00976 **
## seasonsummer -3.360656 3.715667 -0.904 0.36694
## seasonautumn -3.635550 4.124706 -0.881 0.37925
## seasonwinter -0.201292 3.366027 -0.060 0.95238
## sizemedium -6.382976 3.437924 -1.857 0.06496 .
## sizelarge -9.774352 4.121995 -2.371 0.01876 *
## speedmedium -3.932265 4.079343 -0.964 0.33634
## speedhigh -3.735161 4.669264 -0.800 0.42477
## mxPH -3.064221 2.649865 -1.156 0.24903
## mnO2 1.041171 0.702533 1.482 0.14004
## Cl -0.041356 0.033538 -1.233 0.21911
## NO3 -1.487189 0.549263 -2.708 0.00742 **
## NH4 0.001623 0.001000 1.623 0.10627
## oPO4 -0.006952 0.039738 -0.175 0.86132
## PO4 -0.050319 0.030608 -1.644 0.10188
## Chla -0.095163 0.079524 -1.197 0.23298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.6 on 184 degrees of freedom
## Multiple R-squared: 0.3718, Adjusted R-squared: 0.3205
## F-statistic: 7.259 on 15 and 184 DF, p-value: 1.962e-12
anova(lm.a1)
season is the variable that least contributes to the reduction of the fitting error of the model
lm2.a1 <- update(lm.a1, . ~ . - season)
summary(lm2.a1)
##
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 +
## oPO4 + PO4 + Chla, data = algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.177 -11.627 -3.127 7.473 64.140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.382e+01 2.123e+01 2.535 0.01205 *
## sizemedium -6.959e+00 3.382e+00 -2.058 0.04102 *
## sizelarge -1.034e+01 4.073e+00 -2.539 0.01192 *
## speedmedium -3.585e+00 4.050e+00 -0.885 0.37720
## speedhigh -2.857e+00 4.575e+00 -0.624 0.53316
## mxPH -2.721e+00 2.604e+00 -1.045 0.29738
## mnO2 7.899e-01 6.566e-01 1.203 0.23047
## Cl -3.897e-02 3.327e-02 -1.171 0.24301
## NO3 -1.508e+00 5.457e-01 -2.764 0.00629 **
## NH4 1.573e-03 9.922e-04 1.586 0.11453
## oPO4 -8.039e-03 3.936e-02 -0.204 0.83838
## PO4 -4.898e-02 3.038e-02 -1.613 0.10853
## Chla -9.085e-02 7.903e-02 -1.150 0.25179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.53 on 187 degrees of freedom
## Multiple R-squared: 0.3666, Adjusted R-squared: 0.326
## F-statistic: 9.02 on 12 and 187 DF, p-value: 1.447e-13
anova(lm.a1,lm2.a1)
final.lm <- step(lm.a1)
## Start: AIC=1162.42
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - season 3 464.97 57444 1158.0
## - speed 2 292.88 57272 1159.5
## - oPO4 1 9.48 56988 1160.5
## - mxPH 1 414.08 57393 1161.9
## - Chla 1 443.44 57422 1162.0
## - Cl 1 470.87 57450 1162.1
## <none> 56979 1162.4
## - mnO2 1 680.15 57659 1162.8
## - NH4 1 815.83 57795 1163.3
## - PO4 1 836.97 57816 1163.3
## - size 2 1907.16 58886 1165.0
## - NO3 1 2270.22 59249 1168.2
##
## Step: AIC=1158.05
## a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
##
## Df Sum of Sq RSS AIC
## - speed 2 241.35 57685 1154.9
## - oPO4 1 12.82 57457 1156.1
## - mxPH 1 335.44 57779 1157.2
## - Chla 1 405.94 57850 1157.5
## - Cl 1 421.38 57865 1157.5
## - mnO2 1 444.62 57888 1157.6
## <none> 57444 1158.0
## - NH4 1 772.24 58216 1158.7
## - PO4 1 798.75 58243 1158.8
## - size 2 2221.90 59666 1161.6
## - NO3 1 2346.19 59790 1164.1
##
## Step: AIC=1154.89
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - oPO4 1 22.84 57708 1153.0
## - Chla 1 270.76 57956 1153.8
## - mxPH 1 339.76 58025 1154.1
## - mnO2 1 417.35 58102 1154.3
## - Cl 1 517.03 58202 1154.7
## <none> 57685 1154.9
## - NH4 1 716.31 58401 1155.4
## - PO4 1 759.91 58445 1155.5
## - size 2 2140.20 59825 1158.2
## - NO3 1 2306.11 59991 1160.7
##
## Step: AIC=1152.97
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - Chla 1 248.2 57956 1151.8
## - mxPH 1 372.0 58080 1152.2
## - mnO2 1 403.5 58111 1152.4
## - Cl 1 514.6 58223 1152.7
## <none> 57708 1153.0
## - NH4 1 695.7 58404 1153.4
## - size 2 2118.8 59827 1156.2
## - NO3 1 2294.6 60003 1158.8
## - PO4 1 5730.8 63439 1169.9
##
## Step: AIC=1151.83
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - mnO2 1 438.2 58394 1151.3
## - Cl 1 476.7 58433 1151.5
## <none> 57956 1151.8
## - mxPH 1 651.8 58608 1152.1
## - NH4 1 741.2 58697 1152.4
## - size 2 2308.6 60265 1155.6
## - NO3 1 2602.3 60559 1158.6
## - PO4 1 6243.5 64200 1170.3
##
## Step: AIC=1151.33
## a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - NH4 1 525.6 58920 1151.1
## <none> 58394 1151.3
## - Cl 1 632.1 59027 1151.5
## - mxPH 1 636.9 59031 1151.5
## - size 2 2581.8 60976 1156.0
## - NO3 1 2186.5 60581 1156.7
## - PO4 1 9020.9 67415 1178.1
##
## Step: AIC=1151.12
## a1 ~ size + mxPH + Cl + NO3 + PO4
##
## Df Sum of Sq RSS AIC
## <none> 58920 1151.1
## - mxPH 1 605.6 59526 1151.2
## - Cl 1 892.4 59812 1152.1
## - NO3 1 1911.2 60831 1155.5
## - size 2 2764.8 61685 1156.3
## - PO4 1 8504.1 67424 1176.1
summary(final.lm)
##
## Call:
## lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = algae[,
## 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.577 -12.515 -3.712 8.596 63.203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.24832 18.79098 3.366 0.000921 ***
## sizemedium -7.56654 3.13688 -2.412 0.016796 *
## sizelarge -10.66720 3.78481 -2.818 0.005329 **
## mxPH -3.43045 2.43565 -1.408 0.160612
## Cl -0.05395 0.03156 -1.710 0.088926 .
## NO3 -0.87713 0.35056 -2.502 0.013178 *
## PO4 -0.05885 0.01115 -5.278 3.5e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.47 on 193 degrees of freedom
## Multiple R-squared: 0.3503, Adjusted R-squared: 0.3302
## F-statistic: 17.35 on 6 and 193 DF, p-value: 5.135e-16
Regression tree
library(rpart)
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
rt.a1
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90694.880 16.923500
## 2) Cl>=7.307559 166 37804.490 10.588550
## 4) PO4>=43.818 143 21819.470 7.477622
## 8) oPO4>=51.118 85 3455.770 3.801176 *
## 9) oPO4< 51.118 58 15531.110 12.865520
## 18) mnO2>=10.05 25 1373.070 7.172000 *
## 19) mnO2< 10.05 33 12733.700 17.178790
## 38) oPO4< 24.917 8 382.800 6.750000 *
## 39) oPO4>=24.917 25 11202.390 20.516000
## 78) mnO2< 9.4 16 3486.958 14.762500 *
## 79) mnO2>=9.4 9 6244.202 30.744440 *
## 5) PO4< 43.818 23 5996.589 29.930430 *
## 3) Cl< 7.307559 34 13703.120 47.852940
## 6) NH4>=14.875 20 7564.538 41.010000
## 12) Cl< 4.7875 11 3860.756 31.718180 *
## 13) Cl>=4.7875 9 1593.300 52.366670 *
## 7) NH4< 14.875 14 3864.189 57.628570 *
library(rpart.plot)
prp(rt.a1,extra=101,box.col="orange",split.box.col="grey")
printcp(rt.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
##
## Variables actually used in tree construction:
## [1] Cl mnO2 NH4 oPO4 PO4
##
## Root node error: 90695/200 = 453.47
##
## n= 200
##
## CP nsplit rel error xerror xstd
## 1 0.432078 0 1.00000 1.00758 0.13046
## 2 0.110132 1 0.56792 0.65764 0.11204
## 3 0.031232 2 0.45779 0.56736 0.10853
## 4 0.025077 3 0.42656 0.57120 0.10756
## 5 0.023270 4 0.40148 0.60422 0.11039
## 6 0.015705 5 0.37821 0.60786 0.10323
## 7 0.014443 6 0.36251 0.61610 0.10398
## 8 0.010000 8 0.33362 0.61785 0.10718
rt2.a1 <- prune(rt.a1, cp = 0.08)
rt2.a1
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90694.880 16.923500
## 2) Cl>=7.307559 166 37804.490 10.588550
## 4) PO4>=43.818 143 21819.470 7.477622 *
## 5) PO4< 43.818 23 5996.589 29.930430 *
## 3) Cl< 7.307559 34 13703.120 47.852940 *
(rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90694.880 16.923500
## 2) Cl>=7.307559 166 37804.490 10.588550
## 4) PO4>=43.818 143 21819.470 7.477622 *
## 5) PO4< 43.818 23 5996.589 29.930430 *
## 3) Cl< 7.307559 34 13703.120 47.852940 *
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
snip.rpart(first.tree, c(4, 7))
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90694.880 16.923500
## 2) Cl>=7.307559 166 37804.490 10.588550
## 4) PO4>=43.818 143 21819.470 7.477622 *
## 5) PO4< 43.818 23 5996.589 29.930430 *
## 3) Cl< 7.307559 34 13703.120 47.852940
## 6) NH4>=14.875 20 7564.538 41.010000
## 12) Cl< 4.7875 11 3860.756 31.718180 *
## 13) Cl>=4.7875 9 1593.300 52.366670 *
## 7) NH4< 14.875 14 3864.189 57.628570 *
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)
## n= 200
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 200 90694.880 16.923500
## 2) Cl>=7.307559 166 37804.490 10.588550
## 4) PO4>=43.818 143 21819.470 7.477622
## 8) oPO4>=51.118 85 3455.770 3.801176 *
## 9) oPO4< 51.118 58 15531.110 12.865520
## 18) mnO2>=10.05 25 1373.070 7.172000 *
## 19) mnO2< 10.05 33 12733.700 17.178790
## 38) oPO4< 24.917 8 382.800 6.750000 *
## 39) oPO4>=24.917 25 11202.390 20.516000
## 78) mnO2< 9.4 16 3486.958 14.762500 *
## 79) mnO2>=9.4 9 6244.202 30.744440 *
## 5) PO4< 43.818 23 5996.589 29.930430 *
## 3) Cl< 7.307559 34 13703.120 47.852940
## 6) NH4>=14.875 20 7564.538 41.010000
## 12) Cl< 4.7875 11 3860.756 31.718180 *
## 13) Cl>=4.7875 9 1593.300 52.366670 *
## 7) NH4< 14.875 14 3864.189 57.628570 *
lm.predictions.a1 <- predict(final.lm, algae)
rt.predictions.a1 <- predict(rt.a1, algae)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.0787
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[["a1"]])))
## [1] 9.704579
(mse.a1.lm <- mean((lm.predictions.a1 - algae[["a1"]])^2))
## [1] 294.6003
(mse.a1.rt <- mean((rt.predictions.a1 - algae[["a1"]])^2))
## [1] 207.5959
(nmse.a1.lm <- mean((lm.predictions.a1-algae[['a1']])^2)/ + mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.6496514
(nmse.a1.rt <- mean((rt.predictions.a1-algae[['a1']])^2)/
+ mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.4577897
dg <- data.frame(lm.a1=lm.predictions.a1,
rt.a1=rt.predictions.a1,
true.a1=algae[["a1"]])
ggplot(dg,aes(x=lm.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Linear Model")
ggplot(dg,aes(x=rt.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") + ggtitle("Regression Tree")
sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 0, lm.predictions.a1)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.0787
(smae.a1.lm <- mean(abs(sensible.lm.predictions.a1 - algae[["a1"]])))
## [1] 12.45408
#install.packages('performanceEstimation')
library(performanceEstimation)
res <- performanceEstimation(
PredTask(a1 ~ ., algae[, 1:12], "a1"), c(Workflow(learner="lm",pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1)))), EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
summary(res)
##
## == Summary of a Cross Validation Performance Estimation Experiment ==
##
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
##
## * Predictive Tasks :: a1
## * Workflows :: lm, rpartXse.v1, rpartXse.v2, rpartXse.v3
##
## -> Task: a1
## *Workflow: lm
## nmse
## avg 0.7148520
## std 0.1927418
## med 0.6898176
## iqr 0.1848591
## min 0.3826330
## max 1.4959328
## invalid 0.0000000
##
## *Workflow: rpartXse.v1
## nmse
## avg 0.5724882
## std 0.2631758
## med 0.5080657
## iqr 0.3081629
## min 0.1871173
## max 1.2187694
## invalid 0.0000000
##
## *Workflow: rpartXse.v2
## nmse
## avg 0.6091709
## std 0.2676449
## med 0.5828456
## iqr 0.3326727
## min 0.1871173
## max 1.1999532
## invalid 0.0000000
##
## *Workflow: rpartXse.v3
## nmse
## avg 0.6481759
## std 0.2536899
## med 0.6247864
## iqr 0.3500925
## min 0.2053678
## max 1.1999532
## invalid 0.0000000
plot(res)
getWorkflow("rpartXse.v1", res)
## Workflow Object:
## Workflow ID :: rpartXse.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0
## learner -> rpartXse
DSs <- sapply(names(algae)[12:18], function(x,names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE) },
names(algae)[1:11])
res.all <- performanceEstimation(
DSs, c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))), EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
##
##
## ##### PERFORMANCE ESTIMATION USING CROSS VALIDATION #####
##
## ** PREDICTIVE TASK :: a1
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a2
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a3
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a4
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a5
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a6
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ** PREDICTIVE TASK :: a7
##
## ++ MODEL/WORKFLOW :: lm
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v1
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v2
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
##
##
## ++ MODEL/WORKFLOW :: rpartXse.v3
## Task for estimating nmse using
## 5 x 10 - Fold Cross Validation
## Run with seed = 1234
## Iteration :**************************************************
plot(res.all)
topPerformers(res.all)
## $a1
## Workflow Estimate
## nmse rpartXse.v1 0.572
##
## $a2
## Workflow Estimate
## nmse lm 0.96
##
## $a3
## Workflow Estimate
## nmse rpartXse.v2 1
##
## $a4
## Workflow Estimate
## nmse rpartXse.v2 1
##
## $a5
## Workflow Estimate
## nmse lm 0.996
##
## $a6
## Workflow Estimate
## nmse lm 0.916
##
## $a7
## Workflow Estimate
## nmse rpartXse.v2 1
wfs <- sapply(taskNames(res.all), function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
## Workflow Object:
## Workflow ID :: rpartXse.v1
## Workflow Function :: standardWF
## Parameter values:
## learner.pars -> se=0
## learner -> rpartXse
full.test.algae <- cbind(test.algae, algae.sols)
pts <- array(dim = c(140,7,2), dimnames = list(1:140, paste0("a",1:7), c("trues","preds")))
for(i in 1:7) {
res <- runWorkflow(wfs[[i]],
as.formula(paste(names(wfs)[i],"~.")),
algae[,c(1:11,11+i)],
full.test.algae[,c(1:11,11+i)])
pts[,i,"trues"]<-res$trues
pts[,i,"preds"]<-res$preds
}
avg.preds <- apply(algae[,12:18], 2, mean)
apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) / + apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)
## a1 a2 a3 a4 a5 a6 a7
## 0.5545185 1.0688121 1.0000000 1.0000000 0.8437873 0.8314581 1.0000000