#install.packages('DMwR2')
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data(algae)
head(algae)
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary(select(algae,-c(season, size, speed)))
##       mxPH            mnO2              Cl               NO3        
##  Min.   :5.600   Min.   : 1.500   Min.   :  0.222   Min.   : 0.050  
##  1st Qu.:7.700   1st Qu.: 7.725   1st Qu.: 10.981   1st Qu.: 1.296  
##  Median :8.060   Median : 9.800   Median : 32.730   Median : 2.675  
##  Mean   :8.012   Mean   : 9.118   Mean   : 43.636   Mean   : 3.282  
##  3rd Qu.:8.400   3rd Qu.:10.800   3rd Qu.: 57.824   3rd Qu.: 4.446  
##  Max.   :9.700   Max.   :13.400   Max.   :391.500   Max.   :45.650  
##  NA's   :1       NA's   :2        NA's   :10        NA's   :2       
##       NH4                oPO4             PO4              Chla        
##  Min.   :    5.00   Min.   :  1.00   Min.   :  1.00   Min.   :  0.200  
##  1st Qu.:   38.33   1st Qu.: 15.70   1st Qu.: 41.38   1st Qu.:  2.000  
##  Median :  103.17   Median : 40.15   Median :103.29   Median :  5.475  
##  Mean   :  501.30   Mean   : 73.59   Mean   :137.88   Mean   : 13.971  
##  3rd Qu.:  226.95   3rd Qu.: 99.33   3rd Qu.:213.75   3rd Qu.: 18.308  
##  Max.   :24064.00   Max.   :564.60   Max.   :771.60   Max.   :110.456  
##  NA's   :2          NA's   :2        NA's   :2        NA's   :12       
##        a1              a2               a3               a4        
##  Min.   : 0.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.50   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 6.95   Median : 3.000   Median : 1.550   Median : 0.000  
##  Mean   :16.92   Mean   : 7.458   Mean   : 4.309   Mean   : 1.992  
##  3rd Qu.:24.80   3rd Qu.:11.375   3rd Qu.: 4.925   3rd Qu.: 2.400  
##  Max.   :89.80   Max.   :72.600   Max.   :42.800   Max.   :44.600  
##                                                                    
##        a5               a6               a7        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 1.900   Median : 0.000   Median : 1.000  
##  Mean   : 5.064   Mean   : 5.964   Mean   : 2.495  
##  3rd Qu.: 7.500   3rd Qu.: 6.925   3rd Qu.: 2.400  
##  Max.   :44.400   Max.   :77.600   Max.   :31.600  
## 
library('ggplot2')
ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..)) + geom_density(color="red") + geom_rug() + ggtitle("The Histogram of mxPH (maximum pH)") + xlab("") + ylab("")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 1 rows containing non-finite values (`stat_density()`).

library(car) 
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")

## [1] 56 57
 ggplot(algae,aes(x=size,y=a1)) +   geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")

ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed)  + geom_jitter(height = 0.4)
## Warning: Removed 1 rows containing missing values (`geom_point()`).
## Removed 1 rows containing missing values (`geom_point()`).

library(ggplot2)
library(forcats) 
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")), speed=fct_relevel(speed,c("low","medium","high")), season=fct_relevel(season,c("spring","summer","autumn","winter")))
symnum(cor(algae[,4:18],use="complete.obs"))
##      mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1                                         
## mnO2    1                                      
## Cl         1                                   
## NO3           1                                
## NH4           ,  1                             
## oPO4    .  .        1                          
## PO4     .  .        * 1                        
## Chla .                  1                      
## a1         .        . .    1                   
## a2   .                  .     1                
## a3                               1             
## a4      .           . .             1          
## a5                                     1       
## a6            .  .                     .  1    
## a7                                           1 
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
algae <- knnImputation(algae, k = 10)
#algae <- knnImputation(algae, k = 10, meth = "median")
#algae <- na.omit(algae)

#Ten most similar cases of any water sample with some unknown
colSums(is.na(algae))
## season   size  speed   mxPH   mnO2     Cl    NO3    NH4   oPO4    PO4   Chla 
##      0      0      0      0      0      0      0      0      0      0      0 
##     a1     a2     a3     a4     a5     a6     a7 
##      0      0      0      0      0      0      0
data2graph <- filter(algae,!is.na(mnO2)) %>% mutate(minO2=cut(mnO2, quantile(mnO2,c(0,0.25,.5,.75,1)), include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season, color=season)) + geom_point() + facet_wrap(~ minO2) + guides(color=FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(corrplot) 
## corrplot 0.92 loaded
cm <- cor(algae[,4:18], use="complete.obs") 
corrplot(cm, type="upper", tl.pos="d") 
corrplot(cm, add=TRUE, type="lower", method="number",
diag=FALSE, tl.pos="n", cl.pos="n")

lm.a1 <- lm(a1 ~ ., data = algae[, 1:12])
summary(lm.a1)
## 
## Call:
## lm(formula = a1 ~ ., data = algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.289 -11.984  -2.587   7.158  62.485 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  56.130043  21.494984   2.611  0.00976 **
## seasonsummer -3.360656   3.715667  -0.904  0.36694   
## seasonautumn -3.635550   4.124706  -0.881  0.37925   
## seasonwinter -0.201292   3.366027  -0.060  0.95238   
## sizemedium   -6.382976   3.437924  -1.857  0.06496 . 
## sizelarge    -9.774352   4.121995  -2.371  0.01876 * 
## speedmedium  -3.932265   4.079343  -0.964  0.33634   
## speedhigh    -3.735161   4.669264  -0.800  0.42477   
## mxPH         -3.064221   2.649865  -1.156  0.24903   
## mnO2          1.041171   0.702533   1.482  0.14004   
## Cl           -0.041356   0.033538  -1.233  0.21911   
## NO3          -1.487189   0.549263  -2.708  0.00742 **
## NH4           0.001623   0.001000   1.623  0.10627   
## oPO4         -0.006952   0.039738  -0.175  0.86132   
## PO4          -0.050319   0.030608  -1.644  0.10188   
## Chla         -0.095163   0.079524  -1.197  0.23298   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.6 on 184 degrees of freedom
## Multiple R-squared:  0.3718, Adjusted R-squared:  0.3205 
## F-statistic: 7.259 on 15 and 184 DF,  p-value: 1.962e-12
anova(lm.a1)

season is the variable that least contributes to the reduction of the fitting error of the model

 lm2.a1 <- update(lm.a1, . ~ . - season)
summary(lm2.a1)
## 
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + 
##     oPO4 + PO4 + Chla, data = algae[, 1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.177 -11.627  -3.127   7.473  64.140 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  5.382e+01  2.123e+01   2.535  0.01205 * 
## sizemedium  -6.959e+00  3.382e+00  -2.058  0.04102 * 
## sizelarge   -1.034e+01  4.073e+00  -2.539  0.01192 * 
## speedmedium -3.585e+00  4.050e+00  -0.885  0.37720   
## speedhigh   -2.857e+00  4.575e+00  -0.624  0.53316   
## mxPH        -2.721e+00  2.604e+00  -1.045  0.29738   
## mnO2         7.899e-01  6.566e-01   1.203  0.23047   
## Cl          -3.897e-02  3.327e-02  -1.171  0.24301   
## NO3         -1.508e+00  5.457e-01  -2.764  0.00629 **
## NH4          1.573e-03  9.922e-04   1.586  0.11453   
## oPO4        -8.039e-03  3.936e-02  -0.204  0.83838   
## PO4         -4.898e-02  3.038e-02  -1.613  0.10853   
## Chla        -9.085e-02  7.903e-02  -1.150  0.25179   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.53 on 187 degrees of freedom
## Multiple R-squared:  0.3666, Adjusted R-squared:  0.326 
## F-statistic:  9.02 on 12 and 187 DF,  p-value: 1.447e-13
 anova(lm.a1,lm2.a1)
final.lm <- step(lm.a1)
## Start:  AIC=1162.42
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
##     PO4 + Chla
## 
##          Df Sum of Sq   RSS    AIC
## - season  3    464.97 57444 1158.0
## - speed   2    292.88 57272 1159.5
## - oPO4    1      9.48 56988 1160.5
## - mxPH    1    414.08 57393 1161.9
## - Chla    1    443.44 57422 1162.0
## - Cl      1    470.87 57450 1162.1
## <none>                56979 1162.4
## - mnO2    1    680.15 57659 1162.8
## - NH4     1    815.83 57795 1163.3
## - PO4     1    836.97 57816 1163.3
## - size    2   1907.16 58886 1165.0
## - NO3     1   2270.22 59249 1168.2
## 
## Step:  AIC=1158.05
## a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
##     Chla
## 
##         Df Sum of Sq   RSS    AIC
## - speed  2    241.35 57685 1154.9
## - oPO4   1     12.82 57457 1156.1
## - mxPH   1    335.44 57779 1157.2
## - Chla   1    405.94 57850 1157.5
## - Cl     1    421.38 57865 1157.5
## - mnO2   1    444.62 57888 1157.6
## <none>               57444 1158.0
## - NH4    1    772.24 58216 1158.7
## - PO4    1    798.75 58243 1158.8
## - size   2   2221.90 59666 1161.6
## - NO3    1   2346.19 59790 1164.1
## 
## Step:  AIC=1154.89
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - oPO4  1     22.84 57708 1153.0
## - Chla  1    270.76 57956 1153.8
## - mxPH  1    339.76 58025 1154.1
## - mnO2  1    417.35 58102 1154.3
## - Cl    1    517.03 58202 1154.7
## <none>              57685 1154.9
## - NH4   1    716.31 58401 1155.4
## - PO4   1    759.91 58445 1155.5
## - size  2   2140.20 59825 1158.2
## - NO3   1   2306.11 59991 1160.7
## 
## Step:  AIC=1152.97
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
## 
##        Df Sum of Sq   RSS    AIC
## - Chla  1     248.2 57956 1151.8
## - mxPH  1     372.0 58080 1152.2
## - mnO2  1     403.5 58111 1152.4
## - Cl    1     514.6 58223 1152.7
## <none>              57708 1153.0
## - NH4   1     695.7 58404 1153.4
## - size  2    2118.8 59827 1156.2
## - NO3   1    2294.6 60003 1158.8
## - PO4   1    5730.8 63439 1169.9
## 
## Step:  AIC=1151.83
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## - mnO2  1     438.2 58394 1151.3
## - Cl    1     476.7 58433 1151.5
## <none>              57956 1151.8
## - mxPH  1     651.8 58608 1152.1
## - NH4   1     741.2 58697 1152.4
## - size  2    2308.6 60265 1155.6
## - NO3   1    2602.3 60559 1158.6
## - PO4   1    6243.5 64200 1170.3
## 
## Step:  AIC=1151.33
## a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## - NH4   1     525.6 58920 1151.1
## <none>              58394 1151.3
## - Cl    1     632.1 59027 1151.5
## - mxPH  1     636.9 59031 1151.5
## - size  2    2581.8 60976 1156.0
## - NO3   1    2186.5 60581 1156.7
## - PO4   1    9020.9 67415 1178.1
## 
## Step:  AIC=1151.12
## a1 ~ size + mxPH + Cl + NO3 + PO4
## 
##        Df Sum of Sq   RSS    AIC
## <none>              58920 1151.1
## - mxPH  1     605.6 59526 1151.2
## - Cl    1     892.4 59812 1152.1
## - NO3   1    1911.2 60831 1155.5
## - size  2    2764.8 61685 1156.3
## - PO4   1    8504.1 67424 1176.1
 summary(final.lm)
## 
## Call:
## lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = algae[, 
##     1:12])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.577 -12.515  -3.712   8.596  63.203 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  63.24832   18.79098   3.366 0.000921 ***
## sizemedium   -7.56654    3.13688  -2.412 0.016796 *  
## sizelarge   -10.66720    3.78481  -2.818 0.005329 ** 
## mxPH         -3.43045    2.43565  -1.408 0.160612    
## Cl           -0.05395    0.03156  -1.710 0.088926 .  
## NO3          -0.87713    0.35056  -2.502 0.013178 *  
## PO4          -0.05885    0.01115  -5.278  3.5e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.47 on 193 degrees of freedom
## Multiple R-squared:  0.3503, Adjusted R-squared:  0.3302 
## F-statistic: 17.35 on 6 and 193 DF,  p-value: 5.135e-16

Regression tree

library(rpart)
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
rt.a1 
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90694.880 16.923500  
##    2) Cl>=7.307559 166 37804.490 10.588550  
##      4) PO4>=43.818 143 21819.470  7.477622  
##        8) oPO4>=51.118 85  3455.770  3.801176 *
##        9) oPO4< 51.118 58 15531.110 12.865520  
##         18) mnO2>=10.05 25  1373.070  7.172000 *
##         19) mnO2< 10.05 33 12733.700 17.178790  
##           38) oPO4< 24.917 8   382.800  6.750000 *
##           39) oPO4>=24.917 25 11202.390 20.516000  
##             78) mnO2< 9.4 16  3486.958 14.762500 *
##             79) mnO2>=9.4 9  6244.202 30.744440 *
##      5) PO4< 43.818 23  5996.589 29.930430 *
##    3) Cl< 7.307559 34 13703.120 47.852940  
##      6) NH4>=14.875 20  7564.538 41.010000  
##       12) Cl< 4.7875 11  3860.756 31.718180 *
##       13) Cl>=4.7875 9  1593.300 52.366670 *
##      7) NH4< 14.875 14  3864.189 57.628570 *
library(rpart.plot) 
prp(rt.a1,extra=101,box.col="orange",split.box.col="grey")

printcp(rt.a1)
## 
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
## 
## Variables actually used in tree construction:
## [1] Cl   mnO2 NH4  oPO4 PO4 
## 
## Root node error: 90695/200 = 453.47
## 
## n= 200 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.432078      0   1.00000 1.00758 0.13046
## 2 0.110132      1   0.56792 0.65764 0.11204
## 3 0.031232      2   0.45779 0.56736 0.10853
## 4 0.025077      3   0.42656 0.57120 0.10756
## 5 0.023270      4   0.40148 0.60422 0.11039
## 6 0.015705      5   0.37821 0.60786 0.10323
## 7 0.014443      6   0.36251 0.61610 0.10398
## 8 0.010000      8   0.33362 0.61785 0.10718
rt2.a1 <- prune(rt.a1, cp = 0.08) 
rt2.a1
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 200 90694.880 16.923500  
##   2) Cl>=7.307559 166 37804.490 10.588550  
##     4) PO4>=43.818 143 21819.470  7.477622 *
##     5) PO4< 43.818 23  5996.589 29.930430 *
##   3) Cl< 7.307559 34 13703.120 47.852940 *
 (rt.a1 <- rpartXse(a1 ~ ., data = algae[, 1:12]))
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 200 90694.880 16.923500  
##   2) Cl>=7.307559 166 37804.490 10.588550  
##     4) PO4>=43.818 143 21819.470  7.477622 *
##     5) PO4< 43.818 23  5996.589 29.930430 *
##   3) Cl< 7.307559 34 13703.120 47.852940 *
first.tree <- rpart(a1 ~ ., data = algae[, 1:12]) 
snip.rpart(first.tree, c(4, 7))
## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90694.880 16.923500  
##    2) Cl>=7.307559 166 37804.490 10.588550  
##      4) PO4>=43.818 143 21819.470  7.477622 *
##      5) PO4< 43.818 23  5996.589 29.930430 *
##    3) Cl< 7.307559 34 13703.120 47.852940  
##      6) NH4>=14.875 20  7564.538 41.010000  
##       12) Cl< 4.7875 11  3860.756 31.718180 *
##       13) Cl>=4.7875 9  1593.300 52.366670 *
##      7) NH4< 14.875 14  3864.189 57.628570 *
plot(first.tree) 
text(first.tree)
snip.rpart(first.tree)

## n= 200 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 200 90694.880 16.923500  
##    2) Cl>=7.307559 166 37804.490 10.588550  
##      4) PO4>=43.818 143 21819.470  7.477622  
##        8) oPO4>=51.118 85  3455.770  3.801176 *
##        9) oPO4< 51.118 58 15531.110 12.865520  
##         18) mnO2>=10.05 25  1373.070  7.172000 *
##         19) mnO2< 10.05 33 12733.700 17.178790  
##           38) oPO4< 24.917 8   382.800  6.750000 *
##           39) oPO4>=24.917 25 11202.390 20.516000  
##             78) mnO2< 9.4 16  3486.958 14.762500 *
##             79) mnO2>=9.4 9  6244.202 30.744440 *
##      5) PO4< 43.818 23  5996.589 29.930430 *
##    3) Cl< 7.307559 34 13703.120 47.852940  
##      6) NH4>=14.875 20  7564.538 41.010000  
##       12) Cl< 4.7875 11  3860.756 31.718180 *
##       13) Cl>=4.7875 9  1593.300 52.366670 *
##      7) NH4< 14.875 14  3864.189 57.628570 *
lm.predictions.a1 <- predict(final.lm, algae) 
rt.predictions.a1 <- predict(rt.a1, algae)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.0787
(mae.a1.rt <- mean(abs(rt.predictions.a1 - algae[["a1"]])))
## [1] 9.704579
(mse.a1.lm <- mean((lm.predictions.a1 - algae[["a1"]])^2))
## [1] 294.6003
(mse.a1.rt <- mean((rt.predictions.a1 - algae[["a1"]])^2))
## [1] 207.5959
(nmse.a1.lm <- mean((lm.predictions.a1-algae[['a1']])^2)/ + mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.6496514
(nmse.a1.rt <- mean((rt.predictions.a1-algae[['a1']])^2)/
+   mean((mean(algae[['a1']])-algae[['a1']])^2))
## [1] 0.4577897
dg <- data.frame(lm.a1=lm.predictions.a1,
rt.a1=rt.predictions.a1,
true.a1=algae[["a1"]]) 
ggplot(dg,aes(x=lm.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") +
ggtitle("Linear Model")

ggplot(dg,aes(x=rt.a1,y=true.a1)) +
geom_point() + geom_abline(slope=1,intercept=0,color="red") + ggtitle("Regression Tree")

sensible.lm.predictions.a1 <- ifelse(lm.predictions.a1 < 0, 0, lm.predictions.a1)
(mae.a1.lm <- mean(abs(lm.predictions.a1 - algae[["a1"]])))
## [1] 13.0787
(smae.a1.lm <- mean(abs(sensible.lm.predictions.a1 - algae[["a1"]])))
## [1] 12.45408
#install.packages('performanceEstimation')
library(performanceEstimation)
res <- performanceEstimation(
PredTask(a1 ~ ., algae[, 1:12], "a1"), c(Workflow(learner="lm",pre="knnImp",post="onlyPos"),
workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1)))), EstimationTask(metrics="nmse",method=CV(nReps=5,nFolds=10)))
## 
## 
## ##### PERFORMANCE ESTIMATION USING  CROSS VALIDATION  #####
## 
## ** PREDICTIVE TASK :: a1
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
summary(res)
## 
## == Summary of a  Cross Validation Performance Estimation Experiment ==
## 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## 
## * Predictive Tasks ::  a1
## * Workflows  ::  lm, rpartXse.v1, rpartXse.v2, rpartXse.v3 
## 
## -> Task:  a1
##   *Workflow: lm 
##              nmse
## avg     0.7148520
## std     0.1927418
## med     0.6898176
## iqr     0.1848591
## min     0.3826330
## max     1.4959328
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v1 
##              nmse
## avg     0.5724882
## std     0.2631758
## med     0.5080657
## iqr     0.3081629
## min     0.1871173
## max     1.2187694
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v2 
##              nmse
## avg     0.6091709
## std     0.2676449
## med     0.5828456
## iqr     0.3326727
## min     0.1871173
## max     1.1999532
## invalid 0.0000000
## 
##   *Workflow: rpartXse.v3 
##              nmse
## avg     0.6481759
## std     0.2536899
## med     0.6247864
## iqr     0.3500925
## min     0.2053678
## max     1.1999532
## invalid 0.0000000
plot(res)

getWorkflow("rpartXse.v1", res)
## Workflow Object:
##  Workflow ID       ::  rpartXse.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> se=0 
##       learner  -> rpartXse
DSs <- sapply(names(algae)[12:18], function(x,names.attrs) {
f <- as.formula(paste(x, "~ ."))
PredTask(f, algae[,c(names.attrs,x)], x, copy=TRUE) },
names(algae)[1:11])
res.all <- performanceEstimation(
DSs, c(Workflow(learner="lm", pre="knnImp", post="onlyPos"),
workflowVariants(learner="rpartXse", learner.pars=list(se=c(0,0.5,1)))), EstimationTask(metrics="nmse" ,method=CV(nReps=5, nFolds=10)))
## 
## 
## ##### PERFORMANCE ESTIMATION USING  CROSS VALIDATION  #####
## 
## ** PREDICTIVE TASK :: a1
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a2
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a3
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a4
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a5
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a6
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ** PREDICTIVE TASK :: a7
## 
## ++ MODEL/WORKFLOW :: lm 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v1 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v2 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
## 
## 
## ++ MODEL/WORKFLOW :: rpartXse.v3 
## Task for estimating  nmse  using
##  5 x 10 - Fold Cross Validation
##   Run with seed =  1234 
## Iteration :**************************************************
plot(res.all)

topPerformers(res.all)
## $a1
##         Workflow Estimate
## nmse rpartXse.v1    0.572
## 
## $a2
##      Workflow Estimate
## nmse       lm     0.96
## 
## $a3
##         Workflow Estimate
## nmse rpartXse.v2        1
## 
## $a4
##         Workflow Estimate
## nmse rpartXse.v2        1
## 
## $a5
##      Workflow Estimate
## nmse       lm    0.996
## 
## $a6
##      Workflow Estimate
## nmse       lm    0.916
## 
## $a7
##         Workflow Estimate
## nmse rpartXse.v2        1
wfs <- sapply(taskNames(res.all),   function(t) topPerformer(res.all,metric="nmse",task=t))
wfs[["a1"]]
## Workflow Object:
##  Workflow ID       ::  rpartXse.v1 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> se=0 
##       learner  -> rpartXse
full.test.algae <- cbind(test.algae, algae.sols) 
pts <- array(dim = c(140,7,2), dimnames = list(1:140, paste0("a",1:7), c("trues","preds"))) 
for(i in 1:7) {
    res <- runWorkflow(wfs[[i]], 
      as.formula(paste(names(wfs)[i],"~.")),
      algae[,c(1:11,11+i)],
      full.test.algae[,c(1:11,11+i)])
    pts[,i,"trues"]<-res$trues
    pts[,i,"preds"]<-res$preds
}
avg.preds <- apply(algae[,12:18], 2, mean) 
apply((pts[,,"trues"] - pts[,,"preds"])^2, 2 ,sum) / +  apply( (scale(pts[,,"trues"], avg.preds, FALSE))^2, 2, sum)
##        a1        a2        a3        a4        a5        a6        a7 
## 0.5545185 1.0688121 1.0000000 1.0000000 0.8437873 0.8314581 1.0000000