Algae Bloom

Objective

To predict the frequency occurrence of several harmful algae in water samples.
Note: Algae are a type of plant-like living things that can make food from sunlight by photosynthesis.

Motivation and Background

About the dataset

There are two main datasets for this problem:
1. Contains data for 200 water samples. Each observation contains information on 11 variables. 3 of these variables describe season of the year as well as size and speed of the river. The other 8 variables are values of different chemical parameters measured in water samples.  
2. Contains information on 140 extra observations. Can be regarded as a kind of test set. The main goal of this study is to predict the frequencies of the seven algae for these 140 water samples.

Loading the Dataset and Viewing the Data

library(DMwR2)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
head(algae)

Understanding the Data

A fist idea of the statistical properties of the data can be obtained through a summary of its descriptive statistics

summary(algae)
    season       size       speed         mxPH            mnO2              Cl               NO3              NH4                oPO4       
 autumn:40   large :45   high  :84   Min.   :5.600   Min.   : 1.500   Min.   :  0.222   Min.   : 0.050   Min.   :    5.00   Min.   :  1.00  
 spring:53   medium:84   low   :33   1st Qu.:7.700   1st Qu.: 7.725   1st Qu.: 10.981   1st Qu.: 1.296   1st Qu.:   38.33   1st Qu.: 15.70  
 summer:45   small :71   medium:83   Median :8.060   Median : 9.800   Median : 32.730   Median : 2.675   Median :  103.17   Median : 40.15  
 winter:62                           Mean   :8.012   Mean   : 9.118   Mean   : 43.636   Mean   : 3.282   Mean   :  501.30   Mean   : 73.59  
                                     3rd Qu.:8.400   3rd Qu.:10.800   3rd Qu.: 57.824   3rd Qu.: 4.446   3rd Qu.:  226.95   3rd Qu.: 99.33  
                                     Max.   :9.700   Max.   :13.400   Max.   :391.500   Max.   :45.650   Max.   :24064.00   Max.   :564.60  
                                     NA's   :1       NA's   :2        NA's   :10        NA's   :2        NA's   :2          NA's   :2       
      PO4              Chla               a1              a2               a3               a4               a5               a6        
 Min.   :  1.00   Min.   :  0.200   Min.   : 0.00   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 41.38   1st Qu.:  2.000   1st Qu.: 1.50   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
 Median :103.29   Median :  5.475   Median : 6.95   Median : 3.000   Median : 1.550   Median : 0.000   Median : 1.900   Median : 0.000  
 Mean   :137.88   Mean   : 13.971   Mean   :16.92   Mean   : 7.458   Mean   : 4.309   Mean   : 1.992   Mean   : 5.064   Mean   : 5.964  
 3rd Qu.:213.75   3rd Qu.: 18.308   3rd Qu.:24.80   3rd Qu.:11.375   3rd Qu.: 4.925   3rd Qu.: 2.400   3rd Qu.: 7.500   3rd Qu.: 6.925  
 Max.   :771.60   Max.   :110.456   Max.   :89.80   Max.   :72.600   Max.   :42.800   Max.   :44.600   Max.   :44.400   Max.   :77.600  
 NA's   :2        NA's   :12                                                                                                            
       a7        
 Min.   : 0.000  
 1st Qu.: 0.000  
 Median : 1.000  
 Mean   : 2.495  
 3rd Qu.: 2.400  
 Max.   :31.600  
                 
hist(algae$mxPH, prob=T)

library(ggplot2)
ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..))

ggplot(algae,aes(x=mxPH)) + 
    geom_histogram(aes(y=..density..)) + 
    geom_density(color="red") + geom_rug() + 
    ggtitle("The Histogram of mxPH (maximum pH)") + 
    xlab("") + ylab("")


qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
[1] 56 57

ggplot(algae,aes(x=factor(0),y=oPO4)) + 
    geom_boxplot() + geom_rug() + 
    geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),
               linetype=2,colour="red") +
    ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)

plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T), lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
identify(algae$NH4)
integer(0)

library(dplyr)

Attaching package: ‘dplyr’

The following object is masked from ‘package:car’:

    recode

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
filter(algae, NH4 > 19000)

##
ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
    xlab("River Size") + ylab("Algal A1")

ggplot(algae,aes(x=size,y=a1)) + 
    geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")

data2graph <- filter(algae,!is.na(mnO2)) %>%
    mutate(minO2=cut(mnO2, quantile(mnO2,c(0,0.25,.5,.75,1)), include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season, color=season)) + geom_point() + 
    facet_wrap(~ minO2) + 
    guides(color=FALSE)

data(algae)  


#### sub-section:  Removing the Observations with Unknown Values

##
filter(algae, !complete.cases(algae) )

##
algae <- na.omit(algae)
data(algae, package="DMwR2") # only necessary if you executed the above na.omit()
algae <- algae[-c(62, 199), ]

##
apply(algae, 1, function(x) sum(is.na(x)))
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 1 0 0 0 0 0 0 0
 [70] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[139] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
data(algae, package="DMwR2")
manyNAs(algae, 0.2)
[1]  62 199
##
algae <- algae[-manyNAs(algae), ]


#### sub-section:  Filling in the Unknowns with the Most Frequent Values

##
algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = TRUE)

##
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = TRUE)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
algae <- centralImputation(algae)


#### sub-section:  Filling in the Unknown Values by Exploring Correlations

##
cor(algae[, 4:18], use = "complete.obs")
            mxPH        mnO2          Cl         NO3         NH4        oPO4         PO4        Chla          a1          a2          a3
mxPH  1.00000000 -0.16749178  0.13285681 -0.13103951 -0.09360612  0.15850785  0.18033494  0.39121495 -0.26823725  0.32584814  0.03077250
mnO2 -0.16749178  1.00000000 -0.27873229  0.09837676 -0.08780541 -0.41655069 -0.48772564 -0.16678069  0.28389830 -0.09935631 -0.25155437
Cl    0.13285681 -0.27873229  1.00000000  0.22504071  0.07407466  0.39230733  0.45652107  0.15082753 -0.36078101  0.08949837  0.09429722
NO3  -0.13103951  0.09837676  0.22504071  1.00000000  0.72144352  0.14458782  0.16931401  0.14290962 -0.24121109  0.02368832 -0.07621407
NH4  -0.09360612 -0.08780541  0.07407466  0.72144352  1.00000000  0.22723723  0.20844445  0.09375115 -0.13265601 -0.02968344 -0.10143974
oPO4  0.15850785 -0.41655069  0.39230733  0.14458782  0.22723723  1.00000000  0.91387767  0.12941615 -0.41735761  0.14768993  0.03362906
PO4   0.18033494 -0.48772564  0.45652107  0.16931401  0.20844445  0.91387767  1.00000000  0.26758873 -0.48730097  0.16246963  0.06587312
Chla  0.39121495 -0.16678069  0.15082753  0.14290962  0.09375115  0.12941615  0.26758873  1.00000000 -0.28380049  0.38192280 -0.04975884
a1   -0.26823725  0.28389830 -0.36078101 -0.24121109 -0.13265601 -0.41735761 -0.48730097 -0.28380049  1.00000000 -0.29251967 -0.14695028
a2    0.32584814 -0.09935631  0.08949837  0.02368832 -0.02968344  0.14768993  0.16246963  0.38192280 -0.29251967  1.00000000  0.03031095
a3    0.03077250 -0.25155437  0.09429722 -0.07621407 -0.10143974  0.03362906  0.06587312 -0.04975884 -0.14695028  0.03031095  1.00000000
a4   -0.24876290 -0.31513753  0.12045912 -0.02578257  0.22822914  0.29574585  0.30462623 -0.08364618 -0.03892441 -0.17168171  0.01218370
a5   -0.01697947  0.17008979  0.16514900  0.22359794  0.02745909  0.15147500  0.19111521 -0.05945318 -0.29503346 -0.16186215 -0.11111997
a6   -0.08388657  0.15864906  0.18369968  0.54640569  0.40571045  0.02876159  0.08316987  0.01815732 -0.27602608 -0.11613061 -0.17283566
a7   -0.08726106 -0.12117098 -0.02793640  0.08509789 -0.01672691  0.04849832  0.10671057  0.02405581 -0.21142489  0.04749242  0.05618729
              a4          a5          a6          a7
mxPH -0.24876290 -0.01697947 -0.08388657 -0.08726106
mnO2 -0.31513753  0.17008979  0.15864906 -0.12117098
Cl    0.12045912  0.16514900  0.18369968 -0.02793640
NO3  -0.02578257  0.22359794  0.54640569  0.08509789
NH4   0.22822914  0.02745909  0.40571045 -0.01672691
oPO4  0.29574585  0.15147500  0.02876159  0.04849832
PO4   0.30462623  0.19111521  0.08316987  0.10671057
Chla -0.08364618 -0.05945318  0.01815732  0.02405581
a1   -0.03892441 -0.29503346 -0.27602608 -0.21142489
a2   -0.17168171 -0.16186215 -0.11613061  0.04749242
a3    0.01218370 -0.11111997 -0.17283566  0.05618729
a4    1.00000000 -0.11006558 -0.09074936  0.04362334
a5   -0.11006558  1.00000000  0.40360881 -0.02686306
a6   -0.09074936  0.40360881  1.00000000 -0.01244488
a7    0.04362334 -0.02686306 -0.01244488  1.00000000
##
symnum(cor(algae[,4:18],use="complete.obs"))
     mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
mxPH 1                                         
mnO2    1                                      
Cl         1                                   
NO3           1                                
NH4           ,  1                             
oPO4    .  .        1                          
PO4     .  .        * 1                        
Chla .                  1                      
a1         .        . .    1                   
a2   .                  .     1                
a3                               1             
a4      .             .             1          
a5                                     1       
a6            .  .                     .  1    
a7                                           1 
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1
library(corrplot)
corrplot 0.84 loaded
cm <- cor(algae[,4:18], use="complete.obs")
corrplot(cm, type="upper", tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number", 
         diag=FALSE, tl.pos="n", cl.pos="n")

data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
lm(PO4 ~ oPO4, data = algae)

Call:
lm(formula = PO4 ~ oPO4, data = algae)

Coefficients:
(Intercept)         oPO4  
     42.897        1.293  
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
fillPO4 <- function(oP) ifelse(is.na(oP),NA,42.897 + 1.293 * oP)
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4), "oPO4"], fillPO4)
algae <- mutate(algae,
                size=fct_relevel(size,c("small","medium","large")),
                speed=fct_relevel(speed,c("low","medium","high")),
                season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)

ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + 
    facet_wrap(size ~ speed)

ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed) + geom_jitter(height = 0.4)

data(algae, package="DMwR2")
algae <-  algae[-manyNAs(algae), ]
clean.algae <- knnImputation(algae, k = 10)
##
lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12])

##
summary(lm.a1)

Call:
lm(formula = a1 ~ ., data = clean.algae[, 1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-37.679 -11.893  -2.567   7.410  62.190 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)  42.942055  24.010879   1.788  0.07537 . 
seasonspring  3.726978   4.137741   0.901  0.36892   
seasonsummer  0.747597   4.020711   0.186  0.85270   
seasonwinter  3.692955   3.865391   0.955  0.34065   
sizemedium    3.263728   3.802051   0.858  0.39179   
sizesmall     9.682140   4.179971   2.316  0.02166 * 
speedlow      3.922084   4.706315   0.833  0.40573   
speedmedium   0.246764   3.241874   0.076  0.93941   
mxPH         -3.589118   2.703528  -1.328  0.18598   
mnO2          1.052636   0.705018   1.493  0.13715   
Cl           -0.040172   0.033661  -1.193  0.23426   
NO3          -1.511235   0.551339  -2.741  0.00674 **
NH4           0.001634   0.001003   1.628  0.10516   
oPO4         -0.005435   0.039884  -0.136  0.89177   
PO4          -0.052241   0.030755  -1.699  0.09109 . 
Chla         -0.088022   0.079998  -1.100  0.27265   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.65 on 182 degrees of freedom
Multiple R-squared:  0.3731,    Adjusted R-squared:  0.3215 
F-statistic: 7.223 on 15 and 182 DF,  p-value: 2.444e-12
##
anova(lm.a1)
Analysis of Variance Table

Response: a1
           Df Sum Sq Mean Sq F value    Pr(>F)    
season      3     85    28.2  0.0905 0.9651944    
size        2  11401  5700.7 18.3088  5.69e-08 ***
speed       2   3934  1967.2  6.3179 0.0022244 ** 
mxPH        1   1329  1328.8  4.2677 0.0402613 *  
mnO2        1   2287  2286.8  7.3444 0.0073705 ** 
Cl          1   4304  4304.3 13.8239 0.0002671 ***
NO3         1   3418  3418.5 10.9789 0.0011118 ** 
NH4         1    404   403.6  1.2963 0.2563847    
oPO4        1   4788  4788.0 15.3774 0.0001246 ***
PO4         1   1406  1405.6  4.5142 0.0349635 *  
Chla        1    377   377.0  1.2107 0.2726544    
Residuals 182  56668   311.4                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
lm2.a1 <- update(lm.a1, . ~ . - season)

##
summary(lm2.a1)

Call:
lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + 
    oPO4 + PO4 + Chla, data = clean.algae[, 1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-36.460 -11.953  -3.044   7.444  63.730 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept) 44.9532874 23.2378377   1.934  0.05458 . 
sizemedium   3.3092102  3.7825221   0.875  0.38278   
sizesmall   10.2730961  4.1223163   2.492  0.01358 * 
speedlow     3.0546270  4.6108069   0.662  0.50848   
speedmedium -0.2976867  3.1818585  -0.094  0.92556   
mxPH        -3.2684281  2.6576592  -1.230  0.22033   
mnO2         0.8011759  0.6589644   1.216  0.22561   
Cl          -0.0381881  0.0333791  -1.144  0.25407   
NO3         -1.5334300  0.5476550  -2.800  0.00565 **
NH4          0.0015777  0.0009951   1.586  0.11456   
oPO4        -0.0062392  0.0395086  -0.158  0.87469   
PO4         -0.0509543  0.0305189  -1.670  0.09669 . 
Chla        -0.0841371  0.0794459  -1.059  0.29096   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.57 on 185 degrees of freedom
Multiple R-squared:  0.3682,    Adjusted R-squared:  0.3272 
F-statistic: 8.984 on 12 and 185 DF,  p-value: 1.762e-13
##
anova(lm.a1,lm2.a1)
Analysis of Variance Table

Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
    PO4 + Chla
Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
    Chla
  Res.Df   RSS Df Sum of Sq      F Pr(>F)
1    182 56668                           
2    185 57116 -3   -447.62 0.4792 0.6971
##
final.lm <- step(lm.a1)
Start:  AIC=1152.03
a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + 
    PO4 + Chla

         Df Sum of Sq   RSS    AIC
- season  3    447.62 57116 1147.6
- speed   2    269.60 56938 1149.0
- oPO4    1      5.78 56674 1150.0
- Chla    1    376.96 57045 1151.3
- Cl      1    443.46 57112 1151.6
- mxPH    1    548.76 57217 1151.9
<none>                56668 1152.0
- mnO2    1    694.11 57363 1152.4
- NH4     1    825.67 57494 1152.9
- PO4     1    898.42 57567 1153.1
- size    2   1857.16 58526 1154.4
- NO3     1   2339.36 59008 1158.0

Step:  AIC=1147.59
a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + 
    Chla

        Df Sum of Sq   RSS    AIC
- speed  2    210.64 57327 1144.3
- oPO4   1      7.70 57124 1145.6
- Chla   1    346.27 57462 1146.8
- Cl     1    404.10 57520 1147.0
- mnO2   1    456.37 57572 1147.2
- mxPH   1    466.95 57583 1147.2
<none>               57116 1147.6
- NH4    1    776.11 57892 1148.3
- PO4    1    860.62 57977 1148.5
- size   2   2175.59 59292 1151.0
- NO3    1   2420.47 59537 1153.8

Step:  AIC=1144.31
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla

       Df Sum of Sq   RSS    AIC
- oPO4  1     16.29 57343 1142.4
- Chla  1    223.29 57550 1143.1
- mnO2  1    413.77 57740 1143.7
- Cl    1    472.70 57799 1143.9
- mxPH  1    483.56 57810 1144.0
<none>              57327 1144.3
- NH4   1    720.19 58047 1144.8
- PO4   1    809.30 58136 1145.1
- size  2   2060.95 59388 1147.3
- NO3   1   2379.75 59706 1150.4

Step:  AIC=1142.37
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla

       Df Sum of Sq   RSS    AIC
- Chla  1     207.7 57551 1141.1
- mnO2  1     402.6 57746 1141.8
- Cl    1     470.7 57814 1142.0
- mxPH  1     519.7 57863 1142.2
<none>              57343 1142.4
- NH4   1     704.4 58047 1142.8
- size  2    2050.3 59393 1145.3
- NO3   1    2370.4 59713 1148.4
- PO4   1    5818.4 63161 1159.5

Step:  AIC=1141.09
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4

       Df Sum of Sq   RSS    AIC
- mnO2  1     435.3 57986 1140.6
- Cl    1     438.1 57989 1140.6
<none>              57551 1141.1
- NH4   1     746.9 58298 1141.6
- mxPH  1     833.1 58384 1141.9
- size  2    2217.5 59768 1144.6
- NO3   1    2667.1 60218 1148.1
- PO4   1    6309.7 63860 1159.7

Step:  AIC=1140.58
a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4

       Df Sum of Sq   RSS    AIC
- NH4   1     531.0 58517 1140.4
- Cl    1     584.9 58571 1140.6
<none>              57986 1140.6
- mxPH  1     819.1 58805 1141.4
- size  2    2478.2 60464 1144.9
- NO3   1    2251.4 60237 1146.1
- PO4   1    9097.9 67084 1167.4

Step:  AIC=1140.38
a1 ~ size + mxPH + Cl + NO3 + PO4

       Df Sum of Sq   RSS    AIC
<none>              58517 1140.4
- mxPH  1     784.1 59301 1141.0
- Cl    1     835.6 59353 1141.2
- NO3   1    1987.9 60505 1145.0
- size  2    2664.3 61181 1145.2
- PO4   1    8575.8 67093 1165.5
##
summary(final.lm)

Call:
lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = clean.algae[, 
    1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-28.874 -12.732  -3.741   8.424  62.926 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 57.28555   20.96132   2.733  0.00687 ** 
sizemedium   2.80050    3.40190   0.823  0.41141    
sizesmall   10.40636    3.82243   2.722  0.00708 ** 
mxPH        -3.97076    2.48204  -1.600  0.11130    
Cl          -0.05227    0.03165  -1.651  0.10028    
NO3         -0.89529    0.35148  -2.547  0.01165 *  
PO4         -0.05911    0.01117  -5.291 3.32e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.5 on 191 degrees of freedom
Multiple R-squared:  0.3527,    Adjusted R-squared:  0.3324 
F-statistic: 17.35 on 6 and 191 DF,  p-value: 5.554e-16
library(rpart)
data(algae, package="DMwR2") 
algae <- algae[-manyNAs(algae), ]
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])

##
rt.a1
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) Cl>=7.8065 140 21622.830  7.492857  
       8) oPO4>=51.118 84  3441.149  3.846429 *
       9) oPO4< 51.118 56 15389.430 12.962500  
        18) mnO2>=10.05 24  1248.673  6.716667 *
        19) mnO2< 10.05 32 12502.320 17.646870  
          38) NO3>=3.1875 9   257.080  7.866667 *
          39) NO3< 3.1875 23 11047.500 21.473910  
            78) mnO2< 8 13  2919.549 13.807690 *
            79) mnO2>=8 10  6370.704 31.440000 *
     5) Cl< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350  
      14) PO4>=15.177 12  3047.517 38.183330 *
      15) PO4< 15.177 11  2673.945 59.136360 *
##
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
snip.rpart(first.tree, c(4, 7))
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) Cl>=7.8065 140 21622.830  7.492857 *
     5) Cl< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350 *
##
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) Cl>=7.8065 140 21622.830  7.492857  
       8) oPO4>=51.118 84  3441.149  3.846429 *
       9) oPO4< 51.118 56 15389.430 12.962500  
        18) mnO2>=10.05 24  1248.673  6.716667 *
        19) mnO2< 10.05 32 12502.320 17.646870  
          38) NO3>=3.1875 9   257.080  7.866667 *
          39) NO3< 3.1875 23 11047.500 21.473910  
            78) mnO2< 8 13  2919.549 13.807690 *
            79) mnO2>=8 10  6370.704 31.440000 *
     5) Cl< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350  
      14) PO4>=15.177 12  3047.517 38.183330 *
      15) PO4< 15.177 11  2673.945 59.136360 *

LS0tCnRpdGxlOiAiUHJlZGljdGluZyBBbGdhZSBCbG9vbXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KIVtBbGdhZSBCbG9vbV0oYWxnYWUuanBlZykgXAoKCiMjIyMgT2JqZWN0aXZlClRvIHByZWRpY3QgdGhlIGZyZXF1ZW5jeSBvY2N1cnJlbmNlIG9mIHNldmVyYWwgaGFybWZ1bCBhbGdhZSBpbiB3YXRlciBzYW1wbGVzLiBcCipOb3RlOiBBbGdhZSBhcmUgYSB0eXBlIG9mIHBsYW50LWxpa2UgbGl2aW5nIHRoaW5ncyB0aGF0IGNhbiBtYWtlIGZvb2QgZnJvbSBzdW5saWdodCBieSBwaG90b3N5bnRoZXNpcy4qCgojIyMjIE1vdGl2YXRpb24gYW5kIEJhY2tncm91bmQKCiogSGlnaCBjb25jZW50cmF0aW9uIG9mIGNlcnRhaW4gaGFybWZ1bCBhbGdhZSBwb3NlIGEgdGhyZWF0IHRvIHRoZSByaXZlciBsaWZlZm9ybXMgYW5kIHdhdGVyIHF1YWxpdHkuIAoqIEVhcmx5IGZvcmVjYXN0IG9mIGFsZ2FlIGNhbiBpbXByb3ZlIHRoZSBxdWFsaXR5IG9mIHJpdmVycwoqIFRvIHNvbHZlIHRoaXMgaXNzdWUsIHNldmVyYWwgd2F0ZXIgc2FtcGxlcyB3ZXJlIGNvbGxlY3RlZCBpbiBFdXJvcGVhbiByaXZlcnMgb3ZlciBhIHBlcmlvZCBvZiAxIHllYXIgYW5kIGNoZW1pY2FsIG1vbml0b3Jpbmcgd2FzIHBlcmZvcm1lZC4KKiBBIG1vdGl2YXRpb24gYmVoaW5kIHRoaXMgcHJvY2VzcyBpcyB0aGF0IGl0IChjaGVtaWNhbCBtb25pdG9yaW5nKSBpcyBjaGVhcGVyIGFuZCBtb3JlIGF1dG9tYXRlZCB0aGFuIGJpb2xvZ2ljYWwgbW9uaXRvcmluZyBvZiB0aGUgc2FtcGxlcy4gCiogVGhlcmVmb3JlLCB0aGUgdHdvIG1haW4gKipnb2FscyoqIG9mIHRoaXMgc3R1ZHkgYXJlIHRvOgogICsgb2J0YWluIG1vZGVscyB0aGF0IGFyZSBhYmxlIHRvIGFjY3VyYXRlbHkgcHJlZGljdCB0aGUgYWxnYWUgZnJlcXVlbmNpZXMgYmFzZWQgb24gY2hlbWljYWwgcHJvcGVydGllcyB3b3VsZCBmYWNpbGl0YXRlIHRoZSBjcmVhdGlvbiBvZiBjaGVhcCBhbmQgICAgICAgIGF1dG9tYXRlZCBzeXN0ZW1zIGZvciBtb25pdG9yaW5nIGhhcm1mdWwgYWxnYWUgYmxvb21zLgogICsgV2Ugd291bGQgYWxzbyBsaWtlIHRvIGJldHRlciB1bmRlcnN0YW5kIGZhY3RvcnMgdGhhdCBpbmZsdWVuY2UgYWxnYWUgZnJlcXVlbmNpZXMuIFdoaWNoIG1lYW5zIHVuZGVyc3RhbmRpbmcgaG93IHRoZXNlIGZyZXF1ZW5jaWVzIGFyZSByZWxhdGVkIHRvICAgICAgIGNlcnRhaW4gY2hlbWljYWwgYXR0cmlidXRlcyBvZiB3YXRlciBzYW1wbGUgKGxpa2Ugc2Vhc29uIG9mIHllYXIsIHR5cGUgb2Ygcml2ZXIsIGV0Yy4pCgojIyMjIEFib3V0IHRoZSBkYXRhc2V0ClRoZXJlIGFyZSB0d28gbWFpbiBkYXRhc2V0cyBmb3IgdGhpcyBwcm9ibGVtOiBcCjEuIENvbnRhaW5zIGRhdGEgZm9yIDIwMCB3YXRlciBzYW1wbGVzLiBFYWNoIG9ic2VydmF0aW9uIGNvbnRhaW5zIGluZm9ybWF0aW9uIG9uIDExIHZhcmlhYmxlcy4gMyBvZiB0aGVzZSB2YXJpYWJsZXMgZGVzY3JpYmUgc2Vhc29uIG9mIHRoZSB5ZWFyIGFzIHdlbGwgICAgYXMgc2l6ZSBhbmQgc3BlZWQgb2YgdGhlIHJpdmVyLiBUaGUgb3RoZXIgOCB2YXJpYWJsZXMgYXJlIHZhbHVlcyBvZiBkaWZmZXJlbnQgY2hlbWljYWwgcGFyYW1ldGVycyBtZWFzdXJlZCBpbiB3YXRlciBzYW1wbGVzLiBcIFwKMi4gQ29udGFpbnMgaW5mb3JtYXRpb24gb24gMTQwIGV4dHJhIG9ic2VydmF0aW9ucy4gQ2FuIGJlIHJlZ2FyZGVkIGFzIGEga2luZCBvZiB0ZXN0IHNldC4gKlRoZSBtYWluIGdvYWwgb2YgdGhpcyBzdHVkeSBpcyB0byBwcmVkaWN0IHRoZSBmcmVxdWVuY2llcyBvZiAgICB0aGUgc2V2ZW4gYWxnYWUgZm9yIHRoZXNlIDE0MCB3YXRlciBzYW1wbGVzLioKCiMjIyMgTG9hZGluZyB0aGUgRGF0YXNldCBhbmQgVmlld2luZyB0aGUgRGF0YQpgYGB7cn0KbGlicmFyeShETXdSMikKaGVhZChhbGdhZSkKYGBgCgojIyMjIFVuZGVyc3RhbmRpbmcgdGhlIERhdGEKCkEgZmlzdCBpZGVhIG9mIHRoZSBzdGF0aXN0aWNhbCBwcm9wZXJ0aWVzIG9mIHRoZSBkYXRhIGNhbiBiZSBvYnRhaW5lZCB0aHJvdWdoIGEgc3VtbWFyeSBvZiBpdHMgZGVzY3JpcHRpdmUgc3RhdGlzdGljcwoKYGBge3J9CnN1bW1hcnkoYWxnYWUpCmBgYAoKYGBge3J9Cmhpc3QoYWxnYWUkbXhQSCwgcHJvYj1UKQpgYGAKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKZ2dwbG90KGFsZ2FlLGFlcyh4PW14UEgpKSArIGdlb21faGlzdG9ncmFtKGFlcyh5PS4uZGVuc2l0eS4uKSkKYGBgCgoKYGBge3J9CmdncGxvdChhbGdhZSxhZXMoeD1teFBIKSkgKyAKICAgIGdlb21faGlzdG9ncmFtKGFlcyh5PS4uZGVuc2l0eS4uKSkgKyAKICAgIGdlb21fZGVuc2l0eShjb2xvcj0icmVkIikgKyBnZW9tX3J1ZygpICsgCiAgICBnZ3RpdGxlKCJUaGUgSGlzdG9ncmFtIG9mIG14UEggKG1heGltdW0gcEgpIikgKyAKICAgIHhsYWIoIiIpICsgeWxhYigiIikKCnFxUGxvdChhbGdhZSRteFBILG1haW49J05vcm1hbCBRUSBwbG90IG9mIG1heGltdW0gcEgnLHlsYWI9IiIpCmBgYApgYGB7cn0KZ2dwbG90KGFsZ2FlLGFlcyh4PWZhY3RvcigwKSx5PW9QTzQpKSArIAogICAgZ2VvbV9ib3hwbG90KCkgKyBnZW9tX3J1ZygpICsgCiAgICBnZW9tX2hsaW5lKGFlcyh5aW50ZXJjZXB0PW1lYW4oYWxnYWUkb1BPNCwgbmEucm0gPSBUUlVFKSksCiAgICAgICAgICAgICAgIGxpbmV0eXBlPTIsY29sb3VyPSJyZWQiKSArCiAgICB5bGFiKCJPcnRob3Bob3NwaGF0ZSAob1BPNCkiKSArIHhsYWIoIiIpICsgc2NhbGVfeF9kaXNjcmV0ZShicmVha3M9TlVMTCkKYGBgCgpgYGB7cn0KcGxvdChhbGdhZSROSDQsIHhsYWIgPSAiIikKYWJsaW5lKGggPSBtZWFuKGFsZ2FlJE5INCwgbmEucm0gPSBUKSwgbHR5ID0gMSkKYWJsaW5lKGggPSBtZWFuKGFsZ2FlJE5INCwgbmEucm0gPSBUKSArIHNkKGFsZ2FlJE5INCwgbmEucm0gPSBUKSwgbHR5ID0gMikKYWJsaW5lKGggPSBtZWRpYW4oYWxnYWUkTkg0LCBuYS5ybSA9IFQpLCBsdHkgPSAzKQppZGVudGlmeShhbGdhZSROSDQpCmBgYApgYGB7cn0KbGlicmFyeShkcGx5cikKZmlsdGVyKGFsZ2FlLCBOSDQgPiAxOTAwMCkKCiMjCmdncGxvdChhbGdhZSxhZXMoeD1zaXplLHk9YTEpKSArIGdlb21fYm94cGxvdCgpICsKICAgIHhsYWIoIlJpdmVyIFNpemUiKSArIHlsYWIoIkFsZ2FsIEExIikKCmBgYApgYGB7cn0KZ2dwbG90KGFsZ2FlLGFlcyh4PXNpemUseT1hMSkpICsgCiAgICBnZW9tX3Zpb2xpbigpICsgZ2VvbV9qaXR0ZXIoKSArIHhsYWIoIlJpdmVyIFNpemUiKSArIHlsYWIoIkFsZ2FsIEExIikKYGBgCmBgYHtyfQpkYXRhMmdyYXBoIDwtIGZpbHRlcihhbGdhZSwhaXMubmEobW5PMikpICU+JQogICAgbXV0YXRlKG1pbk8yPWN1dChtbk8yLCBxdWFudGlsZShtbk8yLGMoMCwwLjI1LC41LC43NSwxKSksIGluY2x1ZGUubG93ZXN0PVRSVUUpKQpnZ3Bsb3QoZGF0YTJncmFwaCxhZXMoeD1hMyx5PXNlYXNvbiwgY29sb3I9c2Vhc29uKSkgKyBnZW9tX3BvaW50KCkgKyAKICAgIGZhY2V0X3dyYXAofiBtaW5PMikgKyAKICAgIGd1aWRlcyhjb2xvcj1GQUxTRSkKYGBgCmBgYHtyfQpkYXRhKGFsZ2FlKSAgCgoKIyMjIyBzdWItc2VjdGlvbjogIFJlbW92aW5nIHRoZSBPYnNlcnZhdGlvbnMgd2l0aCBVbmtub3duIFZhbHVlcwoKIyMKZmlsdGVyKGFsZ2FlLCAhY29tcGxldGUuY2FzZXMoYWxnYWUpICkKCiMjCmFsZ2FlIDwtIG5hLm9taXQoYWxnYWUpCmBgYApgYGB7cn0KZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKSAjIG9ubHkgbmVjZXNzYXJ5IGlmIHlvdSBleGVjdXRlZCB0aGUgYWJvdmUgbmEub21pdCgpCmFsZ2FlIDwtIGFsZ2FlWy1jKDYyLCAxOTkpLCBdCgojIwphcHBseShhbGdhZSwgMSwgZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSkKYGBgCgpgYGB7cn0KIyMKZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKQptYW55TkFzKGFsZ2FlLCAwLjIpCgojIwphbGdhZSA8LSBhbGdhZVstbWFueU5BcyhhbGdhZSksIF0KCgojIyMjIHN1Yi1zZWN0aW9uOiAgRmlsbGluZyBpbiB0aGUgVW5rbm93bnMgd2l0aCB0aGUgTW9zdCBGcmVxdWVudCBWYWx1ZXMKCiMjCmFsZ2FlWzQ4LCAibXhQSCJdIDwtIG1lYW4oYWxnYWUkbXhQSCwgbmEucm0gPSBUUlVFKQoKIyMKYWxnYWVbaXMubmEoYWxnYWUkQ2hsYSksICJDaGxhIl0gPC0gbWVkaWFuKGFsZ2FlJENobGEsIG5hLnJtID0gVFJVRSkKYGBgCgpgYGB7cn0KZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKQphbGdhZSA8LSBhbGdhZVstbWFueU5BcyhhbGdhZSksIF0KYWxnYWUgPC0gY2VudHJhbEltcHV0YXRpb24oYWxnYWUpCgoKIyMjIyBzdWItc2VjdGlvbjogIEZpbGxpbmcgaW4gdGhlIFVua25vd24gVmFsdWVzIGJ5IEV4cGxvcmluZyBDb3JyZWxhdGlvbnMKCiMjCmNvcihhbGdhZVssIDQ6MThdLCB1c2UgPSAiY29tcGxldGUub2JzIikKCiMjCnN5bW51bShjb3IoYWxnYWVbLDQ6MThdLHVzZT0iY29tcGxldGUub2JzIikpCmBgYApgYGB7cn0KbGlicmFyeShjb3JycGxvdCkKY20gPC0gY29yKGFsZ2FlWyw0OjE4XSwgdXNlPSJjb21wbGV0ZS5vYnMiKQpjb3JycGxvdChjbSwgdHlwZT0idXBwZXIiLCB0bC5wb3M9ImQiKQpjb3JycGxvdChjbSwgYWRkPVRSVUUsIHR5cGU9Imxvd2VyIiwgbWV0aG9kPSJudW1iZXIiLCAKICAgICAgICAgZGlhZz1GQUxTRSwgdGwucG9zPSJuIiwgY2wucG9zPSJuIikKYGBgCgpgYGB7cn0KZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKQphbGdhZSA8LSBhbGdhZVstbWFueU5BcyhhbGdhZSksIF0KbG0oUE80IH4gb1BPNCwgZGF0YSA9IGFsZ2FlKQoKIyMKYWxnYWVbMjgsICJQTzQiXSA8LSA0Mi44OTcgKyAxLjI5MyAqIGFsZ2FlWzI4LCAib1BPNCJdCmBgYApgYGB7cn0KZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKQphbGdhZSA8LSBhbGdhZVstbWFueU5BcyhhbGdhZSksIF0KZmlsbFBPNCA8LSBmdW5jdGlvbihvUCkgaWZlbHNlKGlzLm5hKG9QKSxOQSw0Mi44OTcgKyAxLjI5MyAqIG9QKQphbGdhZVtpcy5uYShhbGdhZSRQTzQpLCAiUE80Il0gPC0gc2FwcGx5KGFsZ2FlW2lzLm5hKGFsZ2FlJFBPNCksICJvUE80Il0sIGZpbGxQTzQpCmBgYAoKYGBge3J9CmFsZ2FlIDwtIG11dGF0ZShhbGdhZSwKICAgICAgICAgICAgICAgIHNpemU9ZmN0X3JlbGV2ZWwoc2l6ZSxjKCJzbWFsbCIsIm1lZGl1bSIsImxhcmdlIikpLAogICAgICAgICAgICAgICAgc3BlZWQ9ZmN0X3JlbGV2ZWwoc3BlZWQsYygibG93IiwibWVkaXVtIiwiaGlnaCIpKSwKICAgICAgICAgICAgICAgIHNlYXNvbj1mY3RfcmVsZXZlbChzZWFzb24sYygic3ByaW5nIiwic3VtbWVyIiwiYXV0dW1uIiwid2ludGVyIikpKQpnZ3Bsb3QoYWxnYWUsIGFlcyh4PW14UEgpKSArIGdlb21faGlzdG9ncmFtKGJpbndpZHRoPTAuNSkgKyBmYWNldF93cmFwKH4gc2Vhc29uKQpgYGAKYGBge3J9CmdncGxvdChhbGdhZSwgYWVzKHg9bXhQSCkpICsgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGg9MC41KSArIAogICAgZmFjZXRfd3JhcChzaXplIH4gc3BlZWQpCmBgYApgYGB7cn0KZ2dwbG90KGFsZ2FlLCBhZXMoeD1teFBILCB5PXNpemUsIGNvbG9yPXNpemUpKSArIGdlb21fcG9pbnQoKSArIGZhY2V0X3dyYXAofnNwZWVkKSArIGdlb21faml0dGVyKGhlaWdodCA9IDAuNCkKYGBgCmBgYHtyfQpkYXRhKGFsZ2FlLCBwYWNrYWdlPSJETXdSMiIpCmFsZ2FlIDwtICBhbGdhZVstbWFueU5BcyhhbGdhZSksIF0KY2xlYW4uYWxnYWUgPC0ga25uSW1wdXRhdGlvbihhbGdhZSwgayA9IDEwKQpgYGAKCmBgYHtyfQojIwpsbS5hMSA8LSBsbShhMSB+IC4sIGRhdGEgPSBjbGVhbi5hbGdhZVssIDE6MTJdKQoKIyMKc3VtbWFyeShsbS5hMSkKCmBgYAoKYGBge3J9CiMjCmFub3ZhKGxtLmExKQpgYGAKCmBgYHtyfQpsbTIuYTEgPC0gdXBkYXRlKGxtLmExLCAuIH4gLiAtIHNlYXNvbikKCiMjCnN1bW1hcnkobG0yLmExKQoKYGBgCgpgYGB7cn0KIyMKYW5vdmEobG0uYTEsbG0yLmExKQoKIyMKZmluYWwubG0gPC0gc3RlcChsbS5hMSkKCiMjCnN1bW1hcnkoZmluYWwubG0pCmBgYApgYGB7cn0KbGlicmFyeShycGFydCkKZGF0YShhbGdhZSwgcGFja2FnZT0iRE13UjIiKSAKYWxnYWUgPC0gYWxnYWVbLW1hbnlOQXMoYWxnYWUpLCBdCnJ0LmExIDwtIHJwYXJ0KGExIH4gLiwgZGF0YSA9IGFsZ2FlWywgMToxMl0pCgojIwpydC5hMQpgYGAKCmBgYHtyfQojIwpmaXJzdC50cmVlIDwtIHJwYXJ0KGExIH4gLiwgZGF0YSA9IGFsZ2FlWywgMToxMl0pCnNuaXAucnBhcnQoZmlyc3QudHJlZSwgYyg0LCA3KSkKCmBgYAoKYGBge3J9CiMjCnBsb3QoZmlyc3QudHJlZSkKdGV4dChmaXJzdC50cmVlKQpzbmlwLnJwYXJ0KGZpcnN0LnRyZWUpCmBgYAoK