To predict the frequency occurrence of several harmful algae in water samples.
Note: Algae are a type of plant-like living things that can make food from sunlight by photosynthesis.
There are two main datasets for this problem:
1. Contains data for 200 water samples. Each observation contains information on 11 variables. 3 of these variables describe season of the year as well as size and speed of the river. The other 8 variables are values of different chemical parameters measured in water samples. Â
2. Contains information on 140 extra observations. Can be regarded as a kind of test set. The main goal of this study is to predict the frequencies of the seven algae for these 140 water samples.
library(DMwR2)
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
head(algae)
A fist idea of the statistical properties of the data can be obtained through a summary of its descriptive statistics
summary(algae)
season size speed mxPH mnO2 Cl NO3 NH4 oPO4
autumn:40 large :45 high :84 Min. :5.600 Min. : 1.500 Min. : 0.222 Min. : 0.050 Min. : 5.00 Min. : 1.00
spring:53 medium:84 low :33 1st Qu.:7.700 1st Qu.: 7.725 1st Qu.: 10.981 1st Qu.: 1.296 1st Qu.: 38.33 1st Qu.: 15.70
summer:45 small :71 medium:83 Median :8.060 Median : 9.800 Median : 32.730 Median : 2.675 Median : 103.17 Median : 40.15
winter:62 Mean :8.012 Mean : 9.118 Mean : 43.636 Mean : 3.282 Mean : 501.30 Mean : 73.59
3rd Qu.:8.400 3rd Qu.:10.800 3rd Qu.: 57.824 3rd Qu.: 4.446 3rd Qu.: 226.95 3rd Qu.: 99.33
Max. :9.700 Max. :13.400 Max. :391.500 Max. :45.650 Max. :24064.00 Max. :564.60
NA's :1 NA's :2 NA's :10 NA's :2 NA's :2 NA's :2
PO4 Chla a1 a2 a3 a4 a5 a6
Min. : 1.00 Min. : 0.200 Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 41.38 1st Qu.: 2.000 1st Qu.: 1.50 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
Median :103.29 Median : 5.475 Median : 6.95 Median : 3.000 Median : 1.550 Median : 0.000 Median : 1.900 Median : 0.000
Mean :137.88 Mean : 13.971 Mean :16.92 Mean : 7.458 Mean : 4.309 Mean : 1.992 Mean : 5.064 Mean : 5.964
3rd Qu.:213.75 3rd Qu.: 18.308 3rd Qu.:24.80 3rd Qu.:11.375 3rd Qu.: 4.925 3rd Qu.: 2.400 3rd Qu.: 7.500 3rd Qu.: 6.925
Max. :771.60 Max. :110.456 Max. :89.80 Max. :72.600 Max. :42.800 Max. :44.600 Max. :44.400 Max. :77.600
NA's :2 NA's :12
a7
Min. : 0.000
1st Qu.: 0.000
Median : 1.000
Mean : 2.495
3rd Qu.: 2.400
Max. :31.600
hist(algae$mxPH, prob=T)
library(ggplot2)
ggplot(algae,aes(x=mxPH)) + geom_histogram(aes(y=..density..))
ggplot(algae,aes(x=mxPH)) +
geom_histogram(aes(y=..density..)) +
geom_density(color="red") + geom_rug() +
ggtitle("The Histogram of mxPH (maximum pH)") +
xlab("") + ylab("")
qqPlot(algae$mxPH,main='Normal QQ plot of maximum pH',ylab="")
[1] 56 57
ggplot(algae,aes(x=factor(0),y=oPO4)) +
geom_boxplot() + geom_rug() +
geom_hline(aes(yintercept=mean(algae$oPO4, na.rm = TRUE)),
linetype=2,colour="red") +
ylab("Orthophosphate (oPO4)") + xlab("") + scale_x_discrete(breaks=NULL)
plot(algae$NH4, xlab = "")
abline(h = mean(algae$NH4, na.rm = T), lty = 1)
abline(h = mean(algae$NH4, na.rm = T) + sd(algae$NH4, na.rm = T), lty = 2)
abline(h = median(algae$NH4, na.rm = T), lty = 3)
identify(algae$NH4)
integer(0)
library(dplyr)
Attaching package: ‘dplyr’
The following object is masked from ‘package:car’:
recode
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
filter(algae, NH4 > 19000)
##
ggplot(algae,aes(x=size,y=a1)) + geom_boxplot() +
xlab("River Size") + ylab("Algal A1")
ggplot(algae,aes(x=size,y=a1)) +
geom_violin() + geom_jitter() + xlab("River Size") + ylab("Algal A1")
data2graph <- filter(algae,!is.na(mnO2)) %>%
mutate(minO2=cut(mnO2, quantile(mnO2,c(0,0.25,.5,.75,1)), include.lowest=TRUE))
ggplot(data2graph,aes(x=a3,y=season, color=season)) + geom_point() +
facet_wrap(~ minO2) +
guides(color=FALSE)
data(algae)
#### sub-section: Removing the Observations with Unknown Values
##
filter(algae, !complete.cases(algae) )
##
algae <- na.omit(algae)
data(algae, package="DMwR2") # only necessary if you executed the above na.omit()
algae <- algae[-c(62, 199), ]
##
apply(algae, 1, function(x) sum(is.na(x)))
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 2 2 2 2 2 1 0 0 0 0 0 0 0
[70] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[139] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
data(algae, package="DMwR2")
manyNAs(algae, 0.2)
[1] 62 199
##
algae <- algae[-manyNAs(algae), ]
#### sub-section: Filling in the Unknowns with the Most Frequent Values
##
algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = TRUE)
##
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = TRUE)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
algae <- centralImputation(algae)
#### sub-section: Filling in the Unknown Values by Exploring Correlations
##
cor(algae[, 4:18], use = "complete.obs")
mxPH mnO2 Cl NO3 NH4 oPO4 PO4 Chla a1 a2 a3
mxPH 1.00000000 -0.16749178 0.13285681 -0.13103951 -0.09360612 0.15850785 0.18033494 0.39121495 -0.26823725 0.32584814 0.03077250
mnO2 -0.16749178 1.00000000 -0.27873229 0.09837676 -0.08780541 -0.41655069 -0.48772564 -0.16678069 0.28389830 -0.09935631 -0.25155437
Cl 0.13285681 -0.27873229 1.00000000 0.22504071 0.07407466 0.39230733 0.45652107 0.15082753 -0.36078101 0.08949837 0.09429722
NO3 -0.13103951 0.09837676 0.22504071 1.00000000 0.72144352 0.14458782 0.16931401 0.14290962 -0.24121109 0.02368832 -0.07621407
NH4 -0.09360612 -0.08780541 0.07407466 0.72144352 1.00000000 0.22723723 0.20844445 0.09375115 -0.13265601 -0.02968344 -0.10143974
oPO4 0.15850785 -0.41655069 0.39230733 0.14458782 0.22723723 1.00000000 0.91387767 0.12941615 -0.41735761 0.14768993 0.03362906
PO4 0.18033494 -0.48772564 0.45652107 0.16931401 0.20844445 0.91387767 1.00000000 0.26758873 -0.48730097 0.16246963 0.06587312
Chla 0.39121495 -0.16678069 0.15082753 0.14290962 0.09375115 0.12941615 0.26758873 1.00000000 -0.28380049 0.38192280 -0.04975884
a1 -0.26823725 0.28389830 -0.36078101 -0.24121109 -0.13265601 -0.41735761 -0.48730097 -0.28380049 1.00000000 -0.29251967 -0.14695028
a2 0.32584814 -0.09935631 0.08949837 0.02368832 -0.02968344 0.14768993 0.16246963 0.38192280 -0.29251967 1.00000000 0.03031095
a3 0.03077250 -0.25155437 0.09429722 -0.07621407 -0.10143974 0.03362906 0.06587312 -0.04975884 -0.14695028 0.03031095 1.00000000
a4 -0.24876290 -0.31513753 0.12045912 -0.02578257 0.22822914 0.29574585 0.30462623 -0.08364618 -0.03892441 -0.17168171 0.01218370
a5 -0.01697947 0.17008979 0.16514900 0.22359794 0.02745909 0.15147500 0.19111521 -0.05945318 -0.29503346 -0.16186215 -0.11111997
a6 -0.08388657 0.15864906 0.18369968 0.54640569 0.40571045 0.02876159 0.08316987 0.01815732 -0.27602608 -0.11613061 -0.17283566
a7 -0.08726106 -0.12117098 -0.02793640 0.08509789 -0.01672691 0.04849832 0.10671057 0.02405581 -0.21142489 0.04749242 0.05618729
a4 a5 a6 a7
mxPH -0.24876290 -0.01697947 -0.08388657 -0.08726106
mnO2 -0.31513753 0.17008979 0.15864906 -0.12117098
Cl 0.12045912 0.16514900 0.18369968 -0.02793640
NO3 -0.02578257 0.22359794 0.54640569 0.08509789
NH4 0.22822914 0.02745909 0.40571045 -0.01672691
oPO4 0.29574585 0.15147500 0.02876159 0.04849832
PO4 0.30462623 0.19111521 0.08316987 0.10671057
Chla -0.08364618 -0.05945318 0.01815732 0.02405581
a1 -0.03892441 -0.29503346 -0.27602608 -0.21142489
a2 -0.17168171 -0.16186215 -0.11613061 0.04749242
a3 0.01218370 -0.11111997 -0.17283566 0.05618729
a4 1.00000000 -0.11006558 -0.09074936 0.04362334
a5 -0.11006558 1.00000000 0.40360881 -0.02686306
a6 -0.09074936 0.40360881 1.00000000 -0.01244488
a7 0.04362334 -0.02686306 -0.01244488 1.00000000
##
symnum(cor(algae[,4:18],use="complete.obs"))
mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
mxPH 1
mnO2 1
Cl 1
NO3 1
NH4 , 1
oPO4 . . 1
PO4 . . * 1
Chla . 1
a1 . . . 1
a2 . . 1
a3 1
a4 . . 1
a5 1
a6 . . . 1
a7 1
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1
library(corrplot)
corrplot 0.84 loaded
cm <- cor(algae[,4:18], use="complete.obs")
corrplot(cm, type="upper", tl.pos="d")
corrplot(cm, add=TRUE, type="lower", method="number",
diag=FALSE, tl.pos="n", cl.pos="n")
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
lm(PO4 ~ oPO4, data = algae)
Call:
lm(formula = PO4 ~ oPO4, data = algae)
Coefficients:
(Intercept) oPO4
42.897 1.293
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
fillPO4 <- function(oP) ifelse(is.na(oP),NA,42.897 + 1.293 * oP)
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4), "oPO4"], fillPO4)
algae <- mutate(algae,
size=fct_relevel(size,c("small","medium","large")),
speed=fct_relevel(speed,c("low","medium","high")),
season=fct_relevel(season,c("spring","summer","autumn","winter")))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) + facet_wrap(~ season)
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth=0.5) +
facet_wrap(size ~ speed)
ggplot(algae, aes(x=mxPH, y=size, color=size)) + geom_point() + facet_wrap(~speed) + geom_jitter(height = 0.4)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
clean.algae <- knnImputation(algae, k = 10)
##
lm.a1 <- lm(a1 ~ ., data = clean.algae[, 1:12])
##
summary(lm.a1)
Call:
lm(formula = a1 ~ ., data = clean.algae[, 1:12])
Residuals:
Min 1Q Median 3Q Max
-37.679 -11.893 -2.567 7.410 62.190
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 42.942055 24.010879 1.788 0.07537 .
seasonspring 3.726978 4.137741 0.901 0.36892
seasonsummer 0.747597 4.020711 0.186 0.85270
seasonwinter 3.692955 3.865391 0.955 0.34065
sizemedium 3.263728 3.802051 0.858 0.39179
sizesmall 9.682140 4.179971 2.316 0.02166 *
speedlow 3.922084 4.706315 0.833 0.40573
speedmedium 0.246764 3.241874 0.076 0.93941
mxPH -3.589118 2.703528 -1.328 0.18598
mnO2 1.052636 0.705018 1.493 0.13715
Cl -0.040172 0.033661 -1.193 0.23426
NO3 -1.511235 0.551339 -2.741 0.00674 **
NH4 0.001634 0.001003 1.628 0.10516
oPO4 -0.005435 0.039884 -0.136 0.89177
PO4 -0.052241 0.030755 -1.699 0.09109 .
Chla -0.088022 0.079998 -1.100 0.27265
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 17.65 on 182 degrees of freedom
Multiple R-squared: 0.3731, Adjusted R-squared: 0.3215
F-statistic: 7.223 on 15 and 182 DF, p-value: 2.444e-12
##
anova(lm.a1)
Analysis of Variance Table
Response: a1
Df Sum Sq Mean Sq F value Pr(>F)
season 3 85 28.2 0.0905 0.9651944
size 2 11401 5700.7 18.3088 5.69e-08 ***
speed 2 3934 1967.2 6.3179 0.0022244 **
mxPH 1 1329 1328.8 4.2677 0.0402613 *
mnO2 1 2287 2286.8 7.3444 0.0073705 **
Cl 1 4304 4304.3 13.8239 0.0002671 ***
NO3 1 3418 3418.5 10.9789 0.0011118 **
NH4 1 404 403.6 1.2963 0.2563847
oPO4 1 4788 4788.0 15.3774 0.0001246 ***
PO4 1 1406 1405.6 4.5142 0.0349635 *
Chla 1 377 377.0 1.2107 0.2726544
Residuals 182 56668 311.4
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
lm2.a1 <- update(lm.a1, . ~ . - season)
##
summary(lm2.a1)
Call:
lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 +
oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
Residuals:
Min 1Q Median 3Q Max
-36.460 -11.953 -3.044 7.444 63.730
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 44.9532874 23.2378377 1.934 0.05458 .
sizemedium 3.3092102 3.7825221 0.875 0.38278
sizesmall 10.2730961 4.1223163 2.492 0.01358 *
speedlow 3.0546270 4.6108069 0.662 0.50848
speedmedium -0.2976867 3.1818585 -0.094 0.92556
mxPH -3.2684281 2.6576592 -1.230 0.22033
mnO2 0.8011759 0.6589644 1.216 0.22561
Cl -0.0381881 0.0333791 -1.144 0.25407
NO3 -1.5334300 0.5476550 -2.800 0.00565 **
NH4 0.0015777 0.0009951 1.586 0.11456
oPO4 -0.0062392 0.0395086 -0.158 0.87469
PO4 -0.0509543 0.0305189 -1.670 0.09669 .
Chla -0.0841371 0.0794459 -1.059 0.29096
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 17.57 on 185 degrees of freedom
Multiple R-squared: 0.3682, Adjusted R-squared: 0.3272
F-statistic: 8.984 on 12 and 185 DF, p-value: 1.762e-13
##
anova(lm.a1,lm2.a1)
Analysis of Variance Table
Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
PO4 + Chla
Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
Chla
Res.Df RSS Df Sum of Sq F Pr(>F)
1 182 56668
2 185 57116 -3 -447.62 0.4792 0.6971
##
final.lm <- step(lm.a1)
Start: AIC=1152.03
a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
PO4 + Chla
Df Sum of Sq RSS AIC
- season 3 447.62 57116 1147.6
- speed 2 269.60 56938 1149.0
- oPO4 1 5.78 56674 1150.0
- Chla 1 376.96 57045 1151.3
- Cl 1 443.46 57112 1151.6
- mxPH 1 548.76 57217 1151.9
<none> 56668 1152.0
- mnO2 1 694.11 57363 1152.4
- NH4 1 825.67 57494 1152.9
- PO4 1 898.42 57567 1153.1
- size 2 1857.16 58526 1154.4
- NO3 1 2339.36 59008 1158.0
Step: AIC=1147.59
a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
Chla
Df Sum of Sq RSS AIC
- speed 2 210.64 57327 1144.3
- oPO4 1 7.70 57124 1145.6
- Chla 1 346.27 57462 1146.8
- Cl 1 404.10 57520 1147.0
- mnO2 1 456.37 57572 1147.2
- mxPH 1 466.95 57583 1147.2
<none> 57116 1147.6
- NH4 1 776.11 57892 1148.3
- PO4 1 860.62 57977 1148.5
- size 2 2175.59 59292 1151.0
- NO3 1 2420.47 59537 1153.8
Step: AIC=1144.31
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
Df Sum of Sq RSS AIC
- oPO4 1 16.29 57343 1142.4
- Chla 1 223.29 57550 1143.1
- mnO2 1 413.77 57740 1143.7
- Cl 1 472.70 57799 1143.9
- mxPH 1 483.56 57810 1144.0
<none> 57327 1144.3
- NH4 1 720.19 58047 1144.8
- PO4 1 809.30 58136 1145.1
- size 2 2060.95 59388 1147.3
- NO3 1 2379.75 59706 1150.4
Step: AIC=1142.37
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
Df Sum of Sq RSS AIC
- Chla 1 207.7 57551 1141.1
- mnO2 1 402.6 57746 1141.8
- Cl 1 470.7 57814 1142.0
- mxPH 1 519.7 57863 1142.2
<none> 57343 1142.4
- NH4 1 704.4 58047 1142.8
- size 2 2050.3 59393 1145.3
- NO3 1 2370.4 59713 1148.4
- PO4 1 5818.4 63161 1159.5
Step: AIC=1141.09
a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
Df Sum of Sq RSS AIC
- mnO2 1 435.3 57986 1140.6
- Cl 1 438.1 57989 1140.6
<none> 57551 1141.1
- NH4 1 746.9 58298 1141.6
- mxPH 1 833.1 58384 1141.9
- size 2 2217.5 59768 1144.6
- NO3 1 2667.1 60218 1148.1
- PO4 1 6309.7 63860 1159.7
Step: AIC=1140.58
a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
Df Sum of Sq RSS AIC
- NH4 1 531.0 58517 1140.4
- Cl 1 584.9 58571 1140.6
<none> 57986 1140.6
- mxPH 1 819.1 58805 1141.4
- size 2 2478.2 60464 1144.9
- NO3 1 2251.4 60237 1146.1
- PO4 1 9097.9 67084 1167.4
Step: AIC=1140.38
a1 ~ size + mxPH + Cl + NO3 + PO4
Df Sum of Sq RSS AIC
<none> 58517 1140.4
- mxPH 1 784.1 59301 1141.0
- Cl 1 835.6 59353 1141.2
- NO3 1 1987.9 60505 1145.0
- size 2 2664.3 61181 1145.2
- PO4 1 8575.8 67093 1165.5
##
summary(final.lm)
Call:
lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = clean.algae[,
1:12])
Residuals:
Min 1Q Median 3Q Max
-28.874 -12.732 -3.741 8.424 62.926
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57.28555 20.96132 2.733 0.00687 **
sizemedium 2.80050 3.40190 0.823 0.41141
sizesmall 10.40636 3.82243 2.722 0.00708 **
mxPH -3.97076 2.48204 -1.600 0.11130
Cl -0.05227 0.03165 -1.651 0.10028
NO3 -0.89529 0.35148 -2.547 0.01165 *
PO4 -0.05911 0.01117 -5.291 3.32e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 17.5 on 191 degrees of freedom
Multiple R-squared: 0.3527, Adjusted R-squared: 0.3324
F-statistic: 17.35 on 6 and 191 DF, p-value: 5.554e-16
library(rpart)
data(algae, package="DMwR2")
algae <- algae[-manyNAs(algae), ]
rt.a1 <- rpart(a1 ~ ., data = algae[, 1:12])
##
rt.a1
n= 198
node), split, n, deviance, yval
* denotes terminal node
1) root 198 90401.290 16.996460
2) PO4>=43.818 147 31279.120 8.979592
4) Cl>=7.8065 140 21622.830 7.492857
8) oPO4>=51.118 84 3441.149 3.846429 *
9) oPO4< 51.118 56 15389.430 12.962500
18) mnO2>=10.05 24 1248.673 6.716667 *
19) mnO2< 10.05 32 12502.320 17.646870
38) NO3>=3.1875 9 257.080 7.866667 *
39) NO3< 3.1875 23 11047.500 21.473910
78) mnO2< 8 13 2919.549 13.807690 *
79) mnO2>=8 10 6370.704 31.440000 *
5) Cl< 7.8065 7 3157.769 38.714290 *
3) PO4< 43.818 51 22442.760 40.103920
6) mxPH< 7.87 28 11452.770 33.450000
12) mxPH>=7.045 18 5146.169 26.394440 *
13) mxPH< 7.045 10 3797.645 46.150000 *
7) mxPH>=7.87 23 8241.110 48.204350
14) PO4>=15.177 12 3047.517 38.183330 *
15) PO4< 15.177 11 2673.945 59.136360 *
##
first.tree <- rpart(a1 ~ ., data = algae[, 1:12])
snip.rpart(first.tree, c(4, 7))
n= 198
node), split, n, deviance, yval
* denotes terminal node
1) root 198 90401.290 16.996460
2) PO4>=43.818 147 31279.120 8.979592
4) Cl>=7.8065 140 21622.830 7.492857 *
5) Cl< 7.8065 7 3157.769 38.714290 *
3) PO4< 43.818 51 22442.760 40.103920
6) mxPH< 7.87 28 11452.770 33.450000
12) mxPH>=7.045 18 5146.169 26.394440 *
13) mxPH< 7.045 10 3797.645 46.150000 *
7) mxPH>=7.87 23 8241.110 48.204350 *
##
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)
n= 198
node), split, n, deviance, yval
* denotes terminal node
1) root 198 90401.290 16.996460
2) PO4>=43.818 147 31279.120 8.979592
4) Cl>=7.8065 140 21622.830 7.492857
8) oPO4>=51.118 84 3441.149 3.846429 *
9) oPO4< 51.118 56 15389.430 12.962500
18) mnO2>=10.05 24 1248.673 6.716667 *
19) mnO2< 10.05 32 12502.320 17.646870
38) NO3>=3.1875 9 257.080 7.866667 *
39) NO3< 3.1875 23 11047.500 21.473910
78) mnO2< 8 13 2919.549 13.807690 *
79) mnO2>=8 10 6370.704 31.440000 *
5) Cl< 7.8065 7 3157.769 38.714290 *
3) PO4< 43.818 51 22442.760 40.103920
6) mxPH< 7.87 28 11452.770 33.450000
12) mxPH>=7.045 18 5146.169 26.394440 *
13) mxPH< 7.045 10 3797.645 46.150000 *
7) mxPH>=7.87 23 8241.110 48.204350
14) PO4>=15.177 12 3047.517 38.183330 *
15) PO4< 15.177 11 2673.945 59.136360 *