Predicting Algae Blooms
The objective of our dataset is to predict the frequency of several harmful algae in water samples.
Each variable in the dataset
Nominal variables Season: Season of the year when water samples aggregated Size: Size of the river Speed: Speed of the river
Numerical Variables MaxPH: Max PH Level MnO2: Min Oxygen Level Cl: Mean value of Chloride NO3: Mean Value of Nitrates NH4: Mean Value of Ammonium oP04: Mean of phosphate P04: Mean of Phosphate Chla: Mean of Chlorophyll
a1 - a7, seven harmful algae frequencies
Reading our Dataset as a table, give our table column names
algae <- read.table("~/Data Science Master Program/Spring 2022/Data Analytics I/analysis.txt",
header = FALSE,
sep = ',', fill = TRUE,
na.strings=c('XXXXXXX'))
algae <- as.data.frame(algae)
colnames(algae) <- c('season', 'size','speed', 'mxPH',
'mn02','Cl','N03',
'NH4', 'oP04', 'P04',
'Chla', 'a1','a2','a3',
'a4','a5','a6','a7')
head(algae)
## season size speed mxPH mn02 Cl N03 NH4 oP04 P04 Chla
## 1 winter small_ medium 8.00 9.8 60.800 6.23800 578.000 105.000 170.000 50.0
## 2 spring small_ medium 8.35 8.0 57.750 1.28800 370.000 428.750 558.750 1.3
## 3 autumn small_ medium 8.10 11.4 40.020 5.33000 346.667 125.667 187.057 15.6
## 4 spring small_ medium 8.07 4.8 77.364 2.30200 98.182 61.182 138.700 1.4
## 5 autumn small_ medium 8.06 9.0 55.350 10.41600 233.700 58.222 97.580 10.5
## 6 winter small_ high__ 8.25 13.1 65.750 9.24800 430.000 18.250 56.667 28.4
## a1 a2 a3 a4 a5 a6 a7
## 1 0.0 0.0 0.0 0.0 34.2 8.3 0.0
## 2 1.4 7.6 4.8 1.9 6.7 0.0 2.1
## 3 3.3 53.6 1.9 0.0 0.0 0.0 9.7
## 4 3.1 41.0 18.9 0.0 1.4 0.0 1.4
## 5 9.2 2.9 7.5 0.0 7.5 4.1 1.0
## 6 15.1 14.6 1.4 0.0 22.5 12.6 2.9
Data Visualization and Data Summary Converting any character variables to numerical where applicable
sapply(algae, class)
## season size speed mxPH mn02 Cl
## "character" "character" "character" "numeric" "numeric" "numeric"
## N03 NH4 oP04 P04 Chla a1
## "character" "numeric" "numeric" "numeric" "numeric" "numeric"
## a2 a3 a4 a5 a6 a7
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
algae$mxPH <- as.numeric(as.character(algae$mxPH))
algae$N03 <- as.numeric(as.character(algae$N03))
## Warning: NAs introduced by coercion
sapply(algae, class)
## season size speed mxPH mn02 Cl
## "character" "character" "character" "numeric" "numeric" "numeric"
## N03 NH4 oP04 P04 Chla a1
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## a2 a3 a4 a5 a6 a7
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
summary(algae)
## season size speed mxPH
## Length:200 Length:200 Length:200 Min. :5.600
## Class :character Class :character Class :character 1st Qu.:7.700
## Mode :character Mode :character Mode :character Median :8.060
## Mean :8.012
## 3rd Qu.:8.400
## Max. :9.700
## NA's :1
## mn02 Cl N03 NH4
## Min. : 1.500 Min. : 0.222 Min. : 0.102 Min. : 5.00
## 1st Qu.: 7.725 1st Qu.: 10.981 1st Qu.: 1.287 1st Qu.: 35.62
## Median : 9.800 Median : 32.730 Median : 2.550 Median : 99.67
## Mean : 9.118 Mean : 43.636 Mean : 3.012 Mean :154.45
## 3rd Qu.:10.800 3rd Qu.: 57.824 3rd Qu.: 4.188 3rd Qu.:203.73
## Max. :13.400 Max. :391.500 Max. :10.416 Max. :931.83
## NA's :2 NA's :10 NA's :19 NA's :2
## oP04 P04 Chla a1
## Min. : 1.00 Min. : 0.90 Min. : 0.00 Min. : 0.000
## 1st Qu.: 16.00 1st Qu.: 19.39 1st Qu.: 2.00 1st Qu.: 1.475
## Median : 41.40 Median : 84.50 Median : 5.20 Median : 7.400
## Mean : 83.33 Mean :111.55 Mean : 13.54 Mean :16.863
## 3rd Qu.:102.25 3rd Qu.:182.16 3rd Qu.: 18.30 3rd Qu.:24.075
## Max. :771.60 Max. :558.75 Max. :110.46 Max. :89.800
## NA's :2 NA's :2 NA's :12
## a2 a3 a4 a5
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 2.100 Median : 1.750 Median : 0.000 Median : 1.90
## Mean : 6.934 Mean : 4.729 Mean : 1.885 Mean : 5.63
## 3rd Qu.: 9.075 3rd Qu.: 6.150 3rd Qu.: 2.225 3rd Qu.: 7.70
## Max. :72.600 Max. :44.600 Max. :35.600 Max. :77.60
##
## a6 a7
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000
## Mean : 5.199 Mean : 2.506
## 3rd Qu.: 6.725 3rd Qu.: 2.400
## Max. :52.500 Max. :31.600
## NA's :17
Histogram of all numerical data to see distributions of each variable
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 4.0.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
algae_numeric_columns <- algae %>%
select(c('mxPH',
'mn02','Cl','N03',
'NH4', 'oP04', 'P04',
'Chla', 'a1','a2','a3',
'a4','a5','a6','a7'))
hist.data.frame(algae_numeric_columns)
QQ Norm plots for each variable
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
par(mfrow = c(3,5))
qqPlot(algae$mxPH)
## [1] 56 57
qqPlot(algae$mn02)
## [1] 69 70
qqPlot(algae$Cl)
## [1] 134 175
qqPlot(algae$N03)
## [1] 5 140
qqPlot(algae$NH4)
## [1] 108 144
qqPlot(algae$oP04)
## [1] 20 89
qqPlot(algae$P04)
## [1] 2 119
qqPlot(algae$Chla)
## [1] 128 98
qqPlot(algae$a1)
## [1] 49 118
qqPlot(algae$a2)
## [1] 137 3
qqPlot(algae$a3)
## [1] 20 67
qqPlot(algae$a4)
## [1] 89 58
qqPlot(algae$a5)
## [1] 153 133
qqPlot(algae$a6)
## [1] 170 100
qqPlot(algae$a7)
## [1] 145 84
A Conditioned Box Plot & Voilin Plot for Algal a1
library(ggplot2)
ggplot(algae, aes(x = size,y=a1)) + geom_boxplot() +
xlab("River Size") + ylab("Algal A1")
ggplot(algae, aes(x = size, y=a1))+
geom_violin(aes(fill = size)) + geom_jitter() + xlab('River Size') + ylab('Algal A1')
par(mfrow = c(2,2))
data2graph <- filter(algae, !is.na(mn02)) %>%
mutate(min02=cut(mn02, quantile(mn02, c(0,0.25,.5,.75,.1)), include.lowest = TRUE))
data2graph <- na.omit(data2graph)
ggplot(data2graph, aes(x=a3, y=season, color = season)) + geom_point() +
facet_wrap(~min02) + guides(color=FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Unknown Values
Removing anything with NA Values
algae_no_na <- na.omit(algae)
Many Na’s is a function that gives you the row numbers that , for our case have 20% of columns with NA Values, in the code below , we removed any rows with more than 80% of the columns missing
library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'DMwR2'
## The following object is masked _by_ '.GlobalEnv':
##
## algae
algae <- algae[-manyNAs(algae),]
Filling Unknows with the Most Frequent Values mxPH is normally distributed therefore using the mean value as missing vale is a good suggestion
algae[48, "mxPH"] <- mean(algae$mxPH, na.rm = TRUE)
In Chla, we will use the median value because our distribution for this variable is skewed
algae[is.na(algae$Chla), 'Chla'] <- median(algae$Chla, na.rm = TRUE)
Automate the entire process, this function fills all unkowns in dataset using median by default
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
algae <- centralImputation(algae)
Filling in the Unknown Values by Exploring Correlations Cor() creates a correlation matrix that will give us the relationships between variables, we use columns 4- 18 To get rid of the nominal columns which are in the first three of the algae dataset, complete.obs eliminates any obs values
cor(algae[,4:18], use = 'complete.obs')
## mxPH mnO2 Cl NO3 NH4 oPO4
## mxPH 1.00000000 -0.16749178 0.13285681 -0.13103951 -0.09360612 0.15850785
## mnO2 -0.16749178 1.00000000 -0.27873229 0.09837676 -0.08780541 -0.41655069
## Cl 0.13285681 -0.27873229 1.00000000 0.22504071 0.07407466 0.39230733
## NO3 -0.13103951 0.09837676 0.22504071 1.00000000 0.72144352 0.14458782
## NH4 -0.09360612 -0.08780541 0.07407466 0.72144352 1.00000000 0.22723723
## oPO4 0.15850785 -0.41655069 0.39230733 0.14458782 0.22723723 1.00000000
## PO4 0.18033494 -0.48772564 0.45652107 0.16931401 0.20844445 0.91387767
## Chla 0.39121495 -0.16678069 0.15082753 0.14290962 0.09375115 0.12941615
## a1 -0.26823725 0.28389830 -0.36078101 -0.24121109 -0.13265601 -0.41735761
## a2 0.32584814 -0.09935631 0.08949837 0.02368832 -0.02968344 0.14768993
## a3 0.03077250 -0.25155437 0.09429722 -0.07621407 -0.10143974 0.03362906
## a4 -0.24876290 -0.31513753 0.12045912 -0.02578257 0.22822914 0.29574585
## a5 -0.01697947 0.17008979 0.16514900 0.22359794 0.02745909 0.15147500
## a6 -0.08388657 0.15864906 0.18369968 0.54640569 0.40571045 0.02876159
## a7 -0.08726106 -0.12117098 -0.02793640 0.08509789 -0.01672691 0.04849832
## PO4 Chla a1 a2 a3 a4
## mxPH 0.18033494 0.39121495 -0.26823725 0.32584814 0.03077250 -0.24876290
## mnO2 -0.48772564 -0.16678069 0.28389830 -0.09935631 -0.25155437 -0.31513753
## Cl 0.45652107 0.15082753 -0.36078101 0.08949837 0.09429722 0.12045912
## NO3 0.16931401 0.14290962 -0.24121109 0.02368832 -0.07621407 -0.02578257
## NH4 0.20844445 0.09375115 -0.13265601 -0.02968344 -0.10143974 0.22822914
## oPO4 0.91387767 0.12941615 -0.41735761 0.14768993 0.03362906 0.29574585
## PO4 1.00000000 0.26758873 -0.48730097 0.16246963 0.06587312 0.30462623
## Chla 0.26758873 1.00000000 -0.28380049 0.38192280 -0.04975884 -0.08364618
## a1 -0.48730097 -0.28380049 1.00000000 -0.29251967 -0.14695028 -0.03892441
## a2 0.16246963 0.38192280 -0.29251967 1.00000000 0.03031095 -0.17168171
## a3 0.06587312 -0.04975884 -0.14695028 0.03031095 1.00000000 0.01218370
## a4 0.30462623 -0.08364618 -0.03892441 -0.17168171 0.01218370 1.00000000
## a5 0.19111521 -0.05945318 -0.29503346 -0.16186215 -0.11111997 -0.11006558
## a6 0.08316987 0.01815732 -0.27602608 -0.11613061 -0.17283566 -0.09074936
## a7 0.10671057 0.02405581 -0.21142489 0.04749242 0.05618729 0.04362334
## a5 a6 a7
## mxPH -0.01697947 -0.08388657 -0.08726106
## mnO2 0.17008979 0.15864906 -0.12117098
## Cl 0.16514900 0.18369968 -0.02793640
## NO3 0.22359794 0.54640569 0.08509789
## NH4 0.02745909 0.40571045 -0.01672691
## oPO4 0.15147500 0.02876159 0.04849832
## PO4 0.19111521 0.08316987 0.10671057
## Chla -0.05945318 0.01815732 0.02405581
## a1 -0.29503346 -0.27602608 -0.21142489
## a2 -0.16186215 -0.11613061 0.04749242
## a3 -0.11111997 -0.17283566 0.05618729
## a4 -0.11006558 -0.09074936 0.04362334
## a5 1.00000000 0.40360881 -0.02686306
## a6 0.40360881 1.00000000 -0.01244488
## a7 -0.02686306 -0.01244488 1.00000000
To make this more legible Will give you symbolic representation to see correlation values
symnum(cor(algae[,4:18], use = 'complete.obs'))
## mP mO Cl NO NH o P Ch a1 a2 a3 a4 a5 a6 a7
## mxPH 1
## mnO2 1
## Cl 1
## NO3 1
## NH4 , 1
## oPO4 . . 1
## PO4 . . * 1
## Chla . 1
## a1 . . . 1
## a2 . . 1
## a3 1
## a4 . . 1
## a5 1
## a6 . . . 1
## a7 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
library(corrplot)
## corrplot 0.92 loaded
cm <- cor(algae[,4:18], use = 'complete.obs')
corrplot(cm, type ='upper', tl.pos='d')
corrplot(cm, add=TRUE, type='lower', method='number', diag=FALSE, tl.pos='n', cl.pos='n')
Finding linear correlation between oPO4 and PO4
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
lm(PO4 ~ oPO4 , data = algae)
##
## Call:
## lm(formula = PO4 ~ oPO4, data = algae)
##
## Coefficients:
## (Intercept) oPO4
## 42.897 1.293
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
fillPO4 <- function(oP) ifelse(is.na(oP), NA, 42.897 + 1.293 * oP)
algae[is.na(algae$PO4), 'PO4'] <- sapply(algae[is.na(algae$PO4), 'oPO4'], fillPO4)
Conditioned Histograms
library(ggplot2)
library(forcats)
## Warning: package 'forcats' was built under R version 4.0.5
algae <- mutate(algae,
size=fct_relevel(size,c('small', 'medium', 'large')),
speed=fct_relevel(speed,c('low','medium', 'high')),
season=fct_relevel(season,c('spring', 'summer', 'autumn', 'winter')))
ggplot(algae, aes(x=mxPH)) + geom_histogram(binwidth = 0.5) + facet_wrap(~ season)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
Filling in Unknown Values by Exploring Similarities betwen Cases
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
algae <- knnImputation(algae, k = 10)
First Regression Model
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
clean.algae <- knnImputation(algae, k = 10)
lm.a1 <- lm(a1 ~., data = clean.algae[,1:12])
summary(lm.a1)
##
## Call:
## lm(formula = a1 ~ ., data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.679 -11.893 -2.567 7.410 62.190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.942055 24.010879 1.788 0.07537 .
## seasonspring 3.726978 4.137741 0.901 0.36892
## seasonsummer 0.747597 4.020711 0.186 0.85270
## seasonwinter 3.692955 3.865391 0.955 0.34065
## sizemedium 3.263728 3.802051 0.858 0.39179
## sizesmall 9.682140 4.179971 2.316 0.02166 *
## speedlow 3.922084 4.706315 0.833 0.40573
## speedmedium 0.246764 3.241874 0.076 0.93941
## mxPH -3.589118 2.703528 -1.328 0.18598
## mnO2 1.052636 0.705018 1.493 0.13715
## Cl -0.040172 0.033661 -1.193 0.23426
## NO3 -1.511235 0.551339 -2.741 0.00674 **
## NH4 0.001634 0.001003 1.628 0.10516
## oPO4 -0.005435 0.039884 -0.136 0.89177
## PO4 -0.052241 0.030755 -1.699 0.09109 .
## Chla -0.088022 0.079998 -1.100 0.27265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.65 on 182 degrees of freedom
## Multiple R-squared: 0.3731, Adjusted R-squared: 0.3215
## F-statistic: 7.223 on 15 and 182 DF, p-value: 2.444e-12
anova(lm.a1)
## Analysis of Variance Table
##
## Response: a1
## Df Sum Sq Mean Sq F value Pr(>F)
## season 3 85 28.2 0.0905 0.9651944
## size 2 11401 5700.7 18.3088 5.69e-08 ***
## speed 2 3934 1967.2 6.3179 0.0022244 **
## mxPH 1 1329 1328.8 4.2677 0.0402613 *
## mnO2 1 2287 2286.8 7.3444 0.0073705 **
## Cl 1 4304 4304.3 13.8239 0.0002671 ***
## NO3 1 3418 3418.5 10.9789 0.0011118 **
## NH4 1 404 403.6 1.2963 0.2563847
## oPO4 1 4788 4788.0 15.3774 0.0001246 ***
## PO4 1 1406 1405.6 4.5142 0.0349635 *
## Chla 1 377 377.0 1.2107 0.2726544
## Residuals 182 56668 311.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Regression 2
lm2.a1 <- update(lm.a1, .~ . - season)
summary(lm2.a1)
##
## Call:
## lm(formula = a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 +
## oPO4 + PO4 + Chla, data = clean.algae[, 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.460 -11.953 -3.044 7.444 63.730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.9532874 23.2378377 1.934 0.05458 .
## sizemedium 3.3092102 3.7825221 0.875 0.38278
## sizesmall 10.2730961 4.1223163 2.492 0.01358 *
## speedlow 3.0546270 4.6108069 0.662 0.50848
## speedmedium -0.2976867 3.1818585 -0.094 0.92556
## mxPH -3.2684281 2.6576592 -1.230 0.22033
## mnO2 0.8011759 0.6589644 1.216 0.22561
## Cl -0.0381881 0.0333791 -1.144 0.25407
## NO3 -1.5334300 0.5476550 -2.800 0.00565 **
## NH4 0.0015777 0.0009951 1.586 0.11456
## oPO4 -0.0062392 0.0395086 -0.158 0.87469
## PO4 -0.0509543 0.0305189 -1.670 0.09669 .
## Chla -0.0841371 0.0794459 -1.059 0.29096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.57 on 185 degrees of freedom
## Multiple R-squared: 0.3682, Adjusted R-squared: 0.3272
## F-statistic: 8.984 on 12 and 185 DF, p-value: 1.762e-13
anova(lm.a1, lm2.a1)
## Analysis of Variance Table
##
## Model 1: a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
## Model 2: a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 182 56668
## 2 185 57116 -3 -447.62 0.4792 0.6971
final.lm <- step(lm.a1)
## Start: AIC=1152.03
## a1 ~ season + size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 +
## PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - season 3 447.62 57116 1147.6
## - speed 2 269.60 56938 1149.0
## - oPO4 1 5.78 56674 1150.0
## - Chla 1 376.96 57045 1151.3
## - Cl 1 443.46 57112 1151.6
## - mxPH 1 548.76 57217 1151.9
## <none> 56668 1152.0
## - mnO2 1 694.11 57363 1152.4
## - NH4 1 825.67 57494 1152.9
## - PO4 1 898.42 57567 1153.1
## - size 2 1857.16 58526 1154.4
## - NO3 1 2339.36 59008 1158.0
##
## Step: AIC=1147.59
## a1 ~ size + speed + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 +
## Chla
##
## Df Sum of Sq RSS AIC
## - speed 2 210.64 57327 1144.3
## - oPO4 1 7.70 57124 1145.6
## - Chla 1 346.27 57462 1146.8
## - Cl 1 404.10 57520 1147.0
## - mnO2 1 456.37 57572 1147.2
## - mxPH 1 466.95 57583 1147.2
## <none> 57116 1147.6
## - NH4 1 776.11 57892 1148.3
## - PO4 1 860.62 57977 1148.5
## - size 2 2175.59 59292 1151.0
## - NO3 1 2420.47 59537 1153.8
##
## Step: AIC=1144.31
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + oPO4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - oPO4 1 16.29 57343 1142.4
## - Chla 1 223.29 57550 1143.1
## - mnO2 1 413.77 57740 1143.7
## - Cl 1 472.70 57799 1143.9
## - mxPH 1 483.56 57810 1144.0
## <none> 57327 1144.3
## - NH4 1 720.19 58047 1144.8
## - PO4 1 809.30 58136 1145.1
## - size 2 2060.95 59388 1147.3
## - NO3 1 2379.75 59706 1150.4
##
## Step: AIC=1142.37
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4 + Chla
##
## Df Sum of Sq RSS AIC
## - Chla 1 207.7 57551 1141.1
## - mnO2 1 402.6 57746 1141.8
## - Cl 1 470.7 57814 1142.0
## - mxPH 1 519.7 57863 1142.2
## <none> 57343 1142.4
## - NH4 1 704.4 58047 1142.8
## - size 2 2050.3 59393 1145.3
## - NO3 1 2370.4 59713 1148.4
## - PO4 1 5818.4 63161 1159.5
##
## Step: AIC=1141.09
## a1 ~ size + mxPH + mnO2 + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - mnO2 1 435.3 57986 1140.6
## - Cl 1 438.1 57989 1140.6
## <none> 57551 1141.1
## - NH4 1 746.9 58298 1141.6
## - mxPH 1 833.1 58384 1141.9
## - size 2 2217.5 59768 1144.6
## - NO3 1 2667.1 60218 1148.1
## - PO4 1 6309.7 63860 1159.7
##
## Step: AIC=1140.58
## a1 ~ size + mxPH + Cl + NO3 + NH4 + PO4
##
## Df Sum of Sq RSS AIC
## - NH4 1 531.0 58517 1140.4
## - Cl 1 584.9 58571 1140.6
## <none> 57986 1140.6
## - mxPH 1 819.1 58805 1141.4
## - size 2 2478.2 60464 1144.9
## - NO3 1 2251.4 60237 1146.1
## - PO4 1 9097.9 67084 1167.4
##
## Step: AIC=1140.38
## a1 ~ size + mxPH + Cl + NO3 + PO4
##
## Df Sum of Sq RSS AIC
## <none> 58517 1140.4
## - mxPH 1 784.1 59301 1141.0
## - Cl 1 835.6 59353 1141.2
## - NO3 1 1987.9 60505 1145.0
## - size 2 2664.3 61181 1145.2
## - PO4 1 8575.8 67093 1165.5
summary(final.lm)
##
## Call:
## lm(formula = a1 ~ size + mxPH + Cl + NO3 + PO4, data = clean.algae[,
## 1:12])
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.874 -12.732 -3.741 8.424 62.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.28555 20.96132 2.733 0.00687 **
## sizemedium 2.80050 3.40190 0.823 0.41141
## sizesmall 10.40636 3.82243 2.722 0.00708 **
## mxPH -3.97076 2.48204 -1.600 0.11130
## Cl -0.05227 0.03165 -1.651 0.10028
## NO3 -0.89529 0.35148 -2.547 0.01165 *
## PO4 -0.05911 0.01117 -5.291 3.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.5 on 191 degrees of freedom
## Multiple R-squared: 0.3527, Adjusted R-squared: 0.3324
## F-statistic: 17.35 on 6 and 191 DF, p-value: 5.554e-16
Regression Tree
library(rpart)
data(algae, package = 'DMwR2')
algae <- algae[-manyNAs(algae),]
rt.a1 <- rpart(a1 ~ ., data = algae[,1:12])
rt.a1
## n= 198
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 198 90401.290 16.996460
## 2) PO4>=43.818 147 31279.120 8.979592
## 4) Cl>=7.8065 140 21622.830 7.492857
## 8) oPO4>=51.118 84 3441.149 3.846429 *
## 9) oPO4< 51.118 56 15389.430 12.962500
## 18) mnO2>=10.05 24 1248.673 6.716667 *
## 19) mnO2< 10.05 32 12502.320 17.646870
## 38) NO3>=3.1875 9 257.080 7.866667 *
## 39) NO3< 3.1875 23 11047.500 21.473910
## 78) mnO2< 8 13 2919.549 13.807690 *
## 79) mnO2>=8 10 6370.704 31.440000 *
## 5) Cl< 7.8065 7 3157.769 38.714290 *
## 3) PO4< 43.818 51 22442.760 40.103920
## 6) mxPH< 7.87 28 11452.770 33.450000
## 12) mxPH>=7.045 18 5146.169 26.394440 *
## 13) mxPH< 7.045 10 3797.645 46.150000 *
## 7) mxPH>=7.87 23 8241.110 48.204350
## 14) PO4>=15.177 12 3047.517 38.183330 *
## 15) PO4< 15.177 11 2673.945 59.136360 *
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
prp(rt.a1, extra=101,box.col='orange', split.box.col = 'grey')
printcp(rt.a1)
##
## Regression tree:
## rpart(formula = a1 ~ ., data = algae[, 1:12])
##
## Variables actually used in tree construction:
## [1] Cl mnO2 mxPH NO3 oPO4 PO4
##
## Root node error: 90401/198 = 456.57
##
## n= 198
##
## CP nsplit rel error xerror xstd
## 1 0.405740 0 1.00000 1.00474 0.12975
## 2 0.071885 1 0.59426 0.71614 0.11403
## 3 0.030887 2 0.52237 0.71595 0.11587
## 4 0.030408 3 0.49149 0.66589 0.10731
## 5 0.027872 4 0.46108 0.65438 0.10559
## 6 0.027754 5 0.43321 0.65438 0.10559
## 7 0.018124 6 0.40545 0.66824 0.10774
## 8 0.016344 7 0.38733 0.70031 0.10799
## 9 0.010000 9 0.35464 0.72093 0.11035