##### LOADING AND PREPPING DATA #####
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.1
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(class)
## Warning: package 'class' was built under R version 4.3.1
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.1
water_raw <- read.csv("C:\\Users\\abhil\\Downloads\\archive (6)\\water_potability.csv")
water_raw$ Potability <- as.factor(water_raw$Potability)
# removing instances with NA for any variables
water_clean <- water_raw[rowSums(is.na(water_raw))==0,]
set.seed(489)
testing_ratio <- .25
water_clean_test <- water_clean[sample(c(1:nrow(water_clean)),
size = ceiling(nrow(water_clean)*testing_ratio)),]
water_clean_mod <- initial_split(anti_join(water_clean,water_clean_test), prop = .75)
## Joining with `by = join_by(ph, Hardness, Solids, Chloramines, Sulfate,
## Conductivity, Organic_carbon, Trihalomethanes, Turbidity, Potability)`
#train
water_clean_train <- training(water_clean_mod)
#tune
water_clean_tune <- testing(water_clean_mod)
summary(water_clean_train)
## ph Hardness Solids Chloramines
## Min. : 0.2275 Min. : 73.49 Min. : 320.9 Min. : 1.920
## 1st Qu.: 6.0256 1st Qu.:176.80 1st Qu.:15547.8 1st Qu.: 6.103
## Median : 6.9898 Median :197.56 Median :20769.5 Median : 7.121
## Mean : 7.0518 Mean :196.15 Mean :21784.5 Mean : 7.128
## 3rd Qu.: 8.0238 3rd Qu.:216.75 3rd Qu.:27067.5 3rd Qu.: 8.115
## Max. :14.0000 Max. :317.34 Max. :56351.4 Max. :13.127
## Sulfate Conductivity Organic_carbon Trihalomethanes
## Min. :182.4 Min. :201.6 Min. : 2.20 Min. : 8.577
## 1st Qu.:306.5 1st Qu.:364.7 1st Qu.:12.12 1st Qu.: 56.037
## Median :332.8 Median :422.0 Median :14.32 Median : 66.376
## Mean :333.4 Mean :426.5 Mean :14.35 Mean : 66.668
## 3rd Qu.:361.0 3rd Qu.:482.5 3rd Qu.:16.68 3rd Qu.: 77.697
## Max. :475.7 Max. :753.3 Max. :24.76 Max. :120.030
## Turbidity Potability
## Min. :1.450 0:672
## 1st Qu.:3.442 1:459
## Median :3.965
## Mean :3.974
## 3rd Qu.:4.511
## Max. :6.495
dim(water_clean_train)
## [1] 1131 10
plot(water_clean_train)
# taking a look at the histograms with facet wrapping
melt(water_clean_train) %>% ggplot(aes(value)) +
geom_histogram() +
facet_wrap(~ variable, scales="free")
## Using Potability as id variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# multiple violin with facet wrapping
melt(water_clean_train) %>% ggplot(aes(variable,value)) +
geom_boxplot(alpha = .5, fill = "purple") +
geom_violin(alpha = .5, fill = "yellow") +
facet_wrap( ~ variable, scales="free")
## Using Potability as id variables
# this shows us conducitivty and solids are skewed whereas the others #
seem more bell-like, perhaps they are related?, lets plot the cor
map
cor_mat <- water_clean_train %>% select(-Potability) %>% cor()
melt(cor_mat) %>% ggplot(aes(x=Var1, y=Var2, fill=value, na.rm = TRUE)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red")
# outliers - taking a look at particularly unusual observations
outlier_lst <- list()
for (n in 1:(ncol(water_clean_train) -1)){
# upper outliers
lower <- which(water_clean_train[,n] > mean(water_clean_train[,n]) + 2.8*sd(water_clean_train[,n]))
# lower outliers
upper <- which(water_clean_train[,n] < mean(water_clean_train[,n]) - 2.8*sd(water_clean_train[,n]))
outliers <- c(lower, upper)
outlier_lst[[colnames(water_clean_train)[n]]] <- outliers
}
print(outlier_lst)
## $ph
## [1] 244 265 785 857 6 569 624 771 1034 1079
##
## $Hardness
## [1] 84 87 266 565 629 814 840 857 1079
##
## $Solids
## [1] 276 564 589 771 813 820 861 1063
##
## $Chloramines
## [1] 60 483 1093 64 505 843 1089
##
## $Sulfate
## [1] 242 355 708 993 605 774 828 1093 1128
##
## $Conductivity
## [1] 387 487 524 886 955
##
## $Organic_carbon
## [1] 68 299 866 1114
##
## $Trihalomethanes
## [1] 336 548 645 740 895 186 371 403 408 763
##
## $Turbidity
## [1] 581 714 901 1049 130 300 933
Reduce(intersect, list(outlier_lst[[1]], outlier_lst[[2]],
outlier_lst[[3]], outlier_lst[[4]],
outlier_lst[[5]], outlier_lst[[6]],
outlier_lst[[7]], outlier_lst[[8]],
outlier_lst[[9]]))
## integer(0)
table(c(outlier_lst[[1]], outlier_lst[[2]],
outlier_lst[[3]], outlier_lst[[4]],
outlier_lst[[5]], outlier_lst[[6]],
outlier_lst[[7]], outlier_lst[[8]],
outlier_lst[[9]]))
##
## 6 60 64 68 84 87 130 186 242 244 265 266 276 299 300 336
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 355 371 387 403 408 483 487 505 524 548 564 565 569 581 589 605
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 624 629 645 708 714 740 763 771 774 785 813 814 820 828 840 843
## 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
## 857 861 866 886 895 901 933 955 993 1034 1049 1063 1079 1089 1093 1114
## 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1
## 1128
## 1
melt(water_clean_train) %>% ggplot(aes(value, fill = Potability)) +
geom_histogram(alpha = .5) +
facet_wrap(~ variable, scales="free")
## Using Potability as id variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# difference is probably due to count differences in potability/non
potability, let’s confirm through density plots…
melt(water_clean_train) %>% ggplot(aes(value, fill = Potability)) +
geom_density(alpha = .5) +
facet_wrap(~ variable, scales="free")
## Using Potability as id variables
# distributions via violin plots and boxplots broken down by
potability
melt(water_clean_train) %>% ggplot(aes(variable,value, fill = Potability)) +
geom_boxplot(alpha = .5) +
geom_violin(alpha = .5) +
facet_wrap( ~ variable, scales="free")
## Using Potability as id variables
outlier_lst_potable <- list()
water_clean_train_potable <- water_clean_train %>% filter(Potability == 1)
for (n in 1:(ncol(water_clean_train_potable) -1)){
# upper outliers
lower <- which(water_clean_train_potable[,n] > mean(water_clean_train_potable[,n]) + 2.8*sd(water_clean_train_potable[,n]))
# lower outliers
upper <- which(water_clean_train_potable[,n] < mean(water_clean_train_potable[,n]) - 2.8*sd(water_clean_train_potable[,n]))
outliers <- c(lower, upper)
outlier_lst_potable[[colnames(water_clean_train_potable)[n]]] <- outliers
}
print(outlier_lst_potable)
## $ph
## [1] 39 99 349 394 419 440
##
## $Hardness
## [1] 36 39 100 221 254 349
##
## $Solids
## [1] 220 430
##
## $Chloramines
## [1] 28 445 346
##
## $Sulfate
## [1] 403 341 445
##
## $Conductivity
## [1] 390
##
## $Organic_carbon
## [1] 159 453
##
## $Trihalomethanes
## [1] 263 75 156
##
## $Turbidity
## [1] 369 424 114
outlier_lst_non_potable <- list()
water_clean_train_non_potable <- water_clean_train %>% filter(Potability == 0)
for (n in 1:(ncol(water_clean_train_non_potable) -1)){
# upper outliers
lower <- which(water_clean_train_non_potable[,n] > mean(water_clean_train_non_potable[,n]) + 2.8*sd(water_clean_train_non_potable[,n]))
# lower outliers
upper <- which(water_clean_train_non_potable[,n] < mean(water_clean_train_non_potable[,n]) - 2.8*sd(water_clean_train_non_potable[,n]))
outliers <- c(lower, upper)
outlier_lst_non_potable[[colnames(water_clean_train_non_potable)[n]]] <- outliers
}
print(outlier_lst_non_potable)
log_model <- glm(Potability ~ ., family = "binomial", data = water_clean_train)
summary(log_model)
##
## Call:
## glm(formula = Potability ~ ., family = "binomial", data = water_clean_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.143e-01 9.790e-01 -0.730 0.466
## ph 3.911e-02 3.906e-02 1.001 0.317
## Hardness 4.195e-04 1.857e-03 0.226 0.821
## Solids 3.851e-06 7.149e-06 0.539 0.590
## Chloramines 2.456e-02 3.861e-02 0.636 0.525
## Sulfate 2.072e-04 1.485e-03 0.140 0.889
## Conductivity -1.086e-03 7.520e-04 -1.444 0.149
## Organic_carbon -2.057e-02 1.812e-02 -1.135 0.256
## Trihalomethanes 2.569e-04 3.713e-03 0.069 0.945
## Turbidity 9.714e-02 7.807e-02 1.244 0.213
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1527.5 on 1130 degrees of freedom
## Residual deviance: 1521.0 on 1121 degrees of freedom
## AIC: 1541
##
## Number of Fisher Scoring iterations: 4
log_probs <- plogis(predict(log_model, water_clean_tune))
log_preds <- as_tibble(log_probs) %>%
mutate(Potability = ifelse(value>.5, 1, 0))
log_preds <- log_preds$Potability %>% factor()
confusionMatrix(log_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 229 144
## 1 1 3
##
## Accuracy : 0.6154
## 95% CI : (0.5642, 0.6647)
## No Information Rate : 0.6101
## P-Value [Acc > NIR] : 0.4386
##
## Kappa : 0.0195
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.020408
## Specificity : 0.995652
## Pos Pred Value : 0.750000
## Neg Pred Value : 0.613941
## Prevalence : 0.389920
## Detection Rate : 0.007958
## Detection Prevalence : 0.010610
## Balanced Accuracy : 0.508030
##
## 'Positive' Class : 1
##
log_probs <- plogis(predict(log_model, water_clean_test))
log_preds <- as_tibble(log_probs) %>%
mutate(Potability = ifelse(value>.5, 1, 0))
log_preds <- log_preds$Potability %>% factor()
confusionMatrix(log_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 295 203
## 1 3 2
##
## Accuracy : 0.5905
## 95% CI : (0.5461, 0.6338)
## No Information Rate : 0.5924
## P-Value [Acc > NIR] : 0.5552
##
## Kappa : -4e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.009756
## Specificity : 0.989933
## Pos Pred Value : 0.400000
## Neg Pred Value : 0.592369
## Prevalence : 0.407555
## Detection Rate : 0.003976
## Detection Prevalence : 0.009940
## Balanced Accuracy : 0.499844
##
## 'Positive' Class : 1
##
tree_model <- rpart(Potability ~ ., method = "class", data = water_clean_train)
tree_preds <- predict(tree_model, water_clean_tune, type = "class")
confusionMatrix(tree_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 185 84
## 1 45 63
##
## Accuracy : 0.6578
## 95% CI : (0.6075, 0.7056)
## No Information Rate : 0.6101
## P-Value [Acc > NIR] : 0.0315460
##
## Kappa : 0.2446
##
## Mcnemar's Test P-Value : 0.0008207
##
## Sensitivity : 0.4286
## Specificity : 0.8043
## Pos Pred Value : 0.5833
## Neg Pred Value : 0.6877
## Prevalence : 0.3899
## Detection Rate : 0.1671
## Detection Prevalence : 0.2865
## Balanced Accuracy : 0.6165
##
## 'Positive' Class : 1
##
tree_preds <- predict(tree_model, water_clean_test, type = "class")
confusionMatrix(tree_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 242 130
## 1 56 75
##
## Accuracy : 0.6302
## 95% CI : (0.5864, 0.6725)
## No Information Rate : 0.5924
## P-Value [Acc > NIR] : 0.04605
##
## Kappa : 0.1886
##
## Mcnemar's Test P-Value : 8.669e-08
##
## Sensitivity : 0.3659
## Specificity : 0.8121
## Pos Pred Value : 0.5725
## Neg Pred Value : 0.6505
## Prevalence : 0.4076
## Detection Rate : 0.1491
## Detection Prevalence : 0.2604
## Balanced Accuracy : 0.5890
##
## 'Positive' Class : 1
##
forest_model <- randomForest(Potability ~ ., data= water_clean_train, ntree= 1000)
forest_preds <- predict(forest_model, water_clean_tune, type = "class")
confusionMatrix(forest_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 198 84
## 1 32 63
##
## Accuracy : 0.6923
## 95% CI : (0.643, 0.7386)
## No Information Rate : 0.6101
## P-Value [Acc > NIR] : 0.0005462
##
## Kappa : 0.3092
##
## Mcnemar's Test P-Value : 2.188e-06
##
## Sensitivity : 0.4286
## Specificity : 0.8609
## Pos Pred Value : 0.6632
## Neg Pred Value : 0.7021
## Prevalence : 0.3899
## Detection Rate : 0.1671
## Detection Prevalence : 0.2520
## Balanced Accuracy : 0.6447
##
## 'Positive' Class : 1
##
forest_preds <- predict(forest_model, water_clean_test, type = "class")
confusionMatrix(forest_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 242 131
## 1 56 74
##
## Accuracy : 0.6282
## 95% CI : (0.5843, 0.6706)
## No Information Rate : 0.5924
## P-Value [Acc > NIR] : 0.05562
##
## Kappa : 0.1835
##
## Mcnemar's Test P-Value : 6.253e-08
##
## Sensitivity : 0.3610
## Specificity : 0.8121
## Pos Pred Value : 0.5692
## Neg Pred Value : 0.6488
## Prevalence : 0.4076
## Detection Rate : 0.1471
## Detection Prevalence : 0.2584
## Balanced Accuracy : 0.5865
##
## 'Positive' Class : 1
##