Water Potability

Data Cleaning and Preprocessing

setting WD

setwd("C:/Users/Emily/Dropbox/My PC (DESKTOP-39IBD6A)/Desktop/Brit School/2021 Fall/DApplications Project")
water = read.csv("water_potability.csv")

library(tidyverse)
library(gridExtra)
str(water)
## 'data.frame':    3276 obs. of  10 variables:
##  $ ph             : num  NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num  205 129 224 214 181 ...
##  $ Solids         : num  20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num  7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num  369 NA NA 357 310 ...
##  $ Conductivity   : num  564 593 419 363 398 ...
##  $ Organic_carbon : num  10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num  87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num  2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : int  0 0 0 0 0 0 0 0 0 0 ...

Searching for NaNs

anyNA(water)
## [1] TRUE
colSums(water == "NA")
##              ph        Hardness          Solids     Chloramines         Sulfate 
##              NA               0               0               0              NA 
##    Conductivity  Organic_carbon Trihalomethanes       Turbidity      Potability 
##               0               0              NA               0               0

Ph Trihalomethanes and Sulfate have missing values

summary(water$ph)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   6.093   7.037   7.081   8.062  14.000     491

Ph has 491 NAs which is 14.99% of the data in that feature

summary(water$Sulfate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   129.0   307.7   333.1   333.8   360.0   481.0     781

sulfate has 781 NAs which is 23.84% of the data in that feature

summary(water$Trihalomethanes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.738  55.845  66.622  66.396  77.337 124.000     162

Trihalomethanes has 162 NaNs which is 04.95%

Going to omit the NA’s and save it as a new dataset called “cleanwater”

cleanwater <- na.omit(water)
anyNA(cleanwater)
## [1] FALSE

Exploratory Analysis

Comparing histogram of variables with missing features

grid.arrange(
  ggplot(data = water, aes( ph)) + 
    geom_histogram() +
    labs(title = "Original Data"),
    ggplot(data = cleanwater, aes( ph)) + 
    geom_histogram() +
    labs(title = "Data with NAs Omitted"),
    ggplot(data = water, aes( Sulfate)) + 
    geom_histogram() +
    labs(title = "Original Data"),
    ggplot(data = cleanwater, aes( Sulfate)) + 
    geom_histogram() +
    labs(title = "Data with NAs Omitted"),
    ggplot(data = water, aes( Trihalomethanes)) + 
    geom_histogram() +
    labs(title = "Original Data"),
    ggplot(data = cleanwater, aes( Trihalomethanes)) + 
    geom_histogram() +
    labs(title = "Data with NAs Omitted"),
ncol = 2,
nrow = 3
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 491 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 781 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 162 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hist(water$Hardness)

hist(water$Solids)

hist(water$Chloramines)

hist(water$Conductivity)

hist(water$Organic_carbon)

hist(water$Trihalomethanes)

hist(water$Turbidity)

# and what we want to predict, y
hist(water$Potability)

Comparing the distributions of each of the variables between potable water and non potable water

# create a function to make life easier to plot all variables
var_density = function(data, var){
  data %>%
    mutate(
      Potability = as.factor(Potability)
    ) %>%
    ggplot(aes_string(x = var, fill = "Potability")) +
    geom_density(alpha = 0.5) 
}


grid.arrange(
  var_density(cleanwater, "ph"),
  var_density(cleanwater, "Hardness"),
  var_density(cleanwater, "Solids"),
  var_density(cleanwater, "Chloramines"),
  var_density(cleanwater, "Sulfate"),
  var_density(cleanwater, "Conductivity"),
  var_density(cleanwater, "Organic_carbon"),
  var_density(cleanwater, "Trihalomethanes"),
  var_density(cleanwater, "Turbidity"),
  ncol = 3, 
  nrow = 3
) 

Water potability and pH show that most potable water is near a pH of 7, which is neutral and is what would be expected for drinkability. Non potable water has a wider bell shape, showing that there is a higher frequency of either more extreme pH, high (basic) or low (acidic).

Water hardness and potability show was is visually a similar curve with the exception that non potable water high a higher density at the top of the curve. This may be due to the fact that there are more occorisions of non potable water in this data set as compared to potable.

Solids and water potability show a lower amount of solids corresponds with a higher likelihood of water being potable.

Chloramines appear to be okay in potable water up until 7, then the graph tapers down.

Sulfate shows a wider curve for potable water that may also suggest that it is due to the balance between potable and non potable water seen in the data

Box Plot variables & Potability

    # Box Plot
grid.arrange(
  ggplot(data = cleanwater, aes(x = as.character(Potability), y = Sulfate)) +
    geom_boxplot(),
  ggplot(data = cleanwater, aes(x =  as.character(Potability), y = ph)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Hardness)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Solids)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Chloramines)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Conductivity)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Organic_carbon)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Trihalomethanes)) +
    geom_boxplot(),
   ggplot(data = cleanwater, aes(x =  as.character(Potability), y = Turbidity)) +
    geom_boxplot(),
  ncol = 3, 
  nrow = 3
)

Based on the above box plots I want to look at the difference in means for Trubidity as they appear to potentially be different.

“Note: you must be very careful with the issue of multiple testing (also referred as multiplicity) which can arise when you perform multiple t-tests. In short, when a large number of statistical tests are performed, some will have p-values less than 0.05 purely by chance, even if all null hypotheses are in fact really true. This is known as multiplicity or multiple testing.” -https://statsandr.com/blog/how-to-do-a-t-test-or-anova-for-many-variables-at-once-in-r-and-communicate-the-results-in-a-better-way/

t.test(Turbidity ~ Potability, data = cleanwater, alternative = "two.sided", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  Turbidity by Potability
## t = -1.0169, df = 2009, p-value = 0.3093
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10563897  0.03349347
## sample estimates:
## mean in group 0 mean in group 1 
##        3.955181        3.991254

However Trubidity does not appear to be significantly different between potable and non potable water

Relationship of features and Potability

library(ggplot2)
library(plotly)

Lets take a look at correlation, if any, among the features

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
corrplot.mixed(cor(cleanwater), upper = "square")

There does not appear to be high correlation among the features.

Checking for multicollinearity

#VIF and removal
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
glm.fit = 
  glm(
  Potability ~ ph + Hardness + Solids + Chloramines + Conductivity + Organic_carbon + Trihalomethanes + Turbidity + Sulfate ,
  data = cleanwater, 
  family = binomial
  )

vif(glm.fit)
##              ph        Hardness          Solids     Chloramines    Conductivity 
##        1.021693        1.030639        1.043358        1.005863        1.001757 
##  Organic_carbon Trihalomethanes       Turbidity         Sulfate 
##        1.002687        1.002304        1.003486        1.044548

All features are below a VIF of 10, so we’ll proceed with these features for modeling.

Pre Processing for Model

cleanwater$Potability <-as.factor(cleanwater$Potability)

Test Train Split

tr_ind = sample(nrow(cleanwater), 0.7*nrow(cleanwater), replace = FALSE)
water.train = cleanwater[tr_ind,]
water.test = cleanwater[-tr_ind,]

#Model SVM

library(e1071)
model1 = Potability~ .

tune svm

tune <- tune.svm(model1, data = water.train, gamma =seq(.01, 0.1, by = .01), cost = seq(0.1,1, by = 0.1))

tune params

tune$best.parameters
##    gamma cost
## 70   0.1  0.7
library(dplyr)
head(tune$performances)
##   gamma cost     error dispersion
## 1  0.01  0.1 0.4022796 0.03585994
## 2  0.02  0.1 0.4022796 0.03585994
## 3  0.03  0.1 0.4022796 0.03585994
## 4  0.04  0.1 0.4022796 0.03585994
## 5  0.05  0.1 0.4022796 0.03585994
## 6  0.06  0.1 0.4022796 0.03585994
min(tune$performances$error)
## [1] 0.3084245
tune$performances %>%
  filter(error == min(tune$performances$error))
##   gamma cost     error dispersion
## 1   0.1  0.7 0.3084245 0.03627312
mysvm <- svm(formula=model1, data = water.train, gamma = tune$best.parameters$gamma, cost = tune$best.parameters$cost)
summary(mysvm)
## 
## Call:
## svm(formula = model1, data = water.train, gamma = tune$best.parameters$gamma, 
##     cost = tune$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  0.7 
## 
## Number of Support Vectors:  1091
## 
##  ( 528 563 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svmpredict <- predict(mysvm, water.test, type = "response")


#Confusion matrix
table(pred = svmpredict, true=water.test$Potability)
##     true
## pred   0   1
##    0 331 171
##    1  28  74
library(caret)
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
svm_conf_matrix = confusionMatrix(water.test$Potability, svmpredict)

svm_conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 331  28
##          1 171  74
##                                           
##                Accuracy : 0.6705          
##                  95% CI : (0.6315, 0.7079)
##     No Information Rate : 0.8311          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2469          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6594          
##             Specificity : 0.7255          
##          Pos Pred Value : 0.9220          
##          Neg Pred Value : 0.3020          
##              Prevalence : 0.8311          
##          Detection Rate : 0.5480          
##    Detection Prevalence : 0.5944          
##       Balanced Accuracy : 0.6924          
##                                           
##        'Positive' Class : 0               
## 
svm_conf_matrix$overall["Accuracy"]
##  Accuracy 
## 0.6705298

logistic regression

model3 = glm(Potability ~.,family="binomial",data=water.train)

summary(model3)
## 
## Call:
## glm(formula = Potability ~ ., family = "binomial", data = water.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2429  -1.0182  -0.9492   1.3188   1.6281  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -1.048e+00  9.102e-01  -1.152   0.2494  
## ph               3.854e-02  3.511e-02   1.098   0.2723  
## Hardness        -1.498e-03  1.774e-03  -0.845   0.3984  
## Solids           1.510e-05  6.683e-06   2.260   0.0238 *
## Chloramines      6.641e-02  3.477e-02   1.910   0.0561 .
## Sulfate         -5.109e-04  1.355e-03  -0.377   0.7061  
## Conductivity     3.355e-05  6.802e-04   0.049   0.9607  
## Organic_carbon  -1.163e-02  1.671e-02  -0.696   0.4864  
## Trihalomethanes  8.400e-04  3.389e-03   0.248   0.8042  
## Turbidity        3.331e-02  6.939e-02   0.480   0.6312  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1896.4  on 1406  degrees of freedom
## Residual deviance: 1885.3  on 1397  degrees of freedom
## AIC: 1905.3
## 
## Number of Fisher Scoring iterations: 4

Interestingly enough, none of the variables come back significant for alpha = 0.05.

model3.pred <- predict(model3,newdata=water.test,type='response')
model3.pred <- ifelse(model3.pred > 0.5,1,0)
model3.misClasificError <- mean(model3.pred != water.test$Potability)
print(paste('Accuracy',1-model3.misClasificError))
## [1] "Accuracy 0.597682119205298"

Conclusion

The SVM model out performed the logistic regression model in terms of accuracy 0.5976821 vs 0.6705298 to predict water Potability.