Objective

Access to safe drinking-water is essential to health, a basic human right and a component of effective policy for health protection. This is important as a health and development issue at a national, regional and local level. In some regions, it has been shown that investments in water supply and sanitation can yield a net economic benefit, since the reductions in adverse health effects and health care costs outweigh the costs of undertaking the interventions.

Data Preparation

#Library loading
library(dplyr)
library(e1071)
library(rsample)
library(caret)
library(ROCR)
library(partykit)
library(rsample)
library(randomForest)
library(rpart)
#Import csv data to data frame
water <- read.csv("water_potability.csv")
water

Data Wrangling

#Inspecting data
glimpse(water)
#> Rows: 3,276
#> Columns: 10
#> $ ph              <dbl> NA, 3.716080, 8.099124, 8.316766, 9.092223, 5.584087, ~
#> $ Hardness        <dbl> 204.8905, 129.4229, 224.2363, 214.3734, 181.1015, 188.~
#> $ Solids          <dbl> 20791.32, 18630.06, 19909.54, 22018.42, 17978.99, 2874~
#> $ Chloramines     <dbl> 7.300212, 6.635246, 9.275884, 8.059332, 6.546600, 7.54~
#> $ Sulfate         <dbl> 368.5164, NA, NA, 356.8861, 310.1357, 326.6784, 393.66~
#> $ Conductivity    <dbl> 564.3087, 592.8854, 418.6062, 363.2665, 398.4108, 280.~
#> $ Organic_carbon  <dbl> 10.379783, 15.180013, 16.868637, 18.436524, 11.558279,~
#> $ Trihalomethanes <dbl> 86.99097, 56.32908, 66.42009, 100.34167, 31.99799, 54.~
#> $ Turbidity       <dbl> 2.963135, 4.500656, 3.055934, 4.628771, 4.075075, 2.55~
#> $ Potability      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
#Checking Missing Value
colSums(is.na(water))
#>              ph        Hardness          Solids     Chloramines         Sulfate 
#>             491               0               0               0             781 
#>    Conductivity  Organic_carbon Trihalomethanes       Turbidity      Potability 
#>               0               0             162               0               0
#Fill NA value with mean and change potability column to factor
water_clean <- water %>% 
  group_by(Potability) %>%
  mutate_at(vars(-c("Potability")),~ifelse(is.na(.), mean(., na.rm = TRUE), .)) %>% 
  mutate(Potability = as.factor(Potability),
         ph = as.numeric(ph),
         Hardness = as.numeric(Hardness),
         Solids = as.numeric(Solids),
         Chloramines = as.numeric(Chloramines),
         Sulfate = as.numeric(Sulfate),
         Conductivity = as.numeric(Conductivity),
         Organic_carbon = as.numeric(Organic_carbon),
         Trihalomethanes = as.numeric(Trihalomethanes),
         Turbidity = as.numeric(Turbidity))

water_clean

Exploratory Data Analysis

#Checking target variable proportion
prop.table(table(water_clean$Potability))
#> 
#>         0         1 
#> 0.6098901 0.3901099

We can assume that the target variable is balanced, with 0 (Not potable) at 60%, and 1 (potable) at 40%

Cross-Validation

#Splitting data to data train (80%) and data test (20%)
RNGkind(sample.kind = "Rounding")
set.seed(1234)

index <- sample(x = nrow(water_clean),
                size = nrow(water_clean)*0.8)

water_train <- water_clean[index, ]
water_test <- water_clean[-index, ]

Naive-Bayes Model

Model Fitting

#Building a naive bayes model
water_naive <- naiveBayes(Potability ~ ., data = water_train)
water_naive
#> 
#> Naive Bayes Classifier for Discrete Predictors
#> 
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#> 
#> A-priori probabilities:
#> Y
#>         0         1 
#> 0.6064885 0.3935115 
#> 
#> Conditional probabilities:
#>    ph
#> Y       [,1]     [,2]
#>   0 7.075419 1.544355
#>   1 7.059120 1.353061
#> 
#>    Hardness
#> Y       [,1]     [,2]
#>   0 196.9079 30.89728
#>   1 195.9732 35.60567
#> 
#>    Solids
#> Y       [,1]     [,2]
#>   0 21842.25 8519.572
#>   1 22320.76 8919.007
#> 
#>    Chloramines
#> Y       [,1]     [,2]
#>   0 7.056088 1.478515
#>   1 7.188880 1.713786
#> 
#>    Sulfate
#> Y       [,1]     [,2]
#>   0 334.6017 31.67072
#>   1 331.8996 40.85486
#> 
#>    Conductivity
#> Y       [,1]     [,2]
#>   0 426.1444 78.47286
#>   1 426.0730 81.50378
#> 
#>    Organic_carbon
#> Y       [,1]     [,2]
#>   0 14.39037 3.318138
#>   1 14.18224 3.197548
#> 
#>    Trihalomethanes
#> Y       [,1]     [,2]
#>   0 66.50815 15.57059
#>   1 66.49571 15.77251
#> 
#>    Turbidity
#> Y       [,1]      [,2]
#>   0 3.964457 0.7797357
#>   1 3.958181 0.7827792

Model Evaluation

# Predict model with data test
water_test$pred <- predict(water_naive, newdata = water_test, type = "class" )
head(water_test)
# confusion Matrix
confusionMatrix(data = water_test$pred, reference = water_test$Potability,
                positive = "1")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 352 181
#>          1  57  66
#>                                           
#>                Accuracy : 0.6372          
#>                  95% CI : (0.5991, 0.6741)
#>     No Information Rate : 0.6235          
#>     P-Value [Acc > NIR] : 0.2473          
#>                                           
#>                   Kappa : 0.142           
#>                                           
#>  Mcnemar's Test P-Value : 1.55e-15        
#>                                           
#>             Sensitivity : 0.2672          
#>             Specificity : 0.8606          
#>          Pos Pred Value : 0.5366          
#>          Neg Pred Value : 0.6604          
#>              Prevalence : 0.3765          
#>          Detection Rate : 0.1006          
#>    Detection Prevalence : 0.1875          
#>       Balanced Accuracy : 0.5639          
#>                                           
#>        'Positive' Class : 1               
#> 

Decision Tree

water_tree <- rpart(Potability ~ ., method = "class", data = water_train)

water_tree_preds <- predict(water_tree, water_test, type = "class")
confusionMatrix(water_tree_preds, water_test$Potability, positive='1')
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 360 104
#>          1  49 143
#>                                           
#>                Accuracy : 0.7668          
#>                  95% CI : (0.7325, 0.7986)
#>     No Information Rate : 0.6235          
#>     P-Value [Acc > NIR] : 3.153e-15       
#>                                           
#>                   Kappa : 0.4803          
#>                                           
#>  Mcnemar's Test P-Value : 1.268e-05       
#>                                           
#>             Sensitivity : 0.5789          
#>             Specificity : 0.8802          
#>          Pos Pred Value : 0.7448          
#>          Neg Pred Value : 0.7759          
#>              Prevalence : 0.3765          
#>          Detection Rate : 0.2180          
#>    Detection Prevalence : 0.2927          
#>       Balanced Accuracy : 0.7296          
#>                                           
#>        'Positive' Class : 1               
#> 

Random Forest

water_forest <- randomForest(Potability ~ ., data= water_train, ntree= 1000)

water_forest_preds <- predict(water_forest, water_test, type = "class")
confusionMatrix(water_forest_preds, water_test$Potability, positive='1')
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 354  82
#>          1  55 165
#>                                          
#>                Accuracy : 0.7912         
#>                  95% CI : (0.758, 0.8217)
#>     No Information Rate : 0.6235         
#>     P-Value [Acc > NIR] : < 2e-16        
#>                                          
#>                   Kappa : 0.5453         
#>                                          
#>  Mcnemar's Test P-Value : 0.02633        
#>                                          
#>             Sensitivity : 0.6680         
#>             Specificity : 0.8655         
#>          Pos Pred Value : 0.7500         
#>          Neg Pred Value : 0.8119         
#>              Prevalence : 0.3765         
#>          Detection Rate : 0.2515         
#>    Detection Prevalence : 0.3354         
#>       Balanced Accuracy : 0.7668         
#>                                          
#>        'Positive' Class : 1              
#> 

Conclusion

Predicting not drinkable water as drinkable (false positive) is in my opinion the most crucial thing to avoid. Therefore, specificity is the most important measure. Because, a high specificity means many true negatives and few false positives. Comparing models above, All models perform almost the same, with 85% plus in specificity metrics