##### LOADING AND PREPPING DATA #####
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.1
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(class)
## Warning: package 'class' was built under R version 4.3.1
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.1
water_raw <- read.csv("C:\\Users\\abhil\\Downloads\\archive (6)\\water_potability.csv")
water_raw$ Potability <- as.factor(water_raw$Potability)

# removing instances with NA for any variables
water_clean <- water_raw[rowSums(is.na(water_raw))==0,] 

training/testing split .25 for testing and .75.75 for training/ .75.25 for tuning

set.seed(489)
testing_ratio <- .25

water_clean_test <-  water_clean[sample(c(1:nrow(water_clean)), 
                                              size = ceiling(nrow(water_clean)*testing_ratio)),]

water_clean_mod <- initial_split(anti_join(water_clean,water_clean_test), prop = .75)
## Joining with `by = join_by(ph, Hardness, Solids, Chloramines, Sulfate,
## Conductivity, Organic_carbon, Trihalomethanes, Turbidity, Potability)`
#train
water_clean_train <- training(water_clean_mod)
#tune
water_clean_tune <- testing(water_clean_mod)
OVERALL EXPLORATORY DATA ANALYSIS

taking a quick look at the variables and their distributions

summary(water_clean_train)
##        ph             Hardness          Solids         Chloramines    
##  Min.   : 0.2275   Min.   : 73.49   Min.   :  320.9   Min.   : 1.920  
##  1st Qu.: 6.0256   1st Qu.:176.80   1st Qu.:15547.8   1st Qu.: 6.103  
##  Median : 6.9898   Median :197.56   Median :20769.5   Median : 7.121  
##  Mean   : 7.0518   Mean   :196.15   Mean   :21784.5   Mean   : 7.128  
##  3rd Qu.: 8.0238   3rd Qu.:216.75   3rd Qu.:27067.5   3rd Qu.: 8.115  
##  Max.   :14.0000   Max.   :317.34   Max.   :56351.4   Max.   :13.127  
##     Sulfate       Conductivity   Organic_carbon  Trihalomethanes  
##  Min.   :182.4   Min.   :201.6   Min.   : 2.20   Min.   :  8.577  
##  1st Qu.:306.5   1st Qu.:364.7   1st Qu.:12.12   1st Qu.: 56.037  
##  Median :332.8   Median :422.0   Median :14.32   Median : 66.376  
##  Mean   :333.4   Mean   :426.5   Mean   :14.35   Mean   : 66.668  
##  3rd Qu.:361.0   3rd Qu.:482.5   3rd Qu.:16.68   3rd Qu.: 77.697  
##  Max.   :475.7   Max.   :753.3   Max.   :24.76   Max.   :120.030  
##    Turbidity     Potability
##  Min.   :1.450   0:672     
##  1st Qu.:3.442   1:459     
##  Median :3.965             
##  Mean   :3.974             
##  3rd Qu.:4.511             
##  Max.   :6.495
dim(water_clean_train)
## [1] 1131   10
plot(water_clean_train)

# taking a look at the histograms with facet wrapping

melt(water_clean_train) %>% ggplot(aes(value)) + 
  geom_histogram() + 
  facet_wrap(~ variable, scales="free")
## Using Potability as id variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# multiple violin with facet wrapping

melt(water_clean_train) %>% ggplot(aes(variable,value)) + 
  geom_boxplot(alpha = .5, fill = "purple") + 
  geom_violin(alpha = .5, fill = "yellow") + 
  facet_wrap( ~ variable, scales="free")
## Using Potability as id variables

# this shows us conducitivty and solids are skewed whereas the others # seem more bell-like, perhaps they are related?, lets plot the cor map

cor_mat <- water_clean_train %>% select(-Potability) %>% cor() 

nope, at least not immediately any association

melt(cor_mat) %>% ggplot(aes(x=Var1, y=Var2, fill=value, na.rm = TRUE)) + 
  geom_tile() + 
  scale_fill_gradient(low = "white", high = "red")

# outliers - taking a look at particularly unusual observations

we will tag outliers by the standard deviation without much thought, given that most of these distributions look bell-shaped

outlier_lst <- list()
for (n in 1:(ncol(water_clean_train) -1)){
 
  # upper outliers
 
  lower <- which(water_clean_train[,n] > mean(water_clean_train[,n]) + 2.8*sd(water_clean_train[,n]))
   
  # lower outliers
   
  upper <- which(water_clean_train[,n] < mean(water_clean_train[,n]) - 2.8*sd(water_clean_train[,n]))
  outliers <- c(lower, upper)
  outlier_lst[[colnames(water_clean_train)[n]]] <- outliers
}

print(outlier_lst)
## $ph
##  [1]  244  265  785  857    6  569  624  771 1034 1079
## 
## $Hardness
## [1]   84   87  266  565  629  814  840  857 1079
## 
## $Solids
## [1]  276  564  589  771  813  820  861 1063
## 
## $Chloramines
## [1]   60  483 1093   64  505  843 1089
## 
## $Sulfate
## [1]  242  355  708  993  605  774  828 1093 1128
## 
## $Conductivity
## [1] 387 487 524 886 955
## 
## $Organic_carbon
## [1]   68  299  866 1114
## 
## $Trihalomethanes
##  [1] 336 548 645 740 895 186 371 403 408 763
## 
## $Turbidity
## [1]  581  714  901 1049  130  300  933

finding the elements that have outliers across the board

Reduce(intersect, list(outlier_lst[[1]], outlier_lst[[2]],
                       outlier_lst[[3]], outlier_lst[[4]],
                       outlier_lst[[5]], outlier_lst[[6]],
                       outlier_lst[[7]], outlier_lst[[8]],
                       outlier_lst[[9]]))
## integer(0)

none!, so let’s count how many categories each has

table(c(outlier_lst[[1]], outlier_lst[[2]],
        outlier_lst[[3]], outlier_lst[[4]],
        outlier_lst[[5]], outlier_lst[[6]],
        outlier_lst[[7]], outlier_lst[[8]],
        outlier_lst[[9]]))
## 
##    6   60   64   68   84   87  130  186  242  244  265  266  276  299  300  336 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  355  371  387  403  408  483  487  505  524  548  564  565  569  581  589  605 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  624  629  645  708  714  740  763  771  774  785  813  814  820  828  840  843 
##    1    1    1    1    1    1    1    2    1    1    1    1    1    1    1    1 
##  857  861  866  886  895  901  933  955  993 1034 1049 1063 1079 1089 1093 1114 
##    2    1    1    1    1    1    1    1    1    1    1    1    2    1    2    1 
## 1128 
##    1
POINTED EXPLORATORY DATA ANALYSIS

taking a look at distribution differences on potability/non-potability

overlaid histograms

melt(water_clean_train) %>% ggplot(aes(value, fill = Potability)) + 
  geom_histogram(alpha = .5) + 
  facet_wrap(~ variable, scales="free")
## Using Potability as id variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# difference is probably due to count differences in potability/non potability, let’s confirm through density plots…

overlaid densities

melt(water_clean_train) %>% ggplot(aes(value, fill = Potability)) + 
  geom_density(alpha = .5) + 
  facet_wrap(~ variable, scales="free")
## Using Potability as id variables

# distributions via violin plots and boxplots broken down by potability

melt(water_clean_train) %>% ggplot(aes(variable,value, fill = Potability)) + 
  geom_boxplot(alpha = .5) + 
  geom_violin(alpha = .5) + 
  facet_wrap( ~ variable, scales="free")
## Using Potability as id variables

outlier_lst_potable <- list()
water_clean_train_potable <- water_clean_train %>% filter(Potability == 1)
for (n in 1:(ncol(water_clean_train_potable) -1)){
  # upper outliers
  
  lower <- which(water_clean_train_potable[,n] > mean(water_clean_train_potable[,n]) + 2.8*sd(water_clean_train_potable[,n]))

  # lower outliers
  
  upper <- which(water_clean_train_potable[,n] < mean(water_clean_train_potable[,n]) - 2.8*sd(water_clean_train_potable[,n]))
  outliers <- c(lower, upper)
  outlier_lst_potable[[colnames(water_clean_train_potable)[n]]] <- outliers
}

print(outlier_lst_potable)
## $ph
## [1]  39  99 349 394 419 440
## 
## $Hardness
## [1]  36  39 100 221 254 349
## 
## $Solids
## [1] 220 430
## 
## $Chloramines
## [1]  28 445 346
## 
## $Sulfate
## [1] 403 341 445
## 
## $Conductivity
## [1] 390
## 
## $Organic_carbon
## [1] 159 453
## 
## $Trihalomethanes
## [1] 263  75 156
## 
## $Turbidity
## [1] 369 424 114
outlier_lst_non_potable <- list()
water_clean_train_non_potable <- water_clean_train %>% filter(Potability == 0)
for (n in 1:(ncol(water_clean_train_non_potable) -1)){
  # upper outliers
  lower <- which(water_clean_train_non_potable[,n] > mean(water_clean_train_non_potable[,n]) + 2.8*sd(water_clean_train_non_potable[,n]))

  # lower outliers

  upper <- which(water_clean_train_non_potable[,n] < mean(water_clean_train_non_potable[,n]) - 2.8*sd(water_clean_train_non_potable[,n]))
  outliers <- c(lower, upper)
  outlier_lst_non_potable[[colnames(water_clean_train_non_potable)[n]]] <- outliers
}

print(outlier_lst_non_potable)

the spread of outliers does not look very helpful, so we will leave the outlier checking for now in favor of checkkig out models, then we will return to see the effects of this supposed outliers on the models we build

we would need a reason to remove the outliers, something that we do not quite have, we can just test their effects on the eventual models and note it.

MODEL BUILDING

I am following best practice and doing my EDA only on the training split, and will build the models off of this knowledge (or lack thereof) from the training set. In short, applying EDA on the test data is wrong and should be avoided.

we will test 3 models: logistic regression, decision tree, random forest

Logistic Regression

log_model <- glm(Potability ~ ., family = "binomial", data = water_clean_train)
summary(log_model)
## 
## Call:
## glm(formula = Potability ~ ., family = "binomial", data = water_clean_train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)     -7.143e-01  9.790e-01  -0.730    0.466
## ph               3.911e-02  3.906e-02   1.001    0.317
## Hardness         4.195e-04  1.857e-03   0.226    0.821
## Solids           3.851e-06  7.149e-06   0.539    0.590
## Chloramines      2.456e-02  3.861e-02   0.636    0.525
## Sulfate          2.072e-04  1.485e-03   0.140    0.889
## Conductivity    -1.086e-03  7.520e-04  -1.444    0.149
## Organic_carbon  -2.057e-02  1.812e-02  -1.135    0.256
## Trihalomethanes  2.569e-04  3.713e-03   0.069    0.945
## Turbidity        9.714e-02  7.807e-02   1.244    0.213
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1527.5  on 1130  degrees of freedom
## Residual deviance: 1521.0  on 1121  degrees of freedom
## AIC: 1541
## 
## Number of Fisher Scoring iterations: 4

tuning performance

gives us raw probability of being a 1

log_probs <- plogis(predict(log_model, water_clean_tune))

converting probs to classes

log_preds <- as_tibble(log_probs) %>%
  mutate(Potability = ifelse(value>.5, 1, 0))

log_preds <- log_preds$Potability %>% factor()
confusionMatrix(log_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 229 144
##          1   1   3
##                                           
##                Accuracy : 0.6154          
##                  95% CI : (0.5642, 0.6647)
##     No Information Rate : 0.6101          
##     P-Value [Acc > NIR] : 0.4386          
##                                           
##                   Kappa : 0.0195          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.020408        
##             Specificity : 0.995652        
##          Pos Pred Value : 0.750000        
##          Neg Pred Value : 0.613941        
##              Prevalence : 0.389920        
##          Detection Rate : 0.007958        
##    Detection Prevalence : 0.010610        
##       Balanced Accuracy : 0.508030        
##                                           
##        'Positive' Class : 1               
## 

testing performance

gives us raw probability of being a 1

log_probs <- plogis(predict(log_model, water_clean_test))

converting probs to classes

log_preds <- as_tibble(log_probs) %>%
  mutate(Potability = ifelse(value>.5, 1, 0))

log_preds <- log_preds$Potability %>% factor()
confusionMatrix(log_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 295 203
##          1   3   2
##                                           
##                Accuracy : 0.5905          
##                  95% CI : (0.5461, 0.6338)
##     No Information Rate : 0.5924          
##     P-Value [Acc > NIR] : 0.5552          
##                                           
##                   Kappa : -4e-04          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.009756        
##             Specificity : 0.989933        
##          Pos Pred Value : 0.400000        
##          Neg Pred Value : 0.592369        
##              Prevalence : 0.407555        
##          Detection Rate : 0.003976        
##    Detection Prevalence : 0.009940        
##       Balanced Accuracy : 0.499844        
##                                           
##        'Positive' Class : 1               
## 

Decision Tree

tree_model <- rpart(Potability ~ ., method = "class", data = water_clean_train)

tuning performance

tree_preds <- predict(tree_model, water_clean_tune, type = "class")
confusionMatrix(tree_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 185  84
##          1  45  63
##                                           
##                Accuracy : 0.6578          
##                  95% CI : (0.6075, 0.7056)
##     No Information Rate : 0.6101          
##     P-Value [Acc > NIR] : 0.0315460       
##                                           
##                   Kappa : 0.2446          
##                                           
##  Mcnemar's Test P-Value : 0.0008207       
##                                           
##             Sensitivity : 0.4286          
##             Specificity : 0.8043          
##          Pos Pred Value : 0.5833          
##          Neg Pred Value : 0.6877          
##              Prevalence : 0.3899          
##          Detection Rate : 0.1671          
##    Detection Prevalence : 0.2865          
##       Balanced Accuracy : 0.6165          
##                                           
##        'Positive' Class : 1               
## 

testing performance

tree_preds <- predict(tree_model, water_clean_test, type = "class")
confusionMatrix(tree_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 242 130
##          1  56  75
##                                           
##                Accuracy : 0.6302          
##                  95% CI : (0.5864, 0.6725)
##     No Information Rate : 0.5924          
##     P-Value [Acc > NIR] : 0.04605         
##                                           
##                   Kappa : 0.1886          
##                                           
##  Mcnemar's Test P-Value : 8.669e-08       
##                                           
##             Sensitivity : 0.3659          
##             Specificity : 0.8121          
##          Pos Pred Value : 0.5725          
##          Neg Pred Value : 0.6505          
##              Prevalence : 0.4076          
##          Detection Rate : 0.1491          
##    Detection Prevalence : 0.2604          
##       Balanced Accuracy : 0.5890          
##                                           
##        'Positive' Class : 1               
## 

Random Forest

forest_model <- randomForest(Potability ~ ., data= water_clean_train, ntree= 1000)

tuning performance

forest_preds <- predict(forest_model, water_clean_tune, type = "class")
confusionMatrix(forest_preds, water_clean_tune$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 198  84
##          1  32  63
##                                          
##                Accuracy : 0.6923         
##                  95% CI : (0.643, 0.7386)
##     No Information Rate : 0.6101         
##     P-Value [Acc > NIR] : 0.0005462      
##                                          
##                   Kappa : 0.3092         
##                                          
##  Mcnemar's Test P-Value : 2.188e-06      
##                                          
##             Sensitivity : 0.4286         
##             Specificity : 0.8609         
##          Pos Pred Value : 0.6632         
##          Neg Pred Value : 0.7021         
##              Prevalence : 0.3899         
##          Detection Rate : 0.1671         
##    Detection Prevalence : 0.2520         
##       Balanced Accuracy : 0.6447         
##                                          
##        'Positive' Class : 1              
## 

testing performance

forest_preds <- predict(forest_model, water_clean_test, type = "class")
confusionMatrix(forest_preds, water_clean_test$Potability, positive='1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 242 131
##          1  56  74
##                                           
##                Accuracy : 0.6282          
##                  95% CI : (0.5843, 0.6706)
##     No Information Rate : 0.5924          
##     P-Value [Acc > NIR] : 0.05562         
##                                           
##                   Kappa : 0.1835          
##                                           
##  Mcnemar's Test P-Value : 6.253e-08       
##                                           
##             Sensitivity : 0.3610          
##             Specificity : 0.8121          
##          Pos Pred Value : 0.5692          
##          Neg Pred Value : 0.6488          
##              Prevalence : 0.4076          
##          Detection Rate : 0.1471          
##    Detection Prevalence : 0.2584          
##       Balanced Accuracy : 0.5865          
##                                           
##        'Positive' Class : 1               
## 

We could also do the ROC curves and AUC