getwd()
## [1] "C:/Users/LENOVO/Desktop/BTL XSTK"
setwd("C:/Users/LENOVO/Desktop/BTL XSTK")
dirty.data <- read.csv("C:/Users/LENOVO/Desktop/BTL XSTK/Dirty_data/water_potability.csv")
View(dirty.data)
str(dirty.data)
## 'data.frame':    3276 obs. of  10 variables:
##  $ ph             : num  NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num  205 129 224 214 181 ...
##  $ Solids         : num  20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num  7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num  369 NA NA 357 310 ...
##  $ Conductivity   : num  564 593 419 363 398 ...
##  $ Organic_carbon : num  10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num  87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num  2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : int  0 0 0 0 0 0 0 0 0 0 ...
colnames(dirty.data) <- c("ph","har","TDS","chl","sul","EC","TOC","THMs","NTU","P")
str(dirty.data)
## 'data.frame':    3276 obs. of  10 variables:
##  $ ph  : num  NA 3.72 8.1 8.32 9.09 ...
##  $ har : num  205 129 224 214 181 ...
##  $ TDS : num  20791 18630 19910 22018 17979 ...
##  $ chl : num  7.3 6.64 9.28 8.06 6.55 ...
##  $ sul : num  369 NA NA 357 310 ...
##  $ EC  : num  564 593 419 363 398 ...
##  $ TOC : num  10.4 15.2 16.9 18.4 11.6 ...
##  $ THMs: num  87 56.3 66.4 100.3 32 ...
##  $ NTU : num  2.96 4.5 3.06 4.63 4.08 ...
##  $ P   : int  0 0 0 0 0 0 0 0 0 0 ...
apply(is.na(dirty.data),2,sum)
##   ph  har  TDS  chl  sul   EC  TOC THMs  NTU    P 
##  491    0    0    0  781    0    0  162    0    0
listMissingColumns <- colnames(dirty.data)[apply(dirty.data, 2, anyNA)]
medianMissing <- apply(dirty.data[,colnames(dirty.data) %in% listMissingColumns],
                2, median, na.rm = TRUE)
medianMissing
##         ph        sul       THMs 
##   7.036752 333.073546  66.622485
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
clean.data <- dirty.data %>% mutate(    
              ph = ifelse(is.na(ph), medianMissing[1], ph),
              sul = ifelse(is.na(sul), medianMissing[2], sul),
              THMs = ifelse(is.na(THMs), medianMissing[3], THMs))
apply(is.na(clean.data),2,sum)
##   ph  har  TDS  chl  sul   EC  TOC THMs  NTU    P 
##    0    0    0    0    0    0    0    0    0    0
View(clean.data)

clean.data$P <- as.factor(clean.data$P)
summary(clean.data)
##        ph              har              TDS               chl        
##  Min.   : 0.000   Min.   : 47.43   Min.   :  320.9   Min.   : 0.352  
##  1st Qu.: 6.278   1st Qu.:176.85   1st Qu.:15666.7   1st Qu.: 6.127  
##  Median : 7.037   Median :196.97   Median :20927.8   Median : 7.130  
##  Mean   : 7.074   Mean   :196.37   Mean   :22014.1   Mean   : 7.122  
##  3rd Qu.: 7.870   3rd Qu.:216.67   3rd Qu.:27332.8   3rd Qu.: 8.115  
##  Max.   :14.000   Max.   :323.12   Max.   :61227.2   Max.   :13.127  
##       sul              EC             TOC             THMs        
##  Min.   :129.0   Min.   :181.5   Min.   : 2.20   Min.   :  0.738  
##  1st Qu.:317.1   1st Qu.:365.7   1st Qu.:12.07   1st Qu.: 56.648  
##  Median :333.1   Median :421.9   Median :14.22   Median : 66.622  
##  Mean   :333.6   Mean   :426.2   Mean   :14.28   Mean   : 66.407  
##  3rd Qu.:350.4   3rd Qu.:481.8   3rd Qu.:16.56   3rd Qu.: 76.667  
##  Max.   :481.0   Max.   :753.3   Max.   :28.30   Max.   :124.000  
##       NTU        P       
##  Min.   :1.450   0:1998  
##  1st Qu.:3.440   1:1278  
##  Median :3.955           
##  Mean   :3.967           
##  3rd Qu.:4.500           
##  Max.   :6.739
bar.P <- barplot(table(clean.data$P),main = "Thống kê số mẫu theo khả năng uống được",xlab = "Khả năng", ylab = "Số mẫu", col = c(1,2), ylim = c(0,2500))

hist(clean.data$ph, main = "Histogram của pH", xlab = "Độ pH", ylab = "Tần số", labels = T, col = 5, xlim = c(min(clean.data$ph),max(clean.data$ph)), ylim = c(0,1300))

hist(clean.data$har, main = "Histogram của độ cứng", xlab = "Độ cứng, mg/L", ylab = "Tần số", labels = T, col = 5, xlim = c(min(clean.data$har),max(clean.data$har)), ylim = c(0,900))

hist(clean.data$TDS, main = "Histogram của Solids (TDS)", xlab = "Solids, ppm", ylab = "Tần số", labels = T, col = 5, ylim = c(0,800))

hist(clean.data$chl, main = "Histogram của Chloramines", xlab = "Chloramines, ppm", ylab = "Tần số", labels = T, col = 5, xlim = c(min(clean.data$chl),max(clean.data$chl)),ylim = c(0,900))

hist(clean.data$sul, main = "Histogram của Sulfate", xlab = "Sulfate, mg/L", ylab = "Tần số", labels = T, col = 5,xlim = c(min(clean.data$sul),max(clean.data$sul)), ylim = c(0,1400))

hist(clean.data$EC, main = "Histogram của Tính dẫn điện", xlab = "Tính dẫn điện, micro S/cm", ylab = "Tần số", labels = T, col = 5,xlim = c(min(clean.data$EC),max(clean.data$EC)), ylim = c(0,800))

hist(clean.data$TOC, main = "Histogram của TOC", xlab = "TOC, ppm", ylab = "Tần số", labels = T, col = 5,xlim = c(min(clean.data$TOC),max(clean.data$TOC)), ylim = c(0,800))

hist(clean.data$THMs, main = "Histogram của THMs", xlab = "THMs, micro g/L", ylab = "Tần số", labels = T, col = 5,xlim = c(min(clean.data$THMs),max(clean.data$THMs)), ylim = c(0,1000))

hist(clean.data$NTU, main = "Histogram của Độ đục", xlab = "Độ đục, NTU", ylab = "Tần số", labels = T, col = 5,xlim = c(min(clean.data$NTU),max(clean.data$NTU)), ylim = c(0,900))

boxplot(clean.data$ph ~ clean.data$P, main = "Boxplot khả năng uống được theo pH", xlab = "Khả năng", ylab = "Độ pH", col = c(3,4))

boxplot(clean.data$har ~ clean.data$P, main = "Boxplot khả năng uống được theo độ cứng", xlab = "Khả năng", ylab = "Độ cứng, mg/L", col = c(3,4))

boxplot(clean.data$TDS ~ clean.data$P, main = "Boxplot khả năng uống được theo TDS", xlab = "Khả năng", ylab = "TDS, ppm", col = c(3,4))

boxplot(clean.data$chl ~ clean.data$P, main = "Boxplot khả năng uống được theo Chloramines", xlab = "Khả năng", ylab = "Chloramines, ppm", col = c(3,4))

boxplot(clean.data$sul ~ clean.data$P, main = "Boxplot khả năng uống được theo Sulfate", xlab = "Khả năng", ylab = "Sulfate, mg/L", col = c(3,4))

boxplot(clean.data$EC ~ clean.data$P, main = "Boxplot khả năng uống được theo độ dẫn điện", xlab = "Khả năng", ylab = "Độ dẫn điện, micro S/cm", col = c(3,4))

boxplot(clean.data$TOC ~ clean.data$P, main = "Boxplot khả năng uống được theo TOC", xlab = "Khả năng", ylab = "TOC, ppm", col = c(3,4))

boxplot(clean.data$THMs ~ clean.data$P, main = "Boxplot khả năng uống được theo THMs", xlab = "Khả năng", ylab = "THMs, micro g/L", col = c(3,4))

boxplot(clean.data$NTU ~ clean.data$P, main = "Boxplot khả năng uống được theo Độ đục", xlab = "Khả năng", ylab = "Độ đục, NTU", col = c(3,4))

round(cor(clean.data[,1:9]),3)
##          ph    har    TDS    chl    sul     EC    TOC   THMs    NTU
## ph    1.000  0.076 -0.082 -0.032  0.014  0.017  0.040  0.003 -0.036
## har   0.076  1.000 -0.047 -0.030 -0.093 -0.024  0.004 -0.013 -0.014
## TDS  -0.082 -0.047  1.000 -0.070 -0.150  0.014  0.010 -0.009  0.020
## chl  -0.032 -0.030 -0.070  1.000  0.024 -0.020 -0.013  0.017  0.002
## sul   0.014 -0.093 -0.150  0.024  1.000 -0.014  0.027 -0.026 -0.010
## EC    0.017 -0.024  0.014 -0.020 -0.014  1.000  0.021  0.001  0.006
## TOC   0.040  0.004  0.010 -0.013  0.027  0.021  1.000 -0.013 -0.027
## THMs  0.003 -0.013 -0.009  0.017 -0.026  0.001 -0.013  1.000 -0.021
## NTU  -0.036 -0.014  0.020  0.002 -0.010  0.006 -0.027 -0.021  1.000
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(clean.data[,1:9]), method = "number")

library(caTools)
set.seed(6)
samp = sample.split(clean.data$P, SplitRatio = 0.65)
xd = subset(clean.data, samp == TRUE)
kd = subset(clean.data, samp == FALSE)
build <- glm(P ~., data = xd, family = binomial)
mh <- step(build)
## Start:  AIC=2855.73
## P ~ ph + har + TDS + chl + sul + EC + TOC + THMs + NTU
## 
##        Df Deviance    AIC
## - ph    1   2835.7 2853.7
## - NTU   1   2835.9 2853.9
## - sul   1   2835.9 2853.9
## - THMs  1   2836.0 2854.0
## - har   1   2836.3 2854.3
## - EC    1   2836.6 2854.6
## <none>      2835.7 2855.7
## - TOC   1   2839.1 2857.1
## - chl   1   2839.1 2857.1
## - TDS   1   2840.6 2858.6
## 
## Step:  AIC=2853.73
## P ~ har + TDS + chl + sul + EC + TOC + THMs + NTU
## 
##        Df Deviance    AIC
## - NTU   1   2835.9 2851.9
## - sul   1   2835.9 2851.9
## - THMs  1   2836.0 2852.0
## - har   1   2836.3 2852.3
## - EC    1   2836.6 2852.6
## <none>      2835.7 2853.7
## - TOC   1   2839.1 2855.1
## - chl   1   2839.1 2855.1
## - TDS   1   2840.6 2856.6
## 
## Step:  AIC=2851.88
## P ~ har + TDS + chl + sul + EC + TOC + THMs
## 
##        Df Deviance    AIC
## - sul   1   2836.0 2850.0
## - THMs  1   2836.2 2850.2
## - har   1   2836.5 2850.5
## - EC    1   2836.7 2850.7
## <none>      2835.9 2851.9
## - chl   1   2839.2 2853.2
## - TOC   1   2839.2 2853.2
## - TDS   1   2840.8 2854.8
## 
## Step:  AIC=2850.03
## P ~ har + TDS + chl + EC + TOC + THMs
## 
##        Df Deviance    AIC
## - THMs  1   2836.3 2848.3
## - har   1   2836.6 2848.6
## - EC    1   2836.9 2848.9
## <none>      2836.0 2850.0
## - chl   1   2839.3 2851.3
## - TOC   1   2839.4 2851.4
## - TDS   1   2841.3 2853.3
## 
## Step:  AIC=2848.32
## P ~ har + TDS + chl + EC + TOC
## 
##        Df Deviance    AIC
## - har   1   2836.9 2846.9
## - EC    1   2837.2 2847.2
## <none>      2836.3 2848.3
## - chl   1   2839.6 2849.6
## - TOC   1   2839.7 2849.7
## - TDS   1   2841.7 2851.7
## 
## Step:  AIC=2846.86
## P ~ TDS + chl + EC + TOC
## 
##        Df Deviance    AIC
## - EC    1   2837.8 2845.8
## <none>      2836.9 2846.9
## - chl   1   2840.2 2848.2
## - TOC   1   2840.2 2848.2
## - TDS   1   2842.3 2850.3
## 
## Step:  AIC=2845.76
## P ~ TDS + chl + TOC
## 
##        Df Deviance    AIC
## <none>      2837.8 2845.8
## - chl   1   2841.0 2847.0
## - TOC   1   2841.1 2847.1
## - TDS   1   2843.2 2849.2
summary(mh)
## 
## Call:
## glm(formula = P ~ TDS + chl + TOC, family = binomial, data = xd)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.298e-01  3.073e-01  -2.375   0.0176 *
## TDS          1.190e-05  5.122e-06   2.322   0.0202 *
## chl          5.119e-02  2.837e-02   1.804   0.0712 .
## TOC         -2.412e-02  1.323e-02  -1.823   0.0682 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2849.1  on 2129  degrees of freedom
## Residual deviance: 2837.8  on 2126  degrees of freedom
## AIC: 2845.8
## 
## Number of Fisher Scoring iterations: 4
confint(mh, level = 0.95)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept) -1.334026e+00 -0.1287788356
## TDS          1.856755e-06  0.0000219461
## chl         -4.348774e-03  0.1069342458
## TOC         -5.010180e-02  0.0017729512
exp(cbind(OR = coef(mh), confint(mh, level = 0.95)))
## Waiting for profiling to be done...
##                    OR     2.5 %    97.5 %
## (Intercept) 0.4820069 0.2634147 0.8791684
## TDS         1.0000119 1.0000019 1.0000219
## chl         1.0525225 0.9956607 1.1128611
## TOC         0.9761690 0.9511326 1.0017745
pred <- predict(mh, newdata = kd, type = "response",)
kd$pred <- round(pred)
head(kd, 10)
##           ph      har      TDS       chl      sul       EC       TOC     THMs
## 2   3.716080 129.4229 18630.06  6.635246 333.0735 592.8854 15.180013 56.32908
## 5   9.092223 181.1015 17978.99  6.546600 310.1357 398.4108 11.558279 31.99799
## 6   5.584087 188.3133 28748.69  7.544869 326.6784 280.4679  8.399735 54.91786
## 7  10.223862 248.0717 28749.72  7.513408 393.6634 283.6516 13.789695 84.60356
## 8   8.635849 203.3615 13672.09  4.563009 303.3098 474.6076 12.363817 62.79831
## 12  7.974522 218.6933 18767.66  8.110385 333.0735 364.0982 14.525746 76.48591
## 15  7.496232 205.3450 28388.00  5.072558 333.0735 444.6454 13.228311 70.30021
## 18  9.181560 273.8138 24041.33  6.904990 398.3505 477.9746 13.387341 71.45736
## 20  7.371050 214.4966 25630.32  4.432669 335.7544 469.9146 12.509164 62.79728
## 21  7.036752 227.4350 22305.57 10.333918 333.0735 554.8201 16.331693 45.38282
##         NTU P pred
## 2  4.500656 0    0
## 5  4.075075 0    0
## 6  2.559708 0    0
## 7  2.672989 0    0
## 8  4.401425 0    0
## 12 4.011718 0    0
## 15 4.777382 0    0
## 18 4.503661 0    0
## 20 2.560299 0    0
## 21 4.133423 0    0
quansat <- table(kd$P)
dudoan <- table(kd$pred)
rbind(quansat, dudoan)
##            0   1
## quansat  699 447
## dudoan  1139   7
table(kd$P, kd$pred)
##    
##       0   1
##   0 696   3
##   1 443   4
dcx <- (table(kd$P, kd$pred)[1,1]+table(kd$P, kd$pred)[2,2])/nrow(kd)
dcx
## [1] 0.6108202
dn <- table(kd$P, kd$pred)[2,2]/(table(kd$P, kd$pred)[2,1]+table(kd$P, kd$pred)[2,2])
dn
## [1] 0.008948546
ddh <- table(kd$P, kd$pred)[1,1]/(table(kd$P, kd$pred)[1,1]+table(kd$P, kd$pred)[1,2])
ddh
## [1] 0.9957082
library(ROCR)
ROCpred <- prediction(pred, kd$P)
ROCperf <- performance(ROCpred, "tpr", "fpr")
plot(ROCperf)

library(Metrics)
auc(kd$P, kd$pred)
## [1] 0.5023284
anv.TDS <- aov(clean.data$TDS ~ clean.data$P)
summary(anv.TDS)
##                Df    Sum Sq   Mean Sq F value Pr(>F)  
## clean.data$P    1 2.867e+08 286711019   3.732 0.0535 .
## Residuals    3274 2.515e+11  76823747                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anv.chl <- aov(clean.data$chl ~ clean.data$P)
summary(anv.chl)
##                Df Sum Sq Mean Sq F value Pr(>F)
## clean.data$P    1      5   4.641   1.852  0.174
## Residuals    3274   8203   2.506
anv.TOC <- aov(clean.data$TOC ~ clean.data$P)
summary(anv.TOC)
##                Df Sum Sq Mean Sq F value Pr(>F)  
## clean.data$P    1     32   32.26    2.95  0.086 .
## Residuals    3274  35809   10.94                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
clean.data$ph <- as.factor(clean.data$ph)

anv2w.TDS <- aov(clean.data$TDS ~ clean.data$P + clean.data$ph)
summary(anv2w.TDS)
##                 Df    Sum Sq   Mean Sq F value Pr(>F)  
## clean.data$P     1 2.867e+08 286711019   3.506 0.0617 .
## clean.data$ph 2784 2.115e+11  75952877   0.929 0.8623  
## Residuals      490 4.007e+10  81771710                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anv2w.chl <- aov(clean.data$chl ~ clean.data$P + clean.data$ph)
summary(anv2w.chl)
##                 Df Sum Sq Mean Sq F value Pr(>F)
## clean.data$P     1      5   4.641   1.946  0.164
## clean.data$ph 2784   7034   2.527   1.059  0.209
## Residuals      490   1169   2.385
anv2w.TOC <- aov(clean.data$TOC ~ clean.data$P + clean.data$ph)
summary(anv2w.TOC)
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## clean.data$P     1     32   32.26   2.916 0.0883 .
## clean.data$ph 2784  30389   10.92   0.987 0.5825  
## Residuals      490   5420   11.06                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1