#loading libraries

library(readr)
library(ggplot2)
library(shiny)
library(shinydashboard)
## 
## Attaching package: 'shinydashboard'
## The following object is masked from 'package:graphics':
## 
##     box
library(shinythemes)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(viridisLite)
library(RColorBrewer)# for color palettes
library(caret)
## Loading required package: lattice
library(rpart)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(C50)
## Warning: package 'C50' was built under R version 4.5.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(scales) 
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(DT)
## 
## Attaching package: 'DT'
## The following objects are masked from 'package:shiny':
## 
##     dataTableOutput, renderDataTable

##objectives The aim of this project is look at various ways to accurately predict measurable minerals used in treatment of drinking water. sample drinking water was tested and recorded for 10 minerals.

“Calcium.CA” “magnesium.Mg” “Sodium.Na” “Potassium”
“Sulfates” “Chlorides” “Nitrates” “Nitrites” “dry.residues” “Bicarbonates”

##Task !developed a classification model to identify ideal mineral composition for water treatment using both WHO standards and data-driven approaches. While WHO guidelines served as the foundation benchmark, the decision tree and random forest models provided refined thresholds tailored to the data set.

we shall be comparing the record with WHO standard to determine if those mineral element are present at recommended amount or in excess in our drinking water. we shall train our model to assist us to easily predict the recommended amount of mineral required.

#we note that Collection, treatment, storage and distribution of drinking-water involve deliberate additions of numerous chemicals to improve the safety and quality of the finished drinking-water for consumers (direct additives). In addition, water is in constant contact with pipes, valves, taps and tank surfaces, all of which have the potential to impart additional chemicals to the water (indirect additives).

#loading of dataset

water <- read.csv("Bottled water brands.csv")

#checking of dataset structures

names(water)
##  [1] "Brand"        "Calcium.CA"   "magnesium.Mg" "Sodium.Na"    "Potassium"   
##  [6] "Sulfates"     "Chlorides"    "Nitrates"     "Nitrites"     "dry.residues"
## [11] "Fluor"        "Bicarbonates" "Silicas"      "Province"
head(water)
##        Brand Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates Chlorides
## 1       arwa     120.00        23.00      56.0       1.0      104       100
## 2      Aures     100.00        59.00      71.0       3.0      348        86
## 3      AYRIS      65.60         6.80      28.5       1.9       75        37
## 4   BESBASSA      54.16         2.64       5.0       2.0        4        10
## 5    Bouglez       4.60         3.75      29.0       1.0       10        30
## 6 El kantara      90.00        37.00      36.0       3.0      162        59
##   Nitrates Nitrites dry.residues Fluor Bicarbonates Silicas Province
## 1    46.50     0.01          450    NA       256.00      NA    SETIF
## 2     1.08     0.01          770    NA       224.00      NA   BISKRA
## 3     2.70     0.01          276    NA       234.24      NA         
## 4     9.00     0.01          206    NA       164.70      NA   GUELMA
## 5     9.00     0.06          140    NA           NA      NA   BEJAIA
## 6     9.60     0.01          636    NA       247.00      NA   BISKRA
str(water)
## 'data.frame':    20 obs. of  14 variables:
##  $ Brand       : chr  "arwa" "Aures" "AYRIS" "BESBASSA" ...
##  $ Calcium.CA  : num  120 100 65.6 54.2 4.6 ...
##  $ magnesium.Mg: num  23 59 6.8 2.64 3.75 37 37 24 5 31 ...
##  $ Sodium.Na   : num  56 71 28.5 5 29 36 29 15.8 3.1 68 ...
##  $ Potassium   : num  1 3 1.9 2 1 3 2 2.1 0.4 4 ...
##  $ Sulfates    : num  104 348 75 4 10 162 95 68 3 153 ...
##  $ Chlorides   : num  100 86 37 10 30 59 40 72 7 84 ...
##  $ Nitrates    : num  46.5 1.08 2.7 9 9 9.6 4.5 15 5.94 8.9 ...
##  $ Nitrites    : num  0.01 0.01 0.01 0.01 0.06 0.01 0.01 0.02 0.03 0.02 ...
##  $ dry.residues: int  450 770 276 206 140 636 564 380 178 725 ...
##  $ Fluor       : num  NA NA NA NA NA NA NA NA 0.11 1.05 ...
##  $ Bicarbonates: num  256 224 234 165 NA ...
##  $ Silicas     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Province    : chr  "SETIF" "BISKRA" "" "GUELMA" ...

##the dataset have 14 recorded variables with 20 observations across each rows. out of the 14 variabeles, 2( province and brands) are categorical, while the other observed variables ar numerical.

#descriptive statisitc

summary(water)
##     Brand             Calcium.CA      magnesium.Mg     Sodium.Na    
##  Length:20          Min.   :  4.60   Min.   : 2.64   Min.   : 3.10  
##  Class :character   1st Qu.: 62.95   1st Qu.:14.38   1st Qu.:15.20  
##  Mode  :character   Median : 79.50   Median :25.25   Median :29.50  
##                     Mean   : 78.29   Mean   :25.46   Mean   :34.34  
##                     3rd Qu.: 94.50   3rd Qu.:37.00   3rd Qu.:56.50  
##                     Max.   :136.00   Max.   :59.00   Max.   :71.00  
##                                                                     
##    Potassium        Sulfates        Chlorides         Nitrates    
##  Min.   :0.400   Min.   :  3.00   Min.   :  7.00   Min.   : 1.08  
##  1st Qu.:1.000   1st Qu.: 35.10   1st Qu.: 28.93   1st Qu.: 4.05  
##  Median :2.000   Median : 75.00   Median : 43.50   Median : 8.95  
##  Mean   :1.873   Mean   : 96.49   Mean   : 46.45   Mean   :10.42  
##  3rd Qu.:2.025   3rd Qu.:153.75   3rd Qu.: 62.25   3rd Qu.:12.75  
##  Max.   :4.650   Max.   :348.00   Max.   :100.00   Max.   :46.50  
##                                                                   
##     Nitrites       dry.residues       Fluor         Bicarbonates  
##  Min.   :0.0000   Min.   :140.0   Min.   :0.1100   Min.   :164.7  
##  1st Qu.:0.0075   1st Qu.:282.8   1st Qu.:0.2200   1st Qu.:219.5  
##  Median :0.0100   Median :445.5   Median :0.3300   Median :258.5  
##  Mean   :0.0120   Mean   :468.4   Mean   :0.4967   Mean   :265.9  
##  3rd Qu.:0.0100   3rd Qu.:642.0   3rd Qu.:0.6900   3rd Qu.:272.5  
##  Max.   :0.0600   Max.   :953.0   Max.   :1.0500   Max.   :458.0  
##                                   NA's   :17       NA's   :2      
##     Silicas         Province        
##  Min.   : 2.330   Length:20         
##  1st Qu.: 7.165   Class :character  
##  Median :12.000   Mode  :character  
##  Mean   : 8.810                     
##  3rd Qu.:12.050                     
##  Max.   :12.100                     
##  NA's   :17

##Based on the statistical summary of the dataset, dry residues (Total Dissolved Solids:Represents overall mineral contenT) emerged as the most influential parameter due to its wide variability(140-953) across samples.

##Sulfates and calcium(Very high variability;3-348): Affects taste + health at high levels) also showed significant variation, indicating their strong contribution to overall water composition. However, from a health perspective, nitrates remain the most critical parameter, as elevated levels pose potential health risks despite lower variability WITH 46.50 Very close to lethal amount. Calcium content imprtant in water treatment as it contribute to hardness

#checking outiers

boxplot(
  water[, c(
    "Calcium.CA", "magnesium.Mg", "Sodium.Na", "Potassium",
    "Sulfates", "Chlorides", "Nitrates", "Nitrites", "dry.residues", "Bicarbonates" )],
  
  main = "Boxplot of Water Minerals",
  col = "lightblue",
  las = 2,         # make labels vertical
  ylab = "Concentration"
)

#Data Preprocessing and Transformation The dataset was checked for missing values, inconsistencies, and duplicates. Cleaning ensured that analysis is results accurately .

#identify missing values

colSums(is.na(water))
##        Brand   Calcium.CA magnesium.Mg    Sodium.Na    Potassium     Sulfates 
##            0            0            0            0            0            0 
##    Chlorides     Nitrates     Nitrites dry.residues        Fluor Bicarbonates 
##            0            0            0            0           17            2 
##      Silicas     Province 
##           17            0

##silicas, flour, carbonates variables was found to contain missing values.

#This shows % how many missing values each variable has

library(naniar)
## Warning: package 'naniar' was built under R version 4.5.2
vis_miss(water)

#Some variables show moderate missing values. For instance, carbonates has 10% missing values, which requires. checking missing value precentage

missing_percent <- colSums(is.na(water)) / nrow(water) * 100
missing_percent
##        Brand   Calcium.CA magnesium.Mg    Sodium.Na    Potassium     Sulfates 
##            0            0            0            0            0            0 
##    Chlorides     Nitrates     Nitrites dry.residues        Fluor Bicarbonates 
##            0            0            0            0           85           10 
##      Silicas     Province 
##           85            0

##Variables with more than 80% missing values were removed from the dataset to avoid unreliable imputations and ensure data quality.

#removal of flour and silicas variable since both variale have more than 80% missing values

water_clean <- water %>% select(-Fluor, -Silicas)
water_clean
##                 Brand Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates
## 1                arwa     120.00        23.00      56.0      1.00    104.0
## 2               Aures     100.00        59.00      71.0      3.00    348.0
## 3               AYRIS      65.60         6.80      28.5      1.90     75.0
## 4            BESBASSA      54.16         2.64       5.0      2.00      4.0
## 5             Bouglez       4.60         3.75      29.0      1.00     10.0
## 6          El kantara      90.00        37.00      36.0      3.00    162.0
## 7             GUEDILA      78.00        37.00      29.0      2.00     95.0
## 8                IFRI      99.00        24.00      15.8      2.10     68.0
## 9      Lalla Khedidja      49.00         5.00       3.1      0.40      3.0
## 10 Manbaa Al ghezlane      93.00        31.00      68.0      4.00    153.0
## 11          Mansourah      85.00        37.00      30.0      1.00     53.0
## 12            MEDJANA     136.00        42.00      62.0      2.00    211.0
## 13            Messaid      79.00        27.00      50.0      2.00    156.0
## 14               NOUA      84.00        28.00      23.0      0.90     82.0
## 15              OUWIS     106.00        25.00      60.0      2.00    177.0
## 16            Ovitale      80.00        14.00      30.0      1.00     75.0
## 17          Pure Life      55.00        17.00      12.0      0.50     33.0
## 18              Saida      68.00        50.00      58.0      2.00     65.0
## 19            youkous      77.40        14.50      13.4      4.65     35.8
## 20             ANINOS      42.08        25.51       7.0      1.00     20.0
##    Chlorides Nitrates Nitrites dry.residues Bicarbonates           Province
## 1     100.00    46.50     0.01          450       256.00              SETIF
## 2      86.00     1.08     0.01          770       224.00             BISKRA
## 3      37.00     2.70     0.01          276       234.24                   
## 4      10.00     9.00     0.01          206       164.70             GUELMA
## 5      30.00     9.00     0.06          140           NA             BEJAIA
## 6      59.00     9.60     0.01          636       247.00             BISKRA
## 7      40.00     4.50     0.01          564           NA             BISKRA
## 8      72.00    15.00     0.02          380       265.00             BEJAIA
## 9       7.00     5.94     0.03          178       168.00         TIZI-OUZOU
## 10     84.00     8.90     0.02          725       326.00             BISKRA
## 11     48.00    12.00     0.00          660       362.00            TLEMCEN
## 12     47.00     1.80     0.01          953       458.00 BORDJ BOU ARRERIDJ
## 13     40.00     2.30     0.01          611       275.00             DJELFA
## 14     36.00    25.41     0.01          441       265.00          KHENCHELA
## 15     48.59    18.30     0.01          724       261.00 BORDJ BOU ARRERIDJ
## 16     50.00     5.10     0.00          360       214.00             BEJAIA
## 17     15.00     4.60     0.00          372       210.00              BLIDA
## 18     81.00    15.00     0.00          478       376.00              SAIDA
## 19     25.70     2.00     0.00          285       218.00            TEBESSA
## 20     12.76     9.61     0.01          160       262.30              SETIF

#imputation of bicarbonate. since it has outlier,we use median since its not a normal distribution(skewd)

boxplot(water_clean$Bicarbonates)

hist(water_clean$Bicarbonates)

hist(water_clean$magnesium.Mg)

hist(water_clean$dry.residues)

#imputation using medain

water_clean$Bicarbonates[is.na(water_clean$Bicarbonates)] <- median(water_clean$Bicarbonates, na.rm = TRUE)

#since the goal is to predict ideal water based solely on chemical composition, province and brand would have no effect on our model rather it will distort how our model sees our data

water_clean <- water_clean[, !names(water_clean) %in% c("Brand", "Province")]
water_clean
##    Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates Chlorides Nitrates
## 1      120.00        23.00      56.0      1.00    104.0    100.00    46.50
## 2      100.00        59.00      71.0      3.00    348.0     86.00     1.08
## 3       65.60         6.80      28.5      1.90     75.0     37.00     2.70
## 4       54.16         2.64       5.0      2.00      4.0     10.00     9.00
## 5        4.60         3.75      29.0      1.00     10.0     30.00     9.00
## 6       90.00        37.00      36.0      3.00    162.0     59.00     9.60
## 7       78.00        37.00      29.0      2.00     95.0     40.00     4.50
## 8       99.00        24.00      15.8      2.10     68.0     72.00    15.00
## 9       49.00         5.00       3.1      0.40      3.0      7.00     5.94
## 10      93.00        31.00      68.0      4.00    153.0     84.00     8.90
## 11      85.00        37.00      30.0      1.00     53.0     48.00    12.00
## 12     136.00        42.00      62.0      2.00    211.0     47.00     1.80
## 13      79.00        27.00      50.0      2.00    156.0     40.00     2.30
## 14      84.00        28.00      23.0      0.90     82.0     36.00    25.41
## 15     106.00        25.00      60.0      2.00    177.0     48.59    18.30
## 16      80.00        14.00      30.0      1.00     75.0     50.00     5.10
## 17      55.00        17.00      12.0      0.50     33.0     15.00     4.60
## 18      68.00        50.00      58.0      2.00     65.0     81.00    15.00
## 19      77.40        14.50      13.4      4.65     35.8     25.70     2.00
## 20      42.08        25.51       7.0      1.00     20.0     12.76     9.61
##    Nitrites dry.residues Bicarbonates
## 1      0.01          450       256.00
## 2      0.01          770       224.00
## 3      0.01          276       234.24
## 4      0.01          206       164.70
## 5      0.06          140       258.50
## 6      0.01          636       247.00
## 7      0.01          564       258.50
## 8      0.02          380       265.00
## 9      0.03          178       168.00
## 10     0.02          725       326.00
## 11     0.00          660       362.00
## 12     0.01          953       458.00
## 13     0.01          611       275.00
## 14     0.01          441       265.00
## 15     0.01          724       261.00
## 16     0.00          360       214.00
## 17     0.00          372       210.00
## 18     0.00          478       376.00
## 19     0.00          285       218.00
## 20     0.01          160       262.30
sum(duplicated(water))
## [1] 0

#no duplicated values NA

#descriptive statistics

library(psych)
## Warning: package 'psych' was built under R version 4.5.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(water_clean[,1:10])
##              vars  n   mean     sd median trimmed    mad    min    max  range
## Calcium.CA      1 20  78.29  29.17  79.50   78.95  24.76   4.60 136.00 131.40
## magnesium.Mg    2 20  25.46  15.44  25.26   24.61  17.05   2.64  59.00  56.36
## Sodium.Na       3 20  34.34  22.12  29.50   33.73  28.17   3.10  71.00  67.90
## Potassium       4 20   1.87   1.12   2.00    1.74   1.48   0.40   4.65   4.25
## Sulfates        5 20  96.49  84.95  75.00   85.24  71.91   3.00 348.00 345.00
## Chlorides       6 20  46.45  27.01  43.50   45.38  24.69   7.00 100.00  93.00
## Nitrates        7 20  10.42  10.55   8.95    8.35   7.78   1.08  46.50  45.42
## Nitrites        8 20   0.01   0.01   0.01    0.01   0.00   0.00   0.06   0.06
## dry.residues    9 20 468.45 230.34 445.50  459.12 266.87 140.00 953.00 813.00
## Bicarbonates   10 20 265.16  70.05 258.50  258.53  43.56 164.70 458.00 293.30
##               skew kurtosis    se
## Calcium.CA   -0.39     0.30  6.52
## magnesium.Mg  0.29    -0.73  3.45
## Sodium.Na     0.22    -1.43  4.95
## Potassium     0.86     0.05  0.25
## Sulfates      1.25     1.35 18.99
## Chlorides     0.34    -1.00  6.04
## Nitrates      2.01     4.18  2.36
## Nitrites      2.16     5.05  0.00
## dry.residues  0.27    -1.05 51.51
## Bicarbonates  1.05     0.84 15.66
boxplot(water_clean[ ,1:10],
        las = 2,
        col = "lightgreen",
        main = "Mineral Composition Distribution")

#High nitrates in water may be health sensitive.

water[order(water$Nitrates, decreasing = TRUE), ]
##                 Brand Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates
## 1                arwa     120.00        23.00      56.0      1.00    104.0
## 14               NOUA      84.00        28.00      23.0      0.90     82.0
## 15              OUWIS     106.00        25.00      60.0      2.00    177.0
## 8                IFRI      99.00        24.00      15.8      2.10     68.0
## 18              Saida      68.00        50.00      58.0      2.00     65.0
## 11          Mansourah      85.00        37.00      30.0      1.00     53.0
## 20             ANINOS      42.08        25.51       7.0      1.00     20.0
## 6          El kantara      90.00        37.00      36.0      3.00    162.0
## 4            BESBASSA      54.16         2.64       5.0      2.00      4.0
## 5             Bouglez       4.60         3.75      29.0      1.00     10.0
## 10 Manbaa Al ghezlane      93.00        31.00      68.0      4.00    153.0
## 9      Lalla Khedidja      49.00         5.00       3.1      0.40      3.0
## 16            Ovitale      80.00        14.00      30.0      1.00     75.0
## 17          Pure Life      55.00        17.00      12.0      0.50     33.0
## 7             GUEDILA      78.00        37.00      29.0      2.00     95.0
## 3               AYRIS      65.60         6.80      28.5      1.90     75.0
## 13            Messaid      79.00        27.00      50.0      2.00    156.0
## 19            youkous      77.40        14.50      13.4      4.65     35.8
## 12            MEDJANA     136.00        42.00      62.0      2.00    211.0
## 2               Aures     100.00        59.00      71.0      3.00    348.0
##    Chlorides Nitrates Nitrites dry.residues Fluor Bicarbonates Silicas
## 1     100.00    46.50     0.01          450    NA       256.00      NA
## 14     36.00    25.41     0.01          441    NA       265.00      NA
## 15     48.59    18.30     0.01          724    NA       261.00      NA
## 8      72.00    15.00     0.02          380    NA       265.00      NA
## 18     81.00    15.00     0.00          478    NA       376.00      NA
## 11     48.00    12.00     0.00          660    NA       362.00   12.10
## 20     12.76     9.61     0.01          160    NA       262.30      NA
## 6      59.00     9.60     0.01          636    NA       247.00      NA
## 4      10.00     9.00     0.01          206    NA       164.70      NA
## 5      30.00     9.00     0.06          140    NA           NA      NA
## 10     84.00     8.90     0.02          725  1.05       326.00      NA
## 9       7.00     5.94     0.03          178  0.11       168.00      NA
## 16     50.00     5.10     0.00          360    NA       214.00      NA
## 17     15.00     4.60     0.00          372    NA       210.00   12.00
## 7      40.00     4.50     0.01          564    NA           NA      NA
## 3      37.00     2.70     0.01          276    NA       234.24      NA
## 13     40.00     2.30     0.01          611    NA       275.00      NA
## 19     25.70     2.00     0.00          285    NA       218.00    2.33
## 12     47.00     1.80     0.01          953  0.33       458.00      NA
## 2      86.00     1.08     0.01          770    NA       224.00      NA
##              Province
## 1               SETIF
## 14          KHENCHELA
## 15 BORDJ BOU ARRERIDJ
## 8              BEJAIA
## 18              SAIDA
## 11            TLEMCEN
## 20              SETIF
## 6              BISKRA
## 4              GUELMA
## 5              BEJAIA
## 10             BISKRA
## 9          TIZI-OUZOU
## 16             BEJAIA
## 17              BLIDA
## 7              BISKRA
## 3                    
## 13             DJELFA
## 19            TEBESSA
## 12 BORDJ BOU ARRERIDJ
## 2              BISKRA

#goal is to create a model that will predict ideal water treatment value for each variables

note;Nitrates and nitrites are the most critical parameters due to their direct impact on human health. ##a feature engineering process was done inorder to selective classify accurate measures of minerals in water treatement. we used WHO thresholds to create anideal class:

library(dplyr)

water_clean$ideal <- ifelse(
  water_clean$Calcium.CA >= 50 & water_clean$Calcium.CA <= 200 &
  water_clean$magnesium.Mg >= 10 & water_clean$magnesium.Mg <= 50 &
  water_clean$Sodium.Na < 50 &
  water_clean$Potassium <10 &
  water_clean$Bicarbonates >= 100 & water_clean$Bicarbonates <= 400 &
  water_clean$Sulfates  <= 250 &
  water_clean$Chlorides <= 100 &
  water_clean$Nitrates < 45 &
  water_clean$Nitrites < 0.05 &
  water_clean$dry.residues <= 500,
  "Ideal",
  "Not_Ideal"
)

water_clean$ideal <- as.factor(water_clean$ideal)

##Classification was performed using rule-based logic where each observation was evaluated against WHO threshold conditions. Observations satisfying all conditions were labeled as ‘Ideal’, while others were classified as ‘Not Ideal’.”

##ideal water is not just about “low contamination”,It is about balanced mineral composition + safe limits according to WHO organisation.

table(water_clean$ideal)
## 
##     Ideal Not_Ideal 
##         5        15
 #Filter only the Ideal rows

ideal_table <- water_clean %>%
  filter(ideal == "Ideal")

# View the table
head(ideal_table) 
##   Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates Chlorides Nitrates
## 1       99.0         24.0      15.8      2.10     68.0      72.0    15.00
## 2       84.0         28.0      23.0      0.90     82.0      36.0    25.41
## 3       80.0         14.0      30.0      1.00     75.0      50.0     5.10
## 4       55.0         17.0      12.0      0.50     33.0      15.0     4.60
## 5       77.4         14.5      13.4      4.65     35.8      25.7     2.00
##   Nitrites dry.residues Bicarbonates ideal
## 1     0.02          380          265 Ideal
## 2     0.01          441          265 Ideal
## 3     0.00          360          214 Ideal
## 4     0.00          372          210 Ideal
## 5     0.00          285          218 Ideal
prop.table(table(water_clean$ideal))
## 
##     Ideal Not_Ideal 
##      0.25      0.75

##The dataset shows that 25% of the water samples are classified as ‘Ideal’, while 75% are ‘Not Ideal’, indicating that a majority of samples do not meet the defined mineral standards.”

library(ggplot2)

ggplot(water_clean, aes(x = ideal, fill = ideal)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_classic() +
  labs(title = "Distribution of Ideal vs Not Ideal Water",
       x = "Class",
       y = "Count")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

freq <- table(water_clean$ideal)
percent <- prop.table(freq) * 100

data.frame(
  Class = names(freq),
  Count = as.vector(freq),
  Percentage = round(percent, 2)
)
##       Class Count Percentage.Var1 Percentage.Freq
## 1     Ideal     5           Ideal              25
## 2 Not_Ideal    15       Not_Ideal              75

#correlation

corVa <- water_clean
corVa
##    Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates Chlorides Nitrates
## 1      120.00        23.00      56.0      1.00    104.0    100.00    46.50
## 2      100.00        59.00      71.0      3.00    348.0     86.00     1.08
## 3       65.60         6.80      28.5      1.90     75.0     37.00     2.70
## 4       54.16         2.64       5.0      2.00      4.0     10.00     9.00
## 5        4.60         3.75      29.0      1.00     10.0     30.00     9.00
## 6       90.00        37.00      36.0      3.00    162.0     59.00     9.60
## 7       78.00        37.00      29.0      2.00     95.0     40.00     4.50
## 8       99.00        24.00      15.8      2.10     68.0     72.00    15.00
## 9       49.00         5.00       3.1      0.40      3.0      7.00     5.94
## 10      93.00        31.00      68.0      4.00    153.0     84.00     8.90
## 11      85.00        37.00      30.0      1.00     53.0     48.00    12.00
## 12     136.00        42.00      62.0      2.00    211.0     47.00     1.80
## 13      79.00        27.00      50.0      2.00    156.0     40.00     2.30
## 14      84.00        28.00      23.0      0.90     82.0     36.00    25.41
## 15     106.00        25.00      60.0      2.00    177.0     48.59    18.30
## 16      80.00        14.00      30.0      1.00     75.0     50.00     5.10
## 17      55.00        17.00      12.0      0.50     33.0     15.00     4.60
## 18      68.00        50.00      58.0      2.00     65.0     81.00    15.00
## 19      77.40        14.50      13.4      4.65     35.8     25.70     2.00
## 20      42.08        25.51       7.0      1.00     20.0     12.76     9.61
##    Nitrites dry.residues Bicarbonates     ideal
## 1      0.01          450       256.00 Not_Ideal
## 2      0.01          770       224.00 Not_Ideal
## 3      0.01          276       234.24 Not_Ideal
## 4      0.01          206       164.70 Not_Ideal
## 5      0.06          140       258.50 Not_Ideal
## 6      0.01          636       247.00 Not_Ideal
## 7      0.01          564       258.50 Not_Ideal
## 8      0.02          380       265.00     Ideal
## 9      0.03          178       168.00 Not_Ideal
## 10     0.02          725       326.00 Not_Ideal
## 11     0.00          660       362.00 Not_Ideal
## 12     0.01          953       458.00 Not_Ideal
## 13     0.01          611       275.00 Not_Ideal
## 14     0.01          441       265.00     Ideal
## 15     0.01          724       261.00 Not_Ideal
## 16     0.00          360       214.00     Ideal
## 17     0.00          372       210.00     Ideal
## 18     0.00          478       376.00 Not_Ideal
## 19     0.00          285       218.00     Ideal
## 20     0.01          160       262.30 Not_Ideal
library(caret)
#fullrank to see a column for every category in ideal
dummy_vari <-dummyVars(~., data = corVa, fullRank = FALSE)

# generate the dummy col
dummy_vari <- predict(dummy_vari, newdata = corVa )

# Convert to data frame
water_dummified <- as.data.frame(dummy_vari)

# View first rows
head(water_dummified)
##   Calcium.CA magnesium.Mg Sodium.Na Potassium Sulfates Chlorides Nitrates
## 1     120.00        23.00      56.0       1.0      104       100    46.50
## 2     100.00        59.00      71.0       3.0      348        86     1.08
## 3      65.60         6.80      28.5       1.9       75        37     2.70
## 4      54.16         2.64       5.0       2.0        4        10     9.00
## 5       4.60         3.75      29.0       1.0       10        30     9.00
## 6      90.00        37.00      36.0       3.0      162        59     9.60
##   Nitrites dry.residues Bicarbonates ideal.Ideal ideal.Not_Ideal
## 1     0.01          450       256.00           0               1
## 2     0.01          770       224.00           0               1
## 3     0.01          276       234.24           0               1
## 4     0.01          206       164.70           0               1
## 5     0.06          140       258.50           0               1
## 6     0.01          636       247.00           0               1
 cor_matrix <- cor(water_dummified)
corrplot(cor_matrix)

#selecting predictors using varimport(ideal~classification)*

library(randomForest)
water_clean$ideal <- as.factor(water_clean$ideal)
importance_modeL_ideal <- randomForest(ideal ~., data = water_clean, importance = TRUE)
importance_modeL_ideal
## 
## Call:
##  randomForest(formula = ideal ~ ., data = water_clean, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 15%
## Confusion matrix:
##           Ideal Not_Ideal class.error
## Ideal         2         3         0.6
## Not_Ideal     0        15         0.0
varImpPlot(importance_modeL_ideal)

#modelling we gona be using variables which has more variability and are correlated. closely related variables will be drop

#models used for predictive analysis Due to the small sample size (n = 20), splitting the dataset into separate training and testing sets would lead to unreliable and unstable model estimates. To address this limitation, Leave-One-Out Cross Validation (LOOCV) was employed.

LOOCV ensures that each observation is used once as a validation sample while the remaining observations are used for training. This approach maximizes data utilization and provides a robust estimate of model performance.

Furthermore, LOOCV reduces bias associated with random sampling and is particularly suitable for small datasets, making it an appropriate choice for this st

fitControl <- trainControl(
                      method = "LOOCV",
                      savePredictions = "final",
                      classProbs = TRUE,
                      sampling = "up", #balance  not ideal and  ideal due to 75% to 25%
                       summaryFunction = twoClassSummary)#balance  not ideal and  ideal due to 75% to 25%

#model training for ideal water predictions

ideal_rfFit1 <- train(ideal ~ dry.residues+Sodium.Na+Sulfates, data = water_clean, 
                method = "rf", 
                trControl = fitControl,
                 #This last option is actually one
                 ## for gbm() that passes through
                verbose = FALSE)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
ideal_rfFit1
## Random Forest 
## 
## 20 samples
##  3 predictor
##  2 classes: 'Ideal', 'Not_Ideal' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 19, 19, 19, 19, 19, 19, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens  Spec     
##   2     0.8666667  0.4   0.8666667
##   3     0.7933333  0.4   0.8666667
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
saveRDS(ideal_rfFit1, "ideal_rfFit1.rds")
#ideal_rfFit1 <- readRDS("ideal_rfFit1.rds")
#ideal_rfFit1
ideal_C5.0Fit1 <- train(ideal~ dry.residues+Sodium.Na+Sulfates, data = water_clean, 
                 method = "C5.0", 
                 trControl = fitControl,
                 ## This last option is actually one
                 ## for C5.0() that passes through
                 verbose = FALSE)


ideal_C5.0Fit1
## C5.0 
## 
## 20 samples
##  3 predictor
##  2 classes: 'Ideal', 'Not_Ideal' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 19, 19, 19, 19, 19, 19, ... 
## Addtional sampling using up-sampling
## 
## Resampling results across tuning parameters:
## 
##   trials  model  winnow  ROC        Sens  Spec     
##    1      rules  FALSE   0.7666667  0.6   0.9333333
##    1      rules   TRUE   0.7800000  0.6   0.8000000
##    1      tree   FALSE   0.6000000  0.6   0.9333333
##    1      tree    TRUE   0.7466667  0.6   0.7333333
##   10      rules  FALSE   0.7666667  0.6   0.9333333
##   10      rules   TRUE   0.7933333  0.6   0.8000000
##   10      tree   FALSE   0.6000000  0.6   0.9333333
##   10      tree    TRUE   0.7466667  0.6   0.7333333
##   20      rules  FALSE   0.7666667  0.6   0.9333333
##   20      rules   TRUE   0.7933333  0.6   0.8000000
##   20      tree   FALSE   0.6000000  0.6   0.9333333
##   20      tree    TRUE   0.7466667  0.6   0.7333333
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = rules and
##  winnow = TRUE.
#ideal_C5.0Fit1 <- readRDS("ideal_C5.0Fit1.rds")
#ideal_C5.0Fit1
saveRDS(ideal_C5.0Fit1,"ideal_C5.0Fit1.rds")
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
## 
##     element
set.seed(123)
svm_model <- train(
  ideal ~ dry.residues + Sodium.Na + Sulfates,
  data = water_clean,
  method = "svmRadial",          # radial basis function kernel
  trControl = fitControl,
   preProcess = c("center", "scale"))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
svm_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 20 samples
##  3 predictor
##  2 classes: 'Ideal', 'Not_Ideal' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 19, 19, 19, 19, 19, 19, ... 
## Addtional sampling using up-sampling prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens  Spec     
##   0.25  0.9600000  0.8   0.9333333
##   0.50  0.9466667  1.0   0.9333333
##   1.00  0.9600000  1.0   0.9333333
## 
## Tuning parameter 'sigma' was held constant at a value of 0.5388966
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.5388966 and C = 0.25.
saveRDS(svm_model,"svm_model.RDS")

#selecting the best model using confusion matrix

rf_confusion_ideal <- confusionMatrix(
  ideal_rfFit1$pred$pred,
  ideal_rfFit1$pred$obs)
rf_confusion_ideal
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Ideal Not_Ideal
##   Ideal         2         2
##   Not_Ideal     3        13
##                                          
##                Accuracy : 0.75           
##                  95% CI : (0.509, 0.9134)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.6172         
##                                          
##                   Kappa : 0.2857         
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.4000         
##             Specificity : 0.8667         
##          Pos Pred Value : 0.5000         
##          Neg Pred Value : 0.8125         
##              Prevalence : 0.2500         
##          Detection Rate : 0.1000         
##    Detection Prevalence : 0.2000         
##       Balanced Accuracy : 0.6333         
##                                          
##        'Positive' Class : Ideal          
## 

#for svm

 svm_confusion_ideal <- confusionMatrix(
  svm_model$pred$pred,
  svm_model$pred$obs)
svm_confusion_ideal
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Ideal Not_Ideal
##   Ideal         4         1
##   Not_Ideal     1        14
##                                          
##                Accuracy : 0.9            
##                  95% CI : (0.683, 0.9877)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.09126        
##                                          
##                   Kappa : 0.7333         
##                                          
##  Mcnemar's Test P-Value : 1.00000        
##                                          
##             Sensitivity : 0.8000         
##             Specificity : 0.9333         
##          Pos Pred Value : 0.8000         
##          Neg Pred Value : 0.9333         
##              Prevalence : 0.2500         
##          Detection Rate : 0.2000         
##    Detection Prevalence : 0.2500         
##       Balanced Accuracy : 0.8667         
##                                          
##        'Positive' Class : Ideal          
## 
 C5.0_confusion_ideal <- confusionMatrix(
  ideal_C5.0Fit1$pred$pred,
  ideal_C5.0Fit1$pred$obs)
C5.0_confusion_ideal 
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Ideal Not_Ideal
##   Ideal         3         3
##   Not_Ideal     2        12
##                                          
##                Accuracy : 0.75           
##                  95% CI : (0.509, 0.9134)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.6172         
##                                          
##                   Kappa : 0.375          
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.6000         
##             Specificity : 0.8000         
##          Pos Pred Value : 0.5000         
##          Neg Pred Value : 0.8571         
##              Prevalence : 0.2500         
##          Detection Rate : 0.1500         
##    Detection Prevalence : 0.3000         
##       Balanced Accuracy : 0.7000         
##                                          
##        'Positive' Class : Ideal          
## 

##To robustly evaluate model performance, ROC (Receiver Operating Characteristic) curves and the corresponding AUC (Area Under the Curve) were used. Unlike accuracy, which can be misleading for imbalanced datasets, ROC/AUC evaluates the model’s ability to correctly distinguish between “Ideal” and “Not Ideal” water samples across all possible thresholds.An AUC close to 1 indicates that the model reliably separates the two classes, ensuring high-quality predictions for water safety assessm

##svm best model The SVM with RBF kernel achieved outstanding discrimination (ROC = 0.96) on a small, imbalanced dataset.

LOOCV ensured each sample was validated once, while up-sampling prevented bias toward the majority class (“Not_Ideal”).

Pre-processing (centering and scaling) allowed SVM to handle the different scales of water mineral concentrations.

#learnin

 yob.f<-as.factor(sample(c(1969,1979,1989)))
yob.f
## [1] 1979 1969 1989
## Levels: 1969 1979 1989

#above created a levels wen factor is used

as.character(yob.f)
## [1] "1979" "1969" "1989"
df <- data.frame(val = c("0.7", "0.8", "<0.9", ">9"))
df
##    val
## 1  0.7
## 2  0.8
## 3 <0.9
## 4   >9
df$val <- gsub("[<>]", "", df$val)#gsub = global substitution
df$val
## [1] "0.7" "0.8" "0.9" "9"

#cut function

x2 <- sample(1:100, 100, replace = T)
x2
##   [1] 18 71 96 74 99 46  9 35 51 34 73 29  8 90 10 25 25 45 58 73 81  2 53 74 72
##  [26] 17 97 49 20 84 60 87  5  5 36  4 45 61 18 84 64 28 70 66 51  5 45 31 82 18
##  [51] 83 43  1  2 32 92 25 59 66 56 50 42  5 80 87 32 31 11 53 38 94 48 53 72 46
##  [76] 10 39  4 89 99 13  1 90 68 85 35 82 80 71 12 12 97  2 32 82 43 48 23 36 55
 x.f <- cut(x2, breaks = 5)
x.f
##   [1] (0.902,20.6] (59.8,79.4]  (79.4,99.1]  (59.8,79.4]  (79.4,99.1] 
##   [6] (40.2,59.8]  (0.902,20.6] (20.6,40.2]  (40.2,59.8]  (20.6,40.2] 
##  [11] (59.8,79.4]  (20.6,40.2]  (0.902,20.6] (79.4,99.1]  (0.902,20.6]
##  [16] (20.6,40.2]  (20.6,40.2]  (40.2,59.8]  (40.2,59.8]  (59.8,79.4] 
##  [21] (79.4,99.1]  (0.902,20.6] (40.2,59.8]  (59.8,79.4]  (59.8,79.4] 
##  [26] (0.902,20.6] (79.4,99.1]  (40.2,59.8]  (0.902,20.6] (79.4,99.1] 
##  [31] (59.8,79.4]  (79.4,99.1]  (0.902,20.6] (0.902,20.6] (20.6,40.2] 
##  [36] (0.902,20.6] (40.2,59.8]  (59.8,79.4]  (0.902,20.6] (79.4,99.1] 
##  [41] (59.8,79.4]  (20.6,40.2]  (59.8,79.4]  (59.8,79.4]  (40.2,59.8] 
##  [46] (0.902,20.6] (40.2,59.8]  (20.6,40.2]  (79.4,99.1]  (0.902,20.6]
##  [51] (79.4,99.1]  (40.2,59.8]  (0.902,20.6] (0.902,20.6] (20.6,40.2] 
##  [56] (79.4,99.1]  (20.6,40.2]  (40.2,59.8]  (59.8,79.4]  (40.2,59.8] 
##  [61] (40.2,59.8]  (40.2,59.8]  (0.902,20.6] (79.4,99.1]  (79.4,99.1] 
##  [66] (20.6,40.2]  (20.6,40.2]  (0.902,20.6] (40.2,59.8]  (20.6,40.2] 
##  [71] (79.4,99.1]  (40.2,59.8]  (40.2,59.8]  (59.8,79.4]  (40.2,59.8] 
##  [76] (0.902,20.6] (20.6,40.2]  (0.902,20.6] (79.4,99.1]  (79.4,99.1] 
##  [81] (0.902,20.6] (0.902,20.6] (79.4,99.1]  (59.8,79.4]  (79.4,99.1] 
##  [86] (20.6,40.2]  (79.4,99.1]  (79.4,99.1]  (59.8,79.4]  (0.902,20.6]
##  [91] (0.902,20.6] (79.4,99.1]  (0.902,20.6] (20.6,40.2]  (79.4,99.1] 
##  [96] (40.2,59.8]  (40.2,59.8]  (20.6,40.2]  (20.6,40.2]  (40.2,59.8] 
## Levels: (0.902,20.6] (20.6,40.2] (40.2,59.8] (59.8,79.4] (79.4,99.1]
 class <- sample(1:3, 10, replace = TRUE, prob = NULL)
class
##  [1] 3 1 1 2 1 2 3 1 1 2
factor(class)
##  [1] 3 1 1 2 1 2 3 1 1 2
## Levels: 1 2 3
x <- factor(1:3, labels = c("one","two","three"))
x
## [1] one   two   three
## Levels: one two three
x <- relevel(x, "three")
x
## [1] one   two   three
## Levels: three one two
df3 <- data.frame(
  value_num = sample(1:100, 200, replace = TRUE),
  y = rnorm(100)
)
x1 <- rnorm(60, mean = 50, sd = 10)
x1
##  [1] 43.34585 47.46364 47.30031 42.85406 52.83274 50.70518 44.26060 43.23044
##  [9] 44.00817 50.13505 34.32899 34.82501 54.01598 56.80856 56.48229 68.78374
## [17] 44.01702 54.86706 49.76897 50.85361 48.39445 41.96102 51.87112 29.77434
## [25] 31.18366 46.33102 29.85822 39.16262 38.66190 44.66467 50.71369 56.47624
## [33] 62.02501 52.70866 49.75501 46.66562 64.25000 51.51802 48.94630 63.92529
## [41] 35.74470 53.40375 43.39064 49.22643 50.85866 41.89007 53.86176 38.18711
## [49] 59.35007 53.43198 58.48201 56.08472 42.90635 39.11939 45.23212 43.69523
## [57] 29.41838 42.22125 31.48071 41.54655