WQIndex.R

Peter Prevos — Feb 1, 2014, 2:18 PM

#Data analysis of water quality index survey

#Preliminaries
library(psych)
options(width=100) #Modify screen width
survey <- read.csv("/home/peter/Water_Quality_Index.csv", skip=1) #Read survey results
survey <- survey[,c(-1:-11,-44:-45)] #Remove redundant Qualtrics infromation
names(survey) <- c("Experience","Bachelor","Master","Doctorate",
                   "less5","less10","less20","more20",
                   "research","design","operations","management","Australia","USA","Europe","Asia",
                   "Catchment","Treatment","Network","Perception","Regulation",
                   "eColi","coliform","turbidity","colour","pH","aluminium","byproducts","chlorine",
                   "comments", "lat", "long") #Name variables

#Demographics
nrow(survey) #Number of responses
[1] 36
#More than one answer was possible on each parameter
colSums(survey[,1:4], na.rm=T) #Education level of respondents
Experience   Bachelor     Master  Doctorate 
         5         17          8          7 
colSums(survey[,5:8], na.rm=T) #Experience of respondents
 less5 less10 less20 more20 
     1     12      8     15 
colSums(survey[,9:12], na.rm=T) #Function type of respondents
  research     design operations management 
        12          9         13         24 
colSums(survey[,13:16], na.rm=T) #Regional experience of respondents
Australia       USA    Europe      Asia 
       30         8         3         4 
library(maps) #Load wold map library
par(mar=rep(1,4)) #Graphics parameters
map(interior=F) #Draw world map
title("Respondent locations")
points(survey$long, survey$lat, col="red", pch=16, cex=1.5)

plot of chunk unnamed-chunk-1

par(mar=c(7,4,1,1),mgp=c(5,1,0)) #Graphics parameters

#Main Water Quality Index
index <- survey[,17:21] #Define data frame
describe(index) #Descriptive statistics
           var  n  mean    sd median trimmed   mad min max range  skew kurtosis   se
Catchment    1 36 60.11 23.43     60   61.10 28.91   9 100    91 -0.50    -0.45 3.91
Treatment    2 36 86.50 16.04     90   89.13 14.83  30 100    70 -1.76     3.50 2.67
Network      3 36 74.17 20.74     80   76.37 14.83  10 100    90 -1.08     0.87 3.46
Perception   4 36 52.94 25.14     60   54.53 25.95   0  95    95 -0.57    -0.59 4.19
Regulation   5 36 74.53 27.32     89   78.03 16.31   3 100    97 -0.92    -0.27 4.55
boxplot(index, xlab="Factor", las=2, col="darkblue") #Boxplot of responses

plot of chunk unnamed-chunk-1

WQindex <- matrix(nrow=5, ncol=4) #Define index matrix
dimnames(WQindex) <- list(names(survey)[17:21], c("mean", "index", "median", "index")) #Add names
WQindex[,1] <- round(colMeans(survey[,17:21]),1) #Factor means
WQindex[,3] <- apply(index,2,median) #Factor medians
for (i in 17:21) { #Normalise factors (total 100)
  WQindex[i-16,2] <- round(100/sum(WQindex[,1])*mean(survey[,i])) #Weighted mean-based factors
  WQindex[i-16,4] <- round(100/sum(WQindex[,3])*median(survey[,i])) #Weighted median-based factors
}
addmargins(WQindex,1) #Display water quality index based on means and based on medians
            mean index median index
Catchment   60.1    17     60    16
Treatment   86.5    25     90    24
Network     74.2    21     80    21
Perception  52.9    15     60    16
Regulation  74.5    21     89    23
Sum        348.2    99    379   100

#Network Factor
network <- survey[,22:29] #Define data frama
describe(network) #Descriptive statistics
           var  n  mean    sd median trimmed   mad min max range  skew kurtosis   se
eColi        1 36 86.33 20.92   98.5   90.23  2.22  21 100    79 -1.58     1.48 3.49
coliform     2 36 62.31 27.67   60.5   63.43 30.39   7 100    93 -0.34    -1.12 4.61
turbidity    3 36 68.69 25.25   77.5   70.13 31.88  11 100    89 -0.37    -0.99 4.21
colour       4 36 54.78 25.26   49.5   54.77 15.57   3 100    97  0.12    -0.74 4.21
pH           5 36 55.06 25.11   50.0   55.63 22.98   2  98    96 -0.12    -0.86 4.19
aluminium    6 36 45.75 28.69   40.0   44.67 29.65   1 100    99  0.36    -0.88 4.78
byproducts   7 36 58.14 22.96   60.0   59.27 29.65   8  98    90 -0.35    -0.58 3.83
chlorine     8 36 80.08 25.14   90.0   84.23 14.83   8 100    92 -1.46     1.33 4.19
boxplot(network, xlab="Subfactor", ylab="Score", las=2,  col="darkgreen")

plot of chunk unnamed-chunk-1

NWindex <- matrix(nrow=8, ncol=4) #Define factor matrix
dimnames(NWindex) <- list(names(survey)[22:29], c("mean", "index", "median", "index")) #Add names
NWindex[,1] <- round(colMeans(network),1) #Subfactor means
NWindex[,3] <- apply(network,2,median) #Subfactor medians
for (i in 22:29) { #Normalise subfactors (total 100)
  NWindex[i-21,2] <- round(100/sum(NWindex[,1])*mean(survey[,i])) #Weighted mean-based subfactors
  NWindex[i-21,4] <- round(100/sum(NWindex[,3])*median(survey[,i])) #Weighted median-based subfactors
}
addmargins(NWindex,1) #Display network factor based on means and based on medians
            mean index median index
eColi       86.3    17   98.5    19
coliform    62.3    12   60.5    12
turbidity   68.7    13   77.5    15
colour      54.8    11   49.5     9
pH          55.1    11   50.0    10
aluminium   45.8     9   40.0     8
byproducts  58.1    11   60.0    11
chlorine    80.1    16   90.0    17
Sum        511.2   100  526.0   101

#Analysis
factors <- fa.parallel(survey[,17:29]) #Parallel analysis
Loading required package: MASS

Attaching package: 'MASS'

The following object is masked _by_ '.GlobalEnv':

    survey

plot of chunk unnamed-chunk-1

Parallel analysis suggests that the number of factors =  1  and the number of components =  1 
factanal(survey[,17:29], factors=factors$nfact)

Call:
factanal(x = survey[, 17:29], factors = factors$nfact)

Uniquenesses:
 Catchment  Treatment    Network Perception Regulation      eColi   coliform  turbidity     colour 
     0.569      0.676      0.416      0.783      0.747      0.617      0.590      0.324      0.304 
        pH  aluminium byproducts   chlorine 
     0.417      0.311      0.359      0.584 

Loadings:
           Factor1
Catchment  0.657  
Treatment  0.569  
Network    0.764  
Perception 0.466  
Regulation 0.503  
eColi      0.619  
coliform   0.640  
turbidity  0.822  
colour     0.835  
pH         0.763  
aluminium  0.830  
byproducts 0.801  
chlorine   0.645  

               Factor1
SS loadings      6.304
Proportion Var   0.485

Test of the hypothesis that 1 factor is sufficient.
The chi square statistic is 91.89 on 65 degrees of freedom.
The p-value is 0.0157 

#Display comments
survey$comments[survey$comments!=""]
 [1] Off course it's system specific, i.e. while some potable water supply systems rely heavily on catchment protection, others rely on the treatment, therefore, it's crucial when developing the index that this is taken in account, however, as there is no indication of the type of the system the assessment is being done on, the results will be ambiguous or inaccurate! I think in line with the above questions, some information on the water supply system the assessment was done on will be of great value. / Regards                                                                                                                                                                                                                                                                                                              
 [2] Do not forget COD and BOD criteria                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
 [3] The survey should state the source water for drinking. Is it groundwater or surface water? This would have made the ranking more relevant. Chemistry and treatment of groundwater and surface water, and their inherent water quality risks, are quite different.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
 [4] This was a difficult survey to fill in accurately, and the importance will depend upon the system you have.  For a protected catchment with minimal treatment then catchment manaement is your highest priority while for a poor quality water source with advanced treatment then the treatment process is your main protection.  Hence the relative waitings can vary from site to site, and it depends wht is the biggest risk prevention measure you have identifed for your system, and what water quality variables might be an indicator for when this departs from the desired setting.                                                                                                                                                                                                                                               
 [5] Not sure how this will help seems a bit simplistic                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
 [6] I'm not sure if the acceptability of water quality should based entirely on E.coli.  While these factors may indicate direct faecal contamination, they do not account for non-faecal contamination. There should be adequate evidence from several indicators that the water quality has not changed post-treatment.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
 [7] in looking at Water Quality, the focus on water safety sometimes gets clouded by issues  associated with customer aesthetic opinion.  A well run treatment plant that has appropriate barriers for the source water risks, that includes on-line or real time control actions, is essential to providing safe drinking water.  Once water enters the distribution system there are limited means to re-treat (apart from secondary chlorine dosing) so ensuring the integrity of the network is essential.  Low or no chlorine residual in a network puts the water supply and customers at risk.  A shift in focus from end of pipe sampling to pre-emptive mapping and control of chlorine residual in networks needs to occur into the future, together with continued improvements in treatment plant "validation" and real time controls.
 [8] Regulatory compliance was scored relatively low on the Main Index Factors graph because Regulatory Compliance should end up being by-product of the doing all the other factors well. /  / David Sheehan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
 [9] It is difficult to comment on the Network Protection Factor without knowing how it is intended to be used? By whom? to assess what?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[10] TTo some extent the relative weightings depends on the parameter value. (eg is turbidity 0.5, 1.0 or 5 NTU or?) /  / Do the relative weightings add up to 100? It seems they dont. It may have been better to include a calculator to automatically calculate the weightings (out of 100) as you enter values.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
11 Levels:  ... TTo some extent the relative weightings depends on the parameter value. (eg is turbidity 0.5, 1.0 or 5 NTU or?) /  / Do the relative weightings add up to 100? It seems they dont. It may have been better to include a calculator to automatically calculate the weightings (out of 100) as you enter values.

#Create png file for website
png(file="wqindex.png", width=1024, height=768)
par(mar=c(6,3,3,1),mfrow=c(1,2), cex.lab=2, cex.axis=1, cex=1.5)
boxplot(index, main="Factor", ylab="relavtive value", las=2, col="darkblue") #Boxplot of responses
boxplot(network, main="Subfactor (Network)", ylab="Score", las=2,  col="darkgreen")
dev.off()
pdf 
  2