BOD AND TSS DATA EVALUATION

#set working directory
#setwd("G:/Transportation/NJ_TRANSIT/PeterSpinney/")
setwd("C:/Users/ywang/Dropbox/R/BOD_Analysis/")

#load data
Data <- read.csv("Data.csv")
Data

##       Date BODRawWW BODScreenedRaw250 BODFilteredRaw BODEffluent
## 1 12/15/15      727               779            537         580
## 2 12/16/15     1079              1136            811        1044
## 3 12/17/15     1080              1149            696        1131
## 4 01/19/16      886               867            660         906
## 5 01/20/16     1085              1105            980         950
## 6 01/21/16     1069              1016            727        1197
##   BODFilteredEffluent lint50 lint100 lint150 LintPercentSoild TSSRawWW
## 1                 455    213     281     274            23.76      155
## 2                 822   1346    1095    1635            15.97      222
## 3                 530    517     160     339            25.81      258
## 4                 628    246     216     378            14.30      168
## 5                 605    295     278     283             7.14      233
## 6                 568    188     403     416             9.36      215
##   TSSScreenedRaw250 TSSEffluent phraw pheffluent
## 1               150         162 11.65      10.02
## 2               189         258 11.52       9.84
## 3               194         230 11.70      10.00
## 4               146         253 11.16       9.89
## 5               254         210 11.31      10.21
## 6               218         208  9.91       9.86

#obtain the summary statistics 
sta<-summary(Data)
sta

##        Date      BODRawWW      BODScreenedRaw250 BODFilteredRaw 
##  01/19/16:1   Min.   : 727.0   Min.   : 779.0    Min.   :537.0  
##  01/20/16:1   1st Qu.: 931.8   1st Qu.: 904.2    1st Qu.:669.0  
##  01/21/16:1   Median :1074.0   Median :1060.5    Median :711.5  
##  12/15/15:1   Mean   : 987.7   Mean   :1008.7    Mean   :735.2  
##  12/16/15:1   3rd Qu.:1079.8   3rd Qu.:1128.2    3rd Qu.:790.0  
##  12/17/15:1   Max.   :1085.0   Max.   :1149.0    Max.   :980.0  
##   BODEffluent   BODFilteredEffluent     lint50          lint100      
##  Min.   : 580   Min.   :455.0       Min.   : 188.0   Min.   : 160.0  
##  1st Qu.: 917   1st Qu.:539.5       1st Qu.: 221.2   1st Qu.: 231.5  
##  Median : 997   Median :586.5       Median : 270.5   Median : 279.5  
##  Mean   : 968   Mean   :601.3       Mean   : 467.5   Mean   : 405.5  
##  3rd Qu.:1109   3rd Qu.:622.2       3rd Qu.: 461.5   3rd Qu.: 372.5  
##  Max.   :1197   Max.   :822.0       Max.   :1346.0   Max.   :1095.0  
##     lint150       LintPercentSoild    TSSRawWW     TSSScreenedRaw250
##  Min.   : 274.0   Min.   : 7.14    Min.   :155.0   Min.   :146.0    
##  1st Qu.: 297.0   1st Qu.:10.60    1st Qu.:179.8   1st Qu.:159.8    
##  Median : 358.5   Median :15.13    Median :218.5   Median :191.5    
##  Mean   : 554.2   Mean   :16.06    Mean   :208.5   Mean   :191.8    
##  3rd Qu.: 406.5   3rd Qu.:21.81    3rd Qu.:230.2   3rd Qu.:212.0    
##  Max.   :1635.0   Max.   :25.81    Max.   :258.0   Max.   :254.0    
##   TSSEffluent        phraw         pheffluent    
##  Min.   :162.0   Min.   : 9.91   Min.   : 9.840  
##  1st Qu.:208.5   1st Qu.:11.20   1st Qu.: 9.867  
##  Median :220.0   Median :11.41   Median : 9.945  
##  Mean   :220.2   Mean   :11.21   Mean   : 9.970  
##  3rd Qu.:247.2   3rd Qu.:11.62   3rd Qu.:10.015  
##  Max.   :258.0   Max.   :11.70   Max.   :10.210

write.csv(sta,"sta.csv")
# performed paired two-sample t-test to evaluate whether significant difference exist between two variables
#BOD results before and after screening (250 mesh) of raw wastewater
#p-value is 0.3351, which is higher than 0.05,so there is no significant difference for BOD between screened Raw WW vs. raw WW.
t.test(Data$BODRawWW,Data$BODScreenedRaw250,paired=TRUE)

## 
##  Paired t-test
## 
## data:  Data$BODRawWW and Data$BODScreenedRaw250
## t = -1.0662, df = 5, p-value = 0.3351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -71.63024  29.63024
## sample estimates:
## mean of the differences 
##                     -21

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(600,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Raw and 250 Mesh Screened Waste Water")
lines(Data$BODScreenedRaw250~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "250 Screened Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")

#BOD results before and after filtering of raw wastewater
#p-value is 0.001752, which is lower than 0.05,so there is significant difference for BOD between filtered Raw WW vs. raw WW.
t.test(Data$BODRawWW,Data$BODFilteredRaw,paired=TRUE)

## 
##  Paired t-test
## 
## data:  Data$BODRawWW and Data$BODFilteredRaw
## t = 6.0703, df = 5, p-value = 0.001752
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  145.5734 359.4266
## sample estimates:
## mean of the differences 
##                   252.5

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(400,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Unfiltered and Filtered Raw Waste Water")
lines(Data$BODFilteredRaw~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "Filtered Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")

#BOD results before and after filtering of effluent
#p-value is 0.007235, which is lower than 0.05,so there is significant difference for BOD between filtered effluent vs. raw effluent.
t.test(Data$BODEffluent,Data$BODFilteredEffluent,paired=TRUE)

## 
##  Paired t-test
## 
## data:  Data$BODEffluent and Data$BODFilteredEffluent
## t = 4.368, df = 5, p-value = 0.007235
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  150.8841 582.4493
## sample estimates:
## mean of the differences 
##                366.6667

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODEffluent~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(400,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Unfiltered and Filtered Effluent Waste Water")
lines(Data$BODFilteredEffluent~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Effluent WW", "Filtered Effluent WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")

#TSS results before and after the screening (250 mesh) of raw wastewater
#p-value is 0.2307, which is higher than 0.05,so there is no significant difference for TSS between screened raw WW  vs. raw WW.
t.test(Data$TSSRawWW,Data$TSSScreenedRaw250,paired=TRUE)

## 
##  Paired t-test
## 
## data:  Data$TSSRawWW and Data$TSSScreenedRaw250
## t = 1.3643, df = 5, p-value = 0.2307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -14.73701  48.07035
## sample estimates:
## mean of the differences 
##                16.66667

par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$TSSRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(0,300),xlab="Date",ylab="TSS(mg/L)",cex=1.5,lwd=2,main="TSS Results Between Raw and 250 Mesh Screened Waste Water")
lines(Data$TSSScreenedRaw250~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "250 Screened Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")

Test whether BOD is affected by using different mesh size for screening of Raw WW.

BOD <- read.csv("BOD.csv")
BOD

##        Date          Type  BOD
## 1  01/19/16 RawUnscreened  886
## 2  01/20/16 RawUnscreened 1085
## 3  01/21/16 RawUnscreened 1069
## 4  01/19/16        Raw200  895
## 5  01/20/16        Raw200  926
## 6  01/21/16        Raw200 1249
## 7  01/19/16        Raw250  867
## 8  01/20/16        Raw250 1105
## 9  01/21/16        Raw250 1016
## 10 01/19/16        Raw325  835
## 11 01/20/16        Raw325 1033
## 12 01/21/16        Raw325 1146

#obtain summary statistics
#install.packages("psych")
library(psych)

## Warning: package 'psych' was built under R version 3.2.4

bod.mat <- describeBy(BOD$BOD,BOD$Type,mat=TRUE)
bod.mat

##    item        group1 vars n     mean       sd median  trimmed      mad
## 11    1        Raw200    1 3 1023.333 196.0468    926 1023.333  45.9606
## 12    2        Raw250    1 3  996.000 120.2539   1016  996.000 131.9514
## 13    3        Raw325    1 3 1004.667 157.4241   1033 1004.667 167.5338
## 14    4 RawUnscreened    1 3 1013.333 110.5637   1069 1013.333  23.7216
##    min  max range       skew  kurtosis        se
## 11 895 1249   354  0.3741015 -2.333333 113.18765
## 12 867 1105   238 -0.1617144 -2.333333  69.42862
## 13 835 1146   311 -0.1741508 -2.333333  90.88882
## 14 886 1085   199 -0.3758519 -2.333333  63.83399

write.csv(bod.mat,"bodmat.csv")
anova(lm(BOD ~ Type, BOD))

## Analysis of Variance Table
## 
## Response: BOD
##           Df Sum Sq Mean Sq F value Pr(>F)
## Type       3   1235   411.6  0.0183 0.9963
## Residuals  8 179804 22475.5

#the above analysis is the same as using
summary(aov(BOD ~ Type, data = BOD))

##             Df Sum Sq Mean Sq F value Pr(>F)
## Type         3   1235     412   0.018  0.996
## Residuals    8 179804   22475

#F value: This is the ratio produced by dividing the Mean Square for the Model by the Mean Square for the Error.  It tests how well the model as a whole (adjusted for the mean) accounts for the dependent variable's behavior.  This represents a test of the null hypothesis that all parameters except the intercept are zero.  If the F value is greater than or equal to the critical value of F (Fcritical), it indicates there are significant differences among population means.  If F value is less than Fcritical, it indicates there are no significant differences among population means.  Fcritical is a function of the degrees of freedom of numerator and denominator and the significance level (??).  The numerator degrees of freedom equals the number of groups minus 1 and the denominator degrees of freedom equals the number of observations minus the number of groups.
#Pr(>F): This is the significance probability associated with the  statistic.  If Pr(>F) is less than or equal to the significance level (??) of 0.05, it indicates there are significant differences among population means.  If Pr(>F) is greater than the significance level (??) of 0.05, it indicates there are no significant differences among population means.
#here, The numerator degrees of freedom is 3 and the denominator degrees of freedom is 8.  
#p-value is higher than 0.05, so there is no effect on BOD by using different mesh size screen.
#Plot comparing different mesh sizes
par(mfrow=c(1,1),mar=c(5,5,2,1))
boxplot(BOD$BOD~BOD$Type,ylab="BOD (mg/L)")

Test the relationship between TSS and BOD based on monthly average data in 2007-2016.

df <- read.csv("Basedata.csv")
head(df)

##       Date BOD TSS
## 1 1/1/2007 279 174
## 2 2/1/2007 355 141
## 3 3/1/2007 249 152
## 4 4/1/2007 359  97
## 5 5/1/2007 505  40
## 6 6/1/2007 626  54

#time series plot for TSS and BOD
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(df$TSS~as.Date(as.character(df$Date),"%m/%d/%Y"),type="b",col="blue",ylim=c(10,10000),xlab="Date",ylab="Concentration (mg/L)",cex=1.2,lwd=2,main="Monthly Average Concentrations",log="y",pch=19)
lines(df$BOD~as.Date(as.character(df$Date),"%m/%d/%Y"),type="b",col="red",cex=1.2,lwd=2, pch=21)
legend("topright",as.vector(c("BOD","TSS")),cex=1.2,ncol=1,text.col=c("red","blue"),bty="n")

#qqplot 
par(mfrow=c(1,1),mar=c(5,5,2,1))
qqplot(df$TSS,df$BOD,xlab="TSS(mg/L)",ylab="BOD (mg/L)")

#the plot indicates three outliers need to be removed 
par(mfrow=c(1,1),mar=c(5,5,2,1))
df1<-df[c(-53,-54,-56),] #remove the May,June and Aug 2011 value
plot(df1$TSS~as.Date(as.character(df1$Date),"%m/%d/%Y"),type="b",col="blue",ylim=c(10,10000),xlab="Date",ylab="Concentration (mg/L)",cex=1.2,lwd=2,main="Monthly Average Concentrations",log="y",pch=19)
lines(df1$BOD~as.Date(as.character(df1$Date),"%m/%d/%Y"),type="b",col="red",cex=1.2,lwd=2, pch=21)
legend("topright",as.vector(c("BOD","TSS")),cex=1.2,ncol=1,text.col=c("red","blue"),bty="n")

par(mfrow=c(1,1),mar=c(5,5,2,1))
qqplot(df1$TSS,df1$BOD,xlab="TSS(mg/L)",ylab="BOD (mg/L)")

#It appears that BOD and TSS are correlated. Correlation test.
cor.test(df1$BOD, df1$TSS)

## 
##  Pearson's product-moment correlation
## 
## data:  df1$BOD and df1$TSS
## t = 7.5025, df = 105, p-value = 2.114e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4515367 0.7018866
## sample estimates:
##       cor 
## 0.5907511

#install.packages("Hmisc")
#library("Hmisc")
#rcorr(as.matrix(df1[c(2,3)]))
#install.packages("psych")
library("psych")
pairs.panels(df1[c(2,3)], gap = 0)

#Perform a linear fit test.
fit<-lm(BOD~TSS,df1)
anova(fit)

## Analysis of Variance Table
## 
## Response: BOD
##            Df   Sum Sq Mean Sq F value    Pr(>F)    
## TSS         1  9124209 9124209  56.287 2.114e-11 ***
## Residuals 105 17020636  162101                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

modsum = summary(fit)
modsum

## 
## Call:
## lm(formula = BOD ~ TSS, data = df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -966.38 -303.01    1.09  238.74 1337.24 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 641.9035    69.4529   9.242 3.07e-15 ***
## TSS           1.5186     0.2024   7.502 2.11e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 402.6 on 105 degrees of freedom
## Multiple R-squared:  0.349,  Adjusted R-squared:  0.3428 
## F-statistic: 56.29 on 1 and 105 DF,  p-value: 2.114e-11

#plot to test the linear model; perform a linear fit and plot
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit, las = 1)

par(opar)
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(BOD~TSS,df1,main="Monthly Average Concentrations",xlab="TSS(mg/L)",ylab="BOD (mg/L)",xlim=c(0,1500),ylim=c(0,3000))
abline(fit,col="red",lwd=1.5)
#Puttig 95% confidence interval
newx<-seq(0,1500)
prd<-predict(fit,newdata=data.frame(TSS=newx),interval = c("confidence"), 
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit), 2) # extract coefficients
text(1200,1700,bquote(BOD == .(lm_coef[2]) * TSS + .(lm_coef[1])))
r2 = modsum$adj.r.squared
my.p = modsum$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(1200,1450, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(1200,1200, plabel)

Test whether the fit can be improved by log-transformation of the data .

#logfit
fitlog<-lm(log(df1$BOD)~log(df1$TSS))
anova(fitlog)

## Analysis of Variance Table
## 
## Response: log(df1$BOD)
##               Df  Sum Sq Mean Sq F value    Pr(>F)    
## log(df1$TSS)   1  9.1419  9.1419   49.36 2.208e-10 ***
## Residuals    105 19.4471  0.1852                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

modsumlog = summary(fitlog)
modsumlog

## 
## Call:
## lm(formula = log(df1$BOD) ~ log(df1$TSS))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.13829 -0.23501  0.09733  0.25549  1.04020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.28879    0.36818  11.649  < 2e-16 ***
## log(df1$TSS)  0.47114    0.06706   7.026 2.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4304 on 105 degrees of freedom
## Multiple R-squared:  0.3198, Adjusted R-squared:  0.3133 
## F-statistic: 49.36 on 1 and 105 DF,  p-value: 2.208e-10

#the log model did not show improvement based on R2 and p value.