#set working directory
#setwd("G:/Transportation/NJ_TRANSIT/PeterSpinney/")
setwd("C:/Users/ywang/Dropbox/R/BOD_Analysis/")
#load data
Data <- read.csv("Data.csv")
Data
## Date BODRawWW BODScreenedRaw250 BODFilteredRaw BODEffluent
## 1 12/15/15 727 779 537 580
## 2 12/16/15 1079 1136 811 1044
## 3 12/17/15 1080 1149 696 1131
## 4 01/19/16 886 867 660 906
## 5 01/20/16 1085 1105 980 950
## 6 01/21/16 1069 1016 727 1197
## BODFilteredEffluent lint50 lint100 lint150 LintPercentSoild TSSRawWW
## 1 455 213 281 274 23.76 155
## 2 822 1346 1095 1635 15.97 222
## 3 530 517 160 339 25.81 258
## 4 628 246 216 378 14.30 168
## 5 605 295 278 283 7.14 233
## 6 568 188 403 416 9.36 215
## TSSScreenedRaw250 TSSEffluent phraw pheffluent
## 1 150 162 11.65 10.02
## 2 189 258 11.52 9.84
## 3 194 230 11.70 10.00
## 4 146 253 11.16 9.89
## 5 254 210 11.31 10.21
## 6 218 208 9.91 9.86
#obtain the summary statistics
sta<-summary(Data)
sta
## Date BODRawWW BODScreenedRaw250 BODFilteredRaw
## 01/19/16:1 Min. : 727.0 Min. : 779.0 Min. :537.0
## 01/20/16:1 1st Qu.: 931.8 1st Qu.: 904.2 1st Qu.:669.0
## 01/21/16:1 Median :1074.0 Median :1060.5 Median :711.5
## 12/15/15:1 Mean : 987.7 Mean :1008.7 Mean :735.2
## 12/16/15:1 3rd Qu.:1079.8 3rd Qu.:1128.2 3rd Qu.:790.0
## 12/17/15:1 Max. :1085.0 Max. :1149.0 Max. :980.0
## BODEffluent BODFilteredEffluent lint50 lint100
## Min. : 580 Min. :455.0 Min. : 188.0 Min. : 160.0
## 1st Qu.: 917 1st Qu.:539.5 1st Qu.: 221.2 1st Qu.: 231.5
## Median : 997 Median :586.5 Median : 270.5 Median : 279.5
## Mean : 968 Mean :601.3 Mean : 467.5 Mean : 405.5
## 3rd Qu.:1109 3rd Qu.:622.2 3rd Qu.: 461.5 3rd Qu.: 372.5
## Max. :1197 Max. :822.0 Max. :1346.0 Max. :1095.0
## lint150 LintPercentSoild TSSRawWW TSSScreenedRaw250
## Min. : 274.0 Min. : 7.14 Min. :155.0 Min. :146.0
## 1st Qu.: 297.0 1st Qu.:10.60 1st Qu.:179.8 1st Qu.:159.8
## Median : 358.5 Median :15.13 Median :218.5 Median :191.5
## Mean : 554.2 Mean :16.06 Mean :208.5 Mean :191.8
## 3rd Qu.: 406.5 3rd Qu.:21.81 3rd Qu.:230.2 3rd Qu.:212.0
## Max. :1635.0 Max. :25.81 Max. :258.0 Max. :254.0
## TSSEffluent phraw pheffluent
## Min. :162.0 Min. : 9.91 Min. : 9.840
## 1st Qu.:208.5 1st Qu.:11.20 1st Qu.: 9.867
## Median :220.0 Median :11.41 Median : 9.945
## Mean :220.2 Mean :11.21 Mean : 9.970
## 3rd Qu.:247.2 3rd Qu.:11.62 3rd Qu.:10.015
## Max. :258.0 Max. :11.70 Max. :10.210
write.csv(sta,"sta.csv")
# performed paired two-sample t-test to evaluate whether significant difference exist between two variables
#BOD results before and after screening (250 mesh) of raw wastewater
#p-value is 0.3351, which is higher than 0.05,so there is no significant difference for BOD between screened Raw WW vs. raw WW.
t.test(Data$BODRawWW,Data$BODScreenedRaw250,paired=TRUE)
##
## Paired t-test
##
## data: Data$BODRawWW and Data$BODScreenedRaw250
## t = -1.0662, df = 5, p-value = 0.3351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -71.63024 29.63024
## sample estimates:
## mean of the differences
## -21
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(600,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Raw and 250 Mesh Screened Waste Water")
lines(Data$BODScreenedRaw250~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "250 Screened Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")
#BOD results before and after filtering of raw wastewater
#p-value is 0.001752, which is lower than 0.05,so there is significant difference for BOD between filtered Raw WW vs. raw WW.
t.test(Data$BODRawWW,Data$BODFilteredRaw,paired=TRUE)
##
## Paired t-test
##
## data: Data$BODRawWW and Data$BODFilteredRaw
## t = 6.0703, df = 5, p-value = 0.001752
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 145.5734 359.4266
## sample estimates:
## mean of the differences
## 252.5
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(400,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Unfiltered and Filtered Raw Waste Water")
lines(Data$BODFilteredRaw~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "Filtered Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")
#BOD results before and after filtering of effluent
#p-value is 0.007235, which is lower than 0.05,so there is significant difference for BOD between filtered effluent vs. raw effluent.
t.test(Data$BODEffluent,Data$BODFilteredEffluent,paired=TRUE)
##
## Paired t-test
##
## data: Data$BODEffluent and Data$BODFilteredEffluent
## t = 4.368, df = 5, p-value = 0.007235
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 150.8841 582.4493
## sample estimates:
## mean of the differences
## 366.6667
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$BODEffluent~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(400,1200),xlab="Date",ylab="BOD(mg/L)",cex=1.5,lwd=2,main="BOD Results Between Unfiltered and Filtered Effluent Waste Water")
lines(Data$BODFilteredEffluent~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Effluent WW", "Filtered Effluent WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")
#TSS results before and after the screening (250 mesh) of raw wastewater
#p-value is 0.2307, which is higher than 0.05,so there is no significant difference for TSS between screened raw WW vs. raw WW.
t.test(Data$TSSRawWW,Data$TSSScreenedRaw250,paired=TRUE)
##
## Paired t-test
##
## data: Data$TSSRawWW and Data$TSSScreenedRaw250
## t = 1.3643, df = 5, p-value = 0.2307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14.73701 48.07035
## sample estimates:
## mean of the differences
## 16.66667
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(Data$TSSRawWW~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="blue",ylim=c(0,300),xlab="Date",ylab="TSS(mg/L)",cex=1.5,lwd=2,main="TSS Results Between Raw and 250 Mesh Screened Waste Water")
lines(Data$TSSScreenedRaw250~as.Date(as.character(Data$Date),"%m/%d/%y"),type="b",col="red",cex=1.5,lwd=2)
legend("top",as.vector(c("Raw WW", "250 Screened Raw WW")),cex=1.2,ncol=1,text.col=c("blue","red"),bty="n")
Test whether BOD is affected by using different mesh size for screening of Raw WW.
BOD <- read.csv("BOD.csv")
BOD
## Date Type BOD
## 1 01/19/16 RawUnscreened 886
## 2 01/20/16 RawUnscreened 1085
## 3 01/21/16 RawUnscreened 1069
## 4 01/19/16 Raw200 895
## 5 01/20/16 Raw200 926
## 6 01/21/16 Raw200 1249
## 7 01/19/16 Raw250 867
## 8 01/20/16 Raw250 1105
## 9 01/21/16 Raw250 1016
## 10 01/19/16 Raw325 835
## 11 01/20/16 Raw325 1033
## 12 01/21/16 Raw325 1146
#obtain summary statistics
#install.packages("psych")
library(psych)
## Warning: package 'psych' was built under R version 3.2.4
bod.mat <- describeBy(BOD$BOD,BOD$Type,mat=TRUE)
bod.mat
## item group1 vars n mean sd median trimmed mad
## 11 1 Raw200 1 3 1023.333 196.0468 926 1023.333 45.9606
## 12 2 Raw250 1 3 996.000 120.2539 1016 996.000 131.9514
## 13 3 Raw325 1 3 1004.667 157.4241 1033 1004.667 167.5338
## 14 4 RawUnscreened 1 3 1013.333 110.5637 1069 1013.333 23.7216
## min max range skew kurtosis se
## 11 895 1249 354 0.3741015 -2.333333 113.18765
## 12 867 1105 238 -0.1617144 -2.333333 69.42862
## 13 835 1146 311 -0.1741508 -2.333333 90.88882
## 14 886 1085 199 -0.3758519 -2.333333 63.83399
write.csv(bod.mat,"bodmat.csv")
anova(lm(BOD ~ Type, BOD))
## Analysis of Variance Table
##
## Response: BOD
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 3 1235 411.6 0.0183 0.9963
## Residuals 8 179804 22475.5
#the above analysis is the same as using
summary(aov(BOD ~ Type, data = BOD))
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 3 1235 412 0.018 0.996
## Residuals 8 179804 22475
#F value: This is the ratio produced by dividing the Mean Square for the Model by the Mean Square for the Error. It tests how well the model as a whole (adjusted for the mean) accounts for the dependent variable's behavior. This represents a test of the null hypothesis that all parameters except the intercept are zero. If the F value is greater than or equal to the critical value of F (Fcritical), it indicates there are significant differences among population means. If F value is less than Fcritical, it indicates there are no significant differences among population means. Fcritical is a function of the degrees of freedom of numerator and denominator and the significance level (??). The numerator degrees of freedom equals the number of groups minus 1 and the denominator degrees of freedom equals the number of observations minus the number of groups.
#Pr(>F): This is the significance probability associated with the statistic. If Pr(>F) is less than or equal to the significance level (??) of 0.05, it indicates there are significant differences among population means. If Pr(>F) is greater than the significance level (??) of 0.05, it indicates there are no significant differences among population means.
#here, The numerator degrees of freedom is 3 and the denominator degrees of freedom is 8.
#p-value is higher than 0.05, so there is no effect on BOD by using different mesh size screen.
#Plot comparing different mesh sizes
par(mfrow=c(1,1),mar=c(5,5,2,1))
boxplot(BOD$BOD~BOD$Type,ylab="BOD (mg/L)")
Test the relationship between TSS and BOD based on monthly average data in 2007-2016.
df <- read.csv("Basedata.csv")
head(df)
## Date BOD TSS
## 1 1/1/2007 279 174
## 2 2/1/2007 355 141
## 3 3/1/2007 249 152
## 4 4/1/2007 359 97
## 5 5/1/2007 505 40
## 6 6/1/2007 626 54
#time series plot for TSS and BOD
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(df$TSS~as.Date(as.character(df$Date),"%m/%d/%Y"),type="b",col="blue",ylim=c(10,10000),xlab="Date",ylab="Concentration (mg/L)",cex=1.2,lwd=2,main="Monthly Average Concentrations",log="y",pch=19)
lines(df$BOD~as.Date(as.character(df$Date),"%m/%d/%Y"),type="b",col="red",cex=1.2,lwd=2, pch=21)
legend("topright",as.vector(c("BOD","TSS")),cex=1.2,ncol=1,text.col=c("red","blue"),bty="n")
#qqplot
par(mfrow=c(1,1),mar=c(5,5,2,1))
qqplot(df$TSS,df$BOD,xlab="TSS(mg/L)",ylab="BOD (mg/L)")
#the plot indicates three outliers need to be removed
par(mfrow=c(1,1),mar=c(5,5,2,1))
df1<-df[c(-53,-54,-56),] #remove the May,June and Aug 2011 value
plot(df1$TSS~as.Date(as.character(df1$Date),"%m/%d/%Y"),type="b",col="blue",ylim=c(10,10000),xlab="Date",ylab="Concentration (mg/L)",cex=1.2,lwd=2,main="Monthly Average Concentrations",log="y",pch=19)
lines(df1$BOD~as.Date(as.character(df1$Date),"%m/%d/%Y"),type="b",col="red",cex=1.2,lwd=2, pch=21)
legend("topright",as.vector(c("BOD","TSS")),cex=1.2,ncol=1,text.col=c("red","blue"),bty="n")
par(mfrow=c(1,1),mar=c(5,5,2,1))
qqplot(df1$TSS,df1$BOD,xlab="TSS(mg/L)",ylab="BOD (mg/L)")
#It appears that BOD and TSS are correlated. Correlation test.
cor.test(df1$BOD, df1$TSS)
##
## Pearson's product-moment correlation
##
## data: df1$BOD and df1$TSS
## t = 7.5025, df = 105, p-value = 2.114e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4515367 0.7018866
## sample estimates:
## cor
## 0.5907511
#install.packages("Hmisc")
#library("Hmisc")
#rcorr(as.matrix(df1[c(2,3)]))
#install.packages("psych")
library("psych")
pairs.panels(df1[c(2,3)], gap = 0)
#Perform a linear fit test.
fit<-lm(BOD~TSS,df1)
anova(fit)
## Analysis of Variance Table
##
## Response: BOD
## Df Sum Sq Mean Sq F value Pr(>F)
## TSS 1 9124209 9124209 56.287 2.114e-11 ***
## Residuals 105 17020636 162101
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsum = summary(fit)
modsum
##
## Call:
## lm(formula = BOD ~ TSS, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -966.38 -303.01 1.09 238.74 1337.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 641.9035 69.4529 9.242 3.07e-15 ***
## TSS 1.5186 0.2024 7.502 2.11e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 402.6 on 105 degrees of freedom
## Multiple R-squared: 0.349, Adjusted R-squared: 0.3428
## F-statistic: 56.29 on 1 and 105 DF, p-value: 2.114e-11
#plot to test the linear model; perform a linear fit and plot
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit, las = 1)
par(opar)
par(mfrow=c(1,1),mar=c(5,5,2,1))
plot(BOD~TSS,df1,main="Monthly Average Concentrations",xlab="TSS(mg/L)",ylab="BOD (mg/L)",xlim=c(0,1500),ylim=c(0,3000))
abline(fit,col="red",lwd=1.5)
#Puttig 95% confidence interval
newx<-seq(0,1500)
prd<-predict(fit,newdata=data.frame(TSS=newx),interval = c("confidence"),
level = 0.95,type="response")
lines(newx,prd[,2],col="blue",lty=2)
lines(newx,prd[,3],col="blue",lty=2)
#putting linear equation, adjusted r2 and p-value on the plot
lm_coef <- round(coef(fit), 2) # extract coefficients
text(1200,1700,bquote(BOD == .(lm_coef[2]) * TSS + .(lm_coef[1])))
r2 = modsum$adj.r.squared
my.p = modsum$coefficients[2,4]
mylabel = bquote(italic(R)^2 == .(format(r2, digits = 3)))
text(1200,1450, labels = mylabel)
plabel = bquote(italic(p) == .(format(my.p, digits = 3)))
text(1200,1200, plabel)
Test whether the fit can be improved by log-transformation of the data .
#logfit
fitlog<-lm(log(df1$BOD)~log(df1$TSS))
anova(fitlog)
## Analysis of Variance Table
##
## Response: log(df1$BOD)
## Df Sum Sq Mean Sq F value Pr(>F)
## log(df1$TSS) 1 9.1419 9.1419 49.36 2.208e-10 ***
## Residuals 105 19.4471 0.1852
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modsumlog = summary(fitlog)
modsumlog
##
## Call:
## lm(formula = log(df1$BOD) ~ log(df1$TSS))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.13829 -0.23501 0.09733 0.25549 1.04020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.28879 0.36818 11.649 < 2e-16 ***
## log(df1$TSS) 0.47114 0.06706 7.026 2.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4304 on 105 degrees of freedom
## Multiple R-squared: 0.3198, Adjusted R-squared: 0.3133
## F-statistic: 49.36 on 1 and 105 DF, p-value: 2.208e-10
#the log model did not show improvement based on R2 and p value.