Read the data
mydata <- read.csv(url("https://raw.githubusercontent.com/JennierJ/CUNY_DATA_608/master/Final_Project/Drinking_Water_Quality_Distribution_Monitoring_Data.csv"))
View(mydata)
Explore the data
head(mydata)
## Sample.Number Sample.Date Sample.Time Sample.Site Sample.class
## 1 NA 01/01/2015 12:19 1S07 Operational
## 2 NA 01/01/2015 11:15 1S04 Operational
## 3 NA 01/01/2015 10:09 1S03A Operational
## 4 NA 01/01/2015 10:41 1S03B Operational
## 5 NA 01/01/2015 09:38 11550 Compliance
## 6 NA 01/01/2015 08:41 13850 Compliance
## Location
## 1 SS - Shaft 7 of City Tunnel No. 1 - W/S Sedgwick Ave OPP W 167th St (Tun 1)
## 2 SS - Shaft 4 of City Tunnel No.1 - IFO 2780 Reservoir Ave, E/S Reservoir Ave,\n1st SS N/O Strong Street, at the intersection of Reservoir & Goulden Aves.
## 3 SS - Shaft 3A of City Tunnel No. 2 - IFO 823 S/S E 233rd St, W/O Bronxwood Ave
## 4 SS - Shaft 3B of City Tunnel No. 3 - Mosholu Ave, W/O Jerome Ave
## 5 SS - IFO 1058 S/S E Gun Hill Rd, 1st SS E/O Hone Ave, 12 inch
## 6 SS - IFO 1778 E/S Jerome Ave, 1st SS S/O E 176th St, 12 inch
## Residual.Free.Chlorine..mg.L. Turbidity..NTU. Fluoride..mg.L.
## 1 0.58 0.96 0.79
## 2 0.71 0.94 0.80
## 3 0.79 0.93 0.79
## 4 0.77 0.93 0.80
## 5 0.74 0.95 NA
## 6 0.59 1.08 NA
## Coliform..Quanti.Tray...MPN..100mL. E.coli.Quanti.Tray...MPN.100mL.
## 1 <1 <1
## 2 <1 <1
## 3 <1 <1
## 4 <1 <1
## 5 <1 <1
## 6 <1 <1
summary(mydata)
## Sample.Number Sample.Date Sample.Time Sample.Site
## Min. : 28 08/05/2015: 63 09:46 : 230 27000 : 1056
## 1st Qu.:18194 07/31/2015: 60 10:10 : 221 20900 : 1045
## Median :28968 09/08/2016: 60 09:30 : 216 1S03B : 1035
## Mean :30893 01/26/2016: 57 10:50 : 215 1S07 : 1032
## 3rd Qu.:44979 07/29/2016: 57 09:11 : 210 1S03A : 1027
## Max. :70207 01/27/2016: 55 09:37 : 210 50300 : 960
## NA's :8864 (Other) :45677 (Other):44727 (Other):39874
## Sample.class
## Compliance :28160
## Operational :17465
## Resample_Compliance : 403
## Resample_Operational: 1
##
##
##
## Location
## SS - Shaft 3B of City Tunnel No. 3 - Mosholu Ave, W/O Jerome Ave : 1065
## SS - Shaft 7 of City Tunnel No. 1 - W/S Sedgwick Ave OPP W 167th St (Tun 1) : 1062
## SS - Shaft 3A of City Tunnel No. 2 - IFO 823 S/S E 233rd St, W/O Bronxwood Ave: 1057
## SS - IFO 177 S/S Sands St, btw Gold & Bridge Sts, 30 inch : 1041
## SS - IFO 383 W/S Hooper St, btw S 1st & S 2nd Sts, 72 inch : 995
## SS - W/S Clove Rd, 1st S/O Genesee St, 72" : 960
## (Other) :39849
## Residual.Free.Chlorine..mg.L. Turbidity..NTU. Fluoride..mg.L.
## Min. :-9.9900 Min. : 0.0700 Min. :0.03
## 1st Qu.: 0.4600 1st Qu.: 0.6400 1st Qu.:0.68
## Median : 0.6000 Median : 0.7600 Median :0.70
## Mean : 0.5942 Mean : 0.7589 Mean :0.71
## 3rd Qu.: 0.7300 3rd Qu.: 0.8800 3rd Qu.:0.73
## Max. : 2.2000 Max. :33.8000 Max. :0.89
## NA's :3 NA's :155 NA's :40120
## Coliform..Quanti.Tray...MPN..100mL. E.coli.Quanti.Tray...MPN.100mL.
## <1 :44531 - : 1
## <1 : 1291 <1 : 1291
## 1 : 69 <1 :44734
## >200.5 : 29 1 : 3
## 2 : 21
## 3.1 : 16
## (Other): 72
Preliminary plots
# pick a day 2015/01/01
mydata_01_01_2015 <- subset(mydata, mydata$Sample.Date == "01/01/2015")
View(mydata_01_01_2015)
# What are the residual free chlorine during the day on 01/01/2015?
plot(mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ mydata_01_01_2015$Sample.Time)

# What are the turbidity during the day on 01/01/2015?
plot(mydata_01_01_2015$Sample.Time, mydata_01_01_2015$Turbidity..NTU.)

# Analyze the regression relationship between residual free chlorine and turbidity
liner <- lm(mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ mydata_01_01_2015$Turbidity..NTU.)
liner
##
## Call:
## lm(formula = mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~
## mydata_01_01_2015$Turbidity..NTU.)
##
## Coefficients:
## (Intercept) mydata_01_01_2015$Turbidity..NTU.
## 0.09721 0.51610
# Evaluate the quality of the model
summary(liner)
##
## Call:
## lm(formula = mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~
## mydata_01_01_2015$Turbidity..NTU.)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21558 -0.08090 -0.01687 0.05346 0.24120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09721 0.26051 0.373 0.7114
## mydata_01_01_2015$Turbidity..NTU. 0.51610 0.27221 1.896 0.0667 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1281 on 33 degrees of freedom
## Multiple R-squared: 0.09823, Adjusted R-squared: 0.07091
## F-statistic: 3.595 on 1 and 33 DF, p-value: 0.06675
# Resudual Analysis
plot(fitted(liner), resid(liner))

qqnorm(resid(liner))
qqline(resid(liner))
