Read the data
mydata <- read.csv(url("https://raw.githubusercontent.com/JennierJ/CUNY_DATA_608/master/Final_Project/Drinking_Water_Quality_Distribution_Monitoring_Data.csv"))
View(mydata)
Explore the data
head(mydata)
##   Sample.Number Sample.Date Sample.Time Sample.Site Sample.class
## 1            NA  01/01/2015       12:19        1S07  Operational
## 2            NA  01/01/2015       11:15        1S04  Operational
## 3            NA  01/01/2015       10:09       1S03A  Operational
## 4            NA  01/01/2015       10:41       1S03B  Operational
## 5            NA  01/01/2015       09:38       11550   Compliance
## 6            NA  01/01/2015       08:41       13850   Compliance
##                                                                                                                                                    Location
## 1                                                                               SS - Shaft 7 of City Tunnel No. 1 - W/S Sedgwick Ave OPP W 167th St (Tun 1)
## 2 SS - Shaft 4 of City Tunnel No.1 - IFO 2780 Reservoir Ave, E/S Reservoir Ave,\n1st SS N/O Strong Street, at the intersection of Reservoir & Goulden Aves.
## 3                                                                            SS - Shaft 3A of City Tunnel No. 2 - IFO 823 S/S E 233rd St, W/O Bronxwood Ave
## 4                                                                                          SS - Shaft 3B of City Tunnel No. 3 - Mosholu Ave, W/O Jerome Ave
## 5                                                                                             SS - IFO 1058 S/S E Gun Hill Rd, 1st SS E/O Hone Ave, 12 inch
## 6                                                                                              SS - IFO 1778 E/S Jerome Ave, 1st SS S/O E 176th St, 12 inch
##   Residual.Free.Chlorine..mg.L. Turbidity..NTU. Fluoride..mg.L.
## 1                          0.58            0.96            0.79
## 2                          0.71            0.94            0.80
## 3                          0.79            0.93            0.79
## 4                          0.77            0.93            0.80
## 5                          0.74            0.95              NA
## 6                          0.59            1.08              NA
##   Coliform..Quanti.Tray...MPN..100mL. E.coli.Quanti.Tray...MPN.100mL.
## 1                                 <1                              <1 
## 2                                 <1                              <1 
## 3                                 <1                              <1 
## 4                                 <1                              <1 
## 5                                 <1                              <1 
## 6                                 <1                              <1
summary(mydata)
##  Sample.Number       Sample.Date     Sample.Time     Sample.Site   
##  Min.   :   28   08/05/2015:   63   09:46  :  230   27000  : 1056  
##  1st Qu.:18194   07/31/2015:   60   10:10  :  221   20900  : 1045  
##  Median :28968   09/08/2016:   60   09:30  :  216    1S03B : 1035  
##  Mean   :30893   01/26/2016:   57   10:50  :  215    1S07  : 1032  
##  3rd Qu.:44979   07/29/2016:   57   09:11  :  210    1S03A : 1027  
##  Max.   :70207   01/27/2016:   55   09:37  :  210   50300  :  960  
##  NA's   :8864    (Other)   :45677   (Other):44727   (Other):39874  
##                Sample.class  
##  Compliance          :28160  
##  Operational         :17465  
##  Resample_Compliance :  403  
##  Resample_Operational:    1  
##                              
##                              
##                              
##                                                                            Location    
##  SS - Shaft 3B of City Tunnel No. 3 - Mosholu Ave, W/O Jerome Ave              : 1065  
##  SS - Shaft 7 of City Tunnel No. 1 - W/S Sedgwick Ave OPP W 167th St (Tun 1)   : 1062  
##  SS - Shaft 3A of City Tunnel No. 2 - IFO 823 S/S E 233rd St, W/O Bronxwood Ave: 1057  
##  SS - IFO 177 S/S Sands St, btw Gold & Bridge Sts, 30 inch                     : 1041  
##  SS - IFO 383 W/S Hooper St, btw S 1st & S 2nd Sts, 72 inch                    :  995  
##  SS - W/S Clove Rd, 1st S/O Genesee St, 72"                                    :  960  
##  (Other)                                                                       :39849  
##  Residual.Free.Chlorine..mg.L. Turbidity..NTU.   Fluoride..mg.L.
##  Min.   :-9.9900               Min.   : 0.0700   Min.   :0.03   
##  1st Qu.: 0.4600               1st Qu.: 0.6400   1st Qu.:0.68   
##  Median : 0.6000               Median : 0.7600   Median :0.70   
##  Mean   : 0.5942               Mean   : 0.7589   Mean   :0.71   
##  3rd Qu.: 0.7300               3rd Qu.: 0.8800   3rd Qu.:0.73   
##  Max.   : 2.2000               Max.   :33.8000   Max.   :0.89   
##  NA's   :3                     NA's   :155       NA's   :40120  
##  Coliform..Quanti.Tray...MPN..100mL. E.coli.Quanti.Tray...MPN.100mL.
##  <1     :44531                       -  :    1                      
##  <1     : 1291                       <1 : 1291                      
##  1      :   69                       <1 :44734                      
##  >200.5 :   29                       1  :    3                      
##  2      :   21                                                      
##  3.1    :   16                                                      
##  (Other):   72
Preliminary plots
# pick a day 2015/01/01
mydata_01_01_2015 <- subset(mydata, mydata$Sample.Date == "01/01/2015")
View(mydata_01_01_2015)
# What are the residual free chlorine during the day on 01/01/2015?
plot(mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ mydata_01_01_2015$Sample.Time)

# What are the turbidity during the day on 01/01/2015?
plot(mydata_01_01_2015$Sample.Time, mydata_01_01_2015$Turbidity..NTU.)

# Analyze the regression relationship between residual free chlorine and turbidity
liner <- lm(mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ mydata_01_01_2015$Turbidity..NTU.)
liner
## 
## Call:
## lm(formula = mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ 
##     mydata_01_01_2015$Turbidity..NTU.)
## 
## Coefficients:
##                       (Intercept)  mydata_01_01_2015$Turbidity..NTU.  
##                           0.09721                            0.51610
# Evaluate the quality of the model
summary(liner)
## 
## Call:
## lm(formula = mydata_01_01_2015$Residual.Free.Chlorine..mg.L. ~ 
##     mydata_01_01_2015$Turbidity..NTU.)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.21558 -0.08090 -0.01687  0.05346  0.24120 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                        0.09721    0.26051   0.373   0.7114  
## mydata_01_01_2015$Turbidity..NTU.  0.51610    0.27221   1.896   0.0667 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1281 on 33 degrees of freedom
## Multiple R-squared:  0.09823,    Adjusted R-squared:  0.07091 
## F-statistic: 3.595 on 1 and 33 DF,  p-value: 0.06675
# Resudual Analysis
plot(fitted(liner), resid(liner))

qqnorm(resid(liner))
qqline(resid(liner))