Intro to correlation and regression

library(leaps)
## Warning: package 'leaps' was built under R version 2.15.3
library(HH)
## Warning: package 'HH' was built under R version 2.15.3
## Loading required package: lattice
## Loading required package: grid
## Loading required package: multcomp
## Warning: package 'multcomp' was built under R version 2.15.3
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: splines
## Loading required package: RColorBrewer
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 2.15.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 2.15.3
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 2.15.3
## Attaching package: 'reshape'
## The following object(s) are masked from 'package:plyr':
## 
## rename, round_any
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 2.15.3
rain <- read.csv(choose.files())
names(rain)
## [1] "Year"    "APMAM"   "APSAB"   "APSLAKE" "OPBPC"   "OPRC"    "OPSLAKE"
## [8] "BSAAM"
# 1.which weather station is the best predictor of runoff?
cor(rain$BSAAM, rain$APMAM, use = "complete.obs")
## [1] 0.2386
cor(rain$BSAAM, rain$APSAB, use = "complete.obs")
## [1] 0.1833
cor(rain$BSAAM, rain$APSLAKE, use = "complete.obs")
## [1] 0.2493
cor(rain$BSAAM, rain$OPBPC, use = "complete.obs")
## [1] 0.8857
cor(rain$BSAAM, rain$OPRC, use = "complete.obs")
## [1] 0.9196
cor(rain$BSAAM, rain$OPSLAKE, use = "complete.obs")
## [1] 0.9384
# I will choose OPSLAKE station
Rain1 <- lm(BSAAM ~ OPSLAKE, data = rain)
summary(Rain1)  #Adj.r2=0.878
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = rain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17604  -5338    332   3411  20876 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    27015       3219    8.39  1.9e-10 ***
## OPSLAKE         3752        216   17.39  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881,   Adjusted R-squared: 0.878 
## F-statistic:  303 on 1 and 41 DF,  p-value: <2e-16
shapiro.test(Rain1$resid)  #residuals are normally distributed
## 
##  Shapiro-Wilk normality test
## 
## data:  Rain1$resid 
## W = 0.9677, p-value = 0.2639
# 2.If you had to choose all but one weather stations, and you wanted to
# estimate runoff effectively, which station would you keep open and why
Rain2 <- regsubsets(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB + APMAM, 
    data = rain)
plot(Rain2)

plot of chunk unnamed-chunk-3

# the five best predictors-no APMAM
Rain3 <- lm(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB, data = rain)
summary(Rain3)
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB, 
##     data = rain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12696  -4933  -1396   4187  18550 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15930.8     3972.5    4.01  0.00028 ***
## OPSLAKE       2212.6      740.3    2.99  0.00495 ** 
## OPRC          1915.7      631.5    3.03  0.00440 ** 
## OPBPC           68.9      453.5    0.15  0.88000    
## APSLAKE       2263.9     1269.4    1.78  0.08271 .  
## APSAB         -673.4     1419.0   -0.47  0.63787    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 7450 on 37 degrees of freedom
## Multiple R-squared: 0.925,   Adjusted R-squared: 0.915 
## F-statistic:   91 on 5 and 37 DF,  p-value: <2e-16
# but OPBPC and APSAB are not significant
# 3.if you could keep open two stations which two would you keep open?
Rain4 <- regsubsets(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB + APMAM, 
    nvmax = 2, data = rain)
plot(Rain4)

plot of chunk unnamed-chunk-4

# so two best predictors are OPSLAKE and APSLAKE
Rain5 <- lm(BSAAM ~ OPSLAKE + APSLAKE, data = rain)
summary(Rain5)
## 
## Call:
## lm(formula = BSAAM ~ OPSLAKE + APSLAKE, data = rain)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13336  -5893   -172   4220  19500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    19145       3812    5.02  1.1e-05 ***
## OPSLAKE         3690        196   18.83  < 2e-16 ***
## APSLAKE         1769        554    3.19   0.0027 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 8060 on 40 degrees of freedom
## Multiple R-squared: 0.905,   Adjusted R-squared:  0.9 
## F-statistic:  190 on 2 and 40 DF,  p-value: <2e-16
shapiro.test(Rain5$resid)  #residuals are normally distributed
## 
##  Shapiro-Wilk normality test
## 
## data:  Rain5$resid 
## W = 0.9724, p-value = 0.3806
plot(resid(Rain5), predict(Rain5))

plot of chunk unnamed-chunk-4

plot(resid(Rain5), rain$BSAAM)

plot of chunk unnamed-chunk-4

qqnorm(Rain5$resid)
qqline(Rain5$resid)

plot of chunk unnamed-chunk-4