library(leaps)
## Warning: package 'leaps' was built under R version 2.15.3
library(HH)
## Warning: package 'HH' was built under R version 2.15.3
## Loading required package: lattice
## Loading required package: grid
## Loading required package: multcomp
## Warning: package 'multcomp' was built under R version 2.15.3
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: splines
## Loading required package: RColorBrewer
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 2.15.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 2.15.3
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 2.15.3
## Attaching package: 'reshape'
## The following object(s) are masked from 'package:plyr':
##
## rename, round_any
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 2.15.3
rain <- read.csv(choose.files())
names(rain)
## [1] "Year" "APMAM" "APSAB" "APSLAKE" "OPBPC" "OPRC" "OPSLAKE"
## [8] "BSAAM"
# 1.which weather station is the best predictor of runoff?
cor(rain$BSAAM, rain$APMAM, use = "complete.obs")
## [1] 0.2386
cor(rain$BSAAM, rain$APSAB, use = "complete.obs")
## [1] 0.1833
cor(rain$BSAAM, rain$APSLAKE, use = "complete.obs")
## [1] 0.2493
cor(rain$BSAAM, rain$OPBPC, use = "complete.obs")
## [1] 0.8857
cor(rain$BSAAM, rain$OPRC, use = "complete.obs")
## [1] 0.9196
cor(rain$BSAAM, rain$OPSLAKE, use = "complete.obs")
## [1] 0.9384
# I will choose OPSLAKE station
Rain1 <- lm(BSAAM ~ OPSLAKE, data = rain)
summary(Rain1) #Adj.r2=0.878
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE, data = rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17604 -5338 332 3411 20876
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27015 3219 8.39 1.9e-10 ***
## OPSLAKE 3752 216 17.39 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8920 on 41 degrees of freedom
## Multiple R-squared: 0.881, Adjusted R-squared: 0.878
## F-statistic: 303 on 1 and 41 DF, p-value: <2e-16
shapiro.test(Rain1$resid) #residuals are normally distributed
##
## Shapiro-Wilk normality test
##
## data: Rain1$resid
## W = 0.9677, p-value = 0.2639
# 2.If you had to choose all but one weather stations, and you wanted to
# estimate runoff effectively, which station would you keep open and why
Rain2 <- regsubsets(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB + APMAM,
data = rain)
plot(Rain2)
# the five best predictors-no APMAM
Rain3 <- lm(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB, data = rain)
summary(Rain3)
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB,
## data = rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12696 -4933 -1396 4187 18550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15930.8 3972.5 4.01 0.00028 ***
## OPSLAKE 2212.6 740.3 2.99 0.00495 **
## OPRC 1915.7 631.5 3.03 0.00440 **
## OPBPC 68.9 453.5 0.15 0.88000
## APSLAKE 2263.9 1269.4 1.78 0.08271 .
## APSAB -673.4 1419.0 -0.47 0.63787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7450 on 37 degrees of freedom
## Multiple R-squared: 0.925, Adjusted R-squared: 0.915
## F-statistic: 91 on 5 and 37 DF, p-value: <2e-16
# but OPBPC and APSAB are not significant
# 3.if you could keep open two stations which two would you keep open?
Rain4 <- regsubsets(BSAAM ~ OPSLAKE + OPRC + OPBPC + APSLAKE + APSAB + APMAM,
nvmax = 2, data = rain)
plot(Rain4)
# so two best predictors are OPSLAKE and APSLAKE
Rain5 <- lm(BSAAM ~ OPSLAKE + APSLAKE, data = rain)
summary(Rain5)
##
## Call:
## lm(formula = BSAAM ~ OPSLAKE + APSLAKE, data = rain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13336 -5893 -172 4220 19500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19145 3812 5.02 1.1e-05 ***
## OPSLAKE 3690 196 18.83 < 2e-16 ***
## APSLAKE 1769 554 3.19 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8060 on 40 degrees of freedom
## Multiple R-squared: 0.905, Adjusted R-squared: 0.9
## F-statistic: 190 on 2 and 40 DF, p-value: <2e-16
shapiro.test(Rain5$resid) #residuals are normally distributed
##
## Shapiro-Wilk normality test
##
## data: Rain5$resid
## W = 0.9724, p-value = 0.3806
plot(resid(Rain5), predict(Rain5))
plot(resid(Rain5), rain$BSAAM)
qqnorm(Rain5$resid)
qqline(Rain5$resid)