Problem Set # 5

Luis Enrique Ramos

date()
## [1] "Sun Nov 18 13:43:51 2012"

Due Date: November 29, 2012
Total Points: 50

The file Georgia.zip contains ESRI shape files in a folder called GeorgiaEduc. The dbf contains the percentage of Georgia county residents with a bachelor’s degree along with other countywide information. Let the dependent variable be PctBach (percent of population with a bachelor's degree) and the explanatory variables be TotPop90, PctRural, PctEld, PctFB, PctPov, PctBlack.

a. Download the zip file, unzip it and use the readShapeSpatial() function from the maptools package to get the data into R. Hint: After unzipping the shape files are in the directory Georgia. (10)

tmp = download.file("http://myweb.fsu.edu/jelsner/Georgia.zip", "Georgia.zip", 
    mode = "wb")
unzip("Georgia.zip")
suppressMessages(require(maptools))
Bachelors = readShapeSpatial("Georgia/GeorgiaEduc")

b. Start with a multiple regression model using all six explanatory variables listed above. Create a final model by removing variables that are not significant in explaining percentage of bachelor degrees. (10)

model1 = lm(PctBach ~ TotPop90 + PctRural + PctEld + PctFB + PctPov + PctBlack, 
    data = Bachelors)
summary(model1)
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctEld + PctFB + 
##     PctPov + PctBlack, data = Bachelors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.369 -2.052 -0.111  1.175 19.894 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.50e+01   1.60e+00    9.37  < 2e-16 ***
## TotPop90     2.33e-05   4.55e-06    5.11  8.7e-07 ***
## PctRural    -4.27e-02   1.30e-02   -3.28   0.0013 ** 
## PctEld      -7.88e-02   1.15e-01   -0.69   0.4935    
## PctFB        1.25e+00   2.98e-01    4.18  4.6e-05 ***
## PctPov      -1.55e-01   6.60e-02   -2.35   0.0201 *  
## PctBlack     2.08e-02   2.37e-02    0.88   0.3809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.35 on 167 degrees of freedom
## Multiple R-squared: 0.649,   Adjusted R-squared: 0.637 
## F-statistic: 51.6 on 6 and 167 DF,  p-value: <2e-16
step(model1)
## Start:  AIC=427.2
## PctBach ~ TotPop90 + PctRural + PctEld + PctFB + PctPov + PctBlack
## 
##            Df Sum of Sq  RSS AIC
## - PctEld    1       5.3 1876 426
## - PctBlack  1       8.6 1879 426
## <none>                  1870 427
## - PctPov    1      61.7 1932 431
## - PctRural  1     120.5 1991 436
## - PctFB     1     195.9 2066 443
## - TotPop90  1     292.4 2163 450
## 
## Step:  AIC=425.7
## PctBach ~ TotPop90 + PctRural + PctFB + PctPov + PctBlack
## 
##            Df Sum of Sq  RSS AIC
## - PctBlack  1      10.9 1886 425
## <none>                  1876 426
## - PctPov    1     100.1 1976 433
## - PctRural  1     137.1 2013 436
## - PctFB     1     228.9 2104 444
## - TotPop90  1     287.2 2163 448
## 
## Step:  AIC=424.7
## PctBach ~ TotPop90 + PctRural + PctFB + PctPov
## 
##            Df Sum of Sq  RSS AIC
## <none>                  1886 425
## - PctPov    1       137 2024 435
## - PctRural  1       152 2038 436
## - PctFB     1       228 2114 443
## - TotPop90  1       320 2206 450
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov, 
##     data = Bachelors)
## 
## Coefficients:
## (Intercept)     TotPop90     PctRural        PctFB       PctPov  
##    1.44e+01     2.37e-05    -4.64e-02     1.30e+00    -1.31e-01
modelFinal = lm(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = Bachelors)
summary(modelFinal)
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov, 
##     data = Bachelors)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.165 -2.066 -0.087  1.311 19.584 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.44e+01   1.42e+00   10.14  < 2e-16 ***
## TotPop90     2.37e-05   4.43e-06    5.35  2.8e-07 ***
## PctRural    -4.64e-02   1.26e-02   -3.69  0.00030 ***
## PctFB        1.30e+00   2.87e-01    4.52  1.2e-05 ***
## PctPov      -1.31e-01   3.73e-02   -3.51  0.00058 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.34 on 169 degrees of freedom
## Multiple R-squared: 0.646,   Adjusted R-squared: 0.638 
## F-statistic: 77.2 on 4 and 169 DF,  p-value: <2e-16
scatter.smooth(modelFinal$fitted, modelFinal$residuals)

plot of chunk linearity/variance

require(sm)
## Loading required package: sm
## Package `sm', version 2.2-4.1 Copyright (C) 1997, 2000, 2005, 2007, 2008,
## A.W.Bowman & A.Azzalini Type help(sm) for summary information
res = residuals(modelFinal)
sm.density(res, xlab = "Model Residuals", model = "Normal")

plot of chunk normality

qqnorm(modelFinal$residuals)
qqline(modelFinal$residuals)

plot of chunk normality2

There is some evidence against normality and constant variance. Residuals do not exactly follow a normal distribution. Remediation measures seems necessary.

c. Use the significant explanatory variables and create a geographic regression model using a fixed bandwidth. Plot a choropleth map of the predictions from the model. (10)

suppressMessages(require(spgwr))
Bachelors.bw = gwr.sel(PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = Bachelors)
## Bandwidth: 241605 CV score: 2012 
## Bandwidth: 390534 CV score: 2052 
## Bandwidth: 149561 CV score: 1995 
## Bandwidth: 92675 CV score: 2100 
## Bandwidth: 184719 CV score: 1993 
## Bandwidth: 173020 CV score: 1991 
## Bandwidth: 170165 CV score: 1991 
## Bandwidth: 167827 CV score: 1991 
## Bandwidth: 168455 CV score: 1991 
## Bandwidth: 168480 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168476 CV score: 1991 
## Bandwidth: 168475 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991
Bachelors.gwr = gwr(PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = Bachelors, 
    coords = cbind(Bachelors$X_COORD, Bachelors$Y_COORD), bandwidth = Bachelors.bw)
## Warning: data is Spatial* object, ignoring coords argument
Bachelors.gwr
## Call:
## gwr(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov, 
##     data = Bachelors, coords = cbind(Bachelors$X_COORD, Bachelors$Y_COORD), 
##     bandwidth = Bachelors.bw)
## Kernel function: gwr.Gauss 
## Fixed bandwidth: 168474 
## Summary of GWR coefficient estimates:
##                   Min.   1st Qu.    Median   3rd Qu.      Max. Global
## X.Intercept.  1.12e+01  1.33e+01  1.45e+01  1.52e+01  1.58e+01  14.39
## TotPop90      1.35e-05  1.83e-05  2.20e-05  2.62e-05  3.37e-05   0.00
## PctRural     -6.24e-02 -5.14e-02 -4.46e-02 -3.89e-02 -2.58e-02  -0.05
## PctFB         4.54e-01  9.40e-01  1.43e+00  2.00e+00  2.52e+00   1.30
## PctPov       -1.46e-01 -1.39e-01 -1.34e-01 -1.21e-01 -9.26e-02  -0.13
suppressMessages(require(akima))
suppressMessages(require(fields))
grd = interp(Bachelors$X_COORD, Bachelors$Y_COORD, Bachelors$PctBach)
str(grd)
## List of 3
##  $ x: num [1:40] 635964 646829 657695 668560 679425 ...
##  $ y: num [1:40] 3394900 3407150 3419399 3431649 3443899 ...
##  $ z: num [1:40, 1:40] NA NA NA NA NA NA NA NA NA NA ...
image.plot(grd, legend.lab = "Percentage with Bachelors")
points(Bachelors$X_COORD, Bachelors$Y_COORD, cex = 1 + Bachelors$PctBach/max(Bachelors$PctBach))

plot of chunk  spatial plot map

plot(Bachelors)

plot of chunk  choropleth map

brks = round(quantile(Bachelors$PctBach, probs = seq(0, 1, 0.2)), digits = 2)
ints = findInterval(Bachelors$PctBach, brks, all.inside = TRUE)
cls = rev(heat.colors(5))
plot(Bachelors, col = cls[ints])
legend(x = "topleft", legend = leglabs(brks), cex = 0.9, fill = cls)
title(main = "Percentage Population with Bachelors Degree - County Level")

plot of chunk  choropleth map

d. Plot a choropleth map of the percent poverty coefficient. (10)

brks = round(quantile(Bachelors$PctPov, probs = seq(0, 1, 0.2)), digits = 2)
ints = findInterval(Bachelors$PctPov, brks, all.inside = TRUE)
cls = rev(topo.colors(5))
plot(Bachelors, col = cls[ints])
legend(x = "topleft", legend = leglabs(brks), cex = 0.9, fill = cls)
title(main = "Percent Poverty by County Level")

plot of chunk unnamed-chunk-4

e. Plot a choropleth map of the R squared value. (10).

predictVal = slot(Bachelors.gwr$SDF, "data")
brks = cut(predictVal$localR2, 6)
ints = as.integer(brks)
cls = terrain.colors(6)
plot(Bachelors, col = cls[ints])
legend(x = "topleft", legend = levels(brks), fill = cls, bty = "n", title = "Local R Squared", 
    horiz = FALSE, cex = 0.9)

plot of chunk  R^2 map