Problem Set # 5

Guang Xing

date()
## [1] "Sat Nov 24 22:00:44 2012"

Due Date: November 29, 2012
Total Points: 50

The file Georgia.zip contains ESRI shape files in a folder called GeorgiaEduc. The dbf contains the percentage of Georgia county residents with a bachelor degree along with other countywide information. Let the dependent variable be PctBach (percent of population with a bachelor's degree) and the explanatory variables be TotPop90, PctRural, PctEld, PctFB, PctPov, PctBlack.

a. Download the zip file, unzip it and use the readShapeSpatial() function from the maptools package to get the data into R. Hint: After unzipping the shape files are in the directory Georgia. (10)

require(maptools)
## Loading required package: maptools
## Warning: package 'maptools' was built under R version 2.15.2
## Loading required package: foreign
## Loading required package: sp
## Warning: package 'sp' was built under R version 2.15.2
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
tmp = download.file("http://myweb.fsu.edu/jelsner/Georgia.zip", "Georgia.zip", 
    mode = "wb")
unzip("Georgia.zip")
GE = readShapeSpatial("Georgia/GeorgiaEduc")
slotNames(GE)
## [1] "data"        "polygons"    "plotOrder"   "bbox"        "proj4string"
str(GE, max.level = 2)
## Formal class 'SpatialPolygonsDataFrame' [package "sp"] with 5 slots
##   ..@ data       :'data.frame':  174 obs. of  25 variables:
##   .. ..- attr(*, "data_types")= chr [1:25] "N" "F" "F" "N" ...
##   ..@ polygons   :List of 174
##   .. .. [list output truncated]
##   ..@ plotOrder  : int [1:174] 162 18 33 89 25 17 54 21 164 135 ...
##   ..@ bbox       : num [1:2, 1:2] 627306 3368056 1082188 3879805
##   .. ..- attr(*, "dimnames")=List of 2
##   ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slots
head(GE@data)
##   FID_1      AREA PERIMETER G_UTM_ G_UTM_ID            AREANAME AREAKEY
## 0   130 1.331e+09    207205    132      133  GA, Appling County   13001
## 1   155 8.929e+08    154640    157      158 GA, Atkinson County   13003
## 2   146 7.434e+08    130431    148      146    GA, Bacon County   13005
## 3   156 9.054e+08    185737    158      155    GA, Baker County   13007
## 4    74 6.942e+08    151347     76       79  GA, Baldwin County   13009
## 5    22 6.065e+08    103518     24       23    GA, Banks County   13011
##   X_COORD Y_COORD KEY_VAL FID_2 AreaKey_1 Latitude Longitud TotPop90
## 0  941397 3521760       0     0     13001    31.75   -82.29    15744
## 1  895553 3471920       0     1     13003    31.29   -82.87     6213
## 2  930946 3502790       0     2     13005    31.56   -82.45     9566
## 3  745399 3474760       0     3     13007    31.33   -84.45     3615
## 4  849431 3665550   13009     4     13009    33.07   -83.25    39530
## 5  819317 3807620   13011     5     13011    34.35   -83.50    10308
##   PctRural PctBach PctEld PctFB PctPov PctBlack  ID      X       Y
## 0     75.6     8.2  11.43  0.64   19.9    20.76 133 941397 3521764
## 1    100.0     6.4  11.77  1.58   26.0    26.86 158 895553 3471916
## 2     61.7     6.6  11.11  0.27   24.1    15.42 146 930946 3502787
## 3    100.0     9.4  13.17  0.11   24.8    51.67 155 745399 3474765
## 4     42.7    13.3   8.64  1.43   17.5    42.39  79 849431 3665553
## 5    100.0     6.4  11.37  0.34   15.1     3.49  23 819317 3807616
##   Distance
## 0        0
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0

b. Start with a multiple regression model using all six explanatory variables listed above. Create a final model by removing variables that are not significant in explaining percentage of bachelor degrees. (10)

model = lm(PctBach ~ TotPop90 + PctRural + PctEld + PctFB + PctPov + PctBlack, 
    data = GE)
summary(model)
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctEld + PctFB + 
##     PctPov + PctBlack, data = GE)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.369 -2.052 -0.111  1.175 19.894 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.50e+01   1.60e+00    9.37  < 2e-16 ***
## TotPop90     2.33e-05   4.55e-06    5.11  8.7e-07 ***
## PctRural    -4.27e-02   1.30e-02   -3.28   0.0013 ** 
## PctEld      -7.88e-02   1.15e-01   -0.69   0.4935    
## PctFB        1.25e+00   2.98e-01    4.18  4.6e-05 ***
## PctPov      -1.55e-01   6.60e-02   -2.35   0.0201 *  
## PctBlack     2.08e-02   2.37e-02    0.88   0.3809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.35 on 167 degrees of freedom
## Multiple R-squared: 0.649,   Adjusted R-squared: 0.637 
## F-statistic: 51.6 on 6 and 167 DF,  p-value: <2e-16 
## 
drop1(model)
## Single term deletions
## 
## Model:
## PctBach ~ TotPop90 + PctRural + PctEld + PctFB + PctPov + PctBlack
##          Df Sum of Sq  RSS AIC
## <none>                1870 427
## TotPop90  1     292.4 2163 450
## PctRural  1     120.5 1991 436
## PctEld    1       5.3 1876 426
## PctFB     1     195.9 2066 443
## PctPov    1      61.7 1932 431
## PctBlack  1       8.6 1879 426
model1 = lm(PctBach ~ TotPop90 + PctRural + PctFB + PctPov + PctBlack, data = GE)
summary(model1)
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov + 
##     PctBlack, data = GE)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.330 -1.995 -0.191  1.352 20.085 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.45e+01   1.43e+00   10.19  < 2e-16 ***
## TotPop90     2.29e-05   4.51e-06    5.07  1.0e-06 ***
## PctRural    -4.46e-02   1.27e-02   -3.50  0.00059 ***
## PctFB        1.30e+00   2.87e-01    4.53  1.1e-05 ***
## PctPov      -1.76e-01   5.87e-02   -2.99  0.00317 ** 
## PctBlack     2.31e-02   2.34e-02    0.99  0.32409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.34 on 168 degrees of freedom
## Multiple R-squared: 0.648,   Adjusted R-squared: 0.638 
## F-statistic:   62 on 5 and 168 DF,  p-value: <2e-16 
## 
drop1(model1)
## Single term deletions
## 
## Model:
## PctBach ~ TotPop90 + PctRural + PctFB + PctPov + PctBlack
##          Df Sum of Sq  RSS AIC
## <none>                1876 426
## TotPop90  1     287.2 2163 448
## PctRural  1     137.1 2013 436
## PctFB     1     228.9 2104 444
## PctPov    1     100.1 1976 433
## PctBlack  1      10.9 1886 425
model2 = lm(PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = GE)
summary(model2)
## 
## Call:
## lm(formula = PctBach ~ TotPop90 + PctRural + PctFB + PctPov, 
##     data = GE)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.165 -2.066 -0.087  1.311 19.584 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.44e+01   1.42e+00   10.14  < 2e-16 ***
## TotPop90     2.37e-05   4.43e-06    5.35  2.8e-07 ***
## PctRural    -4.64e-02   1.26e-02   -3.69  0.00030 ***
## PctFB        1.30e+00   2.87e-01    4.52  1.2e-05 ***
## PctPov      -1.31e-01   3.73e-02   -3.51  0.00058 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 3.34 on 169 degrees of freedom
## Multiple R-squared: 0.646,   Adjusted R-squared: 0.638 
## F-statistic: 77.2 on 4 and 169 DF,  p-value: <2e-16 
## 
drop1(model2)
## Single term deletions
## 
## Model:
## PctBach ~ TotPop90 + PctRural + PctFB + PctPov
##          Df Sum of Sq  RSS AIC
## <none>                1886 425
## TotPop90  1       320 2206 450
## PctRural  1       152 2038 436
## PctFB     1       228 2114 443
## PctPov    1       137 2024 435

From summaries, PctEld and PctBlack should be removed. So the final model is model2, which includes four explanatory variables of TotPop90,PctRural,PctFB and PctPov.

c. Use the significant explanatory variables and create a geographic regression model using a fixed bandwidth. Plot a choropleth map of the predictions from the model. (10)

require(spgwr)
## Loading required package: spgwr
## Warning: package 'spgwr' was built under R version 2.15.2
## NOTE: This package does not constitute approval of GWR as a method of
## spatial analysis; see example(gwr)
bw = gwr.sel(PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = GE)
## Bandwidth: 241605 CV score: 2012 
## Bandwidth: 390534 CV score: 2052 
## Bandwidth: 149561 CV score: 1995 
## Bandwidth: 92675 CV score: 2100 
## Bandwidth: 184719 CV score: 1993 
## Bandwidth: 173020 CV score: 1991 
## Bandwidth: 170165 CV score: 1991 
## Bandwidth: 167827 CV score: 1991 
## Bandwidth: 168455 CV score: 1991 
## Bandwidth: 168480 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
## Bandwidth: 168474 CV score: 1991 
modelgwr = gwr(PctBach ~ TotPop90 + PctRural + PctFB + PctPov, data = GE, bandwidth = bw)
df = slot(modelgwr$SDF, "data")
brks = round(quantile(df$pred, probs = seq(0, 1, 0.2)), digits = 2)
ints = findInterval(df$pred, brks, all.inside = TRUE)
cls = rev(heat.colors(5))
plot(GE, col = cls[ints])
legend(x = "topleft", legend = leglabs(brks), fill = cls, bty = "n", horiz = FALSE, 
    cex = 0.8)
title(main = "Predicted (gwr) percent of population with a bachelor's degree")

plot of chunk unnamed-chunk-3

d. Plot a choropleth map of the percent poverty coefficient. (10)

brks = cut(df$PctPov, 5)
ints = as.integer(brks)
cls = rev(terrain.colors(5))
plot(GE, col = cls[ints])
legend(x = "topleft", legend = levels(brks), fill = cls, bty = "n", title = "Percent Poverty Coefficient", 
    horiz = FALSE, cex = 0.8)

plot of chunk unnamed-chunk-4

e. Plot a choropleth map of the R squared value. (10).

brks = cut(df$localR2, 5)
ints = as.integer(brks)
cls = rev(terrain.colors(5))
plot(GE, col = cls[ints])
legend(x = "topleft", legend = levels(brks), fill = cls, bty = "n", title = "Local R Squared", 
    horiz = FALSE, cex = 0.8)

plot of chunk unnamed-chunk-5