# In Class Exercise -- Maps & Multiple Regression
## INSTALL LIBRARIES FOR SPATIAL DATA
install.packages("sp")
## Installing package(s) into 'C:/Program Files/RStudio/R/library' (as 'lib'
## is unspecified)
## Warning: 'lib = "C:/Program Files/RStudio/R/library"' is not writable
## Error: unable to install packages
library(sp)
library(maptools)
## Loading required package: foreign
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
# READ DATA
USA <- readShapePoly("C:/Users/Hallie/Desktop/Spring 2013 Courses/Quantitative Methods/Data/USA copy.shp")
# Plot Data IT'S A MAP
plot(USA)
# Remove counties with no votes
USA <- USA[USA$Total > 0, ]
# List the pieces of the file
slotNames(USA)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# summarize the file
summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -124.73 -66.97
## y 24.96 49.37
## Is projected: NA
## proj4string : [NA]
## Data attributes:
## SP_ID NAME STATE_NAME STATE_FIPS
## 0 : 1 Washington: 32 Texas : 254 48 : 254
## 1 : 1 Jefferson : 26 Georgia : 159 13 : 159
## 10 : 1 Franklin : 25 Virginia: 134 51 : 134
## 100 : 1 Jackson : 24 Kentucky: 120 21 : 120
## 1000 : 1 Lincoln : 24 Missouri: 115 29 : 115
## 1001 : 1 Madison : 20 Kansas : 105 20 : 105
## (Other):3102 (Other) :2957 (Other) :2221 (Other):2221
## CNTY_FIPS FIPS AREA FIPS_num
## 001 : 48 01001 : 1 Min. : 2 Min. : 1001
## 003 : 48 01003 : 1 1st Qu.: 435 1st Qu.:19046
## 005 : 48 01005 : 1 Median : 622 Median :29214
## 009 : 47 01007 : 1 Mean : 966 Mean :30686
## 007 : 46 01009 : 1 3rd Qu.: 931 3rd Qu.:46010
## 011 : 46 01011 : 1 Max. :20175 Max. :56045
## (Other):2825 (Other):3102
## Bush Kerry County_F Nader
## Min. : 65 Min. : 12 Min. : 1001 Min. : 0
## 1st Qu.: 2941 1st Qu.: 1782 1st Qu.:19046 1st Qu.: 0
## Median : 6364 Median : 4041 Median :29214 Median : 14
## Mean : 19073 Mean : 17957 Mean :30686 Mean : 145
## 3rd Qu.: 15924 3rd Qu.: 10434 3rd Qu.:46010 3rd Qu.: 67
## Max. :954764 Max. :1670341 Max. :56045 Max. :13251
##
## Total Bush_pct Kerry_pct Nader_pct
## Min. : 77 Min. : 9.31 Min. : 7.17 Min. :0.000
## 1st Qu.: 4831 1st Qu.:52.73 1st Qu.:30.23 1st Qu.:0.000
## Median : 10416 Median :61.17 Median :38.49 Median :0.303
## Mean : 37176 Mean :60.66 Mean :38.94 Mean :0.401
## 3rd Qu.: 26599 3rd Qu.:69.37 3rd Qu.:46.79 3rd Qu.:0.633
## Max. :2625105 Max. :92.83 Max. :90.05 Max. :4.467
##
## MDratio pcturban pctfemhh pcincome
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 37.3 1st Qu.: 0.0 1st Qu.: 9.6 1st Qu.:15474
## Median : 65.6 Median : 33.5 Median :12.2 Median :17450
## Mean : 93.1 Mean : 35.3 Mean :13.0 Mean :17805
## 3rd Qu.: 117.6 3rd Qu.: 56.5 3rd Qu.:15.4 3rd Qu.:19818
## Max. :2189.5 Max. :100.0 Max. :41.1 Max. :58096
##
## pctpoor pctcoled unemploy homevalu
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:11.1 1st Qu.: 9.0 1st Qu.: 3.90 1st Qu.: 35900
## Median :15.1 Median :11.6 Median : 5.30 Median : 44400
## Mean :16.5 Mean :13.1 Mean : 5.88 Mean : 52066
## 3rd Qu.:20.4 3rd Qu.:15.3 3rd Qu.: 7.20 3rd Qu.: 58600
## Max. :63.1 Max. :53.4 Max. :37.90 Max. :500001
##
## popdens Obese Noins HISP_LAT
## Min. : 0 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 15 1st Qu.:0.320 1st Qu.:0.100 1st Qu.: 0.90
## Median : 39 Median :0.340 Median :0.120 Median : 1.80
## Mean : 194 Mean :0.335 Mean :0.129 Mean : 6.19
## 3rd Qu.: 93 3rd Qu.:0.360 3rd Qu.:0.150 3rd Qu.: 5.10
## Max. :53801 Max. :0.630 Max. :0.410 Max. :97.50
##
## MEDAGE2000 PEROVER65
## Min. : 0.0 Min. : 0.0
## 1st Qu.:35.2 1st Qu.:12.1
## Median :37.4 Median :14.4
## Mean :37.4 Mean :14.8
## 3rd Qu.:39.8 3rd Qu.:17.1
## Max. :54.3 Max. :34.7
##
# summarize the data
summary(USA@data)
## SP_ID NAME STATE_NAME STATE_FIPS
## 0 : 1 Washington: 32 Texas : 254 48 : 254
## 1 : 1 Jefferson : 26 Georgia : 159 13 : 159
## 10 : 1 Franklin : 25 Virginia: 134 51 : 134
## 100 : 1 Jackson : 24 Kentucky: 120 21 : 120
## 1000 : 1 Lincoln : 24 Missouri: 115 29 : 115
## 1001 : 1 Madison : 20 Kansas : 105 20 : 105
## (Other):3102 (Other) :2957 (Other) :2221 (Other):2221
## CNTY_FIPS FIPS AREA FIPS_num
## 001 : 48 01001 : 1 Min. : 2 Min. : 1001
## 003 : 48 01003 : 1 1st Qu.: 435 1st Qu.:19046
## 005 : 48 01005 : 1 Median : 622 Median :29214
## 009 : 47 01007 : 1 Mean : 966 Mean :30686
## 007 : 46 01009 : 1 3rd Qu.: 931 3rd Qu.:46010
## 011 : 46 01011 : 1 Max. :20175 Max. :56045
## (Other):2825 (Other):3102
## Bush Kerry County_F Nader
## Min. : 65 Min. : 12 Min. : 1001 Min. : 0
## 1st Qu.: 2941 1st Qu.: 1782 1st Qu.:19046 1st Qu.: 0
## Median : 6364 Median : 4041 Median :29214 Median : 14
## Mean : 19073 Mean : 17957 Mean :30686 Mean : 145
## 3rd Qu.: 15924 3rd Qu.: 10434 3rd Qu.:46010 3rd Qu.: 67
## Max. :954764 Max. :1670341 Max. :56045 Max. :13251
##
## Total Bush_pct Kerry_pct Nader_pct
## Min. : 77 Min. : 9.31 Min. : 7.17 Min. :0.000
## 1st Qu.: 4831 1st Qu.:52.73 1st Qu.:30.23 1st Qu.:0.000
## Median : 10416 Median :61.17 Median :38.49 Median :0.303
## Mean : 37176 Mean :60.66 Mean :38.94 Mean :0.401
## 3rd Qu.: 26599 3rd Qu.:69.37 3rd Qu.:46.79 3rd Qu.:0.633
## Max. :2625105 Max. :92.83 Max. :90.05 Max. :4.467
##
## MDratio pcturban pctfemhh pcincome
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 37.3 1st Qu.: 0.0 1st Qu.: 9.6 1st Qu.:15474
## Median : 65.6 Median : 33.5 Median :12.2 Median :17450
## Mean : 93.1 Mean : 35.3 Mean :13.0 Mean :17805
## 3rd Qu.: 117.6 3rd Qu.: 56.5 3rd Qu.:15.4 3rd Qu.:19818
## Max. :2189.5 Max. :100.0 Max. :41.1 Max. :58096
##
## pctpoor pctcoled unemploy homevalu
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:11.1 1st Qu.: 9.0 1st Qu.: 3.90 1st Qu.: 35900
## Median :15.1 Median :11.6 Median : 5.30 Median : 44400
## Mean :16.5 Mean :13.1 Mean : 5.88 Mean : 52066
## 3rd Qu.:20.4 3rd Qu.:15.3 3rd Qu.: 7.20 3rd Qu.: 58600
## Max. :63.1 Max. :53.4 Max. :37.90 Max. :500001
##
## popdens Obese Noins HISP_LAT
## Min. : 0 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 15 1st Qu.:0.320 1st Qu.:0.100 1st Qu.: 0.90
## Median : 39 Median :0.340 Median :0.120 Median : 1.80
## Mean : 194 Mean :0.335 Mean :0.129 Mean : 6.19
## 3rd Qu.: 93 3rd Qu.:0.360 3rd Qu.:0.150 3rd Qu.: 5.10
## Max. :53801 Max. :0.630 Max. :0.410 Max. :97.50
##
## MEDAGE2000 PEROVER65
## Min. : 0.0 Min. : 0.0
## 1st Qu.:35.2 1st Qu.:12.1
## Median :37.4 Median :14.4
## Mean :37.4 Mean :14.8
## 3rd Qu.:39.8 3rd Qu.:17.1
## Max. :54.3 Max. :34.7
##
#################################################
##Making Maps in R ##uses package “lattice” and function spplot() ##uses “classInt” package
display.brewer.all()
# let's make a 7 color 'spectral' palette
pal7 <- brewer.pal(7, "Spectral")
# to see the colors
display.brewer.pal(7, "Spectral")
# create a column that holds the percent of all votes that went to G.W.
# Bush in 2004
USA$BushPct <- USA$Bush/USA$Total
# create categories
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
cats7
## style: quantile
## [0.09308,0.4746) [0.4746,0.5415) [0.5415,0.5907) [0.5907,0.6336)
## 444 444 444 444
## [0.6336,0.6801) [0.6801,0.7421) [0.7421,0.9283]
## 444 444 444
# output shows the range for BushPct within each category each group
# should have about 440 counties
SevenColors <- findColours(cats7, pal7)
# draw map using specified data and colors
plot(USA, col = SevenColors)
# Create a new column to hold the standardized percent bush
USA$BushPctZ <- (USA$BushPct - mean(USA$BushPct, na.rm = TRUE))/sd(USA$BushPct,
na.rm = TRUE)
# Create new categories with the standardized column and map
pal7 <- brewer.pal(7, "Spectral")
cats7 <- classIntervals(USA$BushPctZ, n = 7, style = "quantile")
SevenColors <- findColours(cats7, pal7)
plot(USA, col = SevenColors)
#################################
lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + MEDAGE2000, USA)
summary(lm1)
##
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT +
## MEDAGE2000, data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5687 -0.0733 0.0102 0.0808 0.2708
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.82e-01 2.43e-02 32.23 <2e-16 ***
## pcturban -1.32e-05 8.88e-05 -0.15 0.882
## pctfemhh -1.35e-02 5.35e-04 -25.19 <2e-16 ***
## pctpoor 2.97e-03 3.55e-04 8.36 <2e-16 ***
## HISP_LAT -2.91e-04 1.87e-04 -1.56 0.119
## MEDAGE2000 -1.25e-03 5.65e-04 -2.21 0.027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.11 on 3102 degrees of freedom
## Multiple R-squared: 0.242, Adjusted R-squared: 0.241
## F-statistic: 198 on 5 and 3102 DF, p-value: <2e-16
# getting the residuals of the model lm1
lm1.resid <- resid(lm1)
plot(lm1.resid ~ USA$BushPct)
# these residuals look REALLY bad...let's try again
lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
##
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6082 -0.0731 0.0089 0.0779 0.3558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.77e-01 5.60e-03 138.9 <2e-16 ***
## pctfemhh -1.01e-02 3.61e-04 -27.9 <2e-16 ***
## homevalu -7.60e-07 6.01e-08 -12.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256, Adjusted R-squared: 0.255
## F-statistic: 533 on 2 and 3105 DF, p-value: <2e-16
lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct)
# these residuals look bad STILL ... why? Are we missing a predictive
# variable?
## add a new column to USA to hold the residuals
USA$resid <- resid(lmBush)
pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)
## add a new column to USA to hold the residuals
USA$resid <- resid(lmBush)
pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)
# definitely a spatial trend in this map -- indicating there is something
# wrong with the model -- missing predictive variable?
names(USA)
## [1] "SP_ID" "NAME" "STATE_NAME" "STATE_FIPS" "CNTY_FIPS"
## [6] "FIPS" "AREA" "FIPS_num" "Bush" "Kerry"
## [11] "County_F" "Nader" "Total" "Bush_pct" "Kerry_pct"
## [16] "Nader_pct" "MDratio" "pcturban" "pctfemhh" "pcincome"
## [21] "pctpoor" "pctcoled" "unemploy" "homevalu" "popdens"
## [26] "Obese" "Noins" "HISP_LAT" "MEDAGE2000" "PEROVER65"
## [31] "BushPct" "BushPctZ" "resid"
lm3 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$MEDAGE2000 + USA$PEROVER65 +
USA$pcincome + USA$HIST_LAT + USA$unemploy)
## Error: invalid type (NULL) for variable 'USA$HIST_LAT'
summary(lm3)
## Error: error in evaluating the argument 'object' in selecting a method for
## function 'summary': Error: object 'lm3' not found
lm4 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + USA$pcincome +
USA$unemploy)
summary(lm4)
##
## Call:
## lm(formula = USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 +
## USA$pcincome + USA$unemploy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5601 -0.0731 0.0080 0.0775 0.4037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.63e-01 1.30e-02 58.84 < 2e-16 ***
## USA$pctfemhh -1.01e-02 4.18e-04 -24.24 < 2e-16 ***
## USA$homevalu -1.42e-06 8.79e-08 -16.10 < 2e-16 ***
## USA$PEROVER65 -2.58e-03 5.28e-04 -4.88 1.1e-06 ***
## USA$pcincome 5.53e-06 6.04e-07 9.15 < 2e-16 ***
## USA$unemploy -1.95e-03 7.11e-04 -2.74 0.0061 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.107 on 3102 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.284
## F-statistic: 247 on 5 and 3102 DF, p-value: <2e-16
lm4.resid <- resid(lm4)
plot(lm4.resid ~ USA$BushPct)
lm5 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + USA$pcincome)
summary(lm5)
##
## Call:
## lm(formula = USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 +
## USA$pcincome)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5494 -0.0738 0.0078 0.0772 0.4077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.55e-01 1.26e-02 59.72 < 2e-16 ***
## USA$pctfemhh -1.06e-02 3.71e-04 -28.69 < 2e-16 ***
## USA$homevalu -1.46e-06 8.68e-08 -16.76 < 2e-16 ***
## USA$PEROVER65 -2.77e-03 5.24e-04 -5.29 1.3e-07 ***
## USA$pcincome 6.00e-06 5.80e-07 10.35 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.107 on 3103 degrees of freedom
## Multiple R-squared: 0.283, Adjusted R-squared: 0.282
## F-statistic: 306 on 4 and 3103 DF, p-value: <2e-16
lm5.resid <- resid(lm5)
plot(lm5.resid ~ USA$BushPct)
## residuals look bad for all regression models predictive capability is
## best for lm4, so will use that model