Maps and Multiple Regression

Understaind spatial trends in regression

Creating maps and plotting spatial data is a very useful visualization method. This exercise overviews the basics of creating maps in R using the ColorBrewer library.

Load the dataset

# Load required libraries install.packages('sp')#Do this once per computer
library(sp)
library(maptools)
## Loading required package: foreign
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: TRUE
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)

# Read the shapefile and do some data cleaning...
USA <- readShapePoly("E:/Quant/inClassExercises/Maps and Multiple Regression/USA.shp")
plot(USA)

plot of chunk unnamed-chunk-1

USA <- USA[USA$Total > 0, ]  #This rewrites the object, removing counties with no votes submitted

slotNames(USA)
## [1] "data"        "polygons"    "plotOrder"   "bbox"        "proj4string"
summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
##       min    max
## x -124.73 -66.97
## y   24.96  49.37
## Is projected: NA 
## proj4string : [NA]
## Data attributes:
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 134   51     : 134  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3102   (Other)   :2957   (Other) :2221   (Other):2221  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19046  
##  005    :  48   01005  :   1   Median :  622   Median :29214  
##  009    :  47   01007  :   1   Mean   :  966   Mean   :30686  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46010  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2825   (Other):3102                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :    65   Min.   :     12   Min.   : 1001   Min.   :    0  
##  1st Qu.:  2941   1st Qu.:   1782   1st Qu.:19046   1st Qu.:    0  
##  Median :  6364   Median :   4041   Median :29214   Median :   14  
##  Mean   : 19073   Mean   :  17957   Mean   :30686   Mean   :  145  
##  3rd Qu.: 15924   3rd Qu.:  10434   3rd Qu.:46010   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct       Kerry_pct       Nader_pct    
##  Min.   :     77   Min.   : 9.31   Min.   : 7.17   Min.   :0.000  
##  1st Qu.:   4831   1st Qu.:52.73   1st Qu.:30.23   1st Qu.:0.000  
##  Median :  10416   Median :61.17   Median :38.49   Median :0.303  
##  Mean   :  37176   Mean   :60.66   Mean   :38.94   Mean   :0.401  
##  3rd Qu.:  26599   3rd Qu.:69.37   3rd Qu.:46.79   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.83   Max.   :90.05   Max.   :4.467  
##                                                                   
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15474  
##  Median :  65.6   Median : 33.5   Median :12.2   Median :17450  
##  Mean   :  93.1   Mean   : 35.3   Mean   :13.0   Mean   :17805  
##  3rd Qu.: 117.6   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.1   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35900  
##  Median :15.1   Median :11.6   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.88   Mean   : 52066  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.19  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.4   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
## 
# summary(USA@data)#same result as summary(USA)

Start examining the colorbrewer library:

# Create color palates
display.brewer.all()  #This brings up all of the color palettes in the library

plot of chunk unnamed-chunk-2

Use color brewer library to help select a color scheme and apply it to a map

pal7 <- brewer.pal(7, "Reds")
display.brewer.pal(7, "Reds")

plot of chunk unnamed-chunk-3



# Start looking at votes
USA$BushPct <- USA$Bush/USA$Total
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")  #breaks the BushPct field into 7 classes based on the quantile classing scheme.
cats7  #This gives the quantile ranges for the data that we just created
## style: quantile
## [0.09308,0.4746)  [0.4746,0.5415)  [0.5415,0.5907)  [0.5907,0.6336) 
##              444              444              444              444 
##  [0.6336,0.6801)  [0.6801,0.7421)  [0.7421,0.9283] 
##              444              444              444
SevenColors <- findColours(cats7, pal7)  #This matches the colors we chose from color brewer and applies it to the classes we just made.

# Map the data using the scheme we just defined
plot(USA, col = SevenColors)

plot of chunk unnamed-chunk-3

Make another map to show deviances from the mean

USA$BushPctZ <- (USA$BushPct - mean(USA$BushPct, na.rm = T))/sd(USA$BushPct, 
    na.rm = T)
newPal7 <- brewer.pal(7, "RdYlBu")
cats7_2 <- classIntervals(USA$BushPctZ, n = 7, style = "quantile")
SevenColors2 <- findColours(cats7_2, newPal7)
plot(USA, col = SevenColors2)

plot of chunk unnamed-chunk-4


# In later exercises, we will build on this to include titles and legends.

Create a linear model to predict the election outcome.

lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + MEDAGE2000, USA)
summary(lm1)
## 
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + 
##     MEDAGE2000, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5687 -0.0733  0.0102  0.0808  0.2708 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.82e-01   2.43e-02   32.23   <2e-16 ***
## pcturban    -1.32e-05   8.88e-05   -0.15    0.882    
## pctfemhh    -1.35e-02   5.35e-04  -25.19   <2e-16 ***
## pctpoor      2.97e-03   3.55e-04    8.36   <2e-16 ***
## HISP_LAT    -2.91e-04   1.87e-04   -1.56    0.119    
## MEDAGE2000  -1.25e-03   5.65e-04   -2.21    0.027 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.11 on 3102 degrees of freedom
## Multiple R-squared: 0.242,   Adjusted R-squared: 0.241 
## F-statistic:  198 on 5 and 3102 DF,  p-value: <2e-16

lm1.resid <- resid(lm1)
plot(lm1.resid ~ USA$BushPct)  #This is horrible.  There is a clear trend in the residuals...

plot of chunk unnamed-chunk-5




lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
## 
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6082 -0.0731  0.0089  0.0779  0.3558 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.77e-01   5.60e-03   138.9   <2e-16 ***
## pctfemhh    -1.01e-02   3.61e-04   -27.9   <2e-16 ***
## homevalu    -7.60e-07   6.01e-08   -12.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256,   Adjusted R-squared: 0.255 
## F-statistic:  533 on 2 and 3105 DF,  p-value: <2e-16
lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct)  #Residuals still look very bad; not randomly and normally distributed around 0

plot of chunk unnamed-chunk-5


# Explore the data through mapping the residuals.  Is there a spatial
# trend that we aren't taking into account?
USA$resid <- resid(lmBush)  #Add a new field to the shapefile so we can map the regression residuals
pal3 <- brewer.pal(3, "RdYlBu")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)

plot of chunk unnamed-chunk-5