Multiple Regression & Maps

# In Class Exercise -- Maps & Multiple Regression

## INSTALL LIBRARIES FOR SPATIAL DATA
install.packages("sp")
## Installing package(s) into 'C:/Program Files/RStudio/R/library' (as 'lib'
## is unspecified)
## Warning: 'lib = "C:/Program Files/RStudio/R/library"' is not writable
## Error: unable to install packages
library(sp)
library(maptools)
## Loading required package: foreign
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)

# READ DATA
USA <- readShapePoly("C:/Users/Hallie/Desktop/Spring 2013 Courses/Quantitative Methods/Data/USA copy.shp")

# Plot Data IT'S A MAP
plot(USA)

plot of chunk unnamed-chunk-1


# Remove counties with no votes
USA <- USA[USA$Total > 0, ]

# List the pieces of the file
slotNames(USA)
## [1] "data"        "polygons"    "plotOrder"   "bbox"        "proj4string"
# summarize the file
summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
##       min    max
## x -124.73 -66.97
## y   24.96  49.37
## Is projected: NA 
## proj4string : [NA]
## Data attributes:
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 134   51     : 134  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3102   (Other)   :2957   (Other) :2221   (Other):2221  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19046  
##  005    :  48   01005  :   1   Median :  622   Median :29214  
##  009    :  47   01007  :   1   Mean   :  966   Mean   :30686  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46010  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2825   (Other):3102                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :    65   Min.   :     12   Min.   : 1001   Min.   :    0  
##  1st Qu.:  2941   1st Qu.:   1782   1st Qu.:19046   1st Qu.:    0  
##  Median :  6364   Median :   4041   Median :29214   Median :   14  
##  Mean   : 19073   Mean   :  17957   Mean   :30686   Mean   :  145  
##  3rd Qu.: 15924   3rd Qu.:  10434   3rd Qu.:46010   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct       Kerry_pct       Nader_pct    
##  Min.   :     77   Min.   : 9.31   Min.   : 7.17   Min.   :0.000  
##  1st Qu.:   4831   1st Qu.:52.73   1st Qu.:30.23   1st Qu.:0.000  
##  Median :  10416   Median :61.17   Median :38.49   Median :0.303  
##  Mean   :  37176   Mean   :60.66   Mean   :38.94   Mean   :0.401  
##  3rd Qu.:  26599   3rd Qu.:69.37   3rd Qu.:46.79   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.83   Max.   :90.05   Max.   :4.467  
##                                                                   
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15474  
##  Median :  65.6   Median : 33.5   Median :12.2   Median :17450  
##  Mean   :  93.1   Mean   : 35.3   Mean   :13.0   Mean   :17805  
##  3rd Qu.: 117.6   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.1   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35900  
##  Median :15.1   Median :11.6   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.88   Mean   : 52066  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.19  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.4   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
## 
# summarize the data
summary(USA@data)
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 134   51     : 134  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3102   (Other)   :2957   (Other) :2221   (Other):2221  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19046  
##  005    :  48   01005  :   1   Median :  622   Median :29214  
##  009    :  47   01007  :   1   Mean   :  966   Mean   :30686  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46010  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2825   (Other):3102                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :    65   Min.   :     12   Min.   : 1001   Min.   :    0  
##  1st Qu.:  2941   1st Qu.:   1782   1st Qu.:19046   1st Qu.:    0  
##  Median :  6364   Median :   4041   Median :29214   Median :   14  
##  Mean   : 19073   Mean   :  17957   Mean   :30686   Mean   :  145  
##  3rd Qu.: 15924   3rd Qu.:  10434   3rd Qu.:46010   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct       Kerry_pct       Nader_pct    
##  Min.   :     77   Min.   : 9.31   Min.   : 7.17   Min.   :0.000  
##  1st Qu.:   4831   1st Qu.:52.73   1st Qu.:30.23   1st Qu.:0.000  
##  Median :  10416   Median :61.17   Median :38.49   Median :0.303  
##  Mean   :  37176   Mean   :60.66   Mean   :38.94   Mean   :0.401  
##  3rd Qu.:  26599   3rd Qu.:69.37   3rd Qu.:46.79   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.83   Max.   :90.05   Max.   :4.467  
##                                                                   
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15474  
##  Median :  65.6   Median : 33.5   Median :12.2   Median :17450  
##  Mean   :  93.1   Mean   : 35.3   Mean   :13.0   Mean   :17805  
##  3rd Qu.: 117.6   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.1   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35900  
##  Median :15.1   Median :11.6   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.88   Mean   : 52066  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.19  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.4   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
## 

#################################################

##Making Maps in R ##uses package “lattice” and function spplot() ##uses “classInt” package

display.brewer.all()

plot of chunk unnamed-chunk-2


# let's make a 7 color 'spectral' palette
pal7 <- brewer.pal(7, "Spectral")
# to see the colors
display.brewer.pal(7, "Spectral")

plot of chunk unnamed-chunk-2


# create a column that holds the percent of all votes that went to G.W.
# Bush in 2004
USA$BushPct <- USA$Bush/USA$Total
# create categories
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
cats7
## style: quantile
## [0.09308,0.4746)  [0.4746,0.5415)  [0.5415,0.5907)  [0.5907,0.6336) 
##              444              444              444              444 
##  [0.6336,0.6801)  [0.6801,0.7421)  [0.7421,0.9283] 
##              444              444              444
# output shows the range for BushPct within each category each group
# should have about 440 counties
SevenColors <- findColours(cats7, pal7)
# draw map using specified data and colors
plot(USA, col = SevenColors)

plot of chunk unnamed-chunk-2


# Create a new column to hold the standardized percent bush
USA$BushPctZ <- (USA$BushPct - mean(USA$BushPct, na.rm = TRUE))/sd(USA$BushPct, 
    na.rm = TRUE)
# Create new categories with the standardized column and map
pal7 <- brewer.pal(7, "Spectral")
cats7 <- classIntervals(USA$BushPctZ, n = 7, style = "quantile")

SevenColors <- findColours(cats7, pal7)
plot(USA, col = SevenColors)

#################################

Multiple Regression

lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + MEDAGE2000, USA)
summary(lm1)
## 
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + 
##     MEDAGE2000, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5687 -0.0733  0.0102  0.0808  0.2708 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.82e-01   2.43e-02   32.23   <2e-16 ***
## pcturban    -1.32e-05   8.88e-05   -0.15    0.882    
## pctfemhh    -1.35e-02   5.35e-04  -25.19   <2e-16 ***
## pctpoor      2.97e-03   3.55e-04    8.36   <2e-16 ***
## HISP_LAT    -2.91e-04   1.87e-04   -1.56    0.119    
## MEDAGE2000  -1.25e-03   5.65e-04   -2.21    0.027 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.11 on 3102 degrees of freedom
## Multiple R-squared: 0.242,   Adjusted R-squared: 0.241 
## F-statistic:  198 on 5 and 3102 DF,  p-value: <2e-16

# getting the residuals of the model lm1
lm1.resid <- resid(lm1)
plot(lm1.resid ~ USA$BushPct)

plot of chunk unnamed-chunk-3

# these residuals look REALLY bad...let's try again

lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
## 
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6082 -0.0731  0.0089  0.0779  0.3558 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.77e-01   5.60e-03   138.9   <2e-16 ***
## pctfemhh    -1.01e-02   3.61e-04   -27.9   <2e-16 ***
## homevalu    -7.60e-07   6.01e-08   -12.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256,   Adjusted R-squared: 0.255 
## F-statistic:  533 on 2 and 3105 DF,  p-value: <2e-16

lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct)

plot of chunk unnamed-chunk-3

# these residuals look bad STILL ... why? Are we missing a predictive
# variable?

## add a new column to USA to hold the residuals

USA$resid <- resid(lmBush)

pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")

ThreeColors <- findColours(cats3, pal3)

plot(USA, col = ThreeColors)

plot of chunk unnamed-chunk-3


## add a new column to USA to hold the residuals

USA$resid <- resid(lmBush)

pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")

ThreeColors <- findColours(cats3, pal3)

plot(USA, col = ThreeColors)
# definitely a spatial trend in this map -- indicating there is something
# wrong with the model -- missing predictive variable?

names(USA)
##  [1] "SP_ID"      "NAME"       "STATE_NAME" "STATE_FIPS" "CNTY_FIPS" 
##  [6] "FIPS"       "AREA"       "FIPS_num"   "Bush"       "Kerry"     
## [11] "County_F"   "Nader"      "Total"      "Bush_pct"   "Kerry_pct" 
## [16] "Nader_pct"  "MDratio"    "pcturban"   "pctfemhh"   "pcincome"  
## [21] "pctpoor"    "pctcoled"   "unemploy"   "homevalu"   "popdens"   
## [26] "Obese"      "Noins"      "HISP_LAT"   "MEDAGE2000" "PEROVER65" 
## [31] "BushPct"    "BushPctZ"   "resid"

lm3 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$MEDAGE2000 + USA$PEROVER65 + 
    USA$pcincome + USA$HIST_LAT + USA$unemploy)
## Error: invalid type (NULL) for variable 'USA$HIST_LAT'
summary(lm3)
## Error: error in evaluating the argument 'object' in selecting a method for
## function 'summary': Error: object 'lm3' not found

lm4 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + USA$pcincome + 
    USA$unemploy)
summary(lm4)
## 
## Call:
## lm(formula = USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + 
##     USA$pcincome + USA$unemploy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5601 -0.0731  0.0080  0.0775  0.4037 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.63e-01   1.30e-02   58.84  < 2e-16 ***
## USA$pctfemhh  -1.01e-02   4.18e-04  -24.24  < 2e-16 ***
## USA$homevalu  -1.42e-06   8.79e-08  -16.10  < 2e-16 ***
## USA$PEROVER65 -2.58e-03   5.28e-04   -4.88  1.1e-06 ***
## USA$pcincome   5.53e-06   6.04e-07    9.15  < 2e-16 ***
## USA$unemploy  -1.95e-03   7.11e-04   -2.74   0.0061 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.107 on 3102 degrees of freedom
## Multiple R-squared: 0.285,   Adjusted R-squared: 0.284 
## F-statistic:  247 on 5 and 3102 DF,  p-value: <2e-16

lm4.resid <- resid(lm4)
plot(lm4.resid ~ USA$BushPct)

plot of chunk unnamed-chunk-3


lm5 <- lm(USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + USA$pcincome)
summary(lm5)
## 
## Call:
## lm(formula = USA$BushPct ~ USA$pctfemhh + USA$homevalu + USA$PEROVER65 + 
##     USA$pcincome)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5494 -0.0738  0.0078  0.0772  0.4077 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.55e-01   1.26e-02   59.72  < 2e-16 ***
## USA$pctfemhh  -1.06e-02   3.71e-04  -28.69  < 2e-16 ***
## USA$homevalu  -1.46e-06   8.68e-08  -16.76  < 2e-16 ***
## USA$PEROVER65 -2.77e-03   5.24e-04   -5.29  1.3e-07 ***
## USA$pcincome   6.00e-06   5.80e-07   10.35  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.107 on 3103 degrees of freedom
## Multiple R-squared: 0.283,   Adjusted R-squared: 0.282 
## F-statistic:  306 on 4 and 3103 DF,  p-value: <2e-16

lm5.resid <- resid(lm5)
plot(lm5.resid ~ USA$BushPct)

plot of chunk unnamed-chunk-3


## residuals look bad for all regression models predictive capability is
## best for lm4, so will use that model