Creating maps and plotting spatial data is a very useful visualization method. This exercise overviews the basics of creating maps in R using the ColorBrewer library.
Load the dataset
# Load required libraries install.packages('sp')#Do this once per computer
library(sp)
library(maptools)
## Loading required package: foreign
## Loading required package: grid
## Loading required package: lattice
## Checking rgeos availability: TRUE
library(classInt)
## Loading required package: class
## Loading required package: e1071
library(RColorBrewer)
# Read the shapefile and do some data cleaning...
USA <- readShapePoly("E:/Quant/inClassExercises/Maps and Multiple Regression/USA.shp")
plot(USA)
USA <- USA[USA$Total > 0, ] #This rewrites the object, removing counties with no votes submitted
slotNames(USA)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
summary(USA)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -124.73 -66.97
## y 24.96 49.37
## Is projected: NA
## proj4string : [NA]
## Data attributes:
## SP_ID NAME STATE_NAME STATE_FIPS
## 0 : 1 Washington: 32 Texas : 254 48 : 254
## 1 : 1 Jefferson : 26 Georgia : 159 13 : 159
## 10 : 1 Franklin : 25 Virginia: 134 51 : 134
## 100 : 1 Jackson : 24 Kentucky: 120 21 : 120
## 1000 : 1 Lincoln : 24 Missouri: 115 29 : 115
## 1001 : 1 Madison : 20 Kansas : 105 20 : 105
## (Other):3102 (Other) :2957 (Other) :2221 (Other):2221
## CNTY_FIPS FIPS AREA FIPS_num
## 001 : 48 01001 : 1 Min. : 2 Min. : 1001
## 003 : 48 01003 : 1 1st Qu.: 435 1st Qu.:19046
## 005 : 48 01005 : 1 Median : 622 Median :29214
## 009 : 47 01007 : 1 Mean : 966 Mean :30686
## 007 : 46 01009 : 1 3rd Qu.: 931 3rd Qu.:46010
## 011 : 46 01011 : 1 Max. :20175 Max. :56045
## (Other):2825 (Other):3102
## Bush Kerry County_F Nader
## Min. : 65 Min. : 12 Min. : 1001 Min. : 0
## 1st Qu.: 2941 1st Qu.: 1782 1st Qu.:19046 1st Qu.: 0
## Median : 6364 Median : 4041 Median :29214 Median : 14
## Mean : 19073 Mean : 17957 Mean :30686 Mean : 145
## 3rd Qu.: 15924 3rd Qu.: 10434 3rd Qu.:46010 3rd Qu.: 67
## Max. :954764 Max. :1670341 Max. :56045 Max. :13251
##
## Total Bush_pct Kerry_pct Nader_pct
## Min. : 77 Min. : 9.31 Min. : 7.17 Min. :0.000
## 1st Qu.: 4831 1st Qu.:52.73 1st Qu.:30.23 1st Qu.:0.000
## Median : 10416 Median :61.17 Median :38.49 Median :0.303
## Mean : 37176 Mean :60.66 Mean :38.94 Mean :0.401
## 3rd Qu.: 26599 3rd Qu.:69.37 3rd Qu.:46.79 3rd Qu.:0.633
## Max. :2625105 Max. :92.83 Max. :90.05 Max. :4.467
##
## MDratio pcturban pctfemhh pcincome
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 37.3 1st Qu.: 0.0 1st Qu.: 9.6 1st Qu.:15474
## Median : 65.6 Median : 33.5 Median :12.2 Median :17450
## Mean : 93.1 Mean : 35.3 Mean :13.0 Mean :17805
## 3rd Qu.: 117.6 3rd Qu.: 56.5 3rd Qu.:15.4 3rd Qu.:19818
## Max. :2189.5 Max. :100.0 Max. :41.1 Max. :58096
##
## pctpoor pctcoled unemploy homevalu
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.:11.1 1st Qu.: 9.0 1st Qu.: 3.90 1st Qu.: 35900
## Median :15.1 Median :11.6 Median : 5.30 Median : 44400
## Mean :16.5 Mean :13.1 Mean : 5.88 Mean : 52066
## 3rd Qu.:20.4 3rd Qu.:15.3 3rd Qu.: 7.20 3rd Qu.: 58600
## Max. :63.1 Max. :53.4 Max. :37.90 Max. :500001
##
## popdens Obese Noins HISP_LAT
## Min. : 0 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 15 1st Qu.:0.320 1st Qu.:0.100 1st Qu.: 0.90
## Median : 39 Median :0.340 Median :0.120 Median : 1.80
## Mean : 194 Mean :0.335 Mean :0.129 Mean : 6.19
## 3rd Qu.: 93 3rd Qu.:0.360 3rd Qu.:0.150 3rd Qu.: 5.10
## Max. :53801 Max. :0.630 Max. :0.410 Max. :97.50
##
## MEDAGE2000 PEROVER65
## Min. : 0.0 Min. : 0.0
## 1st Qu.:35.2 1st Qu.:12.1
## Median :37.4 Median :14.4
## Mean :37.4 Mean :14.8
## 3rd Qu.:39.8 3rd Qu.:17.1
## Max. :54.3 Max. :34.7
##
# summary(USA@data)#same result as summary(USA)
Start examining the colorbrewer library:
# Create color palates
display.brewer.all() #This brings up all of the color palettes in the library
Use color brewer library to help select a color scheme and apply it to a map
pal7 <- brewer.pal(7, "Reds")
display.brewer.pal(7, "Reds")
# Start looking at votes
USA$BushPct <- USA$Bush/USA$Total
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile") #breaks the BushPct field into 7 classes based on the quantile classing scheme.
cats7 #This gives the quantile ranges for the data that we just created
## style: quantile
## [0.09308,0.4746) [0.4746,0.5415) [0.5415,0.5907) [0.5907,0.6336)
## 444 444 444 444
## [0.6336,0.6801) [0.6801,0.7421) [0.7421,0.9283]
## 444 444 444
SevenColors <- findColours(cats7, pal7) #This matches the colors we chose from color brewer and applies it to the classes we just made.
# Map the data using the scheme we just defined
plot(USA, col = SevenColors)
Make another map to show deviances from the mean
USA$BushPctZ <- (USA$BushPct - mean(USA$BushPct, na.rm = T))/sd(USA$BushPct,
na.rm = T)
newPal7 <- brewer.pal(7, "RdYlBu")
cats7_2 <- classIntervals(USA$BushPctZ, n = 7, style = "quantile")
SevenColors2 <- findColours(cats7_2, newPal7)
plot(USA, col = SevenColors2)
# In later exercises, we will build on this to include titles and legends.
Create a linear model to predict the election outcome.
lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT + MEDAGE2000, USA)
summary(lm1)
##
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + HISP_LAT +
## MEDAGE2000, data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5687 -0.0733 0.0102 0.0808 0.2708
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.82e-01 2.43e-02 32.23 <2e-16 ***
## pcturban -1.32e-05 8.88e-05 -0.15 0.882
## pctfemhh -1.35e-02 5.35e-04 -25.19 <2e-16 ***
## pctpoor 2.97e-03 3.55e-04 8.36 <2e-16 ***
## HISP_LAT -2.91e-04 1.87e-04 -1.56 0.119
## MEDAGE2000 -1.25e-03 5.65e-04 -2.21 0.027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.11 on 3102 degrees of freedom
## Multiple R-squared: 0.242, Adjusted R-squared: 0.241
## F-statistic: 198 on 5 and 3102 DF, p-value: <2e-16
lm1.resid <- resid(lm1)
plot(lm1.resid ~ USA$BushPct) #This is horrible. There is a clear trend in the residuals...
lmBush <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lmBush)
##
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6082 -0.0731 0.0089 0.0779 0.3558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.77e-01 5.60e-03 138.9 <2e-16 ***
## pctfemhh -1.01e-02 3.61e-04 -27.9 <2e-16 ***
## homevalu -7.60e-07 6.01e-08 -12.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256, Adjusted R-squared: 0.255
## F-statistic: 533 on 2 and 3105 DF, p-value: <2e-16
lmBush.resid <- resid(lmBush)
plot(lmBush.resid ~ USA$BushPct) #Residuals still look very bad; not randomly and normally distributed around 0
# Explore the data through mapping the residuals. Is there a spatial
# trend that we aren't taking into account?
USA$resid <- resid(lmBush) #Add a new field to the shapefile so we can map the regression residuals
pal3 <- brewer.pal(3, "RdYlBu")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
plot(USA, col = ThreeColors)