Exercise from Lecture 4 – Alex Crawford

GEOG 5023: Quantitative Methods In Geography

Load the Data and Plot

First need to install packages

# install.packages('sp')
library(sp)
library(maptools)

## Loading required package: foreign

## Loading required package: grid

## Loading required package: lattice

## Checking rgeos availability: FALSE Note: when rgeos is not available,
## polygon geometry computations in maptools depend on gpclib, which has a
## restricted licence. It is disabled by default; to enable gpclib, type
## gpclibPermit()

library(classInt)

## Loading required package: class

## Loading required package: e1071

library(RColorBrewer)

Reading a Poly Shapefile

# This is loaded at a spatial polygon data frame, which means I can
# display it as a map.
USA <- readShapePoly("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.shp")
plot(USA)

plot of chunk unnamed-chunk-2

Cleaning & Reading the Data

Remove counties with no votes using bracket notation.

USA <- USA[USA$Total > 0, ]

List the pieces of the shapefile using slotNames()

# This can be useful especially if there are a lot of fields for the data.
slotNames(USA)

## [1] "data"        "polygons"    "plotOrder"   "bbox"        "proj4string"

Summarizing data

summary(USA)

## Object of class SpatialPolygonsDataFrame
## Coordinates:
##       min    max
## x -124.73 -66.97
## y   24.96  49.37
## Is projected: NA 
## proj4string : [NA]
## Data attributes:
##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 134   51     : 134  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3102   (Other)   :2957   (Other) :2221   (Other):2221  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19046  
##  005    :  48   01005  :   1   Median :  622   Median :29214  
##  009    :  47   01007  :   1   Mean   :  966   Mean   :30686  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46010  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2825   (Other):3102                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :    65   Min.   :     12   Min.   : 1001   Min.   :    0  
##  1st Qu.:  2941   1st Qu.:   1782   1st Qu.:19046   1st Qu.:    0  
##  Median :  6364   Median :   4041   Median :29214   Median :   14  
##  Mean   : 19073   Mean   :  17957   Mean   :30686   Mean   :  145  
##  3rd Qu.: 15924   3rd Qu.:  10434   3rd Qu.:46010   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct       Kerry_pct       Nader_pct    
##  Min.   :     77   Min.   : 9.31   Min.   : 7.17   Min.   :0.000  
##  1st Qu.:   4831   1st Qu.:52.73   1st Qu.:30.23   1st Qu.:0.000  
##  Median :  10416   Median :61.17   Median :38.49   Median :0.303  
##  Mean   :  37176   Mean   :60.66   Mean   :38.94   Mean   :0.401  
##  3rd Qu.:  26599   3rd Qu.:69.37   3rd Qu.:46.79   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.83   Max.   :90.05   Max.   :4.467  
##                                                                   
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15474  
##  Median :  65.6   Median : 33.5   Median :12.2   Median :17450  
##  Mean   :  93.1   Mean   : 35.3   Mean   :13.0   Mean   :17805  
##  3rd Qu.: 117.6   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.1   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35900  
##  Median :15.1   Median :11.7   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.88   Mean   : 52066  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.19  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.4   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
##

# In general, use the @data to see just data.
summary(USA@data)

##      SP_ID              NAME         STATE_NAME     STATE_FIPS  
##  0      :   1   Washington:  32   Texas   : 254   48     : 254  
##  1      :   1   Jefferson :  26   Georgia : 159   13     : 159  
##  10     :   1   Franklin  :  25   Virginia: 134   51     : 134  
##  100    :   1   Jackson   :  24   Kentucky: 120   21     : 120  
##  1000   :   1   Lincoln   :  24   Missouri: 115   29     : 115  
##  1001   :   1   Madison   :  20   Kansas  : 105   20     : 105  
##  (Other):3102   (Other)   :2957   (Other) :2221   (Other):2221  
##    CNTY_FIPS         FIPS           AREA          FIPS_num    
##  001    :  48   01001  :   1   Min.   :    2   Min.   : 1001  
##  003    :  48   01003  :   1   1st Qu.:  435   1st Qu.:19046  
##  005    :  48   01005  :   1   Median :  622   Median :29214  
##  009    :  47   01007  :   1   Mean   :  966   Mean   :30686  
##  007    :  46   01009  :   1   3rd Qu.:  931   3rd Qu.:46010  
##  011    :  46   01011  :   1   Max.   :20175   Max.   :56045  
##  (Other):2825   (Other):3102                                  
##       Bush            Kerry            County_F         Nader      
##  Min.   :    65   Min.   :     12   Min.   : 1001   Min.   :    0  
##  1st Qu.:  2941   1st Qu.:   1782   1st Qu.:19046   1st Qu.:    0  
##  Median :  6364   Median :   4041   Median :29214   Median :   14  
##  Mean   : 19073   Mean   :  17957   Mean   :30686   Mean   :  145  
##  3rd Qu.: 15924   3rd Qu.:  10434   3rd Qu.:46010   3rd Qu.:   67  
##  Max.   :954764   Max.   :1670341   Max.   :56045   Max.   :13251  
##                                                                    
##      Total            Bush_pct       Kerry_pct       Nader_pct    
##  Min.   :     77   Min.   : 9.31   Min.   : 7.17   Min.   :0.000  
##  1st Qu.:   4831   1st Qu.:52.73   1st Qu.:30.23   1st Qu.:0.000  
##  Median :  10416   Median :61.17   Median :38.49   Median :0.303  
##  Mean   :  37176   Mean   :60.66   Mean   :38.94   Mean   :0.401  
##  3rd Qu.:  26599   3rd Qu.:69.37   3rd Qu.:46.79   3rd Qu.:0.633  
##  Max.   :2625105   Max.   :92.83   Max.   :90.05   Max.   :4.467  
##                                                                   
##     MDratio          pcturban        pctfemhh       pcincome    
##  Min.   :   0.0   Min.   :  0.0   Min.   : 0.0   Min.   :    0  
##  1st Qu.:  37.3   1st Qu.:  0.0   1st Qu.: 9.6   1st Qu.:15474  
##  Median :  65.6   Median : 33.5   Median :12.2   Median :17450  
##  Mean   :  93.1   Mean   : 35.3   Mean   :13.0   Mean   :17805  
##  3rd Qu.: 117.6   3rd Qu.: 56.5   3rd Qu.:15.4   3rd Qu.:19818  
##  Max.   :2189.5   Max.   :100.0   Max.   :41.1   Max.   :58096  
##                                                                 
##     pctpoor        pctcoled       unemploy        homevalu     
##  Min.   : 0.0   Min.   : 0.0   Min.   : 0.00   Min.   :     0  
##  1st Qu.:11.1   1st Qu.: 9.0   1st Qu.: 3.90   1st Qu.: 35900  
##  Median :15.1   Median :11.7   Median : 5.30   Median : 44400  
##  Mean   :16.5   Mean   :13.1   Mean   : 5.88   Mean   : 52066  
##  3rd Qu.:20.4   3rd Qu.:15.3   3rd Qu.: 7.20   3rd Qu.: 58600  
##  Max.   :63.1   Max.   :53.4   Max.   :37.90   Max.   :500001  
##                                                                
##     popdens          Obese           Noins          HISP_LAT    
##  Min.   :    0   Min.   :0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:   15   1st Qu.:0.320   1st Qu.:0.100   1st Qu.: 0.90  
##  Median :   39   Median :0.340   Median :0.120   Median : 1.80  
##  Mean   :  194   Mean   :0.335   Mean   :0.129   Mean   : 6.19  
##  3rd Qu.:   93   3rd Qu.:0.360   3rd Qu.:0.150   3rd Qu.: 5.10  
##  Max.   :53801   Max.   :0.630   Max.   :0.410   Max.   :97.50  
##                                                                 
##    MEDAGE2000     PEROVER65   
##  Min.   : 0.0   Min.   : 0.0  
##  1st Qu.:35.2   1st Qu.:12.1  
##  Median :37.4   Median :14.4  
##  Mean   :37.4   Mean   :14.8  
##  3rd Qu.:39.8   3rd Qu.:17.1  
##  Max.   :54.3   Max.   :34.7  
##

# The only difference for the summary function is that if I use @data, it
# adds the coordinates and projection.

Making Maps!

Color Brewer Palettes

# Color Brewer links with the shapefile to make thematic maps.
display.brewer.all()  # Displays all the colors available

plot of chunk unnamed-chunk-6

# Use brewer.pal(#,'Type') to create a palette
pal7 <- brewer.pal(7, "Spectral")  # Makes a 7-color spectral palette
display.brewer.pal(7, "Spectral")  # This displays the colors

plot of chunk unnamed-chunk-6

pal7  # Returns esoteric list of color codes.  Not helpful

## [1] "#D53E4F" "#FC8D59" "#FEE08B" "#FFFFBF" "#E6F598" "#99D594" "#3288BD"

Classifying Data using classIntervals() and findColours()

# SS: Create a column that holds the percent of all votes that went to
# G.W. Bush in 2004.
USA$BushPct <- USA$Bush/USA$Total
# Create the classification intervals using classIntervals(DATA, n=
# #OFCLASSES, style='STYLE') Styles are the basics like
# 'quantile','equal','fixed','sd','jenks', and apparently 'pretty'
cats7 <- classIntervals(USA$BushPct, n = 7, style = "quantile")
cats7  # Returns the bins for each classification.

## style: quantile
## [0.09308,0.4746)  [0.4746,0.5415)  [0.5415,0.5907)  [0.5907,0.6336) 
##              444              444              444              444 
##  [0.6336,0.6801)  [0.6801,0.7421)  [0.7421,0.9283] 
##              444              444              444

# Link the color pallette to the categories using
# findColours(CATEGORIES,PALETTE)
SevenColors <- findColours(cats7, pal7)
# Draw a map using specificed data and colors
plot(USA, col = SevenColors, main = "2004 Voting Percentage for Bush")  # The col= argument provides the coloring scheme.

plot of chunk unnamed-chunk-7

Mapping Deviations from the Mean

# Create a new column to hold the standardized percent bush (Z Scores) by
# subtracting the mean % and dividing by the std dev.
USA$BushPctZ <- (USA$BushPct - mean(USA$BushPct, na.rm = TRUE))/sd(USA$BushPct, 
    na.rm = TRUE)
# Crete a new palette and new categories with the standardized column and
# map.
pal7 <- brewer.pal(7, "Spectral")
cats7 <- classIntervals(USA$BushPctZ, n = 7, style = "quantile")
SevenColors <- findColours(cats7, pal7)
plot(USA, col = SevenColors, main = "2004 Voting Percentage for Bush")

plot of chunk unnamed-chunk-8

Multiple Regression

Null Hypothesis: The model does not explain the variation in perctage vote for George Bush better than the mean percentage.
Note that this is the null hypothesis for every regression model performed in this section. There is also an additional null hypothesis in the ANOVA for each independent variable. In that case, the null is always that the coefficient for that variable is 0, making RSS = MSS.
Trial 1

# The first linear model uses the percentage of the county population that
# is urban and the percentage that is 'poor', the percentage of households
# that are female-headed, and the median age in 2000.
lm1 <- lm(BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000, USA)
summary(lm1)  # The adj. R^2 is 0.2402

## 
## Call:
## lm(formula = BushPct ~ pcturban + pctfemhh + pctpoor + MEDAGE2000, 
##     data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5688 -0.0732  0.0108  0.0807  0.2666 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.76e-01   2.40e-02   32.33   <2e-16 ***
## pcturban    -6.05e-05   8.35e-05   -0.72    0.469    
## pctfemhh    -1.32e-02   5.10e-04  -25.95   <2e-16 ***
## pctpoor      2.76e-03   3.29e-04    8.38   <2e-16 ***
## MEDAGE2000  -1.10e-03   5.57e-04   -1.98    0.048 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.11 on 3103 degrees of freedom
## Multiple R-squared: 0.241,   Adjusted R-squared: 0.24 
## F-statistic:  247 on 4 and 3103 DF,  p-value: <2e-16

Reaction: I'm not surprised that poorer people prefer policies of Democrats, so are socially and fiscally liberal,and the for female-headed households, that may represent socially progressive counties. The Republican Party sells itself as a party of tradition. (Both of these have negative coefficients, indicating that higher percentages of those variables are associated with fewer votes for George Bush.) Median age shows that counties with a higher median age are less likely to vote for George Bush,which honestly is the opposite of what I would expect, based on the “tradtition” argument. Percent urban is not significant. I know in states like Pennsylvania and Ohio, the cities tend to be vote more democratic, but then again, a lot of counties have one midsized city and a lot of countryside, so that may mask the urban-rural split.

Observing Residuals

# Residuals and predicted values can be extracted using resid() and
# predict(), respectively, or they can be recalled using MODEL$residuals
# and MODEL$fitted.values.
lm1.resid <- resid(lm1)  # Extracting Residuals
lm1.predicted <- predict(lm1)  # Extracting Predicted Values
# We plot the residuals against the predicted y values to check if they're
# homoskedastic.  We also want to see a mean of zero and a linear fit to
# the residuals.  These residuals show us that we have many problems.
# Spatial dependence is perhaps the biggest variable omitted from the
# model.

plot(lm1.resid ~ lm1.predicted, main = "Model 1 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")  # Plot vs. the fitting y values

plot of chunk unnamed-chunk-11

Trial 2

# This model uses the percentage of households headed by females and home
# value.
lm2 <- lm(BushPct ~ pctfemhh + homevalu, data = USA)
summary(lm2)  # R2 is 0.255, a little better

## 
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6082 -0.0731  0.0089  0.0779  0.3558 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.77e-01   5.60e-03   138.9   <2e-16 ***
## pctfemhh    -1.01e-02   3.61e-04   -27.9   <2e-16 ***
## homevalu    -7.60e-07   6.01e-08   -12.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.109 on 3105 degrees of freedom
## Multiple R-squared: 0.256,   Adjusted R-squared: 0.255 
## F-statistic:  533 on 2 and 3105 DF,  p-value: <2e-16

Observing Residuals

lm2.resid <- resid(lm2)  # Extracting Residuals
lm2.predicted <- predict(lm2)  # Extracting Predicted Values

plot(lm2.resid ~ lm2.predicted, main = "Model 2 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")  # Plot vs. the fitting y variables

plot of chunk unnamed-chunk-14

Plotting Residuals on a Map

# SS: Since the residuals have something strange going on, we may have an
# omitted variable.  SS: Add a new column to USA to hold the residuals
USA$resid <- resid(lm2)
pal3 <- brewer.pal(3, "Spectral")
cats3 <- classIntervals(USA$resid, n = 3, style = "quantile")
ThreeColors <- findColours(cats3, pal3)
# The upper midwest, northwest coast, and New England are all over
# predicted (negative residuals).  The southern Great Plains are under
# predicted (positive residuals).
plot(USA, col = ThreeColors)

plot of chunk unnamed-chunk-15

Trying to Find a Better Model

Hereafter, the goal of this exercise is to improve on the model above by explaining more of the variation is county vote percentage for George Bush in 2004 and to improve the distribution of the residuals for the model.
Trial 3

# This model adds obesity as a proxy for location.  The percent obseity is
# a little higher in the South than other regions, and I know the South is
# generally under-predicted in previous models.
lm3 <- lm(BushPct ~ pctfemhh + homevalu + Obese, data = USA)
summary(lm3)  # The adjusted R^2 is 0.2831, the best so far.

## 
## Call:
## lm(formula = BushPct ~ pctfemhh + homevalu + Obese, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4962 -0.0770  0.0073  0.0790  0.3871 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.65e-01   1.15e-02    57.8   <2e-16 ***
## pctfemhh    -1.11e-02   3.65e-04   -30.3   <2e-16 ***
## homevalu    -7.99e-07   5.90e-08   -13.5   <2e-16 ***
## Obese        3.79e-01   3.42e-02    11.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.107 on 3104 degrees of freedom
## Multiple R-squared: 0.284,   Adjusted R-squared: 0.283 
## F-statistic:  410 on 3 and 3104 DF,  p-value: <2e-16

# But the residuals are still a blob.

plot(lm3$residuals ~ lm3$fitted.values, main = "Model 3 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")

plot of chunk unnamed-chunk-17

Trial 4

# Instead of home value, what about the percentage of the population that
# is Latino/Hispanic?  Theoretically, immigration issues would push them
# Democratic.
lm4 <- lm(BushPct ~ pctfemhh + HISP_LAT + Obese, data = USA)
summary(lm4)  # Adjsuted R^2 is 0.2423. No improvement here.

## 
## Call:
## lm(formula = BushPct ~ pctfemhh + HISP_LAT + Obese, data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4674 -0.0772  0.0083  0.0819  0.2620 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.635623   0.011881    53.5   <2e-16 ***
## pctfemhh    -0.011757   0.000373   -31.5   <2e-16 ***
## HISP_LAT     0.000398   0.000166     2.4    0.016 *  
## Obese        0.362561   0.035380    10.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.11 on 3104 degrees of freedom
## Multiple R-squared: 0.243,   Adjusted R-squared: 0.242 
## F-statistic:  332 on 3 and 3104 DF,  p-value: <2e-16

Trial 5

# I've decided to take a closer look at a single variable.

plot(BushPct ~ pctfemhh, data = USA, main = "BushPct v. femhh", xlab = "% Households Led by Female", 
    ylab = "% Voting for Bush")  # femhh shows a better correlation than most variables.

plot of chunk unnamed-chunk-20

# Check the residuals.
lm5 <- lm(BushPct ~ pctfemhh, data = USA)
# Use par(mfrow=c(#rows,#columns)) to make an array of plots.  The
# residuals show that it is not a linear relation -- a quadratic
# relationship would be better.

par(mfrow = c(2, 2))
plot(lm5, main = "Model 5 Plots")

plot of chunk unnamed-chunk-22

Trial 6

# That in find, I'll improve on lm3 by including a quadratic:
lm6 <- lm(BushPct ~ pctfemhh + I(pctfemhh^2) + Obese + homevalu, data = USA)
summary(lm6)  # 0.2957 is the best so far.  Woohoo!

## 
## Call:
## lm(formula = BushPct ~ pctfemhh + I(pctfemhh^2) + Obese + homevalu, 
##     data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4693 -0.0755  0.0071  0.0765  0.4674 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.38e-01   1.20e-02   53.41  < 2e-16 ***
## pctfemhh      -1.94e-03   1.26e-03   -1.54     0.12    
## I(pctfemhh^2) -2.75e-04   3.66e-05   -7.52  7.4e-14 ***
## Obese          2.82e-01   3.62e-02    7.80  8.2e-15 ***
## homevalu      -8.88e-07   5.97e-08  -14.88  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.106 on 3103 degrees of freedom
## Multiple R-squared: 0.297,   Adjusted R-squared: 0.296 
## F-statistic:  327 on 4 and 3103 DF,  p-value: <2e-16

# But the residuals are WEIRD.  They're less blobbed for sure, but now the
# mean is +0.6?  Gosh!

par(mfrow = c(1, 1))
plot(lm6$residuals, lm6$fitted.values, main = "Model 6 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")

plot of chunk unnamed-chunk-24

Trial 7

# Now let's look at college education.  College educated people are known
# for being less traditional, perhaps less likely to vote for the socially
# conservative party.

par(mfrow = c(1, 2))
plot(BushPct ~ pctcoled, data = USA, main = "BushPct v. pctcoled", xlab = "% College Educated", 
    ylab = "% Voting for Bush")  # Another negative correlation.
plot(pctcoled ~ pctfemhh, data = USA, main = "pctcoled v. pctfemhh", xlab = "% Households Led by Females", 
    ylab = "% College Educated")  # And not visually autocorrelated with female households.

plot of chunk unnamed-chunk-26

lm7 <- lm(BushPct ~ pctcoled, data = USA)
# This variable also seems to be better explained with a quadratic.

par(mfrow = c(1, 1))
plot(lm7$residuals ~ lm7$fitted.values, main = "Model 7 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")

plot of chunk unnamed-chunk-28

Trial 8

lm8 <- lm(BushPct ~ pctfemhh + I(pctfemhh^2) + Obese + homevalu + pctcoled + 
    I(pctcoled^2), data = USA)
summary(lm8)  # I'm over 30% for adj. R^2, so I'm pretty happy for this data set.

## 
## Call:
## lm(formula = BushPct ~ pctfemhh + I(pctfemhh^2) + Obese + homevalu + 
##     pctcoled + I(pctcoled^2), data = USA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4425 -0.0730  0.0058  0.0736  0.5377 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.11e-01   1.21e-02   50.34  < 2e-16 ***
## pctfemhh      -3.55e-03   1.25e-03   -2.85   0.0045 ** 
## I(pctfemhh^2) -2.28e-04   3.61e-05   -6.32  3.0e-10 ***
## Obese          1.79e-01   3.69e-02    4.85  1.3e-06 ***
## homevalu      -6.55e-07   7.79e-08   -8.41  < 2e-16 ***
## pctcoled       9.26e-03   9.91e-04    9.34  < 2e-16 ***
## I(pctcoled^2) -2.83e-04   2.52e-05  -11.21  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.104 on 3101 degrees of freedom
## Multiple R-squared: 0.325,   Adjusted R-squared: 0.324 
## F-statistic:  249 on 6 and 3101 DF,  p-value: <2e-16

# The residuals are also the best I've seen, although still atrocious.  At
# least they're back to a mean around 0!  Final Count: 0.3241.

plot(lm8$residuals ~ lm8$fitted.values, main = "Model 8 Residuals", xlab = "Fitted Values", 
    ylab = "Residuals")

plot of chunk unnamed-chunk-30