Exercise 5b - Alex Crawford

GEOG 5023: Quantitative Methods In Geography

Read dbf File

library(leaps)
library(foreign)
USA <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.dbf")
names(USA)
##  [1] "SP_ID"      "NAME"       "STATE_NAME" "STATE_FIPS" "CNTY_FIPS" 
##  [6] "FIPS"       "AREA"       "FIPS_num"   "Bush"       "Kerry"     
## [11] "County_F"   "Nader"      "Total"      "Bush_pct"   "Kerry_pct" 
## [16] "Nader_pct"  "MDratio"    "pcturban"   "pctfemhh"   "pcincome"  
## [21] "pctpoor"    "pctcoled"   "unemploy"   "homevalu"   "popdens"   
## [26] "Obese"      "Noins"      "HISP_LAT"   "MEDAGE2000" "PEROVER65"
Elect04 <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/2004_Election_Counties.dbf")
names(Elect04)
##  [1] "NAME"       "STATE_NAME" "STATE_FIPS" "CNTY_FIPS"  "FIPS"      
##  [6] "AREA"       "FIPS_num"   "Bush"       "Kerry"      "County_F"  
## [11] "Nader"      "Total"      "Bush_pct"   "Kerry_pct"  "Nader_pct" 
## [16] "MDratio"    "hosp"       "pcthisp"    "pcturban"   "urbrural"  
## [21] "pctfemhh"   "pcincome"   "pctpoor"    "pctlt9ed"   "pcthsed"   
## [26] "pctcoled"   "unemploy"   "pctwhtcl"   "homevalu"   "rent"      
## [31] "popdens"    "crowded"    "ginirev"    "SmokecurM"  "SmokevrM"  
## [36] "SmokecurF"  "SmokevrF"   "Obese"      "Noins"      "XYLENES__M"
## [41] "TOLUENE"    "TETRACHLOR" "STYRENE"    "NICKEL_COM" "METHYLENE_"
## [46] "MERCURY_CO" "LEAD_COMPO" "BENZENE__I" "ARSENIC_CO" "POP2000"   
## [51] "POP00SQMIL" "MALE2000"   "FEMALE2000" "MAL2FEM"    "UNDER18"   
## [56] "AIAN"       "ASIA"       "BLACK"      "NHPI"       "WHITE"     
## [61] "AIAN_MORE"  "ASIA_MORE"  "BLK_MORE"   "NHPI_MORE"  "WHT_MORE"  
## [66] "HISP_LAT"   "CH19902000" "MEDAGE2000" "PEROVER65"

Use regsubsets

# regsubsets uses a search algorithm to find the best models for any
# number of variables.  It literally runs through every possibility and
# spits out the best model (or models) based on ...  nbest= selects the
# number of models to save for each # of variables.  nvmax= selects the
# maximum number of variables to use for a model.  really.big= allows it
# to run a lot of values
library(leaps)
mod1 <- regsubsets(Bush_pct ~ ., data = USA[, c(14, 16:30)], nbest = 1, nvmax = 2)  # For the smaller data set
mod2 <- regsubsets(Bush_pct ~ ., data = Elect04[, c(13, 15:51, 53:69)], nbest = 1, 
    nvmax = 8, really.big = T)  # For the larger data set
# Be careful of auto-correlation -- it might freeze resubsets, and it at
# least gives a warning message nvmax=8 is about as complicated as my old
# computer can handle.
summary(mod2)
## Subset selection object
## Call: regsubsets.formula(Bush_pct ~ ., data = Elect04[, c(13, 15:51, 
##     53:69)], nbest = 1, nvmax = 8, really.big = T)
## 54 Variables  (and intercept)
##            Forced in Forced out
## Nader_pct      FALSE      FALSE
## MDratio        FALSE      FALSE
## hosp           FALSE      FALSE
## pcthisp        FALSE      FALSE
## pcturban       FALSE      FALSE
## urbrural       FALSE      FALSE
## pctfemhh       FALSE      FALSE
## pcincome       FALSE      FALSE
## pctpoor        FALSE      FALSE
## pctlt9ed       FALSE      FALSE
## pcthsed        FALSE      FALSE
## pctcoled       FALSE      FALSE
## unemploy       FALSE      FALSE
## pctwhtcl       FALSE      FALSE
## homevalu       FALSE      FALSE
## rent           FALSE      FALSE
## popdens        FALSE      FALSE
## crowded        FALSE      FALSE
## ginirev        FALSE      FALSE
## SmokecurM      FALSE      FALSE
## SmokevrM       FALSE      FALSE
## SmokecurF      FALSE      FALSE
## SmokevrF       FALSE      FALSE
## Obese          FALSE      FALSE
## Noins          FALSE      FALSE
## XYLENES__M     FALSE      FALSE
## TOLUENE        FALSE      FALSE
## TETRACHLOR     FALSE      FALSE
## STYRENE        FALSE      FALSE
## NICKEL_COM     FALSE      FALSE
## METHYLENE_     FALSE      FALSE
## MERCURY_CO     FALSE      FALSE
## LEAD_COMPO     FALSE      FALSE
## BENZENE__I     FALSE      FALSE
## ARSENIC_CO     FALSE      FALSE
## POP2000        FALSE      FALSE
## POP00SQMIL     FALSE      FALSE
## FEMALE2000     FALSE      FALSE
## MAL2FEM        FALSE      FALSE
## UNDER18        FALSE      FALSE
## AIAN           FALSE      FALSE
## ASIA           FALSE      FALSE
## BLACK          FALSE      FALSE
## NHPI           FALSE      FALSE
## WHITE          FALSE      FALSE
## AIAN_MORE      FALSE      FALSE
## ASIA_MORE      FALSE      FALSE
## BLK_MORE       FALSE      FALSE
## NHPI_MORE      FALSE      FALSE
## WHT_MORE       FALSE      FALSE
## HISP_LAT       FALSE      FALSE
## CH19902000     FALSE      FALSE
## MEDAGE2000     FALSE      FALSE
## PEROVER65      FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          Nader_pct MDratio hosp pcthisp pcturban urbrural pctfemhh
## 1  ( 1 ) " "       " "     " "  " "     " "      " "      "*"     
## 2  ( 1 ) " "       " "     " "  " "     " "      " "      "*"     
## 3  ( 1 ) " "       " "     " "  " "     " "      " "      "*"     
## 4  ( 1 ) "*"       " "     " "  " "     " "      " "      "*"     
## 5  ( 1 ) " "       " "     " "  " "     " "      " "      "*"     
## 6  ( 1 ) "*"       " "     " "  " "     " "      " "      "*"     
## 7  ( 1 ) "*"       " "     " "  " "     " "      " "      "*"     
## 8  ( 1 ) "*"       " "     " "  " "     " "      " "      "*"     
##          pcincome pctpoor pctlt9ed pcthsed pctcoled unemploy pctwhtcl
## 1  ( 1 ) " "      " "     " "      " "     " "      " "      " "     
## 2  ( 1 ) " "      " "     " "      " "     " "      " "      " "     
## 3  ( 1 ) " "      " "     " "      " "     " "      " "      " "     
## 4  ( 1 ) " "      " "     " "      " "     " "      " "      " "     
## 5  ( 1 ) " "      " "     " "      " "     "*"      "*"      " "     
## 6  ( 1 ) " "      " "     " "      " "     " "      "*"      " "     
## 7  ( 1 ) " "      " "     " "      " "     " "      "*"      " "     
## 8  ( 1 ) " "      " "     " "      " "     "*"      "*"      " "     
##          homevalu rent popdens crowded ginirev SmokecurM SmokevrM
## 1  ( 1 ) " "      " "  " "     " "     " "     " "       " "     
## 2  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 3  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 4  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 5  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 6  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 7  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
## 8  ( 1 ) " "      " "  " "     " "     "*"     " "       " "     
##          SmokecurF SmokevrF Obese Noins XYLENES__M TOLUENE TETRACHLOR
## 1  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 2  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 3  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 4  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 5  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 6  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 7  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
## 8  ( 1 ) " "       " "      " "   " "   " "        " "     " "       
##          STYRENE NICKEL_COM METHYLENE_ MERCURY_CO LEAD_COMPO BENZENE__I
## 1  ( 1 ) " "     " "        " "        " "        " "        " "       
## 2  ( 1 ) " "     " "        " "        " "        " "        " "       
## 3  ( 1 ) " "     " "        " "        " "        " "        " "       
## 4  ( 1 ) " "     " "        " "        " "        " "        " "       
## 5  ( 1 ) " "     " "        " "        " "        " "        " "       
## 6  ( 1 ) " "     " "        " "        " "        " "        " "       
## 7  ( 1 ) " "     " "        " "        " "        " "        " "       
## 8  ( 1 ) " "     " "        " "        " "        " "        " "       
##          ARSENIC_CO POP2000 POP00SQMIL FEMALE2000 MAL2FEM UNDER18 AIAN
## 1  ( 1 ) " "        " "     " "        " "        " "     " "     " " 
## 2  ( 1 ) " "        " "     " "        " "        " "     " "     " " 
## 3  ( 1 ) " "        " "     " "        " "        " "     "*"     " " 
## 4  ( 1 ) " "        " "     " "        " "        " "     "*"     " " 
## 5  ( 1 ) " "        " "     " "        " "        " "     "*"     " " 
## 6  ( 1 ) " "        " "     " "        " "        " "     "*"     " " 
## 7  ( 1 ) " "        " "     " "        " "        " "     "*"     "*" 
## 8  ( 1 ) " "        " "     " "        " "        " "     "*"     "*" 
##          ASIA BLACK NHPI WHITE AIAN_MORE ASIA_MORE BLK_MORE NHPI_MORE
## 1  ( 1 ) " "  " "   " "  " "   " "       " "       " "      " "      
## 2  ( 1 ) " "  " "   " "  " "   " "       " "       " "      " "      
## 3  ( 1 ) " "  " "   " "  " "   " "       " "       " "      " "      
## 4  ( 1 ) " "  " "   " "  " "   " "       " "       " "      " "      
## 5  ( 1 ) " "  " "   " "  " "   " "       " "       " "      " "      
## 6  ( 1 ) " "  " "   " "  " "   " "       "*"       " "      " "      
## 7  ( 1 ) " "  " "   " "  " "   " "       "*"       " "      " "      
## 8  ( 1 ) "*"  " "   " "  " "   " "       " "       " "      " "      
##          WHT_MORE HISP_LAT CH19902000 MEDAGE2000 PEROVER65
## 1  ( 1 ) " "      " "      " "        " "        " "      
## 2  ( 1 ) " "      " "      " "        " "        " "      
## 3  ( 1 ) " "      " "      " "        " "        " "      
## 4  ( 1 ) " "      " "      " "        " "        " "      
## 5  ( 1 ) " "      " "      " "        " "        " "      
## 6  ( 1 ) " "      " "      " "        " "        " "      
## 7  ( 1 ) " "      " "      " "        " "        " "      
## 8  ( 1 ) " "      " "      " "        " "        " "
names(mod2)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"   
##  [6] "first"     "last"      "vorder"    "tol"       "rss"      
## [11] "bound"     "nvmax"     "ress"      "ir"        "nbest"    
## [16] "lopt"      "il"        "ier"       "xnames"    "method"   
## [21] "force.in"  "force.out" "sserr"     "intercept" "lindep"   
## [26] "nullrss"   "nn"        "call"

Plotting Resubsets Results

# Save results summary to a object
mod2_sum <- summary(mod2)
# Plot regsubsets results scree plots to identify the best model.  Two
# useful variables within the summary object are 'adjr2' and 'bic', which
# are both measures of model fit that include a penalty for including
# additional independent variables.  Higher adj. r2 and lower bic indicate
# a better model fit.
par(mfrow = c(1, 2))
plot(1:8, y = mod2_sum$adjr2, type = "o", xlab = "Number of Parameters", ylab = "Adj. r^2", 
    main = "Adj. r^2 by Model Size")
plot(1:8, y = mod2_sum$bic, type = "o", xlab = "Number of Parameters", ylab = "BIC", 
    main = "BIC by Model Size")

plot of chunk unnamed-chunk-3


# Alternatively, results can be visualized by plotting the original
# regsubsets object (as opposed to the summary object).  This plots bic
# and the number of variables.  The bonus over the previous method is that
# it also tells you which variables to take for each model size.
par(mfrow = c(1, 1))
plot(mod2, main = "Regsubsets Results by BIC")

plot of chunk unnamed-chunk-3

Examining the best 8-var Model

m8a <- lm(Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy + pctcoled + 
    pctfemhh + Nader_pct, data = Elect04)
summary(m8a)  # We can explain 46% of the Total SS with this model.
## 
## Call:
## lm(formula = Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy + 
##     pctcoled + pctfemhh + Nader_pct, data = Elect04)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53.26  -6.08   0.71   6.78  31.04 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.8011     1.7382   18.30  < 2e-16 ***
## ASIA         -1.0155     0.1286   -7.90  4.0e-15 ***
## AIAN         -0.2264     0.0286   -7.93  3.1e-15 ***
## UNDER18       1.0180     0.0549   18.55  < 2e-16 ***
## ginirev      77.7389     3.2919   23.61  < 2e-16 ***
## unemploy     -0.8912     0.0645  -13.81  < 2e-16 ***
## pctcoled     -0.2522     0.0331   -7.62  3.5e-14 ***
## pctfemhh     -1.3787     0.0393  -35.04  < 2e-16 ***
## Nader_pct    -4.1939     0.3602  -11.64  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 9.37 on 3102 degrees of freedom
## Multiple R-squared: 0.462,   Adjusted R-squared: 0.46 
## F-statistic:  333 on 8 and 3102 DF,  p-value: <2e-16
# Every varaible shows up significant, and the F-stat is golden.

## Examining Residuals
plot(m8a$fitted.values, m8a$residuals, main = "Residual Plot for Model m8a", 
    xlab = "Fitted Values (% Votes for Bush)", ylab = "Residuals", cex = 0.6)

plot of chunk unnamed-chunk-4

# A little bias for overprediction, perhaps, but not too bad.
hist(m8a$residuals, main = "Residuals for Model m8a", xlab = "Residuals", col = "cyan")

plot of chunk unnamed-chunk-4

# There is a larger righthand tail than lefthand tail, but not too shabby.
qqnorm(m8a$residuals, main = "QQ-Plot Residuals for Model m8a")
qqline(m8a$residuals)

plot of chunk unnamed-chunk-4

# And it was looking so good...  Yeah, not quite normal.
shapiro.test(m8a$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  m8a$residuals 
## W = 0.9869, p-value = 2.45e-16
# That's the last nail in the coffin.  The data being in the thousands may
# have something to do with it, but it's REALLY significant.

## Test for Multicollinearity
library(car)
## Loading required package: MASS
## Loading required package: nnet
m8a_vif <- vif(m8a)
summary(m8a_vif)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.13    1.17    1.45    1.42    1.65    1.69
# Mean VIF = 1.418 Range if 1.127 to 1.686 So there isn't much concern
# here for multicollinearity.