Read dbf File
library(leaps)
library(foreign)
USA <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/USAcopy.dbf")
names(USA)
## [1] "SP_ID" "NAME" "STATE_NAME" "STATE_FIPS" "CNTY_FIPS"
## [6] "FIPS" "AREA" "FIPS_num" "Bush" "Kerry"
## [11] "County_F" "Nader" "Total" "Bush_pct" "Kerry_pct"
## [16] "Nader_pct" "MDratio" "pcturban" "pctfemhh" "pcincome"
## [21] "pctpoor" "pctcoled" "unemploy" "homevalu" "popdens"
## [26] "Obese" "Noins" "HISP_LAT" "MEDAGE2000" "PEROVER65"
Elect04 <- read.dbf("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/2004_Election_Counties.dbf")
names(Elect04)
## [1] "NAME" "STATE_NAME" "STATE_FIPS" "CNTY_FIPS" "FIPS"
## [6] "AREA" "FIPS_num" "Bush" "Kerry" "County_F"
## [11] "Nader" "Total" "Bush_pct" "Kerry_pct" "Nader_pct"
## [16] "MDratio" "hosp" "pcthisp" "pcturban" "urbrural"
## [21] "pctfemhh" "pcincome" "pctpoor" "pctlt9ed" "pcthsed"
## [26] "pctcoled" "unemploy" "pctwhtcl" "homevalu" "rent"
## [31] "popdens" "crowded" "ginirev" "SmokecurM" "SmokevrM"
## [36] "SmokecurF" "SmokevrF" "Obese" "Noins" "XYLENES__M"
## [41] "TOLUENE" "TETRACHLOR" "STYRENE" "NICKEL_COM" "METHYLENE_"
## [46] "MERCURY_CO" "LEAD_COMPO" "BENZENE__I" "ARSENIC_CO" "POP2000"
## [51] "POP00SQMIL" "MALE2000" "FEMALE2000" "MAL2FEM" "UNDER18"
## [56] "AIAN" "ASIA" "BLACK" "NHPI" "WHITE"
## [61] "AIAN_MORE" "ASIA_MORE" "BLK_MORE" "NHPI_MORE" "WHT_MORE"
## [66] "HISP_LAT" "CH19902000" "MEDAGE2000" "PEROVER65"
Use regsubsets
# regsubsets uses a search algorithm to find the best models for any
# number of variables. It literally runs through every possibility and
# spits out the best model (or models) based on ... nbest= selects the
# number of models to save for each # of variables. nvmax= selects the
# maximum number of variables to use for a model. really.big= allows it
# to run a lot of values
library(leaps)
mod1 <- regsubsets(Bush_pct ~ ., data = USA[, c(14, 16:30)], nbest = 1, nvmax = 2) # For the smaller data set
mod2 <- regsubsets(Bush_pct ~ ., data = Elect04[, c(13, 15:51, 53:69)], nbest = 1,
nvmax = 8, really.big = T) # For the larger data set
# Be careful of auto-correlation -- it might freeze resubsets, and it at
# least gives a warning message nvmax=8 is about as complicated as my old
# computer can handle.
summary(mod2)
## Subset selection object
## Call: regsubsets.formula(Bush_pct ~ ., data = Elect04[, c(13, 15:51,
## 53:69)], nbest = 1, nvmax = 8, really.big = T)
## 54 Variables (and intercept)
## Forced in Forced out
## Nader_pct FALSE FALSE
## MDratio FALSE FALSE
## hosp FALSE FALSE
## pcthisp FALSE FALSE
## pcturban FALSE FALSE
## urbrural FALSE FALSE
## pctfemhh FALSE FALSE
## pcincome FALSE FALSE
## pctpoor FALSE FALSE
## pctlt9ed FALSE FALSE
## pcthsed FALSE FALSE
## pctcoled FALSE FALSE
## unemploy FALSE FALSE
## pctwhtcl FALSE FALSE
## homevalu FALSE FALSE
## rent FALSE FALSE
## popdens FALSE FALSE
## crowded FALSE FALSE
## ginirev FALSE FALSE
## SmokecurM FALSE FALSE
## SmokevrM FALSE FALSE
## SmokecurF FALSE FALSE
## SmokevrF FALSE FALSE
## Obese FALSE FALSE
## Noins FALSE FALSE
## XYLENES__M FALSE FALSE
## TOLUENE FALSE FALSE
## TETRACHLOR FALSE FALSE
## STYRENE FALSE FALSE
## NICKEL_COM FALSE FALSE
## METHYLENE_ FALSE FALSE
## MERCURY_CO FALSE FALSE
## LEAD_COMPO FALSE FALSE
## BENZENE__I FALSE FALSE
## ARSENIC_CO FALSE FALSE
## POP2000 FALSE FALSE
## POP00SQMIL FALSE FALSE
## FEMALE2000 FALSE FALSE
## MAL2FEM FALSE FALSE
## UNDER18 FALSE FALSE
## AIAN FALSE FALSE
## ASIA FALSE FALSE
## BLACK FALSE FALSE
## NHPI FALSE FALSE
## WHITE FALSE FALSE
## AIAN_MORE FALSE FALSE
## ASIA_MORE FALSE FALSE
## BLK_MORE FALSE FALSE
## NHPI_MORE FALSE FALSE
## WHT_MORE FALSE FALSE
## HISP_LAT FALSE FALSE
## CH19902000 FALSE FALSE
## MEDAGE2000 FALSE FALSE
## PEROVER65 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## Nader_pct MDratio hosp pcthisp pcturban urbrural pctfemhh
## 1 ( 1 ) " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " " " "*"
## 4 ( 1 ) "*" " " " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" " " " " " " " " " " "*"
## pcincome pctpoor pctlt9ed pcthsed pctcoled unemploy pctwhtcl
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " "*" "*" " "
## 6 ( 1 ) " " " " " " " " " " "*" " "
## 7 ( 1 ) " " " " " " " " " " "*" " "
## 8 ( 1 ) " " " " " " " " "*" "*" " "
## homevalu rent popdens crowded ginirev SmokecurM SmokevrM
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " " " "
## 3 ( 1 ) " " " " " " " " "*" " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " " " "*" " " " "
## 7 ( 1 ) " " " " " " " " "*" " " " "
## 8 ( 1 ) " " " " " " " " "*" " " " "
## SmokecurF SmokevrF Obese Noins XYLENES__M TOLUENE TETRACHLOR
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " " " "
## STYRENE NICKEL_COM METHYLENE_ MERCURY_CO LEAD_COMPO BENZENE__I
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## ARSENIC_CO POP2000 POP00SQMIL FEMALE2000 MAL2FEM UNDER18 AIAN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " "*" " "
## 5 ( 1 ) " " " " " " " " " " "*" " "
## 6 ( 1 ) " " " " " " " " " " "*" " "
## 7 ( 1 ) " " " " " " " " " " "*" "*"
## 8 ( 1 ) " " " " " " " " " " "*" "*"
## ASIA BLACK NHPI WHITE AIAN_MORE ASIA_MORE BLK_MORE NHPI_MORE
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " "*" " " " "
## 7 ( 1 ) " " " " " " " " " " "*" " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " " " "
## WHT_MORE HISP_LAT CH19902000 MEDAGE2000 PEROVER65
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " "
names(mod2)
## [1] "np" "nrbar" "d" "rbar" "thetab"
## [6] "first" "last" "vorder" "tol" "rss"
## [11] "bound" "nvmax" "ress" "ir" "nbest"
## [16] "lopt" "il" "ier" "xnames" "method"
## [21] "force.in" "force.out" "sserr" "intercept" "lindep"
## [26] "nullrss" "nn" "call"
Plotting Resubsets Results
# Save results summary to a object
mod2_sum <- summary(mod2)
# Plot regsubsets results scree plots to identify the best model. Two
# useful variables within the summary object are 'adjr2' and 'bic', which
# are both measures of model fit that include a penalty for including
# additional independent variables. Higher adj. r2 and lower bic indicate
# a better model fit.
par(mfrow = c(1, 2))
plot(1:8, y = mod2_sum$adjr2, type = "o", xlab = "Number of Parameters", ylab = "Adj. r^2",
main = "Adj. r^2 by Model Size")
plot(1:8, y = mod2_sum$bic, type = "o", xlab = "Number of Parameters", ylab = "BIC",
main = "BIC by Model Size")
# Alternatively, results can be visualized by plotting the original
# regsubsets object (as opposed to the summary object). This plots bic
# and the number of variables. The bonus over the previous method is that
# it also tells you which variables to take for each model size.
par(mfrow = c(1, 1))
plot(mod2, main = "Regsubsets Results by BIC")
Examining the best 8-var Model
m8a <- lm(Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy + pctcoled +
pctfemhh + Nader_pct, data = Elect04)
summary(m8a) # We can explain 46% of the Total SS with this model.
##
## Call:
## lm(formula = Bush_pct ~ ASIA + AIAN + UNDER18 + ginirev + unemploy +
## pctcoled + pctfemhh + Nader_pct, data = Elect04)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.26 -6.08 0.71 6.78 31.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.8011 1.7382 18.30 < 2e-16 ***
## ASIA -1.0155 0.1286 -7.90 4.0e-15 ***
## AIAN -0.2264 0.0286 -7.93 3.1e-15 ***
## UNDER18 1.0180 0.0549 18.55 < 2e-16 ***
## ginirev 77.7389 3.2919 23.61 < 2e-16 ***
## unemploy -0.8912 0.0645 -13.81 < 2e-16 ***
## pctcoled -0.2522 0.0331 -7.62 3.5e-14 ***
## pctfemhh -1.3787 0.0393 -35.04 < 2e-16 ***
## Nader_pct -4.1939 0.3602 -11.64 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.37 on 3102 degrees of freedom
## Multiple R-squared: 0.462, Adjusted R-squared: 0.46
## F-statistic: 333 on 8 and 3102 DF, p-value: <2e-16
# Every varaible shows up significant, and the F-stat is golden.
## Examining Residuals
plot(m8a$fitted.values, m8a$residuals, main = "Residual Plot for Model m8a",
xlab = "Fitted Values (% Votes for Bush)", ylab = "Residuals", cex = 0.6)
# A little bias for overprediction, perhaps, but not too bad.
hist(m8a$residuals, main = "Residuals for Model m8a", xlab = "Residuals", col = "cyan")
# There is a larger righthand tail than lefthand tail, but not too shabby.
qqnorm(m8a$residuals, main = "QQ-Plot Residuals for Model m8a")
qqline(m8a$residuals)
# And it was looking so good... Yeah, not quite normal.
shapiro.test(m8a$residuals)
##
## Shapiro-Wilk normality test
##
## data: m8a$residuals
## W = 0.9869, p-value = 2.45e-16
# That's the last nail in the coffin. The data being in the thousands may
# have something to do with it, but it's REALLY significant.
## Test for Multicollinearity
library(car)
## Loading required package: MASS
## Loading required package: nnet
m8a_vif <- vif(m8a)
summary(m8a_vif)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.13 1.17 1.45 1.42 1.65 1.69
# Mean VIF = 1.418 Range if 1.127 to 1.686 So there isn't much concern
# here for multicollinearity.