Install and load required packages as well as load the data

#install.packages("tidyverse")
library(tidyverse)
#install.packages("readr")
library(readr)
#install.packages("leaps")
library(leaps)
#install.packages("rpart")
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)
OR_Data <- read_csv("OR_Data.csv")

Exploratory data analysis

# histogram of SNAP benefit usage, colored by whether a tract is Urban
ggplot(OR_Data, aes(x = TractSNAP, fill= as.factor(Urban)))+
  geom_histogram(position="dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# scatterplot of SNAP benefit usage as a function of low income population
ggplot(OR_Data, aes(x = TractLOWI, y = TractSNAP))+
  geom_point()

# histogram of low income population, colored by whether a tract is considered low income
ggplot(OR_Data, aes(TractLOWI,fill=as.factor(LowIncomeTracts)))+
  geom_histogram(position="dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# side by side boxplots comparing SNAP benefit usage for urban and non-urban tracts
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(Urban)))+
  geom_boxplot()

# side by side boxplots comparing SNAP benefit usage for tracts with and without low vehicle access
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(HUNVFlag)))+
  geom_boxplot()

simple linear regression - 1 numerical explanatory variable

mod1 <-lm(TractSNAP~TractLOWI, data = OR_Data)
summary(mod1)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI, data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -725.51  -58.50  -10.91   49.46  764.93 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.074361   7.734538  -0.656    0.512    
## TractLOWI    0.208255   0.004197  49.624   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115.1 on 824 degrees of freedom
## Multiple R-squared:  0.7493, Adjusted R-squared:  0.749 
## F-statistic:  2463 on 1 and 824 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP))+
  geom_point()+
  geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2], col = "blue")

anova(mod1)
## Analysis of Variance Table
## 
## Response: TractSNAP
##            Df   Sum Sq  Mean Sq F value    Pr(>F)    
## TractLOWI   1 32649335 32649335  2462.6 < 2.2e-16 ***
## Residuals 824 10924709    13258                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod1)

linear regression - 1 categorical explanatory variable

1. Using Urban as the categorical explanatory variable

mod2a <- lm(TractSNAP~as.factor(Urban), data = OR_Data)
anova(mod2a)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                   Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Urban)   1  1338603 1338603  26.116 3.999e-07 ***
## Residuals        824 42235441   51257                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(OR_Data, aes(as.factor(Urban), TractSNAP, fill = as.factor(Urban)))+
  geom_boxplot()

2. Using HUNVFlag as the categorical explanatory variable

mod2b <- lm(TractSNAP~as.factor(HUNVFlag), data = OR_Data)
anova(mod2b)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(HUNVFlag)   1  5542228 5542228  120.08 < 2.2e-16 ***
## Residuals           824 38031816   46155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(OR_Data, aes(as.factor(HUNVFlag), TractSNAP, fill = as.factor(HUNVFlag)))+
  geom_boxplot()

multiple linear regression - 1 numerical and 1 categorical explanatory variable, parallel lines

1. Using Urban as the categorical explanatory variable

mod3a <- lm(TractSNAP~TractLOWI + as.factor(Urban), data = OR_Data)
summary(mod3a)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(Urban), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -723.23  -57.35  -10.11   50.14  762.04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -15.465079   9.449001  -1.637   0.1021    
## TractLOWI           0.206921   0.004248  48.712   <2e-16 ***
## as.factor(Urban)1  17.286935   9.059351   1.908   0.0567 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115 on 823 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7498 
## F-statistic:  1237 on 2 and 823 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(Urban)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod3a$coefficients[1], slope = mod3a$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod3a$coefficients[1] + mod3a$coefficients[3], slope = mod3a$coefficients[2], color = "blue", lwd = 1)

2. Using HUNVFlag as the categorical explanatory variable

mod3b <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag), data = OR_Data)
summary(mod3b)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -682.86  -60.14   -8.70   51.09  782.79 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.776483   7.654497  -0.493    0.622    
## TractLOWI             0.201866   0.004394  45.937  < 2e-16 ***
## as.factor(HUNVFlag)1 48.005295  10.854084   4.423 1.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 823 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7545 
## F-statistic:  1269 on 2 and 823 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod3b$coefficients[1], slope = mod3b$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod3b$coefficients[1] + mod3b$coefficients[3], slope = mod3b$coefficients[2], color = "blue", lwd = 1)

multiple linear regression - 1 numerical and 1 categorical explanatory variable, unrelated lines

1. Using Urban as the categorical explanatory variable

mod4a <- lm(TractSNAP~TractLOWI + as.factor(Urban) +
             TractLOWI*as.factor(Urban), data = OR_Data)
summary(mod4a)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(Urban) + TractLOWI * 
##     as.factor(Urban), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -722.39  -57.46  -10.29   50.19  762.25 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.645e+01  1.532e+01  -1.074    0.283    
## TractLOWI                    2.077e-01  1.006e-02  20.648   <2e-16 ***
## as.factor(Urban)1            1.854e+01  1.785e+01   1.039    0.299    
## TractLOWI:as.factor(Urban)1 -9.056e-04  1.110e-02  -0.082    0.935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115 on 822 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7495 
## F-statistic: 823.7 on 3 and 822 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(Urban)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod4a$coefficients[1], slope = mod4a$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod4a$coefficients[1] + mod4a$coefficients[3], slope = mod4a$coefficients[2] + mod4a$coefficients[4], color = "blue", lwd = 1)

2. Using HUNVFlag as the categorical explanatory variable

mod4b <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag) +
             TractLOWI*as.factor(HUNVFlag), data = OR_Data)
summary(mod4b)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag) + TractLOWI * 
##     as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -693.58  -60.00   -7.82   50.65  779.70 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -6.586162   8.503727  -0.775  0.43886    
## TractLOWI                       0.203833   0.005102  39.948  < 2e-16 ***
## as.factor(HUNVFlag)1           63.498511  23.112703   2.747  0.00614 ** 
## TractLOWI:as.factor(HUNVFlag)1 -0.007629   0.010048  -0.759  0.44788    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 822 degrees of freedom
## Multiple R-squared:  0.7553, Adjusted R-squared:  0.7544 
## F-statistic: 845.6 on 3 and 822 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod4b$coefficients[1], slope = mod4b$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod4b$coefficients[1] + mod4b$coefficients[3], slope = mod4b$coefficients[2] + mod4b$coefficients[4], color = "blue", lwd = 1)

Calculating MSE’s for the simple model, the parallel lines model with HUNVFlag, and tthe unrelated lines model with HUNVFlag:

anova(mod1)
## Analysis of Variance Table
## 
## Response: TractSNAP
##            Df   Sum Sq  Mean Sq F value    Pr(>F)    
## TractLOWI   1 32649335 32649335  2462.6 < 2.2e-16 ***
## Residuals 824 10924709    13258                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod3b)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq  Mean Sq  F value    Pr(>F)    
## TractLOWI             1 32649335 32649335 2518.059 < 2.2e-16 ***
## as.factor(HUNVFlag)   1   253630   253630   19.561 1.105e-05 ***
## Residuals           823 10671080    12966                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod4b)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                                Df   Sum Sq  Mean Sq   F value    Pr(>F)    
## TractLOWI                       1 32649335 32649335 2516.7630 < 2.2e-16 ***
## as.factor(HUNVFlag)             1   253630   253630   19.5510 1.111e-05 ***
## TractLOWI:as.factor(HUNVFlag)   1     7480     7480    0.5766    0.4479    
## Residuals                     822 10663600    12973                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Tree for low income tract

set.seed(1)
OR_Data<-OR_Data[,-c(2:3)] # removing county and state from the dataset, so that the tree will work
Y_N_LowInc <- ifelse(OR_Data$LowIncomeTracts == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_LowInc)
train_data <- sample(1:nrow(OR_Data), nrow(OR_Data)/2)

tree.lowInc <- rpart(Y_N_LowInc~.-LowIncomeTracts, 
                     data = OR_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.lowInc, faclen = 0, cex = 0.7, extra = 1, space = .5)

OR.test <- OR_Data[-train_data, "Y_N_LowInc"]
tree.pred <- predict(tree.lowInc, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm
##          OR.test
## test.pred  No Yes
##       No  254   5
##       Yes   3 151
err_rate <- 1-sum(diag(cm))/sum(cm)
err_rate
## [1] 0.01937046
summary(tree.lowInc)
## Call:
## rpart(formula = Y_N_LowInc ~ . - LowIncomeTracts, data = OR_Data, 
##     subset = train_data, method = "class")
##   n= 413 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.71069182      0 1.00000000 1.0000000 0.06219325
## 2 0.14465409      1 0.28930818 0.3081761 0.04133106
## 3 0.04402516      2 0.14465409 0.1635220 0.03104345
## 4 0.01000000      4 0.05660377 0.0754717 0.02146795
## 
## Variable importance
## MedianFamilyIncome        PovertyRate          TractSNAP          TractLOWI 
##                 32                 20                 10                  9 
##  LILATracts_1And10         TractBlack          TractAIAN         TractAsian 
##                  8                  4                  4                  3 
##          TractHUNV       TractOMultir         TractNHOPI      TractHispanic 
##                  2                  2                  2                  1 
##           PCTGQTRS           NUMGQTRS          TractKids 
##                  1                  1                  1 
## 
## Node number 1: 413 observations,    complexity param=0.7106918
##   predicted class=No   expected loss=0.3849879  P(node) =1
##     class counts:   254   159
##    probabilities: 0.615 0.385 
##   left son=2 (300 obs) right son=3 (113 obs)
##   Primary splits:
##       MedianFamilyIncome < 58234.5 to the right, improve=116.16020, (2 missing)
##       PovertyRate        < 17.65   to the left,  improve= 92.43622, (0 missing)
##       TractSNAP          < 402.5   to the left,  improve= 60.68571, (0 missing)
##       TractLOWI          < 1643.5  to the left,  improve= 58.94830, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve= 43.03390, (0 missing)
##   Surrogate splits:
##       PovertyRate       < 17.55   to the left,  agree=0.832, adj=0.378, (2 split)
##       TractSNAP         < 532.5   to the left,  agree=0.820, adj=0.333, (0 split)
##       TractLOWI         < 2117.5  to the left,  agree=0.810, adj=0.297, (0 split)
##       LILATracts_1And10 < 0.5     to the left,  agree=0.805, adj=0.279, (0 split)
##       TractAIAN         < 106.5   to the left,  agree=0.764, adj=0.126, (0 split)
## 
## Node number 2: 300 observations,    complexity param=0.1446541
##   predicted class=No   expected loss=0.1533333  P(node) =0.7263923
##     class counts:   254    46
##    probabilities: 0.847 0.153 
##   left son=4 (277 obs) right son=5 (23 obs)
##   Primary splits:
##       PovertyRate        < 19.9    to the left,  improve=35.71283, (0 missing)
##       MedianFamilyIncome < 68175.5 to the right, improve=18.04391, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve=13.30227, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve=12.42014, (0 missing)
##       TractLOWI          < 2606.5  to the left,  improve=11.43370, (0 missing)
##   Surrogate splits:
##       PCTGQTRS  < 32.315  to the left,  agree=0.933, adj=0.130, (0 split)
##       TractHUNV < 584.5   to the left,  agree=0.933, adj=0.130, (0 split)
##       NUMGQTRS  < 871.5   to the left,  agree=0.930, adj=0.087, (0 split)
##       TractKids < 78.5    to the right, agree=0.930, adj=0.087, (0 split)
##       OHU2010   < 604.5   to the right, agree=0.927, adj=0.043, (0 split)
## 
## Node number 3: 113 observations
##   predicted class=Yes  expected loss=0  P(node) =0.2736077
##     class counts:     0   113
##    probabilities: 0.000 1.000 
## 
## Node number 4: 277 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.08303249  P(node) =0.6707022
##     class counts:   254    23
##    probabilities: 0.917 0.083 
##   left son=8 (207 obs) right son=9 (70 obs)
##   Primary splits:
##       MedianFamilyIncome < 68175.5 to the right, improve=11.294790, (0 missing)
##       TractLOWI          < 2729    to the left,  improve= 6.279248, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve= 5.242500, (0 missing)
##       TractOMultir       < 779     to the left,  improve= 5.081397, (0 missing)
##       TractBlack         < 264     to the left,  improve= 4.479913, (0 missing)
##   Surrogate splits:
##       TractLOWI     < 2113.5  to the left,  agree=0.801, adj=0.214, (0 split)
##       TractOMultir  < 779     to the left,  agree=0.783, adj=0.143, (0 split)
##       TractSNAP     < 351.5   to the left,  agree=0.783, adj=0.143, (0 split)
##       TractHispanic < 904.5   to the left,  agree=0.776, adj=0.114, (0 split)
##       TractAsian    < 14.5    to the right, agree=0.773, adj=0.100, (0 split)
## 
## Node number 5: 23 observations
##   predicted class=Yes  expected loss=0  P(node) =0.05569007
##     class counts:     0    23
##    probabilities: 0.000 1.000 
## 
## Node number 8: 207 observations
##   predicted class=No   expected loss=0  P(node) =0.5012107
##     class counts:   207     0
##    probabilities: 1.000 0.000 
## 
## Node number 9: 70 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.3285714  P(node) =0.1694915
##     class counts:    47    23
##    probabilities: 0.671 0.329 
##   left son=18 (50 obs) right son=19 (20 obs)
##   Primary splits:
##       TractBlack   < 58      to the left,  improve=15.225710, (0 missing)
##       TractAsian   < 85.5    to the left,  improve=12.370640, (0 missing)
##       TractNHOPI   < 15      to the left,  improve= 8.752847, (0 missing)
##       TractOMultir < 174     to the left,  improve= 8.396825, (0 missing)
##       laaian1share < 0.085   to the right, improve= 8.062035, (8 missing)
##   Surrogate splits:
##       TractAsian    < 154.5   to the left,  agree=0.929, adj=0.75, (0 split)
##       TractNHOPI    < 23.5    to the left,  agree=0.843, adj=0.45, (0 split)
##       TractOMultir  < 496     to the left,  agree=0.814, adj=0.35, (0 split)
##       TractHispanic < 590.5   to the left,  agree=0.786, adj=0.25, (0 split)
##       TractHUNV     < 255     to the left,  agree=0.786, adj=0.25, (0 split)
## 
## Node number 18: 50 observations
##   predicted class=No   expected loss=0.12  P(node) =0.1210654
##     class counts:    44     6
##    probabilities: 0.880 0.120 
## 
## Node number 19: 20 observations
##   predicted class=Yes  expected loss=0.15  P(node) =0.04842615
##     class counts:     3    17
##    probabilities: 0.150 0.850

Using the above tree on data for the entire U.S. excluding Oregon, as it was the train case

US_Data <- read.csv("US_Data.csv")
US_Data <- US_Data[,-c(2:3)] # removing county and state
Y_N_LowInc <- ifelse(US_Data$LowIncomeTracts == 0, "No", "Yes")
US_Data <- data.frame(US_Data, Y_N_LowInc)
train <- c(54109:54934)
US.test <- US_Data[-train, "Y_N_LowInc"]
tree.pred2 <- predict(tree.lowInc, newdata = US_Data, type = "class")
test.pred2 <- tree.pred2[-train]
cm2 <- table(test.pred2, US.test)
cm2
##           US.test
## test.pred2    No   Yes
##        No  36365  2409
##        Yes  5368 27563
err_rate2 <- 1-sum(diag(cm2))/sum(cm2)
err_rate2
## [1] 0.1084583

training/testing a tree for the entire U.S.

train_data <- sample(1:nrow(US_Data), nrow(US_Data)/2)

tree.lowInc2 <- rpart(Y_N_LowInc~.-LowIncomeTracts, 
                     data = US_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.lowInc2, faclen = 0, cex = 0.7, extra = 1, space = .5)

US.test2 <- US_Data[-train_data, "Y_N_LowInc"]
tree.pred3 <- predict(tree.lowInc2, newdata = US_Data, type = "class")
test.pred3 <- tree.pred3[-train_data]

cm3<- table(test.pred3, US.test2)
cm3
##           US.test2
## test.pred3    No   Yes
##        No  20522  1480
##        Yes   623 13641
err_rate3 <- 1-sum(diag(cm))/sum(cm)
err_rate3
## [1] 0.01937046