Install and load required packages as well as load the data
#install.packages("tidyverse")
library(tidyverse)
#install.packages("readr")
library(readr)
#install.packages("leaps")
library(leaps)
#install.packages("rpart")
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)
OR_Data <- read_csv("OR_Data.csv")
Exploratory data analysis
# histogram of SNAP benefit usage, colored by whether a tract is Urban
ggplot(OR_Data, aes(x = TractSNAP, fill= as.factor(Urban)))+
geom_histogram(position="dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# scatterplot of SNAP benefit usage as a function of low income population
ggplot(OR_Data, aes(x = TractLOWI, y = TractSNAP))+
geom_point()

# histogram of low income population, colored by whether a tract is considered low income
ggplot(OR_Data, aes(TractLOWI,fill=as.factor(LowIncomeTracts)))+
geom_histogram(position="dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# side by side boxplots comparing SNAP benefit usage for urban and non-urban tracts
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(Urban)))+
geom_boxplot()

# side by side boxplots comparing SNAP benefit usage for tracts with and without low vehicle access
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(HUNVFlag)))+
geom_boxplot()

linear regression - 1 categorical explanatory variable
1. Using Urban as the categorical explanatory variable
mod2a <- lm(TractSNAP~as.factor(Urban), data = OR_Data)
anova(mod2a)
## Analysis of Variance Table
##
## Response: TractSNAP
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Urban) 1 1338603 1338603 26.116 3.999e-07 ***
## Residuals 824 42235441 51257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(OR_Data, aes(as.factor(Urban), TractSNAP, fill = as.factor(Urban)))+
geom_boxplot()

2. Using HUNVFlag as the categorical explanatory variable
mod2b <- lm(TractSNAP~as.factor(HUNVFlag), data = OR_Data)
anova(mod2b)
## Analysis of Variance Table
##
## Response: TractSNAP
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(HUNVFlag) 1 5542228 5542228 120.08 < 2.2e-16 ***
## Residuals 824 38031816 46155
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(OR_Data, aes(as.factor(HUNVFlag), TractSNAP, fill = as.factor(HUNVFlag)))+
geom_boxplot()

multiple linear regression - 1 numerical and 1 categorical explanatory variable, parallel lines
1. Using Urban as the categorical explanatory variable
mod3a <- lm(TractSNAP~TractLOWI + as.factor(Urban), data = OR_Data)
summary(mod3a)
##
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(Urban), data = OR_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -723.23 -57.35 -10.11 50.14 762.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.465079 9.449001 -1.637 0.1021
## TractLOWI 0.206921 0.004248 48.712 <2e-16 ***
## as.factor(Urban)1 17.286935 9.059351 1.908 0.0567 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 115 on 823 degrees of freedom
## Multiple R-squared: 0.7504, Adjusted R-squared: 0.7498
## F-statistic: 1237 on 2 and 823 DF, p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(Urban)))+
geom_point(alpha = 0.5)+
geom_abline(intercept = mod3a$coefficients[1], slope = mod3a$coefficients[2], color = "red", lwd = 1)+
geom_abline(intercept = mod3a$coefficients[1] + mod3a$coefficients[3], slope = mod3a$coefficients[2], color = "blue", lwd = 1)

2. Using HUNVFlag as the categorical explanatory variable
mod3b <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag), data = OR_Data)
summary(mod3b)
##
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag), data = OR_Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -682.86 -60.14 -8.70 51.09 782.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.776483 7.654497 -0.493 0.622
## TractLOWI 0.201866 0.004394 45.937 < 2e-16 ***
## as.factor(HUNVFlag)1 48.005295 10.854084 4.423 1.11e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113.9 on 823 degrees of freedom
## Multiple R-squared: 0.7551, Adjusted R-squared: 0.7545
## F-statistic: 1269 on 2 and 823 DF, p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
geom_point(alpha = 0.5)+
geom_abline(intercept = mod3b$coefficients[1], slope = mod3b$coefficients[2], color = "red", lwd = 1)+
geom_abline(intercept = mod3b$coefficients[1] + mod3b$coefficients[3], slope = mod3b$coefficients[2], color = "blue", lwd = 1)

Tree for low income tract
set.seed(1)
OR_Data<-OR_Data[,-c(2:3)] # removing county and state from the dataset, so that the tree will work
Y_N_LowInc <- ifelse(OR_Data$LowIncomeTracts == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_LowInc)
train_data <- sample(1:nrow(OR_Data), nrow(OR_Data)/2)
tree.lowInc <- rpart(Y_N_LowInc~.-LowIncomeTracts,
data = OR_Data,
subset = train_data,
method = "class")
prp(tree.lowInc, faclen = 0, cex = 0.7, extra = 1, space = .5)

OR.test <- OR_Data[-train_data, "Y_N_LowInc"]
tree.pred <- predict(tree.lowInc, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm
## OR.test
## test.pred No Yes
## No 254 5
## Yes 3 151
err_rate <- 1-sum(diag(cm))/sum(cm)
err_rate
## [1] 0.01937046
summary(tree.lowInc)
## Call:
## rpart(formula = Y_N_LowInc ~ . - LowIncomeTracts, data = OR_Data,
## subset = train_data, method = "class")
## n= 413
##
## CP nsplit rel error xerror xstd
## 1 0.71069182 0 1.00000000 1.0000000 0.06219325
## 2 0.14465409 1 0.28930818 0.3081761 0.04133106
## 3 0.04402516 2 0.14465409 0.1635220 0.03104345
## 4 0.01000000 4 0.05660377 0.0754717 0.02146795
##
## Variable importance
## MedianFamilyIncome PovertyRate TractSNAP TractLOWI
## 32 20 10 9
## LILATracts_1And10 TractBlack TractAIAN TractAsian
## 8 4 4 3
## TractHUNV TractOMultir TractNHOPI TractHispanic
## 2 2 2 1
## PCTGQTRS NUMGQTRS TractKids
## 1 1 1
##
## Node number 1: 413 observations, complexity param=0.7106918
## predicted class=No expected loss=0.3849879 P(node) =1
## class counts: 254 159
## probabilities: 0.615 0.385
## left son=2 (300 obs) right son=3 (113 obs)
## Primary splits:
## MedianFamilyIncome < 58234.5 to the right, improve=116.16020, (2 missing)
## PovertyRate < 17.65 to the left, improve= 92.43622, (0 missing)
## TractSNAP < 402.5 to the left, improve= 60.68571, (0 missing)
## TractLOWI < 1643.5 to the left, improve= 58.94830, (0 missing)
## LILATracts_1And10 < 0.5 to the left, improve= 43.03390, (0 missing)
## Surrogate splits:
## PovertyRate < 17.55 to the left, agree=0.832, adj=0.378, (2 split)
## TractSNAP < 532.5 to the left, agree=0.820, adj=0.333, (0 split)
## TractLOWI < 2117.5 to the left, agree=0.810, adj=0.297, (0 split)
## LILATracts_1And10 < 0.5 to the left, agree=0.805, adj=0.279, (0 split)
## TractAIAN < 106.5 to the left, agree=0.764, adj=0.126, (0 split)
##
## Node number 2: 300 observations, complexity param=0.1446541
## predicted class=No expected loss=0.1533333 P(node) =0.7263923
## class counts: 254 46
## probabilities: 0.847 0.153
## left son=4 (277 obs) right son=5 (23 obs)
## Primary splits:
## PovertyRate < 19.9 to the left, improve=35.71283, (0 missing)
## MedianFamilyIncome < 68175.5 to the right, improve=18.04391, (0 missing)
## LILATracts_1And10 < 0.5 to the left, improve=13.30227, (0 missing)
## TractSNAP < 352.5 to the left, improve=12.42014, (0 missing)
## TractLOWI < 2606.5 to the left, improve=11.43370, (0 missing)
## Surrogate splits:
## PCTGQTRS < 32.315 to the left, agree=0.933, adj=0.130, (0 split)
## TractHUNV < 584.5 to the left, agree=0.933, adj=0.130, (0 split)
## NUMGQTRS < 871.5 to the left, agree=0.930, adj=0.087, (0 split)
## TractKids < 78.5 to the right, agree=0.930, adj=0.087, (0 split)
## OHU2010 < 604.5 to the right, agree=0.927, adj=0.043, (0 split)
##
## Node number 3: 113 observations
## predicted class=Yes expected loss=0 P(node) =0.2736077
## class counts: 0 113
## probabilities: 0.000 1.000
##
## Node number 4: 277 observations, complexity param=0.04402516
## predicted class=No expected loss=0.08303249 P(node) =0.6707022
## class counts: 254 23
## probabilities: 0.917 0.083
## left son=8 (207 obs) right son=9 (70 obs)
## Primary splits:
## MedianFamilyIncome < 68175.5 to the right, improve=11.294790, (0 missing)
## TractLOWI < 2729 to the left, improve= 6.279248, (0 missing)
## TractSNAP < 352.5 to the left, improve= 5.242500, (0 missing)
## TractOMultir < 779 to the left, improve= 5.081397, (0 missing)
## TractBlack < 264 to the left, improve= 4.479913, (0 missing)
## Surrogate splits:
## TractLOWI < 2113.5 to the left, agree=0.801, adj=0.214, (0 split)
## TractOMultir < 779 to the left, agree=0.783, adj=0.143, (0 split)
## TractSNAP < 351.5 to the left, agree=0.783, adj=0.143, (0 split)
## TractHispanic < 904.5 to the left, agree=0.776, adj=0.114, (0 split)
## TractAsian < 14.5 to the right, agree=0.773, adj=0.100, (0 split)
##
## Node number 5: 23 observations
## predicted class=Yes expected loss=0 P(node) =0.05569007
## class counts: 0 23
## probabilities: 0.000 1.000
##
## Node number 8: 207 observations
## predicted class=No expected loss=0 P(node) =0.5012107
## class counts: 207 0
## probabilities: 1.000 0.000
##
## Node number 9: 70 observations, complexity param=0.04402516
## predicted class=No expected loss=0.3285714 P(node) =0.1694915
## class counts: 47 23
## probabilities: 0.671 0.329
## left son=18 (50 obs) right son=19 (20 obs)
## Primary splits:
## TractBlack < 58 to the left, improve=15.225710, (0 missing)
## TractAsian < 85.5 to the left, improve=12.370640, (0 missing)
## TractNHOPI < 15 to the left, improve= 8.752847, (0 missing)
## TractOMultir < 174 to the left, improve= 8.396825, (0 missing)
## laaian1share < 0.085 to the right, improve= 8.062035, (8 missing)
## Surrogate splits:
## TractAsian < 154.5 to the left, agree=0.929, adj=0.75, (0 split)
## TractNHOPI < 23.5 to the left, agree=0.843, adj=0.45, (0 split)
## TractOMultir < 496 to the left, agree=0.814, adj=0.35, (0 split)
## TractHispanic < 590.5 to the left, agree=0.786, adj=0.25, (0 split)
## TractHUNV < 255 to the left, agree=0.786, adj=0.25, (0 split)
##
## Node number 18: 50 observations
## predicted class=No expected loss=0.12 P(node) =0.1210654
## class counts: 44 6
## probabilities: 0.880 0.120
##
## Node number 19: 20 observations
## predicted class=Yes expected loss=0.15 P(node) =0.04842615
## class counts: 3 17
## probabilities: 0.150 0.850
Using the above tree on data for the entire U.S. excluding Oregon, as it was the train case
US_Data <- read.csv("US_Data.csv")
US_Data <- US_Data[,-c(2:3)] # removing county and state
Y_N_LowInc <- ifelse(US_Data$LowIncomeTracts == 0, "No", "Yes")
US_Data <- data.frame(US_Data, Y_N_LowInc)
train <- c(54109:54934)
US.test <- US_Data[-train, "Y_N_LowInc"]
tree.pred2 <- predict(tree.lowInc, newdata = US_Data, type = "class")
test.pred2 <- tree.pred2[-train]
cm2 <- table(test.pred2, US.test)
cm2
## US.test
## test.pred2 No Yes
## No 36365 2409
## Yes 5368 27563
err_rate2 <- 1-sum(diag(cm2))/sum(cm2)
err_rate2
## [1] 0.1084583
training/testing a tree for the entire U.S.
train_data <- sample(1:nrow(US_Data), nrow(US_Data)/2)
tree.lowInc2 <- rpart(Y_N_LowInc~.-LowIncomeTracts,
data = US_Data,
subset = train_data,
method = "class")
prp(tree.lowInc2, faclen = 0, cex = 0.7, extra = 1, space = .5)

US.test2 <- US_Data[-train_data, "Y_N_LowInc"]
tree.pred3 <- predict(tree.lowInc2, newdata = US_Data, type = "class")
test.pred3 <- tree.pred3[-train_data]
cm3<- table(test.pred3, US.test2)
cm3
## US.test2
## test.pred3 No Yes
## No 20522 1480
## Yes 623 13641
err_rate3 <- 1-sum(diag(cm))/sum(cm)
err_rate3
## [1] 0.01937046