Final Project Code

Install and load required packages as well as load the data

#install.packages("tidyverse")
library(tidyverse)
#install.packages("readr")
library(readr)
#install.packages("leaps")
library(leaps)
#install.packages("rpart")
library(rpart)
#install.packages("rpart.plot")
library(rpart.plot)
OR_Data <- read_csv("OR_Data.csv")

Exploratory data analysis

# histogram of SNAP benefit usage, colored by whether a tract is Urban
ggplot(OR_Data, aes(x = TractSNAP, fill= as.factor(Urban)))+
  geom_histogram(position="dodge")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# scatterplot of SNAP benefit usage as a function of low income population
ggplot(OR_Data, aes(x = TractLOWI, y = TractSNAP))+
  geom_point()

# histogram of low income population, colored by whether a tract is considered low income
ggplot(OR_Data, aes(TractLOWI,fill=as.factor(LowIncomeTracts)))+
  geom_histogram(position="dodge")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# side by side boxplots comparing SNAP benefit usage for urban and non-urban tracts
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(Urban)))+
  geom_boxplot()

# side by side boxplots comparing SNAP benefit usage for tracts with and without low vehicle access
ggplot(OR_Data, aes(y = TractSNAP, fill = as.factor(HUNVFlag)))+
  geom_boxplot()

simple linear regression - 1 numerical explanatory variable

mod1 <-lm(TractSNAP~TractLOWI, data = OR_Data)
summary(mod1)

## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI, data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -725.51  -58.50  -10.91   49.46  764.93 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.074361   7.734538  -0.656    0.512    
## TractLOWI    0.208255   0.004197  49.624   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115.1 on 824 degrees of freedom
## Multiple R-squared:  0.7493, Adjusted R-squared:  0.749 
## F-statistic:  2463 on 1 and 824 DF,  p-value: < 2.2e-16

ggplot(OR_Data, aes(TractLOWI, TractSNAP))+
  geom_point()+
  geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2], col = "blue")

anova(mod1)

## Analysis of Variance Table
## 
## Response: TractSNAP
##            Df   Sum Sq  Mean Sq F value    Pr(>F)    
## TractLOWI   1 32649335 32649335  2462.6 < 2.2e-16 ***
## Residuals 824 10924709    13258                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(mod1)

linear regression - 1 categorical explanatory variable

1. Using Urban as the categorical explanatory variable

mod2a <- lm(TractSNAP~as.factor(Urban), data = OR_Data)
anova(mod2a)

## Analysis of Variance Table
## 
## Response: TractSNAP
##                   Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Urban)   1  1338603 1338603  26.116 3.999e-07 ***
## Residuals        824 42235441   51257                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(OR_Data, aes(as.factor(Urban), TractSNAP, fill = as.factor(Urban)))+
  geom_boxplot()

2. Using HUNVFlag as the categorical explanatory variable

mod2b <- lm(TractSNAP~as.factor(HUNVFlag), data = OR_Data)
anova(mod2b)

## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(HUNVFlag)   1  5542228 5542228  120.08 < 2.2e-16 ***
## Residuals           824 38031816   46155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(OR_Data, aes(as.factor(HUNVFlag), TractSNAP, fill = as.factor(HUNVFlag)))+
  geom_boxplot()

multiple linear regression - 1 numerical and 1 categorical explanatory variable, parallel lines

1. Using Urban as the categorical explanatory variable

mod3a <- lm(TractSNAP~TractLOWI + as.factor(Urban), data = OR_Data)
summary(mod3a)

## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(Urban), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -723.23  -57.35  -10.11   50.14  762.04 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -15.465079   9.449001  -1.637   0.1021    
## TractLOWI           0.206921   0.004248  48.712   <2e-16 ***
## as.factor(Urban)1  17.286935   9.059351   1.908   0.0567 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115 on 823 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7498 
## F-statistic:  1237 on 2 and 823 DF,  p-value: < 2.2e-16

ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(Urban)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod3a$coefficients[1], slope = mod3a$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod3a$coefficients[1] + mod3a$coefficients[3], slope = mod3a$coefficients[2], color = "blue", lwd = 1)

2. Using HUNVFlag as the categorical explanatory variable

mod3b <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag), data = OR_Data)
summary(mod3b)

## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -682.86  -60.14   -8.70   51.09  782.79 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.776483   7.654497  -0.493    0.622    
## TractLOWI             0.201866   0.004394  45.937  < 2e-16 ***
## as.factor(HUNVFlag)1 48.005295  10.854084   4.423 1.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 823 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7545 
## F-statistic:  1269 on 2 and 823 DF,  p-value: < 2.2e-16

ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod3b$coefficients[1], slope = mod3b$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod3b$coefficients[1] + mod3b$coefficients[3], slope = mod3b$coefficients[2], color = "blue", lwd = 1)

multiple linear regression - 1 numerical and 1 categorical explanatory variable, unrelated lines

1. Using Urban as the categorical explanatory variable

mod4a <- lm(TractSNAP~TractLOWI + as.factor(Urban) +
             TractLOWI*as.factor(Urban), data = OR_Data)
summary(mod4a)

## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(Urban) + TractLOWI * 
##     as.factor(Urban), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -722.39  -57.46  -10.29   50.19  762.25 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.645e+01  1.532e+01  -1.074    0.283    
## TractLOWI                    2.077e-01  1.006e-02  20.648   <2e-16 ***
## as.factor(Urban)1            1.854e+01  1.785e+01   1.039    0.299    
## TractLOWI:as.factor(Urban)1 -9.056e-04  1.110e-02  -0.082    0.935    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115 on 822 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7495 
## F-statistic: 823.7 on 3 and 822 DF,  p-value: < 2.2e-16

ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(Urban)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod4a$coefficients[1], slope = mod4a$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod4a$coefficients[1] + mod4a$coefficients[3], slope = mod4a$coefficients[2] + mod4a$coefficients[4], color = "blue", lwd = 1)

2. Using HUNVFlag as the categorical explanatory variable

mod4b <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag) +
             TractLOWI*as.factor(HUNVFlag), data = OR_Data)
summary(mod4b)

## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag) + TractLOWI * 
##     as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -693.58  -60.00   -7.82   50.65  779.70 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -6.586162   8.503727  -0.775  0.43886    
## TractLOWI                       0.203833   0.005102  39.948  < 2e-16 ***
## as.factor(HUNVFlag)1           63.498511  23.112703   2.747  0.00614 ** 
## TractLOWI:as.factor(HUNVFlag)1 -0.007629   0.010048  -0.759  0.44788    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 822 degrees of freedom
## Multiple R-squared:  0.7553, Adjusted R-squared:  0.7544 
## F-statistic: 845.6 on 3 and 822 DF,  p-value: < 2.2e-16

ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point(alpha = 0.5)+
  geom_abline(intercept = mod4b$coefficients[1], slope = mod4b$coefficients[2], color = "red", lwd = 1)+
  geom_abline(intercept = mod4b$coefficients[1] + mod4b$coefficients[3], slope = mod4b$coefficients[2] + mod4b$coefficients[4], color = "blue", lwd = 1)

Calculating MSE’s for the simple model, the parallel lines model with HUNVFlag, and tthe unrelated lines model with HUNVFlag:

anova(mod1)

## Analysis of Variance Table
## 
## Response: TractSNAP
##            Df   Sum Sq  Mean Sq F value    Pr(>F)    
## TractLOWI   1 32649335 32649335  2462.6 < 2.2e-16 ***
## Residuals 824 10924709    13258                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(mod3b)

## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq  Mean Sq  F value    Pr(>F)    
## TractLOWI             1 32649335 32649335 2518.059 < 2.2e-16 ***
## as.factor(HUNVFlag)   1   253630   253630   19.561 1.105e-05 ***
## Residuals           823 10671080    12966                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(mod4b)

## Analysis of Variance Table
## 
## Response: TractSNAP
##                                Df   Sum Sq  Mean Sq   F value    Pr(>F)    
## TractLOWI                       1 32649335 32649335 2516.7630 < 2.2e-16 ***
## as.factor(HUNVFlag)             1   253630   253630   19.5510 1.111e-05 ***
## TractLOWI:as.factor(HUNVFlag)   1     7480     7480    0.5766    0.4479    
## Residuals                     822 10663600    12973                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Tree for low income tract

set.seed(1)
OR_Data<-OR_Data[,-c(2:3)] # removing county and state from the dataset, so that the tree will work
Y_N_LowInc <- ifelse(OR_Data$LowIncomeTracts == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_LowInc)
train_data <- sample(1:nrow(OR_Data), nrow(OR_Data)/2)

tree.lowInc <- rpart(Y_N_LowInc~.-LowIncomeTracts, 
                     data = OR_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.lowInc, faclen = 0, cex = 0.7, extra = 1, space = .5)

OR.test <- OR_Data[-train_data, "Y_N_LowInc"]
tree.pred <- predict(tree.lowInc, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm

##          OR.test
## test.pred  No Yes
##       No  254   5
##       Yes   3 151

err_rate <- 1-sum(diag(cm))/sum(cm)
err_rate

## [1] 0.01937046

summary(tree.lowInc)

## Call:
## rpart(formula = Y_N_LowInc ~ . - LowIncomeTracts, data = OR_Data, 
##     subset = train_data, method = "class")
##   n= 413 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.71069182      0 1.00000000 1.0000000 0.06219325
## 2 0.14465409      1 0.28930818 0.3081761 0.04133106
## 3 0.04402516      2 0.14465409 0.1635220 0.03104345
## 4 0.01000000      4 0.05660377 0.0754717 0.02146795
## 
## Variable importance
## MedianFamilyIncome        PovertyRate          TractSNAP          TractLOWI 
##                 32                 20                 10                  9 
##  LILATracts_1And10         TractBlack          TractAIAN         TractAsian 
##                  8                  4                  4                  3 
##          TractHUNV       TractOMultir         TractNHOPI      TractHispanic 
##                  2                  2                  2                  1 
##           PCTGQTRS           NUMGQTRS          TractKids 
##                  1                  1                  1 
## 
## Node number 1: 413 observations,    complexity param=0.7106918
##   predicted class=No   expected loss=0.3849879  P(node) =1
##     class counts:   254   159
##    probabilities: 0.615 0.385 
##   left son=2 (300 obs) right son=3 (113 obs)
##   Primary splits:
##       MedianFamilyIncome < 58234.5 to the right, improve=116.16020, (2 missing)
##       PovertyRate        < 17.65   to the left,  improve= 92.43622, (0 missing)
##       TractSNAP          < 402.5   to the left,  improve= 60.68571, (0 missing)
##       TractLOWI          < 1643.5  to the left,  improve= 58.94830, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve= 43.03390, (0 missing)
##   Surrogate splits:
##       PovertyRate       < 17.55   to the left,  agree=0.832, adj=0.378, (2 split)
##       TractSNAP         < 532.5   to the left,  agree=0.820, adj=0.333, (0 split)
##       TractLOWI         < 2117.5  to the left,  agree=0.810, adj=0.297, (0 split)
##       LILATracts_1And10 < 0.5     to the left,  agree=0.805, adj=0.279, (0 split)
##       TractAIAN         < 106.5   to the left,  agree=0.764, adj=0.126, (0 split)
## 
## Node number 2: 300 observations,    complexity param=0.1446541
##   predicted class=No   expected loss=0.1533333  P(node) =0.7263923
##     class counts:   254    46
##    probabilities: 0.847 0.153 
##   left son=4 (277 obs) right son=5 (23 obs)
##   Primary splits:
##       PovertyRate        < 19.9    to the left,  improve=35.71283, (0 missing)
##       MedianFamilyIncome < 68175.5 to the right, improve=18.04391, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve=13.30227, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve=12.42014, (0 missing)
##       TractLOWI          < 2606.5  to the left,  improve=11.43370, (0 missing)
##   Surrogate splits:
##       PCTGQTRS  < 32.315  to the left,  agree=0.933, adj=0.130, (0 split)
##       TractHUNV < 584.5   to the left,  agree=0.933, adj=0.130, (0 split)
##       NUMGQTRS  < 871.5   to the left,  agree=0.930, adj=0.087, (0 split)
##       TractKids < 78.5    to the right, agree=0.930, adj=0.087, (0 split)
##       OHU2010   < 604.5   to the right, agree=0.927, adj=0.043, (0 split)
## 
## Node number 3: 113 observations
##   predicted class=Yes  expected loss=0  P(node) =0.2736077
##     class counts:     0   113
##    probabilities: 0.000 1.000 
## 
## Node number 4: 277 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.08303249  P(node) =0.6707022
##     class counts:   254    23
##    probabilities: 0.917 0.083 
##   left son=8 (207 obs) right son=9 (70 obs)
##   Primary splits:
##       MedianFamilyIncome < 68175.5 to the right, improve=11.294790, (0 missing)
##       TractLOWI          < 2729    to the left,  improve= 6.279248, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve= 5.242500, (0 missing)
##       TractOMultir       < 779     to the left,  improve= 5.081397, (0 missing)
##       TractBlack         < 264     to the left,  improve= 4.479913, (0 missing)
##   Surrogate splits:
##       TractLOWI     < 2113.5  to the left,  agree=0.801, adj=0.214, (0 split)
##       TractOMultir  < 779     to the left,  agree=0.783, adj=0.143, (0 split)
##       TractSNAP     < 351.5   to the left,  agree=0.783, adj=0.143, (0 split)
##       TractHispanic < 904.5   to the left,  agree=0.776, adj=0.114, (0 split)
##       TractAsian    < 14.5    to the right, agree=0.773, adj=0.100, (0 split)
## 
## Node number 5: 23 observations
##   predicted class=Yes  expected loss=0  P(node) =0.05569007
##     class counts:     0    23
##    probabilities: 0.000 1.000 
## 
## Node number 8: 207 observations
##   predicted class=No   expected loss=0  P(node) =0.5012107
##     class counts:   207     0
##    probabilities: 1.000 0.000 
## 
## Node number 9: 70 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.3285714  P(node) =0.1694915
##     class counts:    47    23
##    probabilities: 0.671 0.329 
##   left son=18 (50 obs) right son=19 (20 obs)
##   Primary splits:
##       TractBlack   < 58      to the left,  improve=15.225710, (0 missing)
##       TractAsian   < 85.5    to the left,  improve=12.370640, (0 missing)
##       TractNHOPI   < 15      to the left,  improve= 8.752847, (0 missing)
##       TractOMultir < 174     to the left,  improve= 8.396825, (0 missing)
##       laaian1share < 0.085   to the right, improve= 8.062035, (8 missing)
##   Surrogate splits:
##       TractAsian    < 154.5   to the left,  agree=0.929, adj=0.75, (0 split)
##       TractNHOPI    < 23.5    to the left,  agree=0.843, adj=0.45, (0 split)
##       TractOMultir  < 496     to the left,  agree=0.814, adj=0.35, (0 split)
##       TractHispanic < 590.5   to the left,  agree=0.786, adj=0.25, (0 split)
##       TractHUNV     < 255     to the left,  agree=0.786, adj=0.25, (0 split)
## 
## Node number 18: 50 observations
##   predicted class=No   expected loss=0.12  P(node) =0.1210654
##     class counts:    44     6
##    probabilities: 0.880 0.120 
## 
## Node number 19: 20 observations
##   predicted class=Yes  expected loss=0.15  P(node) =0.04842615
##     class counts:     3    17
##    probabilities: 0.150 0.850

Using the above tree on data for the entire U.S. excluding Oregon, as it was the train case

US_Data <- read.csv("US_Data.csv")
US_Data <- US_Data[,-c(2:3)] # removing county and state
Y_N_LowInc <- ifelse(US_Data$LowIncomeTracts == 0, "No", "Yes")
US_Data <- data.frame(US_Data, Y_N_LowInc)
train <- c(54109:54934)
US.test <- US_Data[-train, "Y_N_LowInc"]
tree.pred2 <- predict(tree.lowInc, newdata = US_Data, type = "class")
test.pred2 <- tree.pred2[-train]
cm2 <- table(test.pred2, US.test)
cm2

##           US.test
## test.pred2    No   Yes
##        No  36365  2409
##        Yes  5368 27563

err_rate2 <- 1-sum(diag(cm2))/sum(cm2)
err_rate2

## [1] 0.1084583

training/testing a tree for the entire U.S.

train_data <- sample(1:nrow(US_Data), nrow(US_Data)/2)

tree.lowInc2 <- rpart(Y_N_LowInc~.-LowIncomeTracts, 
                     data = US_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.lowInc2, faclen = 0, cex = 0.7, extra = 1, space = .5)

US.test2 <- US_Data[-train_data, "Y_N_LowInc"]
tree.pred3 <- predict(tree.lowInc2, newdata = US_Data, type = "class")
test.pred3 <- tree.pred3[-train_data]

cm3<- table(test.pred3, US.test2)
cm3

##           US.test2
## test.pred3    No   Yes
##        No  20522  1480
##        Yes   623 13641

err_rate3 <- 1-sum(diag(cm))/sum(cm)
err_rate3

## [1] 0.01937046

Final Project Code

Olivia Schutz, Marion Powell, Malie Heine

12/11/2021

Install and load required packages as well as load the data

Exploratory data analysis

simple linear regression - 1 numerical explanatory variable

linear regression - 1 categorical explanatory variable

1. Using Urban as the categorical explanatory variable

2. Using HUNVFlag as the categorical explanatory variable

multiple linear regression - 1 numerical and 1 categorical explanatory variable, parallel lines

1. Using Urban as the categorical explanatory variable

2. Using HUNVFlag as the categorical explanatory variable

multiple linear regression - 1 numerical and 1 categorical explanatory variable, unrelated lines

1. Using Urban as the categorical explanatory variable

2. Using HUNVFlag as the categorical explanatory variable

Calculating MSE’s for the simple model, the parallel lines model with HUNVFlag, and tthe unrelated lines model with HUNVFlag:

Tree for low income tract

Using the above tree on data for the entire U.S. excluding Oregon, as it was the train case

training/testing a tree for the entire U.S.