library(readr)
library(tidyverse)
OR_Data <- read_csv("OR_Data.csv")
library(leaps)

A)

response: TractSNAP - number of housing units recieving SNAP benefits
explanatory: TractLOWI - total low income population
categorical: as.factor(HUNVFlag) - whether or not the census tract has low vehicle access

B)

mod1 <-lm(TractSNAP~TractLOWI, data = OR_Data)
summary(mod1)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI, data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -725.51  -58.50  -10.91   49.46  764.93 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.074361   7.734538  -0.656    0.512    
## TractLOWI    0.208255   0.004197  49.624   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115.1 on 824 degrees of freedom
## Multiple R-squared:  0.7493, Adjusted R-squared:  0.749 
## F-statistic:  2463 on 1 and 824 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point()+
  geom_abline(intercept = -5.074361, slope = 0.208255, col = "blue")

The relationship between TractSNAP and TractLOWI does appear to be significant - the p-value is less than 2e-16.

C)

contrasts(as.factor(OR_Data$HUNVFlag))
##   1
## 0 0
## 1 1

x2 = {0 if not a low vehicle access tract, 1 otherwise}

D)

mod2 <- lm(TractSNAP~as.factor(HUNVFlag), data = OR_Data)
anova(mod2)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(HUNVFlag)   1  5542228 5542228  120.08 < 2.2e-16 ***
## Residuals           824 38031816   46155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(OR_Data, aes(as.factor(HUNVFlag), TractSNAP, fill = as.factor(HUNVFlag)))+
  geom_boxplot()

There are differences in the means between the levels of HUNVFlag

E)

mod3 <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag), data = OR_Data)
summary(mod3)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -682.86  -60.14   -8.70   51.09  782.79 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -3.776483   7.654497  -0.493    0.622    
## TractLOWI             0.201866   0.004394  45.937  < 2e-16 ***
## as.factor(HUNVFlag)1 48.005295  10.854084   4.423 1.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 823 degrees of freedom
## Multiple R-squared:  0.7551, Adjusted R-squared:  0.7545 
## F-statistic:  1269 on 2 and 823 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point()+
  geom_abline(intercept = mod3$coefficients[1], slope = mod3$coefficients[2], color = "blue", lwd = 1)+
  geom_abline(intercept = mod3$coefficients[1] + mod3$coefficients[3], slope = mod3$coefficients[2], color = "red", lwd = 1)

models:
HUNVFlag = 0: y = -3.776483 + 0.201866x
HUNVFlag = 1: y = 44.22881 + 0.201866x

F)

mod4 <- lm(TractSNAP~TractLOWI + as.factor(HUNVFlag) + TractLOWI*as.factor(HUNVFlag), data = OR_Data)
summary(mod4)
## 
## Call:
## lm(formula = TractSNAP ~ TractLOWI + as.factor(HUNVFlag) + TractLOWI * 
##     as.factor(HUNVFlag), data = OR_Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -693.58  -60.00   -7.82   50.65  779.70 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -6.586162   8.503727  -0.775  0.43886    
## TractLOWI                       0.203833   0.005102  39.948  < 2e-16 ***
## as.factor(HUNVFlag)1           63.498511  23.112703   2.747  0.00614 ** 
## TractLOWI:as.factor(HUNVFlag)1 -0.007629   0.010048  -0.759  0.44788    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.9 on 822 degrees of freedom
## Multiple R-squared:  0.7553, Adjusted R-squared:  0.7544 
## F-statistic: 845.6 on 3 and 822 DF,  p-value: < 2.2e-16
ggplot(OR_Data, aes(TractLOWI, TractSNAP, color = as.factor(HUNVFlag)))+
  geom_point()+
  geom_abline(intercept = mod4$coefficients[1], slope = mod4$coefficients[2], color = "blue", lwd = 1)+
  geom_abline(intercept = mod4$coefficients[1] + mod4$coefficients[3], slope = mod4$coefficients[2] + mod4$coefficients[4], color = "red", lwd = 1)

models:
HUNVFlag = 0: y = -6.586162 + 0.203833x
HUNVFlag = 1: y = 56.91235 + 0.196204x

G)

Calculate MSE’s of each model

anova(mod1)
## Analysis of Variance Table
## 
## Response: TractSNAP
##            Df   Sum Sq  Mean Sq F value    Pr(>F)    
## TractLOWI   1 32649335 32649335  2462.6 < 2.2e-16 ***
## Residuals 824 10924709    13258                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod2)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(HUNVFlag)   1  5542228 5542228  120.08 < 2.2e-16 ***
## Residuals           824 38031816   46155                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod3)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                      Df   Sum Sq  Mean Sq  F value    Pr(>F)    
## TractLOWI             1 32649335 32649335 2518.059 < 2.2e-16 ***
## as.factor(HUNVFlag)   1   253630   253630   19.561 1.105e-05 ***
## Residuals           823 10671080    12966                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod4)
## Analysis of Variance Table
## 
## Response: TractSNAP
##                                Df   Sum Sq  Mean Sq   F value    Pr(>F)    
## TractLOWI                       1 32649335 32649335 2516.7630 < 2.2e-16 ***
## as.factor(HUNVFlag)             1   253630   253630   19.5510 1.111e-05 ***
## TractLOWI:as.factor(HUNVFlag)   1     7480     7480    0.5766    0.4479    
## Residuals                     822 10663600    12973                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. 13258
  2. 46155
  3. 12966
  4. 12973

The MSE for the model in part E (parallel lines) was the lowest.

Models B, E, and F increased in complexity, going from a simple linear regression to parallel lines to unrelated lines.

H)

Vehicle access does have an effect on the relationship between low income population and number of people receiving SNAP benefits. The relationship was significant with parallel lines, but not with different slopes. We also attempted to use whether or not a tract was urban as a categorical predictor, but that had an insignificant effect on the intercept of the regression line.