ASA Fall Data Challenge Code

Part 0: Data Gathering

Read in dataset

The original data set used the string NULL instead of NA, which causes the columns of the the data set to be imported as string type rather than numeric. To fix this we changed all NULLs in the dataset to NAs in the original excel file and then uploaded it to github, to then be able to import into RStudio.

fdc <- read_csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2021/main/Data/fallFood.csv")
## Rows: 72531 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): CensusTract, State, County
## dbl (80): Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS, PCTGQTRS, LI...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Part 1: Data Cleaning

Filtering for only rows in the dataset containing data about Oregon census tracts

Subset fdc to get only oregon data

Since our team is comprised of Willamette University students located in Salem, OR, we decided to concentrate our analysis on Oregon. To do this we filtered the original dataset for only census tracts starting with the sequence 41, Oregon’s state code.

oregon <- fdc %>% 
  filter(State=="Oregon")%>%
  mutate(GEOID=CensusTract)
#str(oregon)

Subset oregon to get only rows complete data

To simplify calculations, we decided to further filter the Oregon dataset to only include the rows whose data is complete.

na_oregon <- na.omit(oregon)
#na_oregon %>% 
 # summarize(sum_na = sum(is.na(na_oregon)))

Part 2: Exploratory Data Analysis

Choropleth of Food Insecurity in Oregon

Counts of Low Access by Census Tracts

library(tidycensus)
library(tigris)
library(leaflet)

census_api_key("fbf65ba295ce81cab0a2eccd52d62ca564bf896b")


# Set a year of interest
# It looks like 2019 is the most recent year of data
this.year = 2019

# MEDIAN HOME VALUE with Geometry
orMedvG <- get_acs(geography = "tract", year=this.year,
                   state = "OR", 
                   variables = "B25077_001E", 
                   geometry = TRUE)


## USE GEO_JOIN TO COMBINE SPATIAL DATA AND OTHER DATA FRAMES
joinOR<-geo_join(spatial_data=orMedvG , data_frame=oregon, 
                 by_sp='GEOID', by_df='GEOID')
popup<-paste("Tract: ", as.character(substring(joinOR$GEOID, 6, 11)), "<br>",
             "Population Low Access: ", as.character(joinOR$LAPOP1_10))

### QUANTILE COLORS
qpal<-colorQuantile("viridis", domain=joinOR$LAPOP1_10,
                    n=5,na.color="#FFFFFF")
leaflet()%>%
  addProviderTiles("CartoDB.Positron")%>%
  addPolygons(data=joinOR,
    fillColor= ~qpal(joinOR$LAPOP1_10),
    fillOpacity = 0.7,
    color="grey",
    opacity=.5,
    weight = 0.4,
    smoothFactor = 0.2,
    popup = popup)%>%
  addLegend("bottomright", pal=qpal, values=joinOR$LAPOP1_10,
            opacity = .7,
            title="Percentiles")
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs ).
## Need '+proj=longlat +datum=WGS84'

Flags for Low Access by Census Tracts

qpal <- colorFactor(palette = c("white", "blue"), 
              levels = c(0, 1))


leaflet()%>%
  addProviderTiles("CartoDB.Positron")%>%
  addPolygons(data=joinOR,
              fillColor= ~qpal(joinOR$LA1and10),
              fillOpacity = 0.7,
              color="grey",
              opacity=.5,
              weight = 0.4,
              smoothFactor = 0.2,
              popup = popup)%>%
  addLegend("bottomright", pal=qpal, values=joinOR$LA1and10,
            opacity = .7,
            title="Rural")
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs ).
## Need '+proj=longlat +datum=WGS84'

Corralation Matrix of urban

To start off our EDA we decided to build a correlation matrix of all the variables pertaining to information about populations living outside one mile of a grocery store. We call this group the Urban population/subgroup.

hm_data_urb <- na_oregon %>% 
  select(c(Urban,Pop2010,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,
           PovertyRate,MedianFamilyIncome,LA1and10,LAPOP1_10,lapop1,lapop1share,laseniors1,
           laseniors1share,lawhite1,lawhite1share,lablack1,lablack1share,laasian1,
           laasian1share,lanhopi1,lanhopi1share,laaian1,laaian1share,laomultir1,
           laomultir1share,lahisp1,lahisp1share,lahunv1,lahunv1share,lasnap1,lasnap1share,
           TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,
           TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP))
cormat_urb<-signif(cor(hm_data_urb),2)
#cormat_urb

Heatmap Urban

In order to better visualize the relationships between the variables, we made a heatmap of the urban sub group’s correlation matrix.

#heatmap(cormat_urb, col=col, symm=TRUE,)
library(viridis)
library(heatmaply)

p <- heatmaply(cormat_urb, 
               #dendrogram = "none",
               xlab = "", ylab = "", 
               main = "Correlation Matrix for Urban (1 mile)",
               #scale = "column",
               margins = c(60,100,40,20),
               grid_color = "white",
               grid_width = 0.00000,
               titleX = FALSE,
               hide_colorbar = FALSE,
               branches_lwd = 0.1,
               label_names = c("Row:", "Column:", "Correlation"),
               fontsize_row = 5, fontsize_col = 5,
               labCol = colnames(cormat_urb),
               labRow = rownames(cormat_urb),
               heatmap_layers = theme(axis.line=element_blank())
)
p

Correlation Matrix Rural

Now we repeated the process for variables that described the the rural population/subgroup.

hm_data_rur <- na_oregon %>% 
  select(c(Urban,Pop2010,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,
           PovertyRate,MedianFamilyIncome,LA1and10,LAPOP1_10,lapop10,lapop10share,lalowi10,
           lalowi10share,lakids10,lakids10share,laseniors10,laseniors10share,lawhite10,
           lawhite10share,lablack10,lablack10share,laasian10,laasian10share,lanhopi10,
           lanhopi10share,laaian10,laaian10share,laomultir10,laomultir10share,lahisp10,
           lahisp10share,lahunv10,lahunv10share,lasnap10,lasnap10share,TractSeniors,
           TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,
           TractHUNV,TractSNAP))
cormat_rur<-signif(cor(hm_data_rur),2)
#cormat_rur

Heatmap Rural

#heatmap(cormat_rur, col=col, symm=TRUE,)

library(viridis)
library(heatmaply)

p <- heatmaply(cormat_rur, 
               #dendrogram = "none",
               xlab = "", ylab = "", 
               main = "Correlation Matrix for Rural (10 miles)",
               #scale = "column",
               margins = c(60,100,40,20),
               grid_color = "white",
               grid_width = 0.00000,
               titleX = FALSE,
               hide_colorbar = FALSE,
               branches_lwd = 0.1,
               label_names = c("Row:", "Column:", "Correlation"),
               fontsize_row = 5, fontsize_col = 5,
               labCol = colnames(cormat_rur),
               labRow = rownames(cormat_rur),
               heatmap_layers = theme(axis.line=element_blank())
)
p

Part 3: Linear Regression

Further Data Cleaning

After initial analysis of the data, we discovered that because a lot of the variables are breakdowns of the overall tract populations, it caused a lot of co-linearity between the variables. To solve this, we narrowed down our candidate variables for building models to those that were not co-linear.

final_data <- na_oregon %>%
  select(c(LILATracts_1And10,LAPOP1_10,Urban,PCTGQTRS,
           PovertyRate,MedianFamilyIncome,lakids1,
           laseniors1,lablack1,lahisp1, lasnap1,
           lakids10,laseniors10,lablack10,lahisp10, lasnap10,
           lahunv1,lahunv10, lapop1share, lapop10share))

Interactive Heatmap for Final Dataset

library(viridis)
library(heatmaply)

cor_final<-signif(cor(final_data),2)

p <- heatmaply(cor_final, 
               #dendrogram = "none",
               xlab = "", ylab = "", 
               main = "Correlation Matrix for Final Dataset",
               #scale = "column",
               margins = c(60,100,40,20),
               grid_color = "white",
               grid_width = 0.00000,
               titleX = FALSE,
               hide_colorbar = FALSE,
               branches_lwd = 0.1,
               label_names = c("Row:", "Column:", "Correlation"),
               fontsize_row = 5, fontsize_col = 5,
               labCol = colnames(cor_final),
               labRow = rownames(cor_final),
               heatmap_layers = theme(axis.line=element_blank())
)
p

Model for Urban subgroup

We will build this model with the lapop1share variable, share of the tract living outside one mile from a grocery store, as the response variable.

Split the data into training and testing sets

Before we train a regression model, we need to split the final_data dataset into two sets, one to train the model on, and then one to test how well the model predicts.

sample_1 <- sample(1:nrow(final_data), nrow(final_data)/2) 
data_train <- final_data[sample_1,]
data_test <- final_data[-sample_1,]

Inststall packages for building the model

Then before we start any model training, we need to install and call three R packages that we’ll be using to build our models.

##install.packages("randomForest")##
##install.packages("caret")##
##install.packages("e1071")##
library(randomForest)
library(caret)
library(e1071)

Define the control

An important step in creating and training a model is building the control paramaters ro judge how well the model preforms. In this case, we decided that 10-fold cross validation would be the best model performance evaluator.

trControl <- trainControl(method = "cv",
    number = 10,
    search = "grid")

First run regression on default to see the baseline accuracy

Now that all the setup is complete, it is time to build a model. We decided to use the random forest method to build this model, and we used default parameters for the rest.

set.seed(1234)
# Run the model
rf_default_urb <- train(lapop1share~.,
    data = data_train,
    method = "rf",
    trControl = trControl)
# Print the results
print(rf_default_urb)
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    18.81658  0.2364892  15.54470
##   10    18.47956  0.2624962  15.24074
##   19    18.43269  0.2711153  15.19470
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.

Expand the grid to make r^2 better

After seeing the results from the model built with default parameters, we decided to expand the forest to trying 20 specific mtrys in order to pinpoint the best mtry value.

set.seed(1234)
# Run the model
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(30: 50))
rf_mtry_urb <- train(lapop1share~.,
    data = data_train,
    method = "rf",
    tuneGrid = tuneGrid,
    trControl = trControl)
# Print the results
print(rf_mtry_urb)
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   30    18.73570  0.2525681  15.50596
##   31    18.70750  0.2451678  15.45984
##   32    18.57652  0.2725282  15.32741
##   33    18.65256  0.2607360  15.38888
##   34    18.51732  0.2810444  15.31113
##   35    18.66050  0.2582400  15.36201
##   36    18.58040  0.2564149  15.36454
##   37    18.72205  0.2514925  15.45332
##   38    18.54462  0.2531552  15.31387
##   39    18.48582  0.2632008  15.35211
##   40    18.69846  0.2557871  15.39698
##   41    18.66879  0.2623290  15.47636
##   42    18.64211  0.2692837  15.31389
##   43    18.60960  0.2527320  15.39425
##   44    18.55368  0.2594919  15.33702
##   45    18.74672  0.2569804  15.54139
##   46    18.61060  0.2517964  15.33594
##   47    18.64190  0.2600063  15.38536
##   48    18.72513  0.2569499  15.48287
##   49    18.58093  0.2566612  15.35687
##   50    18.59269  0.2564299  15.37035
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 39.

Pick best mtry

We then saved the best mtry value to use as a tune for the next model build.

best_mtry_urb <- rf_mtry_urb$bestTune$mtry
best_mtry_urb
## [1] 39

Fit the final model with new parameters

Although there are more parameters we could have tuned, we felt that only tuning the mtry produced a very sufficient model for prediction.

set.seed(1234)
tuneGrid <- expand.grid(.mtry = best_mtry_urb)
rf_fit_urb <- train(lapop1share~.,
    data = data_train,
    method = "rf",
    tuneGrid = tuneGrid,
    trControl = trControl)
rf_fit_urb
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   18.62039  0.2710515  15.37189
## 
## Tuning parameter 'mtry' was held constant at a value of 39
rf_fit_urb$results
##   mtry     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1   39 18.62039 0.2710515 15.37189 3.096944  0.2340185 2.794225

Analyze the Final Model

After fitting our model, we want to see how well it can be used for prediction purposes. before we see that, we need to install and call another R package. We also decided that we would use MSE to see how well we built the model.

#install.packages("ModelMetrics")
library(ModelMetrics)
prediction_urb <-predict(rf_fit_urb, data_test)
mse(prediction_urb,data_test$lapop1share)
## [1] 382.0019

Look at which variables are most important

Another thing we can do, is see which variables our random forest deemed to be important. This will help us understand what combination of settings create food insecurity.

var_imp_urb <- varImp(rf_fit_urb)
var_imp_urb
## rf variable importance
## 
##                     Overall
## lakids1            100.0000
## lapop10share        67.9059
## lahisp1             56.8252
## laseniors1          43.2497
## PCTGQTRS            39.3404
## lablack1            31.3000
## lakids10            27.5151
## PovertyRate         16.8594
## MedianFamilyIncome  15.6567
## LAPOP1_10           13.9206
## lahunv1             13.2690
## lasnap1             12.0784
## laseniors10         10.6076
## lahunv10             8.8257
## lahisp10             8.2208
## lablack10            7.8333
## lasnap10             6.3670
## Urban                0.9867
## LILATracts_1And10    0.0000
matImp<-as.matrix(var_imp_urb$importance)

impDF<-data.frame(vars=rownames(matImp),
                  importance=matImp[,1])
#str(matImp)

impDF$vars<-factor(impDF$vars, levels = impDF$vars[order(impDF$importance)])

ggplot(impDF, aes(x=importance, y=vars))+
  geom_col()+
  ggtitle("Variable Importance")

Model for Rural Subgroup

We will build this model with the lapop10share variable, share of the tract living outside ten miles from a grocery store, as the response variable.

First run regression on default to see the baseline accuracy

Now we will repeat the process of model building.

set.seed(1234)
# Run the model
rf_default_rur <- train(lapop10share~.,
    data = data_train,
    method = "rf",
    trControl = trControl)
# Print the results
print(rf_default_rur)
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    10.68973  0.9385433  7.353801
##   10    10.12786  0.9226399  6.351505
##   19    10.49905  0.9051555  6.409830
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.

Expand the grid to make r^2 better

set.seed(1234)
# Run the model
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(30: 50))
rf_mtry_rur <- train(lapop10share~.,
    data = data_train,
    method = "rf",
    tuneGrid = tuneGrid,
    trControl = trControl)
# Print the results
print(rf_mtry_rur)
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   30    10.35124  0.9066682  6.333070
##   31    10.43962  0.9049379  6.395796
##   32    10.36696  0.9025078  6.307175
##   33    10.53206  0.9008418  6.361129
##   34    10.40615  0.9017172  6.310621
##   35    10.48025  0.8996820  6.384833
##   36    10.52830  0.8986936  6.466168
##   37    10.66780  0.8998885  6.542992
##   38    10.57192  0.8989280  6.381574
##   39    10.35507  0.9072573  6.257975
##   40    10.41817  0.9034612  6.303329
##   41    10.54909  0.9017272  6.407191
##   42    10.51899  0.8985665  6.354703
##   43    10.59874  0.8997011  6.465372
##   44    10.66926  0.9026052  6.513557
##   45    10.50207  0.9027343  6.366676
##   46    10.38660  0.9037762  6.427466
##   47    10.61160  0.9033198  6.432779
##   48    10.45948  0.9013762  6.309391
##   49    10.49366  0.9016831  6.411619
##   50    10.49278  0.9005667  6.377281
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 30.

Pick best mtry

best_mtry_rur <- rf_mtry_rur$bestTune$mtry
best_mtry_rur
## [1] 30

Fit the final model with new parameters

set.seed(1234)
tuneGrid <- expand.grid(.mtry = best_mtry_rur)
rf_fit_rur <- train(lapop10share~.,
    data = data_train,
    method = "rf",
    tuneGrid = tuneGrid,
    trControl = trControl)
rf_fit_rur
## Random Forest 
## 
## 71 samples
## 19 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   10.39212  0.9037354  6.321981
## 
## Tuning parameter 'mtry' was held constant at a value of 30
rf_fit_rur$results
##   mtry     RMSE  Rsquared      MAE  RMSESD RsquaredSD    MAESD
## 1   30 10.39212 0.9037354 6.321981 5.50931  0.1086655 3.151746

Analyze the Final Model

prediction_rur <-predict(rf_fit_rur, data_test)
mse(prediction_urb,data_test$lapop10share)
## [1] 4418.325

Look at which variables are most important

var_imp_rur <- varImp(rf_fit_rur)
var_imp_rur
## rf variable importance
## 
##                     Overall
## lakids10           100.0000
## laseniors10         75.7220
## lasnap10             9.2479
## lahisp10             7.2082
## lahunv10             6.4860
## LAPOP1_10            3.5686
## lablack10            2.8957
## MedianFamilyIncome   2.6656
## lablack1             1.3422
## lahunv1              1.1465
## lakids1              1.1341
## laseniors1           0.9493
## lahisp1              0.9452
## lasnap1              0.8778
## lapop1share          0.6148
## PCTGQTRS             0.4586
## PovertyRate          0.3801
## LILATracts_1And10    0.2278
## Urban                0.0000
matImp<-as.matrix(var_imp_rur$importance)

impDF<-data.frame(vars=rownames(matImp),
                  importance=matImp[,1])
#str(matImp)

impDF$vars<-factor(impDF$vars, levels = impDF$vars[order(impDF$importance)])

ggplot(impDF, aes(x=importance, y=vars))+
  geom_col()+
  ggtitle("Variable Importance")

Part 4: Tree Regression

Since many explanatory variables can be used to create linear regression models, they tend to be hard to interpret. Thankfully there is also tree regression which produces a much more intuitive model. Here we also have to download another R package.

#install.packages("rpart")
library(rpart)

Create classification tree

Here we built tree and made the classification tree’s response variable be LILATracts_1And10, the flag for a low income low access tracts.

tree_lali_flag<-rpart(LILATracts_1And10~., data=final_data,
                     control=rpart.control(minsplit=1),
                     method="class")
summary(tree_lali_flag)
## Call:
## rpart(formula = LILATracts_1And10 ~ ., data = final_data, method = "class", 
##     control = rpart.control(minsplit = 1))
##   n= 142 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.46428571      0 1.00000000 1.0000000 0.16932818
## 2 0.03571429      2 0.07142857 0.1785714 0.07844099
## 3 0.01000000      4 0.00000000 0.2500000 0.09213268
## 
## Variable importance
## MedianFamilyIncome          LAPOP1_10           lasnap10        laseniors10 
##                 19                 16                 12                 12 
##       lapop10share           lahisp10           lakids10        PovertyRate 
##                 10                 10                  9                  7 
##           lahunv10          lablack10           PCTGQTRS 
##                  4                  2                  1 
## 
## Node number 1: 142 observations,    complexity param=0.4642857
##   predicted class=0  expected loss=0.1971831  P(node) =1
##     class counts:   114    28
##    probabilities: 0.803 0.197 
##   left son=2 (100 obs) right son=3 (42 obs)
##   Primary splits:
##       MedianFamilyIncome < 56174   to the right, improve=21.22822, (0 missing)
##       lasnap10           < 63.5    to the left,  improve=18.29224, (0 missing)
##       LAPOP1_10          < 504.5   to the left,  improve=16.95775, (0 missing)
##       lahunv10           < 21.5    to the left,  improve=13.65418, (0 missing)
##       laseniors10        < 122     to the left,  improve=12.19665, (0 missing)
##   Surrogate splits:
##       PovertyRate < 16.1    to the left,  agree=0.824, adj=0.405, (0 split)
##       lahunv10    < 21.5    to the left,  agree=0.775, adj=0.238, (0 split)
##       lasnap10    < 63.5    to the left,  agree=0.754, adj=0.167, (0 split)
##       laseniors10 < 227     to the left,  agree=0.732, adj=0.095, (0 split)
##       lablack10   < 4.5     to the left,  agree=0.732, adj=0.095, (0 split)
## 
## Node number 2: 100 observations,    complexity param=0.03571429
##   predicted class=0  expected loss=0.02  P(node) =0.7042254
##     class counts:    98     2
##    probabilities: 0.980 0.020 
##   left son=4 (93 obs) right son=5 (7 obs)
##   Primary splits:
##       lasnap10           < 80.5    to the left,  improve=1.0628570, (0 missing)
##       laseniors10        < 481     to the left,  improve=0.9404082, (0 missing)
##       lapop10share       < 89.35   to the left,  improve=0.9404082, (0 missing)
##       MedianFamilyIncome < 57865.5 to the right, improve=0.8088889, (0 missing)
##       lahisp10           < 42.5    to the left,  improve=0.5866667, (0 missing)
##   Surrogate splits:
##       laseniors10  < 332.5   to the left,  agree=0.98, adj=0.714, (0 split)
##       lakids10     < 310.5   to the left,  agree=0.97, adj=0.571, (0 split)
##       lapop10share < 49.575  to the left,  agree=0.97, adj=0.571, (0 split)
##       LAPOP1_10    < 1512    to the left,  agree=0.96, adj=0.429, (0 split)
##       lahunv10     < 24.5    to the left,  agree=0.96, adj=0.429, (0 split)
## 
## Node number 3: 42 observations,    complexity param=0.4642857
##   predicted class=1  expected loss=0.3809524  P(node) =0.2957746
##     class counts:    16    26
##    probabilities: 0.381 0.619 
##   left son=6 (16 obs) right son=7 (26 obs)
##   Primary splits:
##       LAPOP1_10    < 483     to the left,  improve=19.809520, (0 missing)
##       lapop10share < 18.16   to the left,  improve=11.082250, (0 missing)
##       laseniors10  < 130     to the left,  improve=10.070390, (0 missing)
##       lasnap10     < 61      to the left,  improve=10.070390, (0 missing)
##       lakids10     < 100.5   to the left,  improve= 9.333333, (0 missing)
##   Surrogate splits:
##       laseniors10  < 85      to the left,  agree=0.857, adj=0.625, (0 split)
##       lapop10share < 13.495  to the left,  agree=0.857, adj=0.625, (0 split)
##       lakids10     < 64.5    to the left,  agree=0.833, adj=0.563, (0 split)
##       lahisp10     < 3       to the left,  agree=0.833, adj=0.563, (0 split)
##       lasnap10     < 17.5    to the left,  agree=0.833, adj=0.563, (0 split)
## 
## Node number 4: 93 observations
##   predicted class=0  expected loss=0  P(node) =0.6549296
##     class counts:    93     0
##    probabilities: 1.000 0.000 
## 
## Node number 5: 7 observations,    complexity param=0.03571429
##   predicted class=0  expected loss=0.2857143  P(node) =0.04929577
##     class counts:     5     2
##    probabilities: 0.714 0.286 
##   left son=10 (5 obs) right son=11 (2 obs)
##   Primary splits:
##       MedianFamilyIncome < 58968.5 to the right, improve=2.857143, (0 missing)
##       PCTGQTRS           < 1.075   to the left,  improve=1.523810, (0 missing)
##       lahisp10           < 66.5    to the right, improve=1.523810, (0 missing)
##       LAPOP1_10          < 1243    to the right, improve=1.190476, (0 missing)
##       lakids1            < 456     to the right, improve=1.190476, (0 missing)
##   Surrogate splits:
##       PCTGQTRS < 1.075   to the left,  agree=0.857, adj=0.5, (0 split)
##       lahisp10 < 66.5    to the right, agree=0.857, adj=0.5, (0 split)
## 
## Node number 6: 16 observations
##   predicted class=0  expected loss=0  P(node) =0.1126761
##     class counts:    16     0
##    probabilities: 1.000 0.000 
## 
## Node number 7: 26 observations
##   predicted class=1  expected loss=0  P(node) =0.1830986
##     class counts:     0    26
##    probabilities: 0.000 1.000 
## 
## Node number 10: 5 observations
##   predicted class=0  expected loss=0  P(node) =0.03521127
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 11: 2 observations
##   predicted class=1  expected loss=0  P(node) =0.01408451
##     class counts:     0     2
##    probabilities: 0.000 1.000

Visualizing the tree

After building the tree, we now will visualize it.

par(mfrow=c(1,1))
plot(tree_lali_flag , uniform=TRUE,margin=0.2,
     main="Classification Tree for Low Access Low Income Flag")
text(tree_lali_flag , use.n=TRUE, all=TRUE, cex=.8)

Alternate Visualization

Here is the same tree, but visualized with a different style. It uses a different R package, so that package needed to be installed and loaded as well.

#install.packages("rpart.plot")
library(rpart.plot)
prp(tree_lali_flag, faclen = 0, cex = 0.7, extra=1, space=.5)

Part 5: Logistic Regression

We chose to do linear regression because we wanted to a binary response to predict if a tract will be flagged as a low income and low access tract. We built a logistic model with LILATracts_1And10 as our response variable again.

But first we need to call another R package into our environment.

library(leaps)

Find best subset of response variables for up to eight model sizes

To see which of our variables from the final_data dataset should be used for each sized model, we used a hybrid method to pick the best combination of explanatory variables that should be used for models with one or up to eight explanatory variables.

regfit_LILATracts_1And10 <- regsubsets(LILATracts_1And10~.,data=final_data)
summary(regfit_LILATracts_1And10)
## Subset selection object
## Call: regsubsets.formula(LILATracts_1And10 ~ ., data = final_data)
## 19 Variables  (and intercept)
##                    Forced in Forced out
## LAPOP1_10              FALSE      FALSE
## Urban                  FALSE      FALSE
## PCTGQTRS               FALSE      FALSE
## PovertyRate            FALSE      FALSE
## MedianFamilyIncome     FALSE      FALSE
## lakids1                FALSE      FALSE
## laseniors1             FALSE      FALSE
## lablack1               FALSE      FALSE
## lahisp1                FALSE      FALSE
## lasnap1                FALSE      FALSE
## lakids10               FALSE      FALSE
## laseniors10            FALSE      FALSE
## lablack10              FALSE      FALSE
## lahisp10               FALSE      FALSE
## lasnap10               FALSE      FALSE
## lahunv1                FALSE      FALSE
## lahunv10               FALSE      FALSE
## lapop1share            FALSE      FALSE
## lapop10share           FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          LAPOP1_10 Urban PCTGQTRS PovertyRate MedianFamilyIncome lakids1
## 1  ( 1 ) " "       " "   " "      " "         " "                " "    
## 2  ( 1 ) " "       " "   " "      " "         "*"                " "    
## 3  ( 1 ) " "       "*"   " "      " "         "*"                " "    
## 4  ( 1 ) " "       " "   " "      " "         "*"                " "    
## 5  ( 1 ) " "       "*"   " "      " "         "*"                " "    
## 6  ( 1 ) " "       "*"   " "      " "         "*"                " "    
## 7  ( 1 ) " "       "*"   " "      " "         "*"                " "    
## 8  ( 1 ) " "       "*"   " "      " "         "*"                " "    
##          laseniors1 lablack1 lahisp1 lasnap1 lakids10 laseniors10 lablack10
## 1  ( 1 ) " "        " "      " "     " "     " "      " "         " "      
## 2  ( 1 ) " "        " "      " "     " "     " "      " "         " "      
## 3  ( 1 ) " "        " "      " "     " "     " "      " "         " "      
## 4  ( 1 ) " "        " "      " "     " "     "*"      " "         " "      
## 5  ( 1 ) " "        " "      " "     " "     "*"      " "         " "      
## 6  ( 1 ) " "        " "      " "     " "     "*"      "*"         " "      
## 7  ( 1 ) " "        " "      " "     " "     "*"      "*"         " "      
## 8  ( 1 ) "*"        " "      " "     " "     "*"      "*"         " "      
##          lahisp10 lasnap10 lahunv1 lahunv10 lapop1share lapop10share
## 1  ( 1 ) " "      "*"      " "     " "      " "         " "         
## 2  ( 1 ) " "      " "      " "     " "      " "         "*"         
## 3  ( 1 ) " "      " "      " "     " "      " "         "*"         
## 4  ( 1 ) " "      "*"      " "     " "      " "         "*"         
## 5  ( 1 ) " "      "*"      " "     " "      " "         "*"         
## 6  ( 1 ) " "      "*"      " "     " "      " "         "*"         
## 7  ( 1 ) " "      "*"      " "     " "      "*"         "*"         
## 8  ( 1 ) " "      "*"      " "     " "      "*"         "*"

One explanatory variable logistic model

With this model and all the rest we will be taking the variables indicated above as the best explanatory variable for each level and pasting them into the model.

var_2_mod <- glm(LILATracts_1And10~lasnap10,data=data_train)
summary(var_2_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10, data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.63942  -0.13925  -0.07699  -0.05921   0.94079  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0592054  0.0444331   1.332    0.187    
## lasnap10    0.0029647  0.0005279   5.616  3.8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1056245)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  7.2881  on 69  degrees of freedom
## AIC: 45.862
## 
## Number of Fisher Scoring iterations: 2

Model Performance

Also for every model, we will be creating a confusion matrix to use to find each model’s error rate, which will be the standard of performance for the models.

pred1 <- predict(var_2_mod,data = data_test,type="response")

conf_mat1<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred1>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    111
## 2                 0 TRUE                       3
## 3                 1 FALSE                     27
## 4                 1 TRUE                       1

Error rate

correct <- conf_mat1$n[c(1,4)]
er_1 <- 1-sum(correct)/sum(conf_mat1$n)
er_1
## [1] 0.2112676

Two explanatory variable logistic model

var_3_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome,data=data_train)
summary(var_3_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome, 
##     data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.44765  -0.18252  -0.06993   0.14692   0.78251  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.023e+00  1.952e-01   5.241 1.70e-06 ***
## lasnap10            2.021e-03  4.910e-04   4.117 0.000106 ***
## MedianFamilyIncome -1.501e-05  2.981e-06  -5.035 3.74e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07807051)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  5.3088  on 68  degrees of freedom
## AIC: 25.364
## 
## Number of Fisher Scoring iterations: 2

Model Performance

pred2 <- predict(var_3_mod,data = data_test,type="response")

conf_mat2<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred2>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    103
## 2                 0 TRUE                      11
## 3                 1 FALSE                     25
## 4                 1 TRUE                       3

Error rate

correct2 <- conf_mat2$n[c(1,4)]
er_2 <- 1-sum(correct2)/sum(conf_mat2$n)
er_2
## [1] 0.2535211

Three explanatory variable logistic model

var_4_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban,data=data_train)
summary(var_4_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban, data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.46390  -0.18143  -0.04975   0.13826   0.63090  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.062e+00  1.933e-01   5.495 6.53e-07 ***
## lasnap10            2.054e-03  4.835e-04   4.248 6.80e-05 ***
## MedianFamilyIncome -1.594e-05  2.978e-06  -5.351 1.14e-06 ***
## Urban               2.341e-01  1.303e-01   1.797   0.0768 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07559208)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  5.0647  on 67  degrees of freedom
## AIC: 24.022
## 
## Number of Fisher Scoring iterations: 2

Model Perfromance

pred3 <- predict(var_4_mod,data = data_test,type="response")

conf_mat3<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred3>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat3
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    103
## 2                 0 TRUE                      11
## 3                 1 FALSE                     25
## 4                 1 TRUE                       3

Error rate

correct3 <- conf_mat3$n[c(1,4)]
er_3 <- 1-sum(correct3)/sum(conf_mat3$n)
er_3
## [1] 0.2535211

Four explanatory variable logistic model

var_5_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids1,data=data_train)
summary(var_5_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban + lakids1, data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4611  -0.1734  -0.0538   0.1275   0.6169  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.071e+00  1.953e-01   5.481 7.13e-07 ***
## lasnap10            2.074e-03  4.884e-04   4.247 6.94e-05 ***
## MedianFamilyIncome -1.565e-05  3.063e-06  -5.108 2.98e-06 ***
## Urban               2.302e-01  1.313e-01   1.753   0.0843 .  
## lakids1            -4.294e-05  9.456e-05  -0.454   0.6512    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07649838)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  5.0489  on 66  degrees of freedom
## AIC: 25.8
## 
## Number of Fisher Scoring iterations: 2

Model Performance

pred4 <- predict(var_5_mod,data = data_test,type="response")

conf_mat4<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred4>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat4
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    103
## 2                 0 TRUE                      11
## 3                 1 FALSE                     25
## 4                 1 TRUE                       3

Error rate

correct4 <- conf_mat4$n[c(1,4)]
er_4 <- 1-sum(correct4)/sum(conf_mat4$n)
er_4
## [1] 0.2535211

Five explanatory variable logistic model

var_6_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids1+lahunv10,
                 data=data_train)
summary(var_6_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban + lakids1 + lahunv10, data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.39008  -0.18087  -0.05161   0.09066   0.68533  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.039e+00  1.921e-01   5.407 9.82e-07 ***
## lasnap10            8.415e-04  7.978e-04   1.055   0.2954    
## MedianFamilyIncome -1.545e-05  3.003e-06  -5.145 2.67e-06 ***
## Urban               2.288e-01  1.287e-01   1.777   0.0802 .  
## lakids1            -1.699e-05  9.364e-05  -0.181   0.8566    
## lahunv10            7.686e-03  3.981e-03   1.931   0.0579 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07346162)
## 
##     Null deviance: 10.620  on 70  degrees of freedom
## Residual deviance:  4.775  on 65  degrees of freedom
## AIC: 23.84
## 
## Number of Fisher Scoring iterations: 2

Model Performance

pred5 <- predict(var_6_mod,data = data_test,type="response")

conf_mat5<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred5>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat5
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    102
## 2                 0 TRUE                      12
## 3                 1 FALSE                     24
## 4                 1 TRUE                       4

Error rate

correct5 <- conf_mat5$n[c(1,4)]
er_5 <- 1-sum(correct5)/sum(conf_mat5$n)
er_5
## [1] 0.2535211

Six explanatory variable logistic model

var_7_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
                 lahunv1,data=data_train)
summary(var_7_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban + lakids10 + lahunv10 + lahunv1, data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.38997  -0.17647  -0.05468   0.09575   0.68514  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.047e+00  1.922e-01   5.446 8.73e-07 ***
## lasnap10            1.025e-03  1.046e-03   0.980   0.3307    
## MedianFamilyIncome -1.520e-05  2.983e-06  -5.096 3.30e-06 ***
## Urban               2.179e-01  1.299e-01   1.677   0.0985 .  
## lakids10           -1.968e-04  5.139e-04  -0.383   0.7030    
## lahunv10            9.096e-03  4.309e-03   2.111   0.0387 *  
## lahunv1            -8.238e-04  9.411e-04  -0.875   0.3847    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0736832)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  4.7157  on 64  degrees of freedom
## AIC: 24.953
## 
## Number of Fisher Scoring iterations: 2

Model Performance

pred6 <- predict(var_7_mod,data = data_test,type="response")

conf_mat6<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred6>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat6
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    102
## 2                 0 TRUE                      12
## 3                 1 FALSE                     24
## 4                 1 TRUE                       4

Error rate

correct6 <- conf_mat6$n[c(1,4)]
er_6 <- 1-sum(correct6)/sum(conf_mat6$n)
er_6
## [1] 0.2535211

Seven explanatory variable logistic model

var_8_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
                 lahunv1+lablack10,data=data_train)
summary(var_8_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban + lakids10 + lahunv10 + lahunv1 + lablack10, data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.36020  -0.17269  -0.06586   0.11719   0.71956  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.027e+00  1.924e-01   5.339 1.36e-06 ***
## lasnap10            8.506e-04  1.054e-03   0.807   0.4226    
## MedianFamilyIncome -1.486e-05  2.989e-06  -4.972 5.39e-06 ***
## Urban               2.028e-01  1.302e-01   1.558   0.1243    
## lakids10           -3.381e-04  5.265e-04  -0.642   0.5231    
## lahunv10            9.043e-03  4.297e-03   2.104   0.0393 *  
## lahunv1            -1.054e-03  9.589e-04  -1.099   0.2760    
## lablack10           1.692e-02  1.449e-02   1.168   0.2474    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0732674)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  4.6158  on 63  degrees of freedom
## AIC: 25.433
## 
## Number of Fisher Scoring iterations: 2

Model Perfromance

pred7 <- predict(var_8_mod,data = data_test,type="response")

conf_mat7<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred7>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat7
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    103
## 2                 0 TRUE                      11
## 3                 1 FALSE                     25
## 4                 1 TRUE                       3

Error Rate

correct7 <- conf_mat7$n[c(1,4)]
er_7 <- 1-sum(correct7)/sum(conf_mat7$n)
er_7
## [1] 0.2535211

Eight explanatory variable logistic model

var_9_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
                 lahunv1+lablack10+lahisp1,data=data_train)
summary(var_9_mod)
## 
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome + 
##     Urban + lakids10 + lahunv10 + lahunv1 + lablack10 + lahisp1, 
##     data = data_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.37802  -0.16790  -0.05736   0.11733   0.69978  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.038e+00  1.934e-01   5.366 1.27e-06 ***
## lasnap10            8.895e-04  1.058e-03   0.841   0.4037    
## MedianFamilyIncome -1.467e-05  3.008e-06  -4.876 7.87e-06 ***
## Urban               1.926e-01  1.312e-01   1.468   0.1471    
## lakids10           -3.246e-04  5.283e-04  -0.614   0.5412    
## lahunv10            8.168e-03  4.448e-03   1.837   0.0711 .  
## lahunv1            -8.850e-04  9.849e-04  -0.899   0.3723    
## lablack10           1.953e-02  1.490e-02   1.311   0.1948    
## lahisp1            -1.784e-04  2.244e-04  -0.795   0.4295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07369738)
## 
##     Null deviance: 10.6197  on 70  degrees of freedom
## Residual deviance:  4.5692  on 62  degrees of freedom
## AIC: 26.713
## 
## Number of Fisher Scoring iterations: 2

Model Performance

pred8 <- predict(var_9_mod,data = data_test,type="response")

conf_mat8<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
                      pred_LILATracts_1And10=pred8>.5)%>%
  group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
  summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat8
## # A tibble: 4 × 3
## # Groups:   LILATracts_1And10 [2]
##   LILATracts_1And10 pred_LILATracts_1And10     n
##               <dbl> <lgl>                  <int>
## 1                 0 FALSE                    101
## 2                 0 TRUE                      13
## 3                 1 FALSE                     25
## 4                 1 TRUE                       3

Error Rate

correct8 <- conf_mat8$n[c(1,4)]
er_8 <- 1-sum(correct8)/sum(conf_mat8$n)
er_8
## [1] 0.2676056

Picking best logsitic model

For our final logistic model, wo would choose the model that had the lowest error rate. To easily see that, we first made a data frame of all the error rates and their corresponding indexes. Then we created a scatter plot overlayed with a line chart to easily show which model performed the best.

Creating the data frame

error <- data.frame(number_of_exp_vars=c(1,2,3,4,5,6,7,8),error_rate=c(er_1,er_2,er_3,er_4,er_5,
                                                                   er_6,er_7,er_8))
str(error)
## 'data.frame':    8 obs. of  2 variables:
##  $ number_of_exp_vars: num  1 2 3 4 5 6 7 8
##  $ error_rate        : num  0.211 0.254 0.254 0.254 0.254 ...

Graph of the error rates

ggplot(error,aes(number_of_exp_vars,error_rate))+
  geom_point()+
  geom_line()

From the graph we can clearly see that the simplest model with only one explanatory variable was the most accurate at predicting low income low access tracts in Oregon.