The original data set used the string NULL instead of NA, which causes the columns of the the data set to be imported as string type rather than numeric. To fix this we changed all NULLs in the dataset to NAs in the original excel file and then uploaded it to github, to then be able to import into RStudio.
fdc <- read_csv("https://raw.githubusercontent.com/kitadasmalley/fallChallenge2021/main/Data/fallFood.csv")
## Rows: 72531 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): CensusTract, State, County
## dbl (80): Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS, PCTGQTRS, LI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Since our team is comprised of Willamette University students located in Salem, OR, we decided to concentrate our analysis on Oregon. To do this we filtered the original dataset for only census tracts starting with the sequence 41, Oregon’s state code.
oregon <- fdc %>%
filter(State=="Oregon")%>%
mutate(GEOID=CensusTract)
#str(oregon)
To simplify calculations, we decided to further filter the Oregon dataset to only include the rows whose data is complete.
na_oregon <- na.omit(oregon)
#na_oregon %>%
# summarize(sum_na = sum(is.na(na_oregon)))
library(tidycensus)
library(tigris)
library(leaflet)
census_api_key("fbf65ba295ce81cab0a2eccd52d62ca564bf896b")
# Set a year of interest
# It looks like 2019 is the most recent year of data
this.year = 2019
# MEDIAN HOME VALUE with Geometry
orMedvG <- get_acs(geography = "tract", year=this.year,
state = "OR",
variables = "B25077_001E",
geometry = TRUE)
## USE GEO_JOIN TO COMBINE SPATIAL DATA AND OTHER DATA FRAMES
joinOR<-geo_join(spatial_data=orMedvG , data_frame=oregon,
by_sp='GEOID', by_df='GEOID')
popup<-paste("Tract: ", as.character(substring(joinOR$GEOID, 6, 11)), "<br>",
"Population Low Access: ", as.character(joinOR$LAPOP1_10))
### QUANTILE COLORS
qpal<-colorQuantile("viridis", domain=joinOR$LAPOP1_10,
n=5,na.color="#FFFFFF")
leaflet()%>%
addProviderTiles("CartoDB.Positron")%>%
addPolygons(data=joinOR,
fillColor= ~qpal(joinOR$LAPOP1_10),
fillOpacity = 0.7,
color="grey",
opacity=.5,
weight = 0.4,
smoothFactor = 0.2,
popup = popup)%>%
addLegend("bottomright", pal=qpal, values=joinOR$LAPOP1_10,
opacity = .7,
title="Percentiles")
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs ).
## Need '+proj=longlat +datum=WGS84'
qpal <- colorFactor(palette = c("white", "blue"),
levels = c(0, 1))
leaflet()%>%
addProviderTiles("CartoDB.Positron")%>%
addPolygons(data=joinOR,
fillColor= ~qpal(joinOR$LA1and10),
fillOpacity = 0.7,
color="grey",
opacity=.5,
weight = 0.4,
smoothFactor = 0.2,
popup = popup)%>%
addLegend("bottomright", pal=qpal, values=joinOR$LA1and10,
opacity = .7,
title="Rural")
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +no_defs ).
## Need '+proj=longlat +datum=WGS84'
To start off our EDA we decided to build a correlation matrix of all the variables pertaining to information about populations living outside one mile of a grocery store. We call this group the Urban population/subgroup.
hm_data_urb <- na_oregon %>%
select(c(Urban,Pop2010,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,
PovertyRate,MedianFamilyIncome,LA1and10,LAPOP1_10,lapop1,lapop1share,laseniors1,
laseniors1share,lawhite1,lawhite1share,lablack1,lablack1share,laasian1,
laasian1share,lanhopi1,lanhopi1share,laaian1,laaian1share,laomultir1,
laomultir1share,lahisp1,lahisp1share,lahunv1,lahunv1share,lasnap1,lasnap1share,
TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,
TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP))
cormat_urb<-signif(cor(hm_data_urb),2)
#cormat_urb
In order to better visualize the relationships between the variables, we made a heatmap of the urban sub group’s correlation matrix.
#heatmap(cormat_urb, col=col, symm=TRUE,)
library(viridis)
library(heatmaply)
p <- heatmaply(cormat_urb,
#dendrogram = "none",
xlab = "", ylab = "",
main = "Correlation Matrix for Urban (1 mile)",
#scale = "column",
margins = c(60,100,40,20),
grid_color = "white",
grid_width = 0.00000,
titleX = FALSE,
hide_colorbar = FALSE,
branches_lwd = 0.1,
label_names = c("Row:", "Column:", "Correlation"),
fontsize_row = 5, fontsize_col = 5,
labCol = colnames(cormat_urb),
labRow = rownames(cormat_urb),
heatmap_layers = theme(axis.line=element_blank())
)
p
Now we repeated the process for variables that described the the rural population/subgroup.
hm_data_rur <- na_oregon %>%
select(c(Urban,Pop2010,OHU2010,NUMGQTRS,PCTGQTRS,LowIncomeTracts,
PovertyRate,MedianFamilyIncome,LA1and10,LAPOP1_10,lapop10,lapop10share,lalowi10,
lalowi10share,lakids10,lakids10share,laseniors10,laseniors10share,lawhite10,
lawhite10share,lablack10,lablack10share,laasian10,laasian10share,lanhopi10,
lanhopi10share,laaian10,laaian10share,laomultir10,laomultir10share,lahisp10,
lahisp10share,lahunv10,lahunv10share,lasnap10,lasnap10share,TractSeniors,
TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,
TractHUNV,TractSNAP))
cormat_rur<-signif(cor(hm_data_rur),2)
#cormat_rur
#heatmap(cormat_rur, col=col, symm=TRUE,)
library(viridis)
library(heatmaply)
p <- heatmaply(cormat_rur,
#dendrogram = "none",
xlab = "", ylab = "",
main = "Correlation Matrix for Rural (10 miles)",
#scale = "column",
margins = c(60,100,40,20),
grid_color = "white",
grid_width = 0.00000,
titleX = FALSE,
hide_colorbar = FALSE,
branches_lwd = 0.1,
label_names = c("Row:", "Column:", "Correlation"),
fontsize_row = 5, fontsize_col = 5,
labCol = colnames(cormat_rur),
labRow = rownames(cormat_rur),
heatmap_layers = theme(axis.line=element_blank())
)
p
After initial analysis of the data, we discovered that because a lot of the variables are breakdowns of the overall tract populations, it caused a lot of co-linearity between the variables. To solve this, we narrowed down our candidate variables for building models to those that were not co-linear.
final_data <- na_oregon %>%
select(c(LILATracts_1And10,LAPOP1_10,Urban,PCTGQTRS,
PovertyRate,MedianFamilyIncome,lakids1,
laseniors1,lablack1,lahisp1, lasnap1,
lakids10,laseniors10,lablack10,lahisp10, lasnap10,
lahunv1,lahunv10, lapop1share, lapop10share))
library(viridis)
library(heatmaply)
cor_final<-signif(cor(final_data),2)
p <- heatmaply(cor_final,
#dendrogram = "none",
xlab = "", ylab = "",
main = "Correlation Matrix for Final Dataset",
#scale = "column",
margins = c(60,100,40,20),
grid_color = "white",
grid_width = 0.00000,
titleX = FALSE,
hide_colorbar = FALSE,
branches_lwd = 0.1,
label_names = c("Row:", "Column:", "Correlation"),
fontsize_row = 5, fontsize_col = 5,
labCol = colnames(cor_final),
labRow = rownames(cor_final),
heatmap_layers = theme(axis.line=element_blank())
)
p
We will build this model with the lapop1share variable, share of the tract living outside one mile from a grocery store, as the response variable.
Before we train a regression model, we need to split the final_data dataset into two sets, one to train the model on, and then one to test how well the model predicts.
sample_1 <- sample(1:nrow(final_data), nrow(final_data)/2)
data_train <- final_data[sample_1,]
data_test <- final_data[-sample_1,]
Then before we start any model training, we need to install and call three R packages that we’ll be using to build our models.
##install.packages("randomForest")##
##install.packages("caret")##
##install.packages("e1071")##
library(randomForest)
library(caret)
library(e1071)
An important step in creating and training a model is building the control paramaters ro judge how well the model preforms. In this case, we decided that 10-fold cross validation would be the best model performance evaluator.
trControl <- trainControl(method = "cv",
number = 10,
search = "grid")
Now that all the setup is complete, it is time to build a model. We decided to use the random forest method to build this model, and we used default parameters for the rest.
set.seed(1234)
# Run the model
rf_default_urb <- train(lapop1share~.,
data = data_train,
method = "rf",
trControl = trControl)
# Print the results
print(rf_default_urb)
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 18.81658 0.2364892 15.54470
## 10 18.47956 0.2624962 15.24074
## 19 18.43269 0.2711153 15.19470
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 19.
After seeing the results from the model built with default parameters, we decided to expand the forest to trying 20 specific mtrys in order to pinpoint the best mtry value.
set.seed(1234)
# Run the model
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(30: 50))
rf_mtry_urb <- train(lapop1share~.,
data = data_train,
method = "rf",
tuneGrid = tuneGrid,
trControl = trControl)
# Print the results
print(rf_mtry_urb)
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 30 18.73570 0.2525681 15.50596
## 31 18.70750 0.2451678 15.45984
## 32 18.57652 0.2725282 15.32741
## 33 18.65256 0.2607360 15.38888
## 34 18.51732 0.2810444 15.31113
## 35 18.66050 0.2582400 15.36201
## 36 18.58040 0.2564149 15.36454
## 37 18.72205 0.2514925 15.45332
## 38 18.54462 0.2531552 15.31387
## 39 18.48582 0.2632008 15.35211
## 40 18.69846 0.2557871 15.39698
## 41 18.66879 0.2623290 15.47636
## 42 18.64211 0.2692837 15.31389
## 43 18.60960 0.2527320 15.39425
## 44 18.55368 0.2594919 15.33702
## 45 18.74672 0.2569804 15.54139
## 46 18.61060 0.2517964 15.33594
## 47 18.64190 0.2600063 15.38536
## 48 18.72513 0.2569499 15.48287
## 49 18.58093 0.2566612 15.35687
## 50 18.59269 0.2564299 15.37035
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 39.
We then saved the best mtry value to use as a tune for the next model build.
best_mtry_urb <- rf_mtry_urb$bestTune$mtry
best_mtry_urb
## [1] 39
Although there are more parameters we could have tuned, we felt that only tuning the mtry produced a very sufficient model for prediction.
set.seed(1234)
tuneGrid <- expand.grid(.mtry = best_mtry_urb)
rf_fit_urb <- train(lapop1share~.,
data = data_train,
method = "rf",
tuneGrid = tuneGrid,
trControl = trControl)
rf_fit_urb
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 64, 65, 63, 64, 64, 65, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 18.62039 0.2710515 15.37189
##
## Tuning parameter 'mtry' was held constant at a value of 39
rf_fit_urb$results
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 39 18.62039 0.2710515 15.37189 3.096944 0.2340185 2.794225
After fitting our model, we want to see how well it can be used for prediction purposes. before we see that, we need to install and call another R package. We also decided that we would use MSE to see how well we built the model.
#install.packages("ModelMetrics")
library(ModelMetrics)
prediction_urb <-predict(rf_fit_urb, data_test)
mse(prediction_urb,data_test$lapop1share)
## [1] 382.0019
Another thing we can do, is see which variables our random forest deemed to be important. This will help us understand what combination of settings create food insecurity.
var_imp_urb <- varImp(rf_fit_urb)
var_imp_urb
## rf variable importance
##
## Overall
## lakids1 100.0000
## lapop10share 67.9059
## lahisp1 56.8252
## laseniors1 43.2497
## PCTGQTRS 39.3404
## lablack1 31.3000
## lakids10 27.5151
## PovertyRate 16.8594
## MedianFamilyIncome 15.6567
## LAPOP1_10 13.9206
## lahunv1 13.2690
## lasnap1 12.0784
## laseniors10 10.6076
## lahunv10 8.8257
## lahisp10 8.2208
## lablack10 7.8333
## lasnap10 6.3670
## Urban 0.9867
## LILATracts_1And10 0.0000
matImp<-as.matrix(var_imp_urb$importance)
impDF<-data.frame(vars=rownames(matImp),
importance=matImp[,1])
#str(matImp)
impDF$vars<-factor(impDF$vars, levels = impDF$vars[order(impDF$importance)])
ggplot(impDF, aes(x=importance, y=vars))+
geom_col()+
ggtitle("Variable Importance")
We will build this model with the lapop10share variable, share of the tract living outside ten miles from a grocery store, as the response variable.
Now we will repeat the process of model building.
set.seed(1234)
# Run the model
rf_default_rur <- train(lapop10share~.,
data = data_train,
method = "rf",
trControl = trControl)
# Print the results
print(rf_default_rur)
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 10.68973 0.9385433 7.353801
## 10 10.12786 0.9226399 6.351505
## 19 10.49905 0.9051555 6.409830
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.
set.seed(1234)
# Run the model
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(30: 50))
rf_mtry_rur <- train(lapop10share~.,
data = data_train,
method = "rf",
tuneGrid = tuneGrid,
trControl = trControl)
# Print the results
print(rf_mtry_rur)
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 30 10.35124 0.9066682 6.333070
## 31 10.43962 0.9049379 6.395796
## 32 10.36696 0.9025078 6.307175
## 33 10.53206 0.9008418 6.361129
## 34 10.40615 0.9017172 6.310621
## 35 10.48025 0.8996820 6.384833
## 36 10.52830 0.8986936 6.466168
## 37 10.66780 0.8998885 6.542992
## 38 10.57192 0.8989280 6.381574
## 39 10.35507 0.9072573 6.257975
## 40 10.41817 0.9034612 6.303329
## 41 10.54909 0.9017272 6.407191
## 42 10.51899 0.8985665 6.354703
## 43 10.59874 0.8997011 6.465372
## 44 10.66926 0.9026052 6.513557
## 45 10.50207 0.9027343 6.366676
## 46 10.38660 0.9037762 6.427466
## 47 10.61160 0.9033198 6.432779
## 48 10.45948 0.9013762 6.309391
## 49 10.49366 0.9016831 6.411619
## 50 10.49278 0.9005667 6.377281
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 30.
best_mtry_rur <- rf_mtry_rur$bestTune$mtry
best_mtry_rur
## [1] 30
set.seed(1234)
tuneGrid <- expand.grid(.mtry = best_mtry_rur)
rf_fit_rur <- train(lapop10share~.,
data = data_train,
method = "rf",
tuneGrid = tuneGrid,
trControl = trControl)
rf_fit_rur
## Random Forest
##
## 71 samples
## 19 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 63, 65, 64, 64, 65, 64, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 10.39212 0.9037354 6.321981
##
## Tuning parameter 'mtry' was held constant at a value of 30
rf_fit_rur$results
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 30 10.39212 0.9037354 6.321981 5.50931 0.1086655 3.151746
prediction_rur <-predict(rf_fit_rur, data_test)
mse(prediction_urb,data_test$lapop10share)
## [1] 4418.325
var_imp_rur <- varImp(rf_fit_rur)
var_imp_rur
## rf variable importance
##
## Overall
## lakids10 100.0000
## laseniors10 75.7220
## lasnap10 9.2479
## lahisp10 7.2082
## lahunv10 6.4860
## LAPOP1_10 3.5686
## lablack10 2.8957
## MedianFamilyIncome 2.6656
## lablack1 1.3422
## lahunv1 1.1465
## lakids1 1.1341
## laseniors1 0.9493
## lahisp1 0.9452
## lasnap1 0.8778
## lapop1share 0.6148
## PCTGQTRS 0.4586
## PovertyRate 0.3801
## LILATracts_1And10 0.2278
## Urban 0.0000
matImp<-as.matrix(var_imp_rur$importance)
impDF<-data.frame(vars=rownames(matImp),
importance=matImp[,1])
#str(matImp)
impDF$vars<-factor(impDF$vars, levels = impDF$vars[order(impDF$importance)])
ggplot(impDF, aes(x=importance, y=vars))+
geom_col()+
ggtitle("Variable Importance")
Since many explanatory variables can be used to create linear regression models, they tend to be hard to interpret. Thankfully there is also tree regression which produces a much more intuitive model. Here we also have to download another R package.
#install.packages("rpart")
library(rpart)
Here we built tree and made the classification tree’s response variable be LILATracts_1And10, the flag for a low income low access tracts.
tree_lali_flag<-rpart(LILATracts_1And10~., data=final_data,
control=rpart.control(minsplit=1),
method="class")
summary(tree_lali_flag)
## Call:
## rpart(formula = LILATracts_1And10 ~ ., data = final_data, method = "class",
## control = rpart.control(minsplit = 1))
## n= 142
##
## CP nsplit rel error xerror xstd
## 1 0.46428571 0 1.00000000 1.0000000 0.16932818
## 2 0.03571429 2 0.07142857 0.1785714 0.07844099
## 3 0.01000000 4 0.00000000 0.2500000 0.09213268
##
## Variable importance
## MedianFamilyIncome LAPOP1_10 lasnap10 laseniors10
## 19 16 12 12
## lapop10share lahisp10 lakids10 PovertyRate
## 10 10 9 7
## lahunv10 lablack10 PCTGQTRS
## 4 2 1
##
## Node number 1: 142 observations, complexity param=0.4642857
## predicted class=0 expected loss=0.1971831 P(node) =1
## class counts: 114 28
## probabilities: 0.803 0.197
## left son=2 (100 obs) right son=3 (42 obs)
## Primary splits:
## MedianFamilyIncome < 56174 to the right, improve=21.22822, (0 missing)
## lasnap10 < 63.5 to the left, improve=18.29224, (0 missing)
## LAPOP1_10 < 504.5 to the left, improve=16.95775, (0 missing)
## lahunv10 < 21.5 to the left, improve=13.65418, (0 missing)
## laseniors10 < 122 to the left, improve=12.19665, (0 missing)
## Surrogate splits:
## PovertyRate < 16.1 to the left, agree=0.824, adj=0.405, (0 split)
## lahunv10 < 21.5 to the left, agree=0.775, adj=0.238, (0 split)
## lasnap10 < 63.5 to the left, agree=0.754, adj=0.167, (0 split)
## laseniors10 < 227 to the left, agree=0.732, adj=0.095, (0 split)
## lablack10 < 4.5 to the left, agree=0.732, adj=0.095, (0 split)
##
## Node number 2: 100 observations, complexity param=0.03571429
## predicted class=0 expected loss=0.02 P(node) =0.7042254
## class counts: 98 2
## probabilities: 0.980 0.020
## left son=4 (93 obs) right son=5 (7 obs)
## Primary splits:
## lasnap10 < 80.5 to the left, improve=1.0628570, (0 missing)
## laseniors10 < 481 to the left, improve=0.9404082, (0 missing)
## lapop10share < 89.35 to the left, improve=0.9404082, (0 missing)
## MedianFamilyIncome < 57865.5 to the right, improve=0.8088889, (0 missing)
## lahisp10 < 42.5 to the left, improve=0.5866667, (0 missing)
## Surrogate splits:
## laseniors10 < 332.5 to the left, agree=0.98, adj=0.714, (0 split)
## lakids10 < 310.5 to the left, agree=0.97, adj=0.571, (0 split)
## lapop10share < 49.575 to the left, agree=0.97, adj=0.571, (0 split)
## LAPOP1_10 < 1512 to the left, agree=0.96, adj=0.429, (0 split)
## lahunv10 < 24.5 to the left, agree=0.96, adj=0.429, (0 split)
##
## Node number 3: 42 observations, complexity param=0.4642857
## predicted class=1 expected loss=0.3809524 P(node) =0.2957746
## class counts: 16 26
## probabilities: 0.381 0.619
## left son=6 (16 obs) right son=7 (26 obs)
## Primary splits:
## LAPOP1_10 < 483 to the left, improve=19.809520, (0 missing)
## lapop10share < 18.16 to the left, improve=11.082250, (0 missing)
## laseniors10 < 130 to the left, improve=10.070390, (0 missing)
## lasnap10 < 61 to the left, improve=10.070390, (0 missing)
## lakids10 < 100.5 to the left, improve= 9.333333, (0 missing)
## Surrogate splits:
## laseniors10 < 85 to the left, agree=0.857, adj=0.625, (0 split)
## lapop10share < 13.495 to the left, agree=0.857, adj=0.625, (0 split)
## lakids10 < 64.5 to the left, agree=0.833, adj=0.563, (0 split)
## lahisp10 < 3 to the left, agree=0.833, adj=0.563, (0 split)
## lasnap10 < 17.5 to the left, agree=0.833, adj=0.563, (0 split)
##
## Node number 4: 93 observations
## predicted class=0 expected loss=0 P(node) =0.6549296
## class counts: 93 0
## probabilities: 1.000 0.000
##
## Node number 5: 7 observations, complexity param=0.03571429
## predicted class=0 expected loss=0.2857143 P(node) =0.04929577
## class counts: 5 2
## probabilities: 0.714 0.286
## left son=10 (5 obs) right son=11 (2 obs)
## Primary splits:
## MedianFamilyIncome < 58968.5 to the right, improve=2.857143, (0 missing)
## PCTGQTRS < 1.075 to the left, improve=1.523810, (0 missing)
## lahisp10 < 66.5 to the right, improve=1.523810, (0 missing)
## LAPOP1_10 < 1243 to the right, improve=1.190476, (0 missing)
## lakids1 < 456 to the right, improve=1.190476, (0 missing)
## Surrogate splits:
## PCTGQTRS < 1.075 to the left, agree=0.857, adj=0.5, (0 split)
## lahisp10 < 66.5 to the right, agree=0.857, adj=0.5, (0 split)
##
## Node number 6: 16 observations
## predicted class=0 expected loss=0 P(node) =0.1126761
## class counts: 16 0
## probabilities: 1.000 0.000
##
## Node number 7: 26 observations
## predicted class=1 expected loss=0 P(node) =0.1830986
## class counts: 0 26
## probabilities: 0.000 1.000
##
## Node number 10: 5 observations
## predicted class=0 expected loss=0 P(node) =0.03521127
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 11: 2 observations
## predicted class=1 expected loss=0 P(node) =0.01408451
## class counts: 0 2
## probabilities: 0.000 1.000
After building the tree, we now will visualize it.
par(mfrow=c(1,1))
plot(tree_lali_flag , uniform=TRUE,margin=0.2,
main="Classification Tree for Low Access Low Income Flag")
text(tree_lali_flag , use.n=TRUE, all=TRUE, cex=.8)
Here is the same tree, but visualized with a different style. It uses a different R package, so that package needed to be installed and loaded as well.
#install.packages("rpart.plot")
library(rpart.plot)
prp(tree_lali_flag, faclen = 0, cex = 0.7, extra=1, space=.5)
We chose to do linear regression because we wanted to a binary response to predict if a tract will be flagged as a low income and low access tract. We built a logistic model with LILATracts_1And10 as our response variable again.
But first we need to call another R package into our environment.
library(leaps)
To see which of our variables from the final_data dataset should be used for each sized model, we used a hybrid method to pick the best combination of explanatory variables that should be used for models with one or up to eight explanatory variables.
regfit_LILATracts_1And10 <- regsubsets(LILATracts_1And10~.,data=final_data)
summary(regfit_LILATracts_1And10)
## Subset selection object
## Call: regsubsets.formula(LILATracts_1And10 ~ ., data = final_data)
## 19 Variables (and intercept)
## Forced in Forced out
## LAPOP1_10 FALSE FALSE
## Urban FALSE FALSE
## PCTGQTRS FALSE FALSE
## PovertyRate FALSE FALSE
## MedianFamilyIncome FALSE FALSE
## lakids1 FALSE FALSE
## laseniors1 FALSE FALSE
## lablack1 FALSE FALSE
## lahisp1 FALSE FALSE
## lasnap1 FALSE FALSE
## lakids10 FALSE FALSE
## laseniors10 FALSE FALSE
## lablack10 FALSE FALSE
## lahisp10 FALSE FALSE
## lasnap10 FALSE FALSE
## lahunv1 FALSE FALSE
## lahunv10 FALSE FALSE
## lapop1share FALSE FALSE
## lapop10share FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## LAPOP1_10 Urban PCTGQTRS PovertyRate MedianFamilyIncome lakids1
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " "*" " "
## 4 ( 1 ) " " " " " " " " "*" " "
## 5 ( 1 ) " " "*" " " " " "*" " "
## 6 ( 1 ) " " "*" " " " " "*" " "
## 7 ( 1 ) " " "*" " " " " "*" " "
## 8 ( 1 ) " " "*" " " " " "*" " "
## laseniors1 lablack1 lahisp1 lasnap1 lakids10 laseniors10 lablack10
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " "*" " " " "
## 5 ( 1 ) " " " " " " " " "*" " " " "
## 6 ( 1 ) " " " " " " " " "*" "*" " "
## 7 ( 1 ) " " " " " " " " "*" "*" " "
## 8 ( 1 ) "*" " " " " " " "*" "*" " "
## lahisp10 lasnap10 lahunv1 lahunv10 lapop1share lapop10share
## 1 ( 1 ) " " "*" " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " "*"
## 5 ( 1 ) " " "*" " " " " " " "*"
## 6 ( 1 ) " " "*" " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " "*" "*"
## 8 ( 1 ) " " "*" " " " " "*" "*"
With this model and all the rest we will be taking the variables indicated above as the best explanatory variable for each level and pasting them into the model.
var_2_mod <- glm(LILATracts_1And10~lasnap10,data=data_train)
summary(var_2_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.63942 -0.13925 -0.07699 -0.05921 0.94079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0592054 0.0444331 1.332 0.187
## lasnap10 0.0029647 0.0005279 5.616 3.8e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1056245)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 7.2881 on 69 degrees of freedom
## AIC: 45.862
##
## Number of Fisher Scoring iterations: 2
Also for every model, we will be creating a confusion matrix to use to find each model’s error rate, which will be the standard of performance for the models.
pred1 <- predict(var_2_mod,data = data_test,type="response")
conf_mat1<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred1>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat1
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 111
## 2 0 TRUE 3
## 3 1 FALSE 27
## 4 1 TRUE 1
correct <- conf_mat1$n[c(1,4)]
er_1 <- 1-sum(correct)/sum(conf_mat1$n)
er_1
## [1] 0.2112676
var_3_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome,data=data_train)
summary(var_3_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome,
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.44765 -0.18252 -0.06993 0.14692 0.78251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.023e+00 1.952e-01 5.241 1.70e-06 ***
## lasnap10 2.021e-03 4.910e-04 4.117 0.000106 ***
## MedianFamilyIncome -1.501e-05 2.981e-06 -5.035 3.74e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07807051)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 5.3088 on 68 degrees of freedom
## AIC: 25.364
##
## Number of Fisher Scoring iterations: 2
pred2 <- predict(var_3_mod,data = data_test,type="response")
conf_mat2<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred2>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat2
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 103
## 2 0 TRUE 11
## 3 1 FALSE 25
## 4 1 TRUE 3
correct2 <- conf_mat2$n[c(1,4)]
er_2 <- 1-sum(correct2)/sum(conf_mat2$n)
er_2
## [1] 0.2535211
var_4_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban,data=data_train)
summary(var_4_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.46390 -0.18143 -0.04975 0.13826 0.63090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.062e+00 1.933e-01 5.495 6.53e-07 ***
## lasnap10 2.054e-03 4.835e-04 4.248 6.80e-05 ***
## MedianFamilyIncome -1.594e-05 2.978e-06 -5.351 1.14e-06 ***
## Urban 2.341e-01 1.303e-01 1.797 0.0768 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07559208)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 5.0647 on 67 degrees of freedom
## AIC: 24.022
##
## Number of Fisher Scoring iterations: 2
pred3 <- predict(var_4_mod,data = data_test,type="response")
conf_mat3<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred3>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat3
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 103
## 2 0 TRUE 11
## 3 1 FALSE 25
## 4 1 TRUE 3
correct3 <- conf_mat3$n[c(1,4)]
er_3 <- 1-sum(correct3)/sum(conf_mat3$n)
er_3
## [1] 0.2535211
var_5_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids1,data=data_train)
summary(var_5_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban + lakids1, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4611 -0.1734 -0.0538 0.1275 0.6169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.071e+00 1.953e-01 5.481 7.13e-07 ***
## lasnap10 2.074e-03 4.884e-04 4.247 6.94e-05 ***
## MedianFamilyIncome -1.565e-05 3.063e-06 -5.108 2.98e-06 ***
## Urban 2.302e-01 1.313e-01 1.753 0.0843 .
## lakids1 -4.294e-05 9.456e-05 -0.454 0.6512
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07649838)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 5.0489 on 66 degrees of freedom
## AIC: 25.8
##
## Number of Fisher Scoring iterations: 2
pred4 <- predict(var_5_mod,data = data_test,type="response")
conf_mat4<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred4>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat4
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 103
## 2 0 TRUE 11
## 3 1 FALSE 25
## 4 1 TRUE 3
correct4 <- conf_mat4$n[c(1,4)]
er_4 <- 1-sum(correct4)/sum(conf_mat4$n)
er_4
## [1] 0.2535211
var_6_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids1+lahunv10,
data=data_train)
summary(var_6_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban + lakids1 + lahunv10, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.39008 -0.18087 -0.05161 0.09066 0.68533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.039e+00 1.921e-01 5.407 9.82e-07 ***
## lasnap10 8.415e-04 7.978e-04 1.055 0.2954
## MedianFamilyIncome -1.545e-05 3.003e-06 -5.145 2.67e-06 ***
## Urban 2.288e-01 1.287e-01 1.777 0.0802 .
## lakids1 -1.699e-05 9.364e-05 -0.181 0.8566
## lahunv10 7.686e-03 3.981e-03 1.931 0.0579 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07346162)
##
## Null deviance: 10.620 on 70 degrees of freedom
## Residual deviance: 4.775 on 65 degrees of freedom
## AIC: 23.84
##
## Number of Fisher Scoring iterations: 2
pred5 <- predict(var_6_mod,data = data_test,type="response")
conf_mat5<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred5>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat5
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 102
## 2 0 TRUE 12
## 3 1 FALSE 24
## 4 1 TRUE 4
correct5 <- conf_mat5$n[c(1,4)]
er_5 <- 1-sum(correct5)/sum(conf_mat5$n)
er_5
## [1] 0.2535211
var_7_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
lahunv1,data=data_train)
summary(var_7_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban + lakids10 + lahunv10 + lahunv1, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.38997 -0.17647 -0.05468 0.09575 0.68514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.047e+00 1.922e-01 5.446 8.73e-07 ***
## lasnap10 1.025e-03 1.046e-03 0.980 0.3307
## MedianFamilyIncome -1.520e-05 2.983e-06 -5.096 3.30e-06 ***
## Urban 2.179e-01 1.299e-01 1.677 0.0985 .
## lakids10 -1.968e-04 5.139e-04 -0.383 0.7030
## lahunv10 9.096e-03 4.309e-03 2.111 0.0387 *
## lahunv1 -8.238e-04 9.411e-04 -0.875 0.3847
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0736832)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 4.7157 on 64 degrees of freedom
## AIC: 24.953
##
## Number of Fisher Scoring iterations: 2
pred6 <- predict(var_7_mod,data = data_test,type="response")
conf_mat6<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred6>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat6
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 102
## 2 0 TRUE 12
## 3 1 FALSE 24
## 4 1 TRUE 4
correct6 <- conf_mat6$n[c(1,4)]
er_6 <- 1-sum(correct6)/sum(conf_mat6$n)
er_6
## [1] 0.2535211
var_8_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
lahunv1+lablack10,data=data_train)
summary(var_8_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban + lakids10 + lahunv10 + lahunv1 + lablack10, data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.36020 -0.17269 -0.06586 0.11719 0.71956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.027e+00 1.924e-01 5.339 1.36e-06 ***
## lasnap10 8.506e-04 1.054e-03 0.807 0.4226
## MedianFamilyIncome -1.486e-05 2.989e-06 -4.972 5.39e-06 ***
## Urban 2.028e-01 1.302e-01 1.558 0.1243
## lakids10 -3.381e-04 5.265e-04 -0.642 0.5231
## lahunv10 9.043e-03 4.297e-03 2.104 0.0393 *
## lahunv1 -1.054e-03 9.589e-04 -1.099 0.2760
## lablack10 1.692e-02 1.449e-02 1.168 0.2474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.0732674)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 4.6158 on 63 degrees of freedom
## AIC: 25.433
##
## Number of Fisher Scoring iterations: 2
pred7 <- predict(var_8_mod,data = data_test,type="response")
conf_mat7<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred7>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat7
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 103
## 2 0 TRUE 11
## 3 1 FALSE 25
## 4 1 TRUE 3
correct7 <- conf_mat7$n[c(1,4)]
er_7 <- 1-sum(correct7)/sum(conf_mat7$n)
er_7
## [1] 0.2535211
var_9_mod <- glm(LILATracts_1And10~lasnap10+MedianFamilyIncome+Urban+lakids10+lahunv10+
lahunv1+lablack10+lahisp1,data=data_train)
summary(var_9_mod)
##
## Call:
## glm(formula = LILATracts_1And10 ~ lasnap10 + MedianFamilyIncome +
## Urban + lakids10 + lahunv10 + lahunv1 + lablack10 + lahisp1,
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.37802 -0.16790 -0.05736 0.11733 0.69978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.038e+00 1.934e-01 5.366 1.27e-06 ***
## lasnap10 8.895e-04 1.058e-03 0.841 0.4037
## MedianFamilyIncome -1.467e-05 3.008e-06 -4.876 7.87e-06 ***
## Urban 1.926e-01 1.312e-01 1.468 0.1471
## lakids10 -3.246e-04 5.283e-04 -0.614 0.5412
## lahunv10 8.168e-03 4.448e-03 1.837 0.0711 .
## lahunv1 -8.850e-04 9.849e-04 -0.899 0.3723
## lablack10 1.953e-02 1.490e-02 1.311 0.1948
## lahisp1 -1.784e-04 2.244e-04 -0.795 0.4295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07369738)
##
## Null deviance: 10.6197 on 70 degrees of freedom
## Residual deviance: 4.5692 on 62 degrees of freedom
## AIC: 26.713
##
## Number of Fisher Scoring iterations: 2
pred8 <- predict(var_9_mod,data = data_test,type="response")
conf_mat8<-data.frame(LILATracts_1And10=final_data$LILATracts_1And10,
pred_LILATracts_1And10=pred8>.5)%>%
group_by(LILATracts_1And10, pred_LILATracts_1And10)%>%
summarise(n=n())
## Warning in data.frame(LILATracts_1And10 = final_data$LILATracts_1And10, : row
## names were found from a short variable and have been discarded
## `summarise()` has grouped output by 'LILATracts_1And10'. You can override using the `.groups` argument.
conf_mat8
## # A tibble: 4 × 3
## # Groups: LILATracts_1And10 [2]
## LILATracts_1And10 pred_LILATracts_1And10 n
## <dbl> <lgl> <int>
## 1 0 FALSE 101
## 2 0 TRUE 13
## 3 1 FALSE 25
## 4 1 TRUE 3
correct8 <- conf_mat8$n[c(1,4)]
er_8 <- 1-sum(correct8)/sum(conf_mat8$n)
er_8
## [1] 0.2676056
For our final logistic model, wo would choose the model that had the lowest error rate. To easily see that, we first made a data frame of all the error rates and their corresponding indexes. Then we created a scatter plot overlayed with a line chart to easily show which model performed the best.
error <- data.frame(number_of_exp_vars=c(1,2,3,4,5,6,7,8),error_rate=c(er_1,er_2,er_3,er_4,er_5,
er_6,er_7,er_8))
str(error)
## 'data.frame': 8 obs. of 2 variables:
## $ number_of_exp_vars: num 1 2 3 4 5 6 7 8
## $ error_rate : num 0.211 0.254 0.254 0.254 0.254 ...
ggplot(error,aes(number_of_exp_vars,error_rate))+
geom_point()+
geom_line()
From the graph we can clearly see that the simplest model with only one explanatory variable was the most accurate at predicting low income low access tracts in Oregon.