library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
OR_Data <- read_csv("OR_Data.csv")
## Rows: 826 Columns: 83
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): State, County
## dbl (81): CensusTract, Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS,...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
OR_Data<-OR_Data[,-3]
We chose Urban as our categorical variable. It is a boolean that tells whether a given census tract is classified as Urban or not.
ggplot(OR_Data, aes(x = TractLOWI, y = TractSNAP, color = as.factor(Urban)))+
geom_boxplot()
Y_N_Urban <- ifelse(OR_Data$Urban == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_Urban)
tree.Urban1 <- rpart(Y_N_Urban~.-Urban, data = OR_Data, method = "class")
prp(tree.Urban1, faclen = 0, cex = 0.7, extra = 1, space = .5)
summary(tree.Urban1)
## Call:
## rpart(formula = Y_N_Urban ~ . - Urban, data = OR_Data, method = "class")
## n= 826
##
## CP nsplit rel error xerror xstd
## 1 0.55895197 0 1.00000000 1.00000000 0.05617970
## 2 0.36244541 1 0.44104803 0.49781659 0.04328794
## 3 0.01746725 2 0.07860262 0.12663755 0.02309950
## 4 0.01000000 4 0.04366812 0.07860262 0.01832384
##
## Variable importance
## laseniors1share LATracts1 lalowi1share laseniors1 lawhite1share
## 14 13 13 11 11
## lapop1share laaian1share LA1and10 TractBlack TractAsian
## 11 10 6 2 2
## TractHUNV lablack1 lablack1share
## 2 1 1
##
## Node number 1: 826 observations, complexity param=0.558952
## predicted class=Yes expected loss=0.2772397 P(node) =1
## class counts: 229 597
## probabilities: 0.277 0.723
## left son=2 (296 obs) right son=3 (530 obs)
## Primary splits:
## laseniors1share < 6.665 to the right, improve=135.7587, (208 missing)
## lalowi1share < 11.075 to the right, improve=125.7511, (208 missing)
## lawhite1share < 39.7 to the right, improve=116.5217, (208 missing)
## TractBlack < 19.5 to the left, improve=114.4118, (0 missing)
## lapop1share < 41.575 to the right, improve=114.0017, (208 missing)
## Surrogate splits:
## laseniors1 < 267 to the right, agree=0.911, adj=0.814, (0 split)
## lawhite1share < 38.565 to the right, agree=0.911, adj=0.814, (0 split)
## lapop1share < 43.205 to the right, agree=0.895, adj=0.780, (0 split)
## lalowi1share < 11.31 to the right, agree=0.885, adj=0.760, (0 split)
## laaian1share < 0.365 to the right, agree=0.864, adj=0.716, (0 split)
##
## Node number 2: 296 observations, complexity param=0.3624454
## predicted class=No expected loss=0.2837838 P(node) =0.3583535
## class counts: 212 84
## probabilities: 0.716 0.284
## left son=4 (213 obs) right son=5 (83 obs)
## Primary splits:
## LATracts1 < 0.5 to the left, improve=118.33370, (0 missing)
## LAPOP1_10 < 1480 to the left, improve= 67.54396, (83 missing)
## LA1and10 < 0.5 to the left, improve= 58.32099, (0 missing)
## LALOWI1_10 < 223.5 to the left, improve= 43.48152, (83 missing)
## laseniors1share < 12.51 to the right, improve= 21.29766, (0 missing)
## Surrogate splits:
## LA1and10 < 0.5 to the left, agree=0.841, adj=0.434, (0 split)
## lalowi1share < 9.19 to the right, agree=0.770, adj=0.181, (0 split)
## TractAsian < 82 to the left, agree=0.770, adj=0.181, (0 split)
## TractBlack < 26.5 to the left, agree=0.760, adj=0.145, (0 split)
## TractHUNV < 175.5 to the left, agree=0.760, adj=0.145, (0 split)
##
## Node number 3: 530 observations, complexity param=0.01746725
## predicted class=Yes expected loss=0.03207547 P(node) =0.6416465
## class counts: 17 513
## probabilities: 0.032 0.968
## left son=6 (25 obs) right son=7 (505 obs)
## Primary splits:
## lahisp1share < 6.47 to the right, improve=3.870423, (208 missing)
## laaian1share < 0.415 to the right, improve=3.842675, (208 missing)
## lalowi1share < 8.37 to the right, improve=3.230002, (208 missing)
## lasnap1share < 4.51 to the right, improve=3.160524, (208 missing)
## laomultir1share < 4.06 to the right, improve=2.763590, (208 missing)
## Surrogate splits:
## laomultir1share < 4.48 to the right, agree=0.978, adj=0.72, (0 split)
## laomultir1 < 314 to the right, agree=0.957, adj=0.44, (0 split)
## lahisp1 < 401.5 to the right, agree=0.957, adj=0.44, (0 split)
## lalowi1share < 16.445 to the right, agree=0.953, adj=0.40, (0 split)
## lalowi1 < 810 to the right, agree=0.947, adj=0.32, (0 split)
##
## Node number 4: 213 observations
## predicted class=No expected loss=0.004694836 P(node) =0.2578692
## class counts: 212 1
## probabilities: 0.995 0.005
##
## Node number 5: 83 observations
## predicted class=Yes expected loss=0 P(node) =0.1004843
## class counts: 0 83
## probabilities: 0.000 1.000
##
## Node number 6: 25 observations, complexity param=0.01746725
## predicted class=Yes expected loss=0.32 P(node) =0.03026634
## class counts: 8 17
## probabilities: 0.320 0.680
## left son=12 (8 obs) right son=13 (17 obs)
## Primary splits:
## LATracts1 < 0.5 to the left, improve=10.880000, (0 missing)
## LA1and10 < 0.5 to the left, improve= 8.991111, (0 missing)
## lablack1 < 12.5 to the left, improve= 4.350588, (0 missing)
## lablack1share < 0.265 to the left, improve= 4.350588, (0 missing)
## lawhite1 < 1683 to the left, improve= 3.931948, (0 missing)
## Surrogate splits:
## LA1and10 < 0.5 to the left, agree=0.96, adj=0.875, (0 split)
## lablack1 < 12.5 to the left, agree=0.84, adj=0.500, (0 split)
## lablack1share < 0.265 to the left, agree=0.84, adj=0.500, (0 split)
## TractBlack < 26 to the left, agree=0.84, adj=0.500, (0 split)
## lawhite1 < 1232 to the left, agree=0.80, adj=0.375, (0 split)
##
## Node number 7: 505 observations
## predicted class=Yes expected loss=0.01782178 P(node) =0.6113801
## class counts: 9 496
## probabilities: 0.018 0.982
##
## Node number 12: 8 observations
## predicted class=No expected loss=0 P(node) =0.00968523
## class counts: 8 0
## probabilities: 1.000 0.000
##
## Node number 13: 17 observations
## predicted class=Yes expected loss=0 P(node) =0.02058111
## class counts: 0 17
## probabilities: 0.000 1.000
If the share of low access seniors at 1 mile is less than 6.7, if the Hispanic low access share at 1 mile is less than 6.5, the tract is Urban. Otherwise, if the Hispanic low access share at 1 mile is greater than or equal to 6.5, if the tract is considered low access, it is Urban, and if not, it isn’t. If the share of low access seniors at 1 mile is greater than or equal to 6.7, if the tract at is low access, it is Urban, and if not, it isn’t.
set.seed(1)
train_data <- sample(1:nrow(OR_Data), nrow(OR_Data)/2)
tree.Urban2 <- rpart(Y_N_Urban~.-Urban,
data = OR_Data,
subset = train_data,
method = "class")
prp(tree.Urban2, faclen = 0, cex = 0.7, extra = 1, space = .5)
OR.test <- OR_Data[-train_data, "Y_N_Urban"]
tree.pred <- predict(tree.Urban2, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm
## OR.test
## test.pred No Yes
## No 108 1
## Yes 9 295
Y_N_LowAcc <- ifelse(OR_Data$LowIncomeTracts == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_LowAcc)
tree.lowAcc <- rpart(Y_N_LowAcc~.-LowIncomeTracts,
data = OR_Data,
subset = train_data,
method = "class")
prp(tree.lowAcc, faclen = 0, cex = 0.7, extra = 1, space = .5)
OR.test <- OR_Data[-train_data, "Y_N_LowAcc"]
tree.pred <- predict(tree.lowAcc, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm
## OR.test
## test.pred No Yes
## No 254 5
## Yes 3 151
summary(tree.lowAcc)
## Call:
## rpart(formula = Y_N_LowAcc ~ . - LowIncomeTracts, data = OR_Data,
## subset = train_data, method = "class")
## n= 413
##
## CP nsplit rel error xerror xstd
## 1 0.71069182 0 1.00000000 1.00000000 0.06219325
## 2 0.14465409 1 0.28930818 0.29559748 0.04058984
## 3 0.04402516 2 0.14465409 0.15094340 0.02990255
## 4 0.01000000 4 0.05660377 0.06918239 0.02057961
##
## Variable importance
## MedianFamilyIncome PovertyRate TractSNAP TractLOWI
## 32 20 10 9
## LILATracts_1And10 TractBlack TractAIAN TractAsian
## 8 4 4 3
## TractHUNV TractOMultir TractNHOPI TractHispanic
## 2 2 2 1
## PCTGQTRS NUMGQTRS TractKids
## 1 1 1
##
## Node number 1: 413 observations, complexity param=0.7106918
## predicted class=No expected loss=0.3849879 P(node) =1
## class counts: 254 159
## probabilities: 0.615 0.385
## left son=2 (300 obs) right son=3 (113 obs)
## Primary splits:
## MedianFamilyIncome < 58234.5 to the right, improve=116.16020, (2 missing)
## PovertyRate < 17.65 to the left, improve= 92.43622, (0 missing)
## TractSNAP < 402.5 to the left, improve= 60.68571, (0 missing)
## TractLOWI < 1643.5 to the left, improve= 58.94830, (0 missing)
## LILATracts_1And10 < 0.5 to the left, improve= 43.03390, (0 missing)
## Surrogate splits:
## PovertyRate < 17.55 to the left, agree=0.832, adj=0.378, (2 split)
## TractSNAP < 532.5 to the left, agree=0.820, adj=0.333, (0 split)
## TractLOWI < 2117.5 to the left, agree=0.810, adj=0.297, (0 split)
## LILATracts_1And10 < 0.5 to the left, agree=0.805, adj=0.279, (0 split)
## TractAIAN < 106.5 to the left, agree=0.764, adj=0.126, (0 split)
##
## Node number 2: 300 observations, complexity param=0.1446541
## predicted class=No expected loss=0.1533333 P(node) =0.7263923
## class counts: 254 46
## probabilities: 0.847 0.153
## left son=4 (277 obs) right son=5 (23 obs)
## Primary splits:
## PovertyRate < 19.9 to the left, improve=35.71283, (0 missing)
## MedianFamilyIncome < 68175.5 to the right, improve=18.04391, (0 missing)
## LILATracts_1And10 < 0.5 to the left, improve=13.30227, (0 missing)
## TractSNAP < 352.5 to the left, improve=12.42014, (0 missing)
## TractLOWI < 2606.5 to the left, improve=11.43370, (0 missing)
## Surrogate splits:
## PCTGQTRS < 32.315 to the left, agree=0.933, adj=0.130, (0 split)
## TractHUNV < 584.5 to the left, agree=0.933, adj=0.130, (0 split)
## NUMGQTRS < 871.5 to the left, agree=0.930, adj=0.087, (0 split)
## TractKids < 78.5 to the right, agree=0.930, adj=0.087, (0 split)
## OHU2010 < 604.5 to the right, agree=0.927, adj=0.043, (0 split)
##
## Node number 3: 113 observations
## predicted class=Yes expected loss=0 P(node) =0.2736077
## class counts: 0 113
## probabilities: 0.000 1.000
##
## Node number 4: 277 observations, complexity param=0.04402516
## predicted class=No expected loss=0.08303249 P(node) =0.6707022
## class counts: 254 23
## probabilities: 0.917 0.083
## left son=8 (207 obs) right son=9 (70 obs)
## Primary splits:
## MedianFamilyIncome < 68175.5 to the right, improve=11.294790, (0 missing)
## TractLOWI < 2729 to the left, improve= 6.279248, (0 missing)
## TractSNAP < 352.5 to the left, improve= 5.242500, (0 missing)
## TractOMultir < 779 to the left, improve= 5.081397, (0 missing)
## TractBlack < 264 to the left, improve= 4.479913, (0 missing)
## Surrogate splits:
## TractLOWI < 2113.5 to the left, agree=0.801, adj=0.214, (0 split)
## TractOMultir < 779 to the left, agree=0.783, adj=0.143, (0 split)
## TractSNAP < 351.5 to the left, agree=0.783, adj=0.143, (0 split)
## TractHispanic < 904.5 to the left, agree=0.776, adj=0.114, (0 split)
## TractAsian < 14.5 to the right, agree=0.773, adj=0.100, (0 split)
##
## Node number 5: 23 observations
## predicted class=Yes expected loss=0 P(node) =0.05569007
## class counts: 0 23
## probabilities: 0.000 1.000
##
## Node number 8: 207 observations
## predicted class=No expected loss=0 P(node) =0.5012107
## class counts: 207 0
## probabilities: 1.000 0.000
##
## Node number 9: 70 observations, complexity param=0.04402516
## predicted class=No expected loss=0.3285714 P(node) =0.1694915
## class counts: 47 23
## probabilities: 0.671 0.329
## left son=18 (50 obs) right son=19 (20 obs)
## Primary splits:
## TractBlack < 58 to the left, improve=15.225710, (0 missing)
## TractAsian < 85.5 to the left, improve=12.370640, (0 missing)
## TractNHOPI < 15 to the left, improve= 8.752847, (0 missing)
## TractOMultir < 174 to the left, improve= 8.396825, (0 missing)
## laaian1share < 0.085 to the right, improve= 8.062035, (8 missing)
## Surrogate splits:
## TractAsian < 154.5 to the left, agree=0.929, adj=0.75, (0 split)
## TractNHOPI < 23.5 to the left, agree=0.843, adj=0.45, (0 split)
## TractOMultir < 496 to the left, agree=0.814, adj=0.35, (0 split)
## TractHispanic < 590.5 to the left, agree=0.786, adj=0.25, (0 split)
## TractHUNV < 255 to the left, agree=0.786, adj=0.25, (0 split)
##
## Node number 18: 50 observations
## predicted class=No expected loss=0.12 P(node) =0.1210654
## class counts: 44 6
## probabilities: 0.880 0.120
##
## Node number 19: 20 observations
## predicted class=Yes expected loss=0.15 P(node) =0.04842615
## class counts: 3 17
## probabilities: 0.150 0.850
```