library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.1.2

OR_Data <- read_csv("OR_Data.csv")

## Rows: 826 Columns: 83

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): State, County
## dbl (81): CensusTract, Urban, Pop2010, OHU2010, GroupQuartersFlag, NUMGQTRS,...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

OR_Data<-OR_Data[,-3]

Step 1:

We chose Urban as our categorical variable. It is a boolean that tells whether a given census tract is classified as Urban or not.

Step 2:

ggplot(OR_Data, aes(x = TractLOWI, y = TractSNAP, color = as.factor(Urban)))+
  geom_boxplot()

Y_N_Urban <- ifelse(OR_Data$Urban == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_Urban)


tree.Urban1 <- rpart(Y_N_Urban~.-Urban, data = OR_Data, method = "class")

prp(tree.Urban1, faclen = 0, cex = 0.7, extra = 1, space = .5)

summary(tree.Urban1)

## Call:
## rpart(formula = Y_N_Urban ~ . - Urban, data = OR_Data, method = "class")
##   n= 826 
## 
##           CP nsplit  rel error     xerror       xstd
## 1 0.55895197      0 1.00000000 1.00000000 0.05617970
## 2 0.36244541      1 0.44104803 0.49781659 0.04328794
## 3 0.01746725      2 0.07860262 0.12663755 0.02309950
## 4 0.01000000      4 0.04366812 0.07860262 0.01832384
## 
## Variable importance
## laseniors1share       LATracts1    lalowi1share      laseniors1   lawhite1share 
##              14              13              13              11              11 
##     lapop1share    laaian1share        LA1and10      TractBlack      TractAsian 
##              11              10               6               2               2 
##       TractHUNV        lablack1   lablack1share 
##               2               1               1 
## 
## Node number 1: 826 observations,    complexity param=0.558952
##   predicted class=Yes  expected loss=0.2772397  P(node) =1
##     class counts:   229   597
##    probabilities: 0.277 0.723 
##   left son=2 (296 obs) right son=3 (530 obs)
##   Primary splits:
##       laseniors1share < 6.665  to the right, improve=135.7587, (208 missing)
##       lalowi1share    < 11.075 to the right, improve=125.7511, (208 missing)
##       lawhite1share   < 39.7   to the right, improve=116.5217, (208 missing)
##       TractBlack      < 19.5   to the left,  improve=114.4118, (0 missing)
##       lapop1share     < 41.575 to the right, improve=114.0017, (208 missing)
##   Surrogate splits:
##       laseniors1    < 267    to the right, agree=0.911, adj=0.814, (0 split)
##       lawhite1share < 38.565 to the right, agree=0.911, adj=0.814, (0 split)
##       lapop1share   < 43.205 to the right, agree=0.895, adj=0.780, (0 split)
##       lalowi1share  < 11.31  to the right, agree=0.885, adj=0.760, (0 split)
##       laaian1share  < 0.365  to the right, agree=0.864, adj=0.716, (0 split)
## 
## Node number 2: 296 observations,    complexity param=0.3624454
##   predicted class=No   expected loss=0.2837838  P(node) =0.3583535
##     class counts:   212    84
##    probabilities: 0.716 0.284 
##   left son=4 (213 obs) right son=5 (83 obs)
##   Primary splits:
##       LATracts1       < 0.5    to the left,  improve=118.33370, (0 missing)
##       LAPOP1_10       < 1480   to the left,  improve= 67.54396, (83 missing)
##       LA1and10        < 0.5    to the left,  improve= 58.32099, (0 missing)
##       LALOWI1_10      < 223.5  to the left,  improve= 43.48152, (83 missing)
##       laseniors1share < 12.51  to the right, improve= 21.29766, (0 missing)
##   Surrogate splits:
##       LA1and10     < 0.5    to the left,  agree=0.841, adj=0.434, (0 split)
##       lalowi1share < 9.19   to the right, agree=0.770, adj=0.181, (0 split)
##       TractAsian   < 82     to the left,  agree=0.770, adj=0.181, (0 split)
##       TractBlack   < 26.5   to the left,  agree=0.760, adj=0.145, (0 split)
##       TractHUNV    < 175.5  to the left,  agree=0.760, adj=0.145, (0 split)
## 
## Node number 3: 530 observations,    complexity param=0.01746725
##   predicted class=Yes  expected loss=0.03207547  P(node) =0.6416465
##     class counts:    17   513
##    probabilities: 0.032 0.968 
##   left son=6 (25 obs) right son=7 (505 obs)
##   Primary splits:
##       lahisp1share    < 6.47   to the right, improve=3.870423, (208 missing)
##       laaian1share    < 0.415  to the right, improve=3.842675, (208 missing)
##       lalowi1share    < 8.37   to the right, improve=3.230002, (208 missing)
##       lasnap1share    < 4.51   to the right, improve=3.160524, (208 missing)
##       laomultir1share < 4.06   to the right, improve=2.763590, (208 missing)
##   Surrogate splits:
##       laomultir1share < 4.48   to the right, agree=0.978, adj=0.72, (0 split)
##       laomultir1      < 314    to the right, agree=0.957, adj=0.44, (0 split)
##       lahisp1         < 401.5  to the right, agree=0.957, adj=0.44, (0 split)
##       lalowi1share    < 16.445 to the right, agree=0.953, adj=0.40, (0 split)
##       lalowi1         < 810    to the right, agree=0.947, adj=0.32, (0 split)
## 
## Node number 4: 213 observations
##   predicted class=No   expected loss=0.004694836  P(node) =0.2578692
##     class counts:   212     1
##    probabilities: 0.995 0.005 
## 
## Node number 5: 83 observations
##   predicted class=Yes  expected loss=0  P(node) =0.1004843
##     class counts:     0    83
##    probabilities: 0.000 1.000 
## 
## Node number 6: 25 observations,    complexity param=0.01746725
##   predicted class=Yes  expected loss=0.32  P(node) =0.03026634
##     class counts:     8    17
##    probabilities: 0.320 0.680 
##   left son=12 (8 obs) right son=13 (17 obs)
##   Primary splits:
##       LATracts1     < 0.5    to the left,  improve=10.880000, (0 missing)
##       LA1and10      < 0.5    to the left,  improve= 8.991111, (0 missing)
##       lablack1      < 12.5   to the left,  improve= 4.350588, (0 missing)
##       lablack1share < 0.265  to the left,  improve= 4.350588, (0 missing)
##       lawhite1      < 1683   to the left,  improve= 3.931948, (0 missing)
##   Surrogate splits:
##       LA1and10      < 0.5    to the left,  agree=0.96, adj=0.875, (0 split)
##       lablack1      < 12.5   to the left,  agree=0.84, adj=0.500, (0 split)
##       lablack1share < 0.265  to the left,  agree=0.84, adj=0.500, (0 split)
##       TractBlack    < 26     to the left,  agree=0.84, adj=0.500, (0 split)
##       lawhite1      < 1232   to the left,  agree=0.80, adj=0.375, (0 split)
## 
## Node number 7: 505 observations
##   predicted class=Yes  expected loss=0.01782178  P(node) =0.6113801
##     class counts:     9   496
##    probabilities: 0.018 0.982 
## 
## Node number 12: 8 observations
##   predicted class=No   expected loss=0  P(node) =0.00968523
##     class counts:     8     0
##    probabilities: 1.000 0.000 
## 
## Node number 13: 17 observations
##   predicted class=Yes  expected loss=0  P(node) =0.02058111
##     class counts:     0    17
##    probabilities: 0.000 1.000

If the share of low access seniors at 1 mile is less than 6.7, if the Hispanic low access share at 1 mile is less than 6.5, the tract is Urban. Otherwise, if the Hispanic low access share at 1 mile is greater than or equal to 6.5, if the tract is considered low access, it is Urban, and if not, it isn’t. If the share of low access seniors at 1 mile is greater than or equal to 6.7, if the tract at is low access, it is Urban, and if not, it isn’t.

Step 3:

set.seed(1)
train_data <- sample(1:nrow(OR_Data), nrow(OR_Data)/2)

tree.Urban2 <- rpart(Y_N_Urban~.-Urban, 
                     data = OR_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.Urban2, faclen = 0, cex = 0.7, extra = 1, space = .5)

OR.test <- OR_Data[-train_data, "Y_N_Urban"]
tree.pred <- predict(tree.Urban2, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm

##          OR.test
## test.pred  No Yes
##       No  108   1
##       Yes   9 295

doing test/train for low access tract

Y_N_LowAcc <- ifelse(OR_Data$LowIncomeTracts == 0, "No", "Yes")
OR_Data <- data.frame(OR_Data, Y_N_LowAcc)

tree.lowAcc <- rpart(Y_N_LowAcc~.-LowIncomeTracts, 
                     data = OR_Data, 
                     subset = train_data, 
                     method = "class")

prp(tree.lowAcc, faclen = 0, cex = 0.7, extra = 1, space = .5)

OR.test <- OR_Data[-train_data, "Y_N_LowAcc"]
tree.pred <- predict(tree.lowAcc, newdata = OR_Data, type = "class")
test.pred <- tree.pred[-train_data]
cm<- table(test.pred, OR.test)
cm

##          OR.test
## test.pred  No Yes
##       No  254   5
##       Yes   3 151

summary(tree.lowAcc)

## Call:
## rpart(formula = Y_N_LowAcc ~ . - LowIncomeTracts, data = OR_Data, 
##     subset = train_data, method = "class")
##   n= 413 
## 
##           CP nsplit  rel error     xerror       xstd
## 1 0.71069182      0 1.00000000 1.00000000 0.06219325
## 2 0.14465409      1 0.28930818 0.29559748 0.04058984
## 3 0.04402516      2 0.14465409 0.15094340 0.02990255
## 4 0.01000000      4 0.05660377 0.06918239 0.02057961
## 
## Variable importance
## MedianFamilyIncome        PovertyRate          TractSNAP          TractLOWI 
##                 32                 20                 10                  9 
##  LILATracts_1And10         TractBlack          TractAIAN         TractAsian 
##                  8                  4                  4                  3 
##          TractHUNV       TractOMultir         TractNHOPI      TractHispanic 
##                  2                  2                  2                  1 
##           PCTGQTRS           NUMGQTRS          TractKids 
##                  1                  1                  1 
## 
## Node number 1: 413 observations,    complexity param=0.7106918
##   predicted class=No   expected loss=0.3849879  P(node) =1
##     class counts:   254   159
##    probabilities: 0.615 0.385 
##   left son=2 (300 obs) right son=3 (113 obs)
##   Primary splits:
##       MedianFamilyIncome < 58234.5 to the right, improve=116.16020, (2 missing)
##       PovertyRate        < 17.65   to the left,  improve= 92.43622, (0 missing)
##       TractSNAP          < 402.5   to the left,  improve= 60.68571, (0 missing)
##       TractLOWI          < 1643.5  to the left,  improve= 58.94830, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve= 43.03390, (0 missing)
##   Surrogate splits:
##       PovertyRate       < 17.55   to the left,  agree=0.832, adj=0.378, (2 split)
##       TractSNAP         < 532.5   to the left,  agree=0.820, adj=0.333, (0 split)
##       TractLOWI         < 2117.5  to the left,  agree=0.810, adj=0.297, (0 split)
##       LILATracts_1And10 < 0.5     to the left,  agree=0.805, adj=0.279, (0 split)
##       TractAIAN         < 106.5   to the left,  agree=0.764, adj=0.126, (0 split)
## 
## Node number 2: 300 observations,    complexity param=0.1446541
##   predicted class=No   expected loss=0.1533333  P(node) =0.7263923
##     class counts:   254    46
##    probabilities: 0.847 0.153 
##   left son=4 (277 obs) right son=5 (23 obs)
##   Primary splits:
##       PovertyRate        < 19.9    to the left,  improve=35.71283, (0 missing)
##       MedianFamilyIncome < 68175.5 to the right, improve=18.04391, (0 missing)
##       LILATracts_1And10  < 0.5     to the left,  improve=13.30227, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve=12.42014, (0 missing)
##       TractLOWI          < 2606.5  to the left,  improve=11.43370, (0 missing)
##   Surrogate splits:
##       PCTGQTRS  < 32.315  to the left,  agree=0.933, adj=0.130, (0 split)
##       TractHUNV < 584.5   to the left,  agree=0.933, adj=0.130, (0 split)
##       NUMGQTRS  < 871.5   to the left,  agree=0.930, adj=0.087, (0 split)
##       TractKids < 78.5    to the right, agree=0.930, adj=0.087, (0 split)
##       OHU2010   < 604.5   to the right, agree=0.927, adj=0.043, (0 split)
## 
## Node number 3: 113 observations
##   predicted class=Yes  expected loss=0  P(node) =0.2736077
##     class counts:     0   113
##    probabilities: 0.000 1.000 
## 
## Node number 4: 277 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.08303249  P(node) =0.6707022
##     class counts:   254    23
##    probabilities: 0.917 0.083 
##   left son=8 (207 obs) right son=9 (70 obs)
##   Primary splits:
##       MedianFamilyIncome < 68175.5 to the right, improve=11.294790, (0 missing)
##       TractLOWI          < 2729    to the left,  improve= 6.279248, (0 missing)
##       TractSNAP          < 352.5   to the left,  improve= 5.242500, (0 missing)
##       TractOMultir       < 779     to the left,  improve= 5.081397, (0 missing)
##       TractBlack         < 264     to the left,  improve= 4.479913, (0 missing)
##   Surrogate splits:
##       TractLOWI     < 2113.5  to the left,  agree=0.801, adj=0.214, (0 split)
##       TractOMultir  < 779     to the left,  agree=0.783, adj=0.143, (0 split)
##       TractSNAP     < 351.5   to the left,  agree=0.783, adj=0.143, (0 split)
##       TractHispanic < 904.5   to the left,  agree=0.776, adj=0.114, (0 split)
##       TractAsian    < 14.5    to the right, agree=0.773, adj=0.100, (0 split)
## 
## Node number 5: 23 observations
##   predicted class=Yes  expected loss=0  P(node) =0.05569007
##     class counts:     0    23
##    probabilities: 0.000 1.000 
## 
## Node number 8: 207 observations
##   predicted class=No   expected loss=0  P(node) =0.5012107
##     class counts:   207     0
##    probabilities: 1.000 0.000 
## 
## Node number 9: 70 observations,    complexity param=0.04402516
##   predicted class=No   expected loss=0.3285714  P(node) =0.1694915
##     class counts:    47    23
##    probabilities: 0.671 0.329 
##   left son=18 (50 obs) right son=19 (20 obs)
##   Primary splits:
##       TractBlack   < 58      to the left,  improve=15.225710, (0 missing)
##       TractAsian   < 85.5    to the left,  improve=12.370640, (0 missing)
##       TractNHOPI   < 15      to the left,  improve= 8.752847, (0 missing)
##       TractOMultir < 174     to the left,  improve= 8.396825, (0 missing)
##       laaian1share < 0.085   to the right, improve= 8.062035, (8 missing)
##   Surrogate splits:
##       TractAsian    < 154.5   to the left,  agree=0.929, adj=0.75, (0 split)
##       TractNHOPI    < 23.5    to the left,  agree=0.843, adj=0.45, (0 split)
##       TractOMultir  < 496     to the left,  agree=0.814, adj=0.35, (0 split)
##       TractHispanic < 590.5   to the left,  agree=0.786, adj=0.25, (0 split)
##       TractHUNV     < 255     to the left,  agree=0.786, adj=0.25, (0 split)
## 
## Node number 18: 50 observations
##   predicted class=No   expected loss=0.12  P(node) =0.1210654
##     class counts:    44     6
##    probabilities: 0.880 0.120 
## 
## Node number 19: 20 observations
##   predicted class=Yes  expected loss=0.15  P(node) =0.04842615
##     class counts:     3    17
##    probabilities: 0.150 0.850

```

Project Milestone 5

Olivia Schutz, Marion Powell, Malie Heine

11/30/2021

Step 1:

Step 2:

Step 3:

doing test/train for low access tract