Classification with a Categorical Response (Week 12-14)

Prepping dataframe

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# additional state data! https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/state

r_state_df <- data.frame(as.factor(state.abb), state.region)
r_state_df <- r_state_df %>% rename(State = as.factor.state.abb.)

# basic data set
stations_income_df3 <- read_csv("/cloud/project/MATH-239_StatsR/stations_ACS_medIncome_joined_df3.csv")
## New names:
## * `` -> ...1
## Rows: 200125 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (16): Station Name, Street Address, City, State, ZIP, Groups With Acces...
## dbl  (11): ...1, Latitude, Longitude, ID, Number.Units, household, families,...
## date  (2): Date Last Confirmed, Open Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# modifying data set for analysis
stations_income_df4 <- distinct(stations_income_df3[, c(17,3, 5, 15, 19, 20, 21, 22, 23)])
num_units_df <- stations_income_df4[!is.na(stations_income_df4$Number.Units), ]
sum_units_df <- num_units_df %>% group_by(`Street Address`) %>% mutate(units_per = sum(Number.Units))
sum_units_df2 <- distinct(subset(sum_units_df, select = -c(Number.Units, Station.Type)))

#convert all character columns to factor
sum_units_df2 <- as.data.frame(unclass(sum_units_df2), stringsAsFactors = TRUE)

# join with state region
sum_units_df2 <- left_join(sum_units_df2, r_state_df)
## Joining, by = "State"
# remove na using renewable source column
renewable_nona_df <- sum_units_df2[!is.na(sum_units_df2$EV.On.Site.Renewable.Source), ]

# recode renewable into two level factor
renewable_nona_df <- renewable_nona_df %>% mutate(renewable.source = ifelse(EV.On.Site.Renewable.Source == "HYDRO"|EV.On.Site.Renewable.Source == "LANDFILL"|EV.On.Site.Renewable.Source == "SOLAR"|EV.On.Site.Renewable.Source == "WASTEWATER"|EV.On.Site.Renewable.Source == "WIND", "renewable source", "none"))

(below) I re-coded facility type into two levels, low and high. High facility types are those that had more than ten individual locations of that facility type with a renewable source on site. This helped me split the facility type into college, municipal government, office building, school, or state government facility type (“high”) and other facility types (“low”).

# recode facility type into two levels

renewable_byFacility_df <- renewable_nona_df[, c(5, 10)] %>% group_by(Facility.Type, renewable.source) %>% summarize(count = n())
## `summarise()` has grouped output by 'Facility.Type'. You can override using the
## `.groups` argument.
renewable_byFacility_df2 <- renewable_byFacility_df %>% mutate(count_renewable = ifelse(renewable.source == "renewable source" & count > 10, "High", "Low"))

# plot facility type as high and low

ggplot(renewable_byFacility_df2, aes(Facility.Type, fill=factor(count_renewable)))+
  geom_bar(position="fill")+
  ggtitle("Facility type and whether there are 10+ locations with renewable source on-site")+ #high is more than 10 individual locations of that type with renewable source on-site
  xlab("Facility Type")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90))+
  scale_fill_discrete(name="10+ locations with renewable?",
                      labels = c("Yes", "No"))+ 
  theme(legend.position="bottom")

# 1 = renewable source, 0 = none
renewable_nona_df$renewable.source<-ifelse(renewable_nona_df$renewable.source=="renewable source",1,0)

# recode region to two levels
renewable_nona_df <- renewable_nona_df %>% mutate(region = ifelse(state.region == "Northeast", "Northeast", "South, North Central, West"))
renewable_nona_df$region = as.factor(renewable_nona_df$region)

Response Variable

The response variable is labeled renewable.source. This variable was created by mutating the original EV renewable source variable (with NA’s removed) into a two level factor, where the options are either “renewable source” (including solar, wind, etc.) or “none”. Using this variable as the response will allow me to explore what factors (explanatory variables) may be related to the probability of an EV station having an on-site renewable energy source.

EDA

The bar plot of public vs private access codes appears to have a clear split, along with the region (Northeast or not Northeast).

Working with facility type was tricky. I decided to re-code the variable into two levels using “count_renewable = ifelse(renewable.source ==”renewable source” & count > 10, “High”, “Low”))“, then plotting this as a bar plot. Looking at the bar plot I can see that college, municipal government, office building, school, or state government facility type have a clear split.

I also used the log of units per station, but this was a little less clear. Looking at the density plot and the boxplot there appears to maybe be a split.

######## Exploratory Graphing ##########

# access code
ggplot(renewable_nona_df, aes(Access.Code, fill=factor(renewable.source)))+
  geom_bar(position="fill")+
  ggtitle("Percentage on-site Renewable by Access Code")+
  xlab("Access Code")+
  theme_bw()+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

# log units per
ggplot(renewable_nona_df, aes(log(units_per), color=factor(renewable.source)))+
  geom_density()+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

ggplot(renewable_nona_df, aes(log(units_per), fill=factor(renewable.source)))+
  geom_boxplot()+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

# state region
ggplot(renewable_nona_df, aes(state.region, fill=factor(renewable.source)))+
  geom_bar(position="fill")+
  ggtitle("Percentage on-site Renewable by Region")+
  xlab("Region")+
  theme_bw()+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

ggplot(renewable_nona_df, aes(region, fill=factor(renewable.source)))+
  geom_bar(position="fill")+
  ggtitle("Percentage on-site Renewable by Region, Dichotomized")+
  xlab("Region")+
  theme_bw()+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

##### variable facility_10+ includes the facility types identified above to have a count >10 of individual locations of that facility type where a renewable source is present ####

renewable_nona_df <- renewable_nona_df %>% mutate(`facility_10+` = ifelse(Facility.Type == "COLLEGE_CAMPUS"|Facility.Type == "MUNI_GOV"|Facility.Type == "OFFICE_BLDG"|Facility.Type == "SCHOOL"|Facility.Type == "STATE_GOV", "college, muni gov, office bldg, school, state gov", "other"))

# facility type dichotomized bar plot
renewable_nona_df %>%
  filter(!is.na(Facility.Type)) %>%
  ggplot(aes(`facility_10+`, fill=factor(renewable.source)))+
  geom_bar(position="fill")+
  ggtitle("Percentage on-site Renewable by Facility Type, Dichotomized")+
  xlab("Facility Type")+
  theme_bw()+
  scale_x_discrete(labels = function(`facility_10+`) str_wrap(`facility_10+`, width = 20))+
  scale_fill_discrete(name="Renewable Source on-site?",
                      breaks=c("0", "1"),
                      labels = c("No", "Yes"))+ 
  theme(legend.position="bottom")

## Fitting the logistic regression model

# renewable source as a function of access code only
m1 <- glm(renewable.source ~ Access.Code, data = renewable_nona_df, family = "binomial")
summary(m1)
## 
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7749   0.2074   0.6819   0.6819   0.6819  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         3.8286     0.7147   5.357 8.47e-08 ***
## Access.Codepublic  -2.4884     0.7322  -3.399 0.000677 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 265.46  on 333  degrees of freedom
## AIC: 269.46
## 
## Number of Fisher Scoring iterations: 6
exp(-2.49)
## [1] 0.08290997
# make a log_units_per variable
renewable_nona_df <- mutate(renewable_nona_df, log_units_per = log(units_per))

# final model based on variable selection
m3 <- glm(renewable.source ~ log_units_per + Access.Code + region + `facility_10+`, data = renewable_nona_df, family = "binomial")
summary(m3)
## 
## Call:
## glm(formula = renewable.source ~ log_units_per + Access.Code + 
##     region + `facility_10+`, family = "binomial", data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9679   0.1029   0.2033   0.3593   1.8500  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        3.1293     0.8933   3.503 0.000460 ***
## log_units_per                     -0.7537     0.2245  -3.357 0.000789 ***
## Access.Codepublic                 -1.6954     0.8015  -2.115 0.034409 *  
## regionSouth, North Central, West   2.9580     0.4308   6.866 6.62e-12 ***
## `facility_10+`other               -0.8560     0.4262  -2.008 0.044604 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 288.20  on 331  degrees of freedom
## Residual deviance: 179.28  on 327  degrees of freedom
##   (3 observations deleted due to missingness)
## AIC: 189.28
## 
## Number of Fisher Scoring iterations: 6

variable selection manually

#variable selection manual
a <-  glm(renewable.source ~ log_units_per, data = renewable_nona_df, family = "binomial")
summary(a) # 0.0104 * 
## 
## Call:
## glm(formula = renewable.source ~ log_units_per, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1292   0.4678   0.5288   0.5966   0.9189  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.1573     0.2501   8.626   <2e-16 ***
## log_units_per  -0.3760     0.1468  -2.562   0.0104 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 282.84  on 333  degrees of freedom
## AIC: 286.84
## 
## Number of Fisher Scoring iterations: 4
b <-  glm(renewable.source ~ Access.Code, data = renewable_nona_df, family = "binomial")
summary(b)# 0.000677 ***
## 
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7749   0.2074   0.6819   0.6819   0.6819  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         3.8286     0.7147   5.357 8.47e-08 ***
## Access.Codepublic  -2.4884     0.7322  -3.399 0.000677 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 265.46  on 333  degrees of freedom
## AIC: 269.46
## 
## Number of Fisher Scoring iterations: 6
c <-  glm(renewable.source ~ region, data = renewable_nona_df, family = "binomial")
summary(c)# 1.24e-15 ***
## 
## Call:
## glm(formula = renewable.source ~ region, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.572   0.273   0.273   0.273   1.149  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       0.06744    0.21212   0.318    0.751    
## regionSouth, North Central, West  3.20339    0.40041   8.000 1.24e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 200.49  on 333  degrees of freedom
## AIC: 204.49
## 
## Number of Fisher Scoring iterations: 6
d <-  glm(renewable.source ~ `facility_10+`, data = renewable_nona_df, family = "binomial")
summary(d)# 0.000404 ***
## 
## Call:
## glm(formula = renewable.source ~ `facility_10+`, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2266   0.4185   0.4185   0.7235   0.7235  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.3914     0.2792   8.564  < 2e-16 ***
## `facility_10+`other  -1.1848     0.3349  -3.538 0.000404 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 288.20  on 331  degrees of freedom
## Residual deviance: 274.28  on 330  degrees of freedom
##   (3 observations deleted due to missingness)
## AIC: 278.28
## 
## Number of Fisher Scoring iterations: 5
e <-  glm(renewable.source ~ as.Date(Open.Date), data = renewable_nona_df, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(e)#5.06e-08 ***
## 
## Call:
## glm(formula = renewable.source ~ as.Date(Open.Date), family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.56183   0.00017   0.00546   0.06112   2.25761  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        108.575458  19.914650   5.452 4.98e-08 ***
## as.Date(Open.Date)  -0.005843   0.001072  -5.449 5.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 288.88  on 333  degrees of freedom
## Residual deviance: 121.30  on 332  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 125.3
## 
## Number of Fisher Scoring iterations: 9
f <-  glm(renewable.source ~ state.region, data = renewable_nona_df, family = "binomial")
summary(f)#state.regionSouth          2.89439    0.62894   4.602 4.18e-06 ***
## 
## Call:
## glm(formula = renewable.source ~ state.region, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6972   0.2309   0.2309   0.3176   1.1489  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.06744    0.21212   0.318 0.750531    
## state.regionSouth          2.89439    0.62894   4.602 4.18e-06 ***
## state.regionNorth Central  2.67340    0.75977   3.519 0.000434 ***
## state.regionWest           3.54348    0.54932   6.451 1.11e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 199.29  on 331  degrees of freedom
## AIC: 207.29
## 
## Number of Fisher Scoring iterations: 6
          #state.regionNorth Central  2.67340    0.75977   3.519 0.000434 ***
          #state.regionWest           3.54348    0.54932   6.451 1.11e-10 ***

f <-  glm(renewable.source ~ units_per, data = renewable_nona_df, family = "binomial")
summary(f)# not significant
## 
## Call:
## glm(formula = renewable.source ~ units_per, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9595   0.5743   0.5834   0.5845   0.5855  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.672644   0.178078   9.393   <2e-16 ***
## units_per   0.004025   0.018005   0.224    0.823    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.22  on 334  degrees of freedom
## Residual deviance: 289.16  on 333  degrees of freedom
## AIC: 293.16
## 
## Number of Fisher Scoring iterations: 4
g <-  glm(renewable.source ~ Facility.Type, data = renewable_nona_df, family = "binomial")
summary(g)# not significant
## 
## Call:
## glm(formula = renewable.source ~ Facility.Type, family = "binomial", 
##     data = renewable_nona_df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.32725   0.00005   0.00005   0.55919   1.55176  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             2.057e+01  1.254e+04   0.002    0.999
## Facility.TypeBREWERY_DISTILLERY_WINERY -2.266e-07  1.483e+04   0.000    1.000
## Facility.TypeCAMPGROUND                -2.267e-07  2.172e+04   0.000    1.000
## Facility.TypeCAR_DEALER                -1.862e+01  1.254e+04  -0.001    0.999
## Facility.TypeCOLLEGE_CAMPUS            -2.112e-07  1.312e+04   0.000    1.000
## Facility.TypeCONVENIENCE_STORE         -2.269e-07  2.172e+04   0.000    1.000
## Facility.TypeFACTORY                   -2.114e-07  2.172e+04   0.000    1.000
## Facility.TypeFED_GOV                   -2.227e-07  1.422e+04   0.000    1.000
## Facility.TypeFIRE_STATION              -2.265e-07  2.172e+04   0.000    1.000
## Facility.TypeFLEET_GARAGE              -2.081e-07  1.773e+04   0.000    1.000
## Facility.TypeGAS_STATION               -2.057e+01  1.254e+04  -0.002    0.999
## Facility.TypeGROCERY                   -2.130e-07  1.535e+04   0.000    1.000
## Facility.TypeHOSPITAL                  -2.269e-07  1.483e+04   0.000    1.000
## Facility.TypeHOTEL                     -2.097e+01  1.254e+04  -0.002    0.999
## Facility.TypeINN                       -1.987e+01  1.254e+04  -0.002    0.999
## Facility.TypeLIBRARY                   -2.243e-07  1.402e+04   0.000    1.000
## Facility.TypeMIL_BASE                  -2.232e-07  1.619e+04   0.000    1.000
## Facility.TypeMULTI_UNIT_DWELLING       -4.113e+01  1.354e+04  -0.003    0.998
## Facility.TypeMUNI_GOV                  -1.793e+01  1.254e+04  -0.001    0.999
## Facility.TypeMUSEUM                    -2.265e-07  2.172e+04   0.000    1.000
## Facility.TypeNATL_PARK                 -1.965e+01  1.254e+04  -0.002    0.999
## Facility.TypeOFFICE_BLDG               -1.879e+01  1.254e+04  -0.001    0.999
## Facility.TypeOTHER                     -2.126e+01  1.254e+04  -0.002    0.999
## Facility.TypeOTHER_ENTERTAINMENT       -1.947e+01  1.254e+04  -0.002    0.999
## Facility.TypePARK                      -2.234e-07  1.386e+04   0.000    1.000
## Facility.TypePARKING_GARAGE            -2.057e+01  1.254e+04  -0.002    0.999
## Facility.TypePARKING_LOT               -2.081e-07  1.386e+04   0.000    1.000
## Facility.TypePLACE_OF_WORSHIP          -2.267e-07  1.448e+04   0.000    1.000
## Facility.TypeREC_SPORTS_FACILITY       -1.987e+01  1.254e+04  -0.002    0.999
## Facility.TypeRESEARCH_FACILITY         -2.236e-07  1.619e+04   0.000    1.000
## Facility.TypeRESTAURANT                -2.083e-07  1.619e+04   0.000    1.000
## Facility.TypeSCHOOL                    -2.267e-07  1.340e+04   0.000    1.000
## Facility.TypeSHOPPING_CENTER           -2.141e+01  1.254e+04  -0.002    0.999
## Facility.TypeSHOPPING_MALL             -2.057e+01  1.254e+04  -0.002    0.999
## Facility.TypeSTANDALONE_STATION        -2.112e-07  1.483e+04   0.000    1.000
## Facility.TypeSTATE_GOV                 -2.082e-07  1.363e+04   0.000    1.000
## Facility.TypeSTREET_PARKING            -2.236e-07  1.483e+04   0.000    1.000
## Facility.TypeTRAVEL_CENTER             -2.259e-07  2.172e+04   0.000    1.000
## Facility.TypeUTILITY                   -2.084e-07  1.373e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 288.20  on 331  degrees of freedom
## Residual deviance: 157.56  on 293  degrees of freedom
##   (3 observations deleted due to missingness)
## AIC: 235.56
## 
## Number of Fisher Scoring iterations: 19

variable selection using leaps

#install.packages("leaps", repo = 'https://mac.R-project.org')
library(leaps)
# Best Subset 
regfit.full<-regsubsets(renewable.source~log_units_per+Access.Code+region+`facility_10+`+state.region, data=renewable_nona_df)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 6
reg.summary<-summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(renewable.source ~ log_units_per + Access.Code + 
##     region + `facility_10+` + state.region, data = renewable_nona_df)
## 7 Variables  (and intercept)
##                                  Forced in Forced out
## log_units_per                        FALSE      FALSE
## Access.Codepublic                    FALSE      FALSE
## regionSouth, North Central, West     FALSE      FALSE
## `facility_10+`other                  FALSE      FALSE
## state.regionSouth                    FALSE      FALSE
## state.regionNorth Central            FALSE      FALSE
## state.regionWest                     FALSE      FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
##          log_units_per Access.Codepublic regionSouth, North Central, West
## 1  ( 1 ) " "           " "               "*"                             
## 2  ( 1 ) "*"           " "               "*"                             
## 3  ( 1 ) "*"           " "               "*"                             
## 4  ( 1 ) "*"           "*"               "*"                             
## 5  ( 1 ) "*"           "*"               "*"                             
## 6  ( 1 ) "*"           "*"               "*"                             
##          `facility_10+`other state.regionSouth state.regionNorth Central
## 1  ( 1 ) " "                 " "               " "                      
## 2  ( 1 ) " "                 " "               " "                      
## 3  ( 1 ) "*"                 " "               " "                      
## 4  ( 1 ) "*"                 " "               " "                      
## 5  ( 1 ) "*"                 " "               " "                      
## 6  ( 1 ) "*"                 "*"               "*"                      
##          state.regionWest
## 1  ( 1 ) " "             
## 2  ( 1 ) " "             
## 3  ( 1 ) " "             
## 4  ( 1 ) " "             
## 5  ( 1 ) "*"             
## 6  ( 1 ) " "
# look at rsq
adjrsq_df<-data.frame(rsq=c(reg.summary$rsq,reg.summary$adjr2),
                      type=c(rep("rsq",6),rep("rsq_adj",6)),
                   size=rep(1:6,2))

ggplot(adjrsq_df, aes(x=size, y=rsq, color=type))+
  geom_point()+
  geom_line()+
  ggtitle("R-squared vs Adj R-squared")

# other metrics
metric_df<-data.frame(metric=c(reg.summary$rsq,
                               reg.summary$adjr2,
                               reg.summary$cp, 
                               reg.summary$bic),
                      type=c(rep("rsq",6),
                             rep("rsq_adj",6),
                             rep("cp",6),
                             rep("bic",6)),
                      size=rep(1:6,4))

which.min(reg.summary$bic)#3
## [1] 3
which.min(reg.summary$cp)#4
## [1] 4
which.max(reg.summary$rsq)#6
## [1] 6
which.max(reg.summary$adjr2)#4
## [1] 4
vline.dat <- data.frame(type=unique(metric_df$type), vl=c(6, 4, 4, 3))

ggplot(metric_df, aes(x=size, y=metric, color=type))+
  geom_point()+
  geom_line()+
  ggtitle("Model Comparison Metrics")+
  geom_vline(aes(xintercept=vl), data=vline.dat, lty=2) +
  facet_grid(type~., scales = "free_y")

Final model in the context of these data.

In m3 there are three negative slopes and one positive slope, indicating that three of the explanatory variables are negatively associated with the probability that there will be a renewable source on-site. It appears that higher log number of units per address; a public access code; and being a facility type other than a college, muni gov, office bldg, school, or state gov, are all negatively associated with the probability that there will be a renewable source on-site. If the region is South, North Central, or West (aka not Northeast) this is positively associated with the probability that there will be a renewable source on-site.

More simply put, there’s a higher probability of having a renewable energy source on-site at an EV charging station if the access code is private, it’s not in the Northeast, and it is a college, municipal government, office building, school, or state government facility type.

Predict

split data into a test and training set

#Dividing the Data
set.seed(501)
train_indices <- sample(1:nrow(renewable_nona_df), size = floor(nrow(renewable_nona_df)/2))
train_data <- renewable_nona_df %>%
  slice(train_indices)
test_data <- renewable_nona_df %>%
  slice(-train_indices)

Fit final models on the training set

#Training
m1B <- glm(renewable.source ~ Access.Code, data = train_data, family = "binomial")
summary(m1B)
## 
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial", 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7825   0.2052   0.6394   0.6394   0.6394  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          3.850      1.011    3.81 0.000139 ***
## Access.Codepublic   -2.366      1.038   -2.28 0.022586 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 133.87  on 166  degrees of freedom
## Residual deviance: 123.65  on 165  degrees of freedom
## AIC: 127.65
## 
## Number of Fisher Scoring iterations: 6
m3B <- glm(renewable.source ~ log_units_per + Access.Code + region +`facility_10+`, data = train_data, family = "binomial")
summary(m3B)
## 
## Call:
## glm(formula = renewable.source ~ log_units_per + Access.Code + 
##     region + `facility_10+`, family = "binomial", data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7210   0.1353   0.2588   0.4819   1.5340  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        3.0980     1.2601   2.458   0.0140 *  
## log_units_per                     -0.4285     0.3038  -1.411   0.1584    
## Access.Codepublic                 -1.6064     1.1069  -1.451   0.1467    
## regionSouth, North Central, West   2.1853     0.5349   4.086  4.4e-05 ***
## `facility_10+`other               -1.1115     0.6093  -1.824   0.0681 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 133.274  on 164  degrees of freedom
## Residual deviance:  95.525  on 160  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 105.53
## 
## Number of Fisher Scoring iterations: 6

Predict how the testing data would perform under the training model

For the first model, which only considers the influence of access code on the response, the error rate is very high at 0.827 or 82.7%.

In the more complex model, which includes log_units_per, access code, region, and facility type, the error rate is much lower at 0.095 or 9.5%.

#Testing
## access code only
test1 <- predict(m1B, newdata = test_data, type = "response")

test_mat1<-data.frame(renewable=test_data$renewable.source, predRenewable=test1>.5)

table(test_mat1$renewable, test_mat1$predRenewable)
##    
##     TRUE
##   0   29
##   1  139
# error
(139)/length(test1)
## [1] 0.827381
## all four variables (final model)
test3 <- predict(m3B, newdata = test_data, type = "response")

test_mat3<-data.frame(renewable=test_data$renewable.source, predRenewable=test3>.5)

table(test_mat3$renewable, test_mat3$predRenewable)
##    
##     FALSE TRUE
##   0    15   14
##   1     2  136
#error
(2 + 14)/length(test3)
## [1] 0.0952381

Classification tree

#install.packages
library(ISLR)


#install.packages("rpart")
library(rpart)

#install.packages("MASS")
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(tidyverse)

# new data frame: access code, log units per, region dichotomized, renewable source, facility 10+
tree_df2 <- distinct(renewable_nona_df[, c(1, 12, 11, 10, 13)]) #maybe trade col 9 for col 11 if needed
tree_df2$`facility_10+` <- as.factor(tree_df2$`facility_10+`)
# test and train 
set.seed(2)
dim(tree_df2)
## [1] 90  5
# split in half 
train2<-sample(1:nrow(tree_df2), 45)

# use method="class" for classification
tree.df2<-rpart(renewable.source~., data=tree_df2,
                     subset=train2,
                     control=rpart.control(minsplit=1),
                     method="class")
#summary(tree.df2)
#install.packages("rpart.plot")
library(rpart.plot)

# plot
prp(tree.df2, faclen = 0, cex = 0.7, extra=1, space=.5, ycompress=TRUE, type=2)

#Using cross-validation to select complexity
printcp(tree.df2) # display the results
## 
## Classification tree:
## rpart(formula = renewable.source ~ ., data = tree_df2, subset = train2, 
##     method = "class", control = rpart.control(minsplit = 1))
## 
## Variables actually used in tree construction:
## [1] Access.Code   facility_10+  log_units_per region       
## 
## Root node error: 13/45 = 0.28889
## 
## n= 45 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.307692      0  1.000000 1.00000 0.23388
## 2 0.153846      1  0.692308 0.84615 0.22176
## 3 0.076923      3  0.384615 0.76923 0.21453
## 4 0.025641      5  0.230769 0.69231 0.20641
## 5 0.019231      8  0.153846 0.84615 0.22176
## 6 0.010000     12  0.076923 0.84615 0.22176
plotcp(tree.df2) # visualize cross-validation results

# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(tree.df2) # visualize cross-validation results
## 
## Classification tree:
## rpart(formula = renewable.source ~ ., data = tree_df2, subset = train2, 
##     method = "class", control = rpart.control(minsplit = 1))
## 
## Variables actually used in tree construction:
## [1] Access.Code   facility_10+  log_units_per region       
## 
## Root node error: 13/45 = 0.28889
## 
## n= 45 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.307692      0  1.000000 1.00000 0.23388
## 2 0.153846      1  0.692308 0.84615 0.22176
## 3 0.076923      3  0.384615 0.76923 0.21453
## 4 0.025641      5  0.230769 0.69231 0.20641
## 5 0.019231      8  0.153846 0.84615 0.22176
## 6 0.010000     12  0.076923 0.84615 0.22176
## Warning in rsq.rpart(tree.df2): may not be applicable for this method

# minimize the error
tree.df2$cptable
##           CP nsplit  rel error    xerror      xstd
## 1 0.30769231      0 1.00000000 1.0000000 0.2338821
## 2 0.15384615      1 0.69230769 0.8461538 0.2217615
## 3 0.07692308      3 0.38461538 0.7692308 0.2145282
## 4 0.02564103      5 0.23076923 0.6923077 0.2064063
## 5 0.01923077      8 0.15384615 0.8461538 0.2217615
## 6 0.01000000     12 0.07692308 0.8461538 0.2217615
which.min(tree.df2$cptable[,"xerror"])
## 4 
## 4
tree.df2$cptable[which.min(tree.df2$cptable[,"xerror"]),"CP"]
## [1] 0.02564103
# prune the tree
pfit<- prune(tree.df2, cp=0.026) # from cptable   

# plot the pruned tree
prp(pfit, faclen = 0, cex = 0.7, extra=1, space=.5, ycompress=TRUE, type=2)