library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# additional state data! https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/state
r_state_df <- data.frame(as.factor(state.abb), state.region)
r_state_df <- r_state_df %>% rename(State = as.factor.state.abb.)
# basic data set
stations_income_df3 <- read_csv("/cloud/project/MATH-239_StatsR/stations_ACS_medIncome_joined_df3.csv")
## New names:
## * `` -> ...1
## Rows: 200125 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Station Name, Street Address, City, State, ZIP, Groups With Acces...
## dbl (11): ...1, Latitude, Longitude, ID, Number.Units, household, families,...
## date (2): Date Last Confirmed, Open Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# modifying data set for analysis
stations_income_df4 <- distinct(stations_income_df3[, c(17,3, 5, 15, 19, 20, 21, 22, 23)])
num_units_df <- stations_income_df4[!is.na(stations_income_df4$Number.Units), ]
sum_units_df <- num_units_df %>% group_by(`Street Address`) %>% mutate(units_per = sum(Number.Units))
sum_units_df2 <- distinct(subset(sum_units_df, select = -c(Number.Units, Station.Type)))
#convert all character columns to factor
sum_units_df2 <- as.data.frame(unclass(sum_units_df2), stringsAsFactors = TRUE)
# join with state region
sum_units_df2 <- left_join(sum_units_df2, r_state_df)
## Joining, by = "State"
# remove na using renewable source column
renewable_nona_df <- sum_units_df2[!is.na(sum_units_df2$EV.On.Site.Renewable.Source), ]
# recode renewable into two level factor
renewable_nona_df <- renewable_nona_df %>% mutate(renewable.source = ifelse(EV.On.Site.Renewable.Source == "HYDRO"|EV.On.Site.Renewable.Source == "LANDFILL"|EV.On.Site.Renewable.Source == "SOLAR"|EV.On.Site.Renewable.Source == "WASTEWATER"|EV.On.Site.Renewable.Source == "WIND", "renewable source", "none"))
(below) I re-coded facility type into two levels, low and high. High facility types are those that had more than ten individual locations of that facility type with a renewable source on site. This helped me split the facility type into college, municipal government, office building, school, or state government facility type (“high”) and other facility types (“low”).
# recode facility type into two levels
renewable_byFacility_df <- renewable_nona_df[, c(5, 10)] %>% group_by(Facility.Type, renewable.source) %>% summarize(count = n())
## `summarise()` has grouped output by 'Facility.Type'. You can override using the
## `.groups` argument.
renewable_byFacility_df2 <- renewable_byFacility_df %>% mutate(count_renewable = ifelse(renewable.source == "renewable source" & count > 10, "High", "Low"))
# plot facility type as high and low
ggplot(renewable_byFacility_df2, aes(Facility.Type, fill=factor(count_renewable)))+
geom_bar(position="fill")+
ggtitle("Facility type and whether there are 10+ locations with renewable source on-site")+ #high is more than 10 individual locations of that type with renewable source on-site
xlab("Facility Type")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90))+
scale_fill_discrete(name="10+ locations with renewable?",
labels = c("Yes", "No"))+
theme(legend.position="bottom")
# 1 = renewable source, 0 = none
renewable_nona_df$renewable.source<-ifelse(renewable_nona_df$renewable.source=="renewable source",1,0)
# recode region to two levels
renewable_nona_df <- renewable_nona_df %>% mutate(region = ifelse(state.region == "Northeast", "Northeast", "South, North Central, West"))
renewable_nona_df$region = as.factor(renewable_nona_df$region)
The response variable is labeled renewable.source. This variable was created by mutating the original EV renewable source variable (with NA’s removed) into a two level factor, where the options are either “renewable source” (including solar, wind, etc.) or “none”. Using this variable as the response will allow me to explore what factors (explanatory variables) may be related to the probability of an EV station having an on-site renewable energy source.
The bar plot of public vs private access codes appears to have a clear split, along with the region (Northeast or not Northeast).
Working with facility type was tricky. I decided to re-code the variable into two levels using “count_renewable = ifelse(renewable.source ==”renewable source” & count > 10, “High”, “Low”))“, then plotting this as a bar plot. Looking at the bar plot I can see that college, municipal government, office building, school, or state government facility type have a clear split.
I also used the log of units per station, but this was a little less clear. Looking at the density plot and the boxplot there appears to maybe be a split.
######## Exploratory Graphing ##########
# access code
ggplot(renewable_nona_df, aes(Access.Code, fill=factor(renewable.source)))+
geom_bar(position="fill")+
ggtitle("Percentage on-site Renewable by Access Code")+
xlab("Access Code")+
theme_bw()+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
# log units per
ggplot(renewable_nona_df, aes(log(units_per), color=factor(renewable.source)))+
geom_density()+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
ggplot(renewable_nona_df, aes(log(units_per), fill=factor(renewable.source)))+
geom_boxplot()+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
# state region
ggplot(renewable_nona_df, aes(state.region, fill=factor(renewable.source)))+
geom_bar(position="fill")+
ggtitle("Percentage on-site Renewable by Region")+
xlab("Region")+
theme_bw()+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
ggplot(renewable_nona_df, aes(region, fill=factor(renewable.source)))+
geom_bar(position="fill")+
ggtitle("Percentage on-site Renewable by Region, Dichotomized")+
xlab("Region")+
theme_bw()+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
##### variable facility_10+ includes the facility types identified above to have a count >10 of individual locations of that facility type where a renewable source is present ####
renewable_nona_df <- renewable_nona_df %>% mutate(`facility_10+` = ifelse(Facility.Type == "COLLEGE_CAMPUS"|Facility.Type == "MUNI_GOV"|Facility.Type == "OFFICE_BLDG"|Facility.Type == "SCHOOL"|Facility.Type == "STATE_GOV", "college, muni gov, office bldg, school, state gov", "other"))
# facility type dichotomized bar plot
renewable_nona_df %>%
filter(!is.na(Facility.Type)) %>%
ggplot(aes(`facility_10+`, fill=factor(renewable.source)))+
geom_bar(position="fill")+
ggtitle("Percentage on-site Renewable by Facility Type, Dichotomized")+
xlab("Facility Type")+
theme_bw()+
scale_x_discrete(labels = function(`facility_10+`) str_wrap(`facility_10+`, width = 20))+
scale_fill_discrete(name="Renewable Source on-site?",
breaks=c("0", "1"),
labels = c("No", "Yes"))+
theme(legend.position="bottom")
## Fitting the logistic regression model
# renewable source as a function of access code only
m1 <- glm(renewable.source ~ Access.Code, data = renewable_nona_df, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7749 0.2074 0.6819 0.6819 0.6819
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.8286 0.7147 5.357 8.47e-08 ***
## Access.Codepublic -2.4884 0.7322 -3.399 0.000677 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 265.46 on 333 degrees of freedom
## AIC: 269.46
##
## Number of Fisher Scoring iterations: 6
exp(-2.49)
## [1] 0.08290997
# make a log_units_per variable
renewable_nona_df <- mutate(renewable_nona_df, log_units_per = log(units_per))
# final model based on variable selection
m3 <- glm(renewable.source ~ log_units_per + Access.Code + region + `facility_10+`, data = renewable_nona_df, family = "binomial")
summary(m3)
##
## Call:
## glm(formula = renewable.source ~ log_units_per + Access.Code +
## region + `facility_10+`, family = "binomial", data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9679 0.1029 0.2033 0.3593 1.8500
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.1293 0.8933 3.503 0.000460 ***
## log_units_per -0.7537 0.2245 -3.357 0.000789 ***
## Access.Codepublic -1.6954 0.8015 -2.115 0.034409 *
## regionSouth, North Central, West 2.9580 0.4308 6.866 6.62e-12 ***
## `facility_10+`other -0.8560 0.4262 -2.008 0.044604 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 288.20 on 331 degrees of freedom
## Residual deviance: 179.28 on 327 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 189.28
##
## Number of Fisher Scoring iterations: 6
#variable selection manual
a <- glm(renewable.source ~ log_units_per, data = renewable_nona_df, family = "binomial")
summary(a) # 0.0104 *
##
## Call:
## glm(formula = renewable.source ~ log_units_per, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1292 0.4678 0.5288 0.5966 0.9189
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.1573 0.2501 8.626 <2e-16 ***
## log_units_per -0.3760 0.1468 -2.562 0.0104 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 282.84 on 333 degrees of freedom
## AIC: 286.84
##
## Number of Fisher Scoring iterations: 4
b <- glm(renewable.source ~ Access.Code, data = renewable_nona_df, family = "binomial")
summary(b)# 0.000677 ***
##
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7749 0.2074 0.6819 0.6819 0.6819
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.8286 0.7147 5.357 8.47e-08 ***
## Access.Codepublic -2.4884 0.7322 -3.399 0.000677 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 265.46 on 333 degrees of freedom
## AIC: 269.46
##
## Number of Fisher Scoring iterations: 6
c <- glm(renewable.source ~ region, data = renewable_nona_df, family = "binomial")
summary(c)# 1.24e-15 ***
##
## Call:
## glm(formula = renewable.source ~ region, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.572 0.273 0.273 0.273 1.149
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.06744 0.21212 0.318 0.751
## regionSouth, North Central, West 3.20339 0.40041 8.000 1.24e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 200.49 on 333 degrees of freedom
## AIC: 204.49
##
## Number of Fisher Scoring iterations: 6
d <- glm(renewable.source ~ `facility_10+`, data = renewable_nona_df, family = "binomial")
summary(d)# 0.000404 ***
##
## Call:
## glm(formula = renewable.source ~ `facility_10+`, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2266 0.4185 0.4185 0.7235 0.7235
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.3914 0.2792 8.564 < 2e-16 ***
## `facility_10+`other -1.1848 0.3349 -3.538 0.000404 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 288.20 on 331 degrees of freedom
## Residual deviance: 274.28 on 330 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 278.28
##
## Number of Fisher Scoring iterations: 5
e <- glm(renewable.source ~ as.Date(Open.Date), data = renewable_nona_df, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(e)#5.06e-08 ***
##
## Call:
## glm(formula = renewable.source ~ as.Date(Open.Date), family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.56183 0.00017 0.00546 0.06112 2.25761
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 108.575458 19.914650 5.452 4.98e-08 ***
## as.Date(Open.Date) -0.005843 0.001072 -5.449 5.06e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 288.88 on 333 degrees of freedom
## Residual deviance: 121.30 on 332 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 125.3
##
## Number of Fisher Scoring iterations: 9
f <- glm(renewable.source ~ state.region, data = renewable_nona_df, family = "binomial")
summary(f)#state.regionSouth 2.89439 0.62894 4.602 4.18e-06 ***
##
## Call:
## glm(formula = renewable.source ~ state.region, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6972 0.2309 0.2309 0.3176 1.1489
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.06744 0.21212 0.318 0.750531
## state.regionSouth 2.89439 0.62894 4.602 4.18e-06 ***
## state.regionNorth Central 2.67340 0.75977 3.519 0.000434 ***
## state.regionWest 3.54348 0.54932 6.451 1.11e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 199.29 on 331 degrees of freedom
## AIC: 207.29
##
## Number of Fisher Scoring iterations: 6
#state.regionNorth Central 2.67340 0.75977 3.519 0.000434 ***
#state.regionWest 3.54348 0.54932 6.451 1.11e-10 ***
f <- glm(renewable.source ~ units_per, data = renewable_nona_df, family = "binomial")
summary(f)# not significant
##
## Call:
## glm(formula = renewable.source ~ units_per, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9595 0.5743 0.5834 0.5845 0.5855
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.672644 0.178078 9.393 <2e-16 ***
## units_per 0.004025 0.018005 0.224 0.823
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.22 on 334 degrees of freedom
## Residual deviance: 289.16 on 333 degrees of freedom
## AIC: 293.16
##
## Number of Fisher Scoring iterations: 4
g <- glm(renewable.source ~ Facility.Type, data = renewable_nona_df, family = "binomial")
summary(g)# not significant
##
## Call:
## glm(formula = renewable.source ~ Facility.Type, family = "binomial",
## data = renewable_nona_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.32725 0.00005 0.00005 0.55919 1.55176
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.057e+01 1.254e+04 0.002 0.999
## Facility.TypeBREWERY_DISTILLERY_WINERY -2.266e-07 1.483e+04 0.000 1.000
## Facility.TypeCAMPGROUND -2.267e-07 2.172e+04 0.000 1.000
## Facility.TypeCAR_DEALER -1.862e+01 1.254e+04 -0.001 0.999
## Facility.TypeCOLLEGE_CAMPUS -2.112e-07 1.312e+04 0.000 1.000
## Facility.TypeCONVENIENCE_STORE -2.269e-07 2.172e+04 0.000 1.000
## Facility.TypeFACTORY -2.114e-07 2.172e+04 0.000 1.000
## Facility.TypeFED_GOV -2.227e-07 1.422e+04 0.000 1.000
## Facility.TypeFIRE_STATION -2.265e-07 2.172e+04 0.000 1.000
## Facility.TypeFLEET_GARAGE -2.081e-07 1.773e+04 0.000 1.000
## Facility.TypeGAS_STATION -2.057e+01 1.254e+04 -0.002 0.999
## Facility.TypeGROCERY -2.130e-07 1.535e+04 0.000 1.000
## Facility.TypeHOSPITAL -2.269e-07 1.483e+04 0.000 1.000
## Facility.TypeHOTEL -2.097e+01 1.254e+04 -0.002 0.999
## Facility.TypeINN -1.987e+01 1.254e+04 -0.002 0.999
## Facility.TypeLIBRARY -2.243e-07 1.402e+04 0.000 1.000
## Facility.TypeMIL_BASE -2.232e-07 1.619e+04 0.000 1.000
## Facility.TypeMULTI_UNIT_DWELLING -4.113e+01 1.354e+04 -0.003 0.998
## Facility.TypeMUNI_GOV -1.793e+01 1.254e+04 -0.001 0.999
## Facility.TypeMUSEUM -2.265e-07 2.172e+04 0.000 1.000
## Facility.TypeNATL_PARK -1.965e+01 1.254e+04 -0.002 0.999
## Facility.TypeOFFICE_BLDG -1.879e+01 1.254e+04 -0.001 0.999
## Facility.TypeOTHER -2.126e+01 1.254e+04 -0.002 0.999
## Facility.TypeOTHER_ENTERTAINMENT -1.947e+01 1.254e+04 -0.002 0.999
## Facility.TypePARK -2.234e-07 1.386e+04 0.000 1.000
## Facility.TypePARKING_GARAGE -2.057e+01 1.254e+04 -0.002 0.999
## Facility.TypePARKING_LOT -2.081e-07 1.386e+04 0.000 1.000
## Facility.TypePLACE_OF_WORSHIP -2.267e-07 1.448e+04 0.000 1.000
## Facility.TypeREC_SPORTS_FACILITY -1.987e+01 1.254e+04 -0.002 0.999
## Facility.TypeRESEARCH_FACILITY -2.236e-07 1.619e+04 0.000 1.000
## Facility.TypeRESTAURANT -2.083e-07 1.619e+04 0.000 1.000
## Facility.TypeSCHOOL -2.267e-07 1.340e+04 0.000 1.000
## Facility.TypeSHOPPING_CENTER -2.141e+01 1.254e+04 -0.002 0.999
## Facility.TypeSHOPPING_MALL -2.057e+01 1.254e+04 -0.002 0.999
## Facility.TypeSTANDALONE_STATION -2.112e-07 1.483e+04 0.000 1.000
## Facility.TypeSTATE_GOV -2.082e-07 1.363e+04 0.000 1.000
## Facility.TypeSTREET_PARKING -2.236e-07 1.483e+04 0.000 1.000
## Facility.TypeTRAVEL_CENTER -2.259e-07 2.172e+04 0.000 1.000
## Facility.TypeUTILITY -2.084e-07 1.373e+04 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 288.20 on 331 degrees of freedom
## Residual deviance: 157.56 on 293 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 235.56
##
## Number of Fisher Scoring iterations: 19
#install.packages("leaps", repo = 'https://mac.R-project.org')
library(leaps)
# Best Subset
regfit.full<-regsubsets(renewable.source~log_units_per+Access.Code+region+`facility_10+`+state.region, data=renewable_nona_df)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 6
reg.summary<-summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(renewable.source ~ log_units_per + Access.Code +
## region + `facility_10+` + state.region, data = renewable_nona_df)
## 7 Variables (and intercept)
## Forced in Forced out
## log_units_per FALSE FALSE
## Access.Codepublic FALSE FALSE
## regionSouth, North Central, West FALSE FALSE
## `facility_10+`other FALSE FALSE
## state.regionSouth FALSE FALSE
## state.regionNorth Central FALSE FALSE
## state.regionWest FALSE FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
## log_units_per Access.Codepublic regionSouth, North Central, West
## 1 ( 1 ) " " " " "*"
## 2 ( 1 ) "*" " " "*"
## 3 ( 1 ) "*" " " "*"
## 4 ( 1 ) "*" "*" "*"
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## `facility_10+`other state.regionSouth state.regionNorth Central
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) "*" " " " "
## 4 ( 1 ) "*" " " " "
## 5 ( 1 ) "*" " " " "
## 6 ( 1 ) "*" "*" "*"
## state.regionWest
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) "*"
## 6 ( 1 ) " "
# look at rsq
adjrsq_df<-data.frame(rsq=c(reg.summary$rsq,reg.summary$adjr2),
type=c(rep("rsq",6),rep("rsq_adj",6)),
size=rep(1:6,2))
ggplot(adjrsq_df, aes(x=size, y=rsq, color=type))+
geom_point()+
geom_line()+
ggtitle("R-squared vs Adj R-squared")
# other metrics
metric_df<-data.frame(metric=c(reg.summary$rsq,
reg.summary$adjr2,
reg.summary$cp,
reg.summary$bic),
type=c(rep("rsq",6),
rep("rsq_adj",6),
rep("cp",6),
rep("bic",6)),
size=rep(1:6,4))
which.min(reg.summary$bic)#3
## [1] 3
which.min(reg.summary$cp)#4
## [1] 4
which.max(reg.summary$rsq)#6
## [1] 6
which.max(reg.summary$adjr2)#4
## [1] 4
vline.dat <- data.frame(type=unique(metric_df$type), vl=c(6, 4, 4, 3))
ggplot(metric_df, aes(x=size, y=metric, color=type))+
geom_point()+
geom_line()+
ggtitle("Model Comparison Metrics")+
geom_vline(aes(xintercept=vl), data=vline.dat, lty=2) +
facet_grid(type~., scales = "free_y")
In m3 there are three negative slopes and one positive slope, indicating that three of the explanatory variables are negatively associated with the probability that there will be a renewable source on-site. It appears that higher log number of units per address; a public access code; and being a facility type other than a college, muni gov, office bldg, school, or state gov, are all negatively associated with the probability that there will be a renewable source on-site. If the region is South, North Central, or West (aka not Northeast) this is positively associated with the probability that there will be a renewable source on-site.
More simply put, there’s a higher probability of having a renewable energy source on-site at an EV charging station if the access code is private, it’s not in the Northeast, and it is a college, municipal government, office building, school, or state government facility type.
#Dividing the Data
set.seed(501)
train_indices <- sample(1:nrow(renewable_nona_df), size = floor(nrow(renewable_nona_df)/2))
train_data <- renewable_nona_df %>%
slice(train_indices)
test_data <- renewable_nona_df %>%
slice(-train_indices)
#Training
m1B <- glm(renewable.source ~ Access.Code, data = train_data, family = "binomial")
summary(m1B)
##
## Call:
## glm(formula = renewable.source ~ Access.Code, family = "binomial",
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7825 0.2052 0.6394 0.6394 0.6394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.850 1.011 3.81 0.000139 ***
## Access.Codepublic -2.366 1.038 -2.28 0.022586 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 133.87 on 166 degrees of freedom
## Residual deviance: 123.65 on 165 degrees of freedom
## AIC: 127.65
##
## Number of Fisher Scoring iterations: 6
m3B <- glm(renewable.source ~ log_units_per + Access.Code + region +`facility_10+`, data = train_data, family = "binomial")
summary(m3B)
##
## Call:
## glm(formula = renewable.source ~ log_units_per + Access.Code +
## region + `facility_10+`, family = "binomial", data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7210 0.1353 0.2588 0.4819 1.5340
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.0980 1.2601 2.458 0.0140 *
## log_units_per -0.4285 0.3038 -1.411 0.1584
## Access.Codepublic -1.6064 1.1069 -1.451 0.1467
## regionSouth, North Central, West 2.1853 0.5349 4.086 4.4e-05 ***
## `facility_10+`other -1.1115 0.6093 -1.824 0.0681 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 133.274 on 164 degrees of freedom
## Residual deviance: 95.525 on 160 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 105.53
##
## Number of Fisher Scoring iterations: 6
For the first model, which only considers the influence of access code on the response, the error rate is very high at 0.827 or 82.7%.
In the more complex model, which includes log_units_per, access code, region, and facility type, the error rate is much lower at 0.095 or 9.5%.
#Testing
## access code only
test1 <- predict(m1B, newdata = test_data, type = "response")
test_mat1<-data.frame(renewable=test_data$renewable.source, predRenewable=test1>.5)
table(test_mat1$renewable, test_mat1$predRenewable)
##
## TRUE
## 0 29
## 1 139
# error
(139)/length(test1)
## [1] 0.827381
## all four variables (final model)
test3 <- predict(m3B, newdata = test_data, type = "response")
test_mat3<-data.frame(renewable=test_data$renewable.source, predRenewable=test3>.5)
table(test_mat3$renewable, test_mat3$predRenewable)
##
## FALSE TRUE
## 0 15 14
## 1 2 136
#error
(2 + 14)/length(test3)
## [1] 0.0952381
#install.packages
library(ISLR)
#install.packages("rpart")
library(rpart)
#install.packages("MASS")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tidyverse)
# new data frame: access code, log units per, region dichotomized, renewable source, facility 10+
tree_df2 <- distinct(renewable_nona_df[, c(1, 12, 11, 10, 13)]) #maybe trade col 9 for col 11 if needed
tree_df2$`facility_10+` <- as.factor(tree_df2$`facility_10+`)
# test and train
set.seed(2)
dim(tree_df2)
## [1] 90 5
# split in half
train2<-sample(1:nrow(tree_df2), 45)
# use method="class" for classification
tree.df2<-rpart(renewable.source~., data=tree_df2,
subset=train2,
control=rpart.control(minsplit=1),
method="class")
#summary(tree.df2)
#install.packages("rpart.plot")
library(rpart.plot)
# plot
prp(tree.df2, faclen = 0, cex = 0.7, extra=1, space=.5, ycompress=TRUE, type=2)
#Using cross-validation to select complexity
printcp(tree.df2) # display the results
##
## Classification tree:
## rpart(formula = renewable.source ~ ., data = tree_df2, subset = train2,
## method = "class", control = rpart.control(minsplit = 1))
##
## Variables actually used in tree construction:
## [1] Access.Code facility_10+ log_units_per region
##
## Root node error: 13/45 = 0.28889
##
## n= 45
##
## CP nsplit rel error xerror xstd
## 1 0.307692 0 1.000000 1.00000 0.23388
## 2 0.153846 1 0.692308 0.84615 0.22176
## 3 0.076923 3 0.384615 0.76923 0.21453
## 4 0.025641 5 0.230769 0.69231 0.20641
## 5 0.019231 8 0.153846 0.84615 0.22176
## 6 0.010000 12 0.076923 0.84615 0.22176
plotcp(tree.df2) # visualize cross-validation results
# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(tree.df2) # visualize cross-validation results
##
## Classification tree:
## rpart(formula = renewable.source ~ ., data = tree_df2, subset = train2,
## method = "class", control = rpart.control(minsplit = 1))
##
## Variables actually used in tree construction:
## [1] Access.Code facility_10+ log_units_per region
##
## Root node error: 13/45 = 0.28889
##
## n= 45
##
## CP nsplit rel error xerror xstd
## 1 0.307692 0 1.000000 1.00000 0.23388
## 2 0.153846 1 0.692308 0.84615 0.22176
## 3 0.076923 3 0.384615 0.76923 0.21453
## 4 0.025641 5 0.230769 0.69231 0.20641
## 5 0.019231 8 0.153846 0.84615 0.22176
## 6 0.010000 12 0.076923 0.84615 0.22176
## Warning in rsq.rpart(tree.df2): may not be applicable for this method
# minimize the error
tree.df2$cptable
## CP nsplit rel error xerror xstd
## 1 0.30769231 0 1.00000000 1.0000000 0.2338821
## 2 0.15384615 1 0.69230769 0.8461538 0.2217615
## 3 0.07692308 3 0.38461538 0.7692308 0.2145282
## 4 0.02564103 5 0.23076923 0.6923077 0.2064063
## 5 0.01923077 8 0.15384615 0.8461538 0.2217615
## 6 0.01000000 12 0.07692308 0.8461538 0.2217615
which.min(tree.df2$cptable[,"xerror"])
## 4
## 4
tree.df2$cptable[which.min(tree.df2$cptable[,"xerror"]),"CP"]
## [1] 0.02564103
# prune the tree
pfit<- prune(tree.df2, cp=0.026) # from cptable
# plot the pruned tree
prp(pfit, faclen = 0, cex = 0.7, extra=1, space=.5, ycompress=TRUE, type=2)