#knitr::opts_chunk$set(echo=TRUE, warning = FALSE, message = FALSE, results = 'hide', include=TRUE, fig.keep = 'all')
## SET UP ##
#Setup for our analysis, including necessary packages.
rm(list = ls())
setwd("~/Documents/UPenn/Penn 2022 Spring/Planning by Numbers/Assignment 3")
library(tidyverse)
library(tidycensus)
library(ggplot2)
library(dplyr)
library(data.table)
require("ggrepel")
library(hrbrthemes)
library(viridis)
library(ggridges)
library(lubridate)
library(tigris)
library(gganimate)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(mapview)
library(ggcorrplot)
library(RColorBrewer)
library(stargazer)
library(correlationfunnel)
library(tidyquant)
library(plotly)
library(shiny)
library(bslib)
library(shinydashboard)
library(DT)
library(car)
library(AICcmodavg)
library(corrplot)
library(ggthemes)
library(mlogit)
require(mlogit)
Part A: Change in Land Use for Chester County, PA between 1992-2001
Utilizing 500-meter raster cell locations matched to corresponding land cover & use, transportation, and demographic data for 1992 and 2001, we evaluated factors that impacted changes in land usage in Chester County, PA. To study this data and identify possible determinants of land use change, in particular conversion of non-urban land to urban uses, we evaluated each potential determinant’s relationship or correlation with land use.
Taking the potential determinants (variables) that displayed a relationship to the land use changes data, we developed and tested a series of models, eventually determining that the percentage of white population, proximity within 100 meters of a SEPTA rail station, distance from regional rail stations, and distance from parks were the best predictors of the likelihood an area would be converted to urban uses or remain unconverted.
We determined that for every unit increase in our identified determinants, there would be the following impact on the probability that land would be converted to urban uses.
Being located within 100 meters of a SEPTA rail station corresponds to 300% increase in the odds of land conversion.
A unit increase in the percentage of white population corresponds with a 4% decrease in the odds of land conversion.
Unit increases in both distance to the closest park and regional rail station correspond with a less than 1% decrease in the odds of land conversion.
From our model, we developed a series of scenarios to predict how changes within each variable impact land conversion probability. Largely confirming the values seen above, our scenario plot displayed trends indicating that as white population increases, the likelihood of conversion to urban land uses decreases. Our scenarios also indicated that the presence of a rail station, being within 100 meters of a rail station has a very high (over 50%) predicted probability of urban land use conversion when distance to parks and other regional rail stations were held at the middle (median) value of their respective data sets.
Given proximity to park space and regional rail stations have very low predicted odds ratios for land use change, these variables are likely not the most useful additions to the model, although their additions did increase the overall fit of the model to the data. Given the substantial impact of SEPTA rail infrastructure within the model it is likely that other variables relating to transit infrastructure, such as REGRAIL300 and FOURLNE300, would improve model performance.
Part B: Factors Impacting Commuting Transportation Choices
Part B.1
A commuter’s choice to drive to work was found to be the most correlated with the total number of vehicles in the household, trip duration, age, and income. Among the five binomial logit models we constructed, however, we observed a significant influence of trip duration, age, and gender on the probability of driving to work.
For a year’s increase in age, the chances of driving to work increase by 2.6%
For a minute’s increase in trip duration, the chances of driving to work decreases by 1.3%
Measuring the influence of gender against itself revealed that women are 39% less likely to drive to work than men.
We crafted three scenarios to predict how the probability of driving to work would change with trip duration for different ages (25, 45, and 65 years respectively) within a fixed income value ($62,000). Older age groups had higher probabilities of driving to work - closer to 0.92 than the youngest age group which started closer to 0.8. The prediction framework visualized an overall decrease in the probability of driving to work, with a sharper decline around 50 to 100 minutes. This is consistent with the phenomenon of time and cost savings being paramount to a traveler - as the trip duration increases, the utility of travel decreases, and people across ages would be less likely to drive to work.
Part B.2
In addition to income, age, and trip duration, we also measured the effect of a person having a disability on the chances of walking or biking to work. In contrast to the probability of driving to work, our model revealed that:
A unit increase in age results in a 2.1% decrease in the odds of walking or biking to work
A unit increase in trip duration decreases the odds of walking or biking by 2.2%
A unit increase in income results in a 0.0001% decrease in the odds of walking or biking to work
A person with disabilities is 95% less likely to walk or bike to work. Obviously.
Similar to our previous approach, we constructed three scenarios to measure the predicted probability of walking or biking to work against trip duration for 3 different age groups (again, 25, 45, and 65 years old), with an income of $62,000.
Consistent with our model results, our prediction frameworks found that younger people were more likely to walk or bike to work. However, when compared to driving, the overall predicted probability of walking or biking to work starts much lower (at about 0.28), falls sharply around trip duration of about 30 minutes, and tends to 0 around trip duration of 2 hours. This really does check out with general reality.
Part B.3
Plotting the frequency distribution of trip modes highlighted an overwhelming number of trips made by car (almost 92% more than walking, biking, and transit combined). The multinomial model allowed us to analyze mode choice relative to driving. We found time and trip duration to be the most statistically significant variables, demonstrating that one is less likely to opt for walking, biking, or transit over a car with increases in time and trip duration.
Consistent with our observations using binomial logit, our “leanest and meanest” model allowed us to infer that:
A unit increase in age results in a 0.33% decrease in the odds of walking or biking to work, but a 0.16% increase in the odds of taking transit.
While controlling for driving, a minute’s increase in trip duration increases the odds of taking transit by 39%
Overall, both binomial and multinomial models could have been further strengthened by variables such as transit fares, housing cost burden, distance from a transit line, and quantity of walking and biking infrastructure respective to a particular sample group. The effect of income could also have been measured better by constructing categories relative to each other, as the effect of a dollar increase in household income did not (and would not) have any significant effect on mode choice or travel behavior.
Remove percent college graduate as numbers appear to be incorrect.
dat_chester <- read.csv("Chester_Urban_Growth.csv", header = TRUE)
#head(dat_chester)
#summary(dat_chester)
dat_chester1 <- subset(dat_chester, select = -c(PCT_COLGRD)) #values appear to be incorrect
#head(dat_chester1)
summary(dat_chester1)
## CHESCO X Y SLOPE
## Min. :1 Min. :1586501 Min. :152678 Min. : 0.050
## 1st Qu.:1 1st Qu.:1602001 1st Qu.:173178 1st Qu.: 1.344
## Median :1 Median :1610001 Median :186178 Median : 2.048
## Mean :1 Mean :1610704 Mean :185065 Mean : 2.407
## 3rd Qu.:1 3rd Qu.:1618502 3rd Qu.:197178 3rd Qu.: 3.036
## Max. :1 Max. :1638502 Max. :214179 Max. :12.042
## FOURLNE300 INTERST800 REGRAIL300 RAILSTN100
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4971 Mean :0.3426 Mean :0.1193 Mean :0.01253
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## PARKS500M WATER100 CITBORO_10 DIST_WATER
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 0
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 1414
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 3162
## Mean :0.02276 Mean :0.05733 Mean :0.07015 Mean : 3753
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 5701
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :13730
## DIST_RAILS DIST_REGRA DIST_PASSR DIST_4LNE_
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 6671 1st Qu.: 6265 1st Qu.: 1000 1st Qu.: 1414
## Median :12500 Median :12176 Median : 2236 Median : 3000
## Mean :13233 Mean :12918 Mean : 2763 Mean : 3416
## 3rd Qu.:18201 3rd Qu.:18000 3rd Qu.: 4123 3rd Qu.: 5315
## Max. :40771 Max. :40771 Max. :11181 Max. :11511
## DIST_INTER DIST_PARKS PAL_WETLND FARM92
## Min. : 0 Min. : 0 Min. :0.0000 Min. :0.000000
## 1st Qu.: 5590 1st Qu.: 2500 1st Qu.:0.0000 1st Qu.:0.000000
## Median :12530 Median : 4472 Median :0.0000 Median :0.000000
## Mean :15640 Mean : 5161 Mean :0.1052 Mean :0.009219
## 3rd Qu.:24076 3rd Qu.: 7106 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :49359 Max. :16808 Max. :1.0000 Max. :1.000000
## PASTURE92 FOREST92 URBAN01 URBAN92
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4994 Mean :0.3518 Mean :0.0677 Mean :0.04883
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## POPDEN90 MEDINC90 MEDHSEVAL_ PCT_WHITE_
## Min. : 0 Min. : 0 Min. : 0 Min. : 34.00
## 1st Qu.: 21 1st Qu.: 37986 1st Qu.:128000 1st Qu.: 94.00
## Median : 61 Median : 45074 Median :150200 Median : 97.00
## Mean : 189 Mean : 48876 Mean :167526 Mean : 94.74
## 3rd Qu.: 154 3rd Qu.: 56394 3rd Qu.:214200 3rd Qu.: 98.00
## Max. :13000 Max. :103043 Max. :384000 Max. :100.00
## PCT_SFHOME PCT_POV_90 PCT_HSB_19
## Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 77.00 1st Qu.: 2.000 1st Qu.: 14.00
## Median : 86.00 Median : 4.000 Median : 19.00
## Mean : 82.67 Mean : 4.424 Mean : 23.11
## 3rd Qu.: 93.00 3rd Qu.: 6.000 3rd Qu.: 31.00
## Max. :100.00 Max. :29.000 Max. :100.00
#dim(dat_chester1)
#[1] 6942 31
Population Density in 1990
Values skewed, create binary variable.
summary(dat_chester1$POPDEN90)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 21 61 189 154 13000
plotTheme <- theme(
axis.title = element_text(
size = 11, lineheight = .9, family = "Comic Sans MS"),
plot.title = element_text(size=16, face = "bold", hjust = 0.5, family = "Comic Sans MS"),
plot.background = element_blank())
dat_chester1$hi_POPDEN90 <- ifelse(dat_chester1$POPDEN90 > mean(dat_chester1$POPDEN90), 1, 0)
qplot(dat_chester1$POPDEN90,
geom="histogram",
binwidth = 157,
main = "Population Density",
xlab = "Average Pop Density",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Median Household Income in 1990
summary(dat_chester1$MEDINC90)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37986 45074 48876 56394 103043
#number of bins: sqrt(6942)
qplot(dat_chester1$MEDINC90,
geom="histogram",
binwidth = 1241,
main = "Median Household Income",
xlab = "$Moolah$",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Median Home Value in 1990
summary(dat_chester1$MEDHSEVAL_)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 128000 150200 167526 214200 384000
#number of bins: sqrt(6942) = 83
#bin width: (max-min) / #of bins
qplot(dat_chester1$MEDHSEVAL_,
geom="histogram",
binwidth = 4627,
main = "Median Home Value",
xlab = "$Moolah$",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Percentage of White Pop in 1990
Values skewed, create binary variable.
summary(dat_chester1$PCT_WHITE_)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34.00 94.00 97.00 94.74 98.00 100.00
dat_chester1$hi_PCT_WHITE_ <- ifelse(dat_chester1$PCT_WHITE_ > mean(dat_chester1$PCT_WHITE_), 1, 0)
qplot(dat_chester1$PCT_WHITE_,
geom="histogram",
binwidth = 10,
main = "White Population",
xlab = "%Percent%",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Percentage of Single-Family Homes in 1990
Values skewed, create binary variable.
summary(dat_chester1$PCT_SFHOME)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 77.00 86.00 82.67 93.00 100.00
dat_chester1$hi_PCT_SFHOME <- ifelse(dat_chester1$PCT_SFHOME > mean(dat_chester1$PCT_SFHOME), 1, 0)
qplot(dat_chester1$PCT_SFHOME,
geom="histogram",
binwidth = 10,
main = "Single-Family Homes",
xlab = "%Percent%",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Percentage of Households Below Poverty Line in 1990
summary(dat_chester1$PCT_POV_90)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 4.000 4.424 6.000 29.000
qplot(dat_chester1$PCT_POV_90,
geom="histogram",
binwidth = 10,
main = "Households Below Poverty Line",
xlab = "%Percent%",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Percentage of Homes Built Before 1950 in 1990
summary(dat_chester1$PCT_HSB_19)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 14.00 19.00 23.11 31.00 100.00
qplot(dat_chester1$PCT_HSB_19,
geom="histogram",
binwidth = 10,
main = "Homes Built Before 1950",
xlab = "%Percent%",
fill=I("lawngreen"),
col=I("lawngreen"),
alpha=I(.3)) + plotTheme
Created a new data frame including a conditional binary variable column of raster cells that were either farmland, pasture or forest in 1992 and were converted to urban uses by 2001.
#create binary variable column, assigning all "0"
dat_chester1$CHNG_URB <- 0
#create conditional value "1" for new column based on whether a cell was farm, pasture or forest in 1992
#and whether it changed to urban in 2001
dat_chester1 <- dat_chester1%>%
mutate(CHNG_URB = if_else ((FARM92 == "1" |
PASTURE92 == "1" | FOREST92 == "1") & URBAN01 == "1", 1, CHNG_URB))
summary(dat_chester1)
## CHESCO X Y SLOPE
## Min. :1 Min. :1586501 Min. :152678 Min. : 0.050
## 1st Qu.:1 1st Qu.:1602001 1st Qu.:173178 1st Qu.: 1.344
## Median :1 Median :1610001 Median :186178 Median : 2.048
## Mean :1 Mean :1610704 Mean :185065 Mean : 2.407
## 3rd Qu.:1 3rd Qu.:1618502 3rd Qu.:197178 3rd Qu.: 3.036
## Max. :1 Max. :1638502 Max. :214179 Max. :12.042
## FOURLNE300 INTERST800 REGRAIL300 RAILSTN100
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4971 Mean :0.3426 Mean :0.1193 Mean :0.01253
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## PARKS500M WATER100 CITBORO_10 DIST_WATER
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. : 0
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.: 1414
## Median :0.00000 Median :0.00000 Median :0.00000 Median : 3162
## Mean :0.02276 Mean :0.05733 Mean :0.07015 Mean : 3753
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.: 5701
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :13730
## DIST_RAILS DIST_REGRA DIST_PASSR DIST_4LNE_
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 6671 1st Qu.: 6265 1st Qu.: 1000 1st Qu.: 1414
## Median :12500 Median :12176 Median : 2236 Median : 3000
## Mean :13233 Mean :12918 Mean : 2763 Mean : 3416
## 3rd Qu.:18201 3rd Qu.:18000 3rd Qu.: 4123 3rd Qu.: 5315
## Max. :40771 Max. :40771 Max. :11181 Max. :11511
## DIST_INTER DIST_PARKS PAL_WETLND FARM92
## Min. : 0 Min. : 0 Min. :0.0000 Min. :0.000000
## 1st Qu.: 5590 1st Qu.: 2500 1st Qu.:0.0000 1st Qu.:0.000000
## Median :12530 Median : 4472 Median :0.0000 Median :0.000000
## Mean :15640 Mean : 5161 Mean :0.1052 Mean :0.009219
## 3rd Qu.:24076 3rd Qu.: 7106 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :49359 Max. :16808 Max. :1.0000 Max. :1.000000
## PASTURE92 FOREST92 URBAN01 URBAN92
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4994 Mean :0.3518 Mean :0.0677 Mean :0.04883
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## POPDEN90 MEDINC90 MEDHSEVAL_ PCT_WHITE_
## Min. : 0 Min. : 0 Min. : 0 Min. : 34.00
## 1st Qu.: 21 1st Qu.: 37986 1st Qu.:128000 1st Qu.: 94.00
## Median : 61 Median : 45074 Median :150200 Median : 97.00
## Mean : 189 Mean : 48876 Mean :167526 Mean : 94.74
## 3rd Qu.: 154 3rd Qu.: 56394 3rd Qu.:214200 3rd Qu.: 98.00
## Max. :13000 Max. :103043 Max. :384000 Max. :100.00
## PCT_SFHOME PCT_POV_90 PCT_HSB_19 hi_POPDEN90
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 77.00 1st Qu.: 2.000 1st Qu.: 14.00 1st Qu.:0.0000
## Median : 86.00 Median : 4.000 Median : 19.00 Median :0.0000
## Mean : 82.67 Mean : 4.424 Mean : 23.11 Mean :0.2025
## 3rd Qu.: 93.00 3rd Qu.: 6.000 3rd Qu.: 31.00 3rd Qu.:0.0000
## Max. :100.00 Max. :29.000 Max. :100.00 Max. :1.0000
## hi_PCT_WHITE_ hi_PCT_SFHOME CHNG_URB
## Min. :0.0000 Min. :0.000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00000
## Median :1.0000 Median :1.000 Median :0.00000
## Mean :0.7151 Mean :0.574 Mean :0.03673
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.000 Max. :1.00000
#NEXT build binomial logit model of determinants for land use
Create Correlation Matrix of Potential Variables for Study
#identify determinants of agr/pasture/forest to urban use change ('92-'01)
colnamCleaning <- c("CHNG_URB", "FOURLNE300", "PARKS500M", "RAILSTN100", "REGRAIL300", "WATER100", "CITBORO_10",
"DIST_WATER", "DIST_RAILS","DIST_REGRA", "DIST_PASSR", "DIST_4LNE","DIST_INTER", "DIST_PARKS", "PAL_WETLND", "POPDEN90","MEDINC90", "MEDHSEVAL_", "PCT_WHITE_", "hi_PCT_WHITE_", "PCT_POV_90", "PCT_HSB_19")
clean_datchester1 <- dat_chester1[ , which(names(dat_chester1) %in% colnamCleaning)]
cortable <- cor(clean_datchester1)
round(cortable, 2)
## FOURLNE300 REGRAIL300 RAILSTN100 PARKS500M WATER100 CITBORO_10
## FOURLNE300 1.00 0.30 0.11 0.02 -0.08 0.10
## REGRAIL300 0.30 1.00 0.31 -0.04 -0.05 0.06
## RAILSTN100 0.11 0.31 1.00 -0.02 -0.02 0.06
## PARKS500M 0.02 -0.04 -0.02 1.00 0.03 -0.01
## WATER100 -0.08 -0.05 -0.02 0.03 1.00 0.04
## CITBORO_10 0.10 0.06 0.06 -0.01 0.04 1.00
## DIST_WATER 0.18 0.20 0.06 -0.04 -0.32 -0.06
## DIST_RAILS -0.23 -0.50 -0.18 0.02 0.09 -0.08
## DIST_REGRA -0.24 -0.51 -0.17 0.03 0.09 -0.08
## DIST_PASSR -0.35 -0.27 -0.08 0.02 0.00 -0.25
## DIST_INTER -0.17 -0.32 -0.10 -0.05 0.08 -0.03
## DIST_PARKS -0.15 -0.11 -0.05 -0.22 0.00 -0.06
## PAL_WETLND 0.01 0.00 -0.03 0.00 0.02 -0.02
## POPDEN90 0.14 0.17 0.13 -0.04 -0.03 0.27
## MEDINC90 0.23 0.21 0.01 0.03 -0.07 -0.17
## MEDHSEVAL_ 0.18 0.16 0.02 0.04 -0.07 -0.18
## PCT_WHITE_ -0.20 -0.03 -0.03 0.05 -0.02 -0.23
## PCT_POV_90 -0.14 -0.16 -0.04 -0.06 0.05 0.18
## PCT_HSB_19 -0.15 -0.33 -0.02 -0.03 0.00 0.25
## hi_PCT_WHITE_ -0.14 -0.13 -0.11 0.07 0.03 -0.22
## CHNG_URB 0.16 0.21 0.16 -0.01 -0.02 0.10
## DIST_WATER DIST_RAILS DIST_REGRA DIST_PASSR DIST_INTER DIST_PARKS
## FOURLNE300 0.18 -0.23 -0.24 -0.35 -0.17 -0.15
## REGRAIL300 0.20 -0.50 -0.51 -0.27 -0.32 -0.11
## RAILSTN100 0.06 -0.18 -0.17 -0.08 -0.10 -0.05
## PARKS500M -0.04 0.02 0.03 0.02 -0.05 -0.22
## WATER100 -0.32 0.09 0.09 0.00 0.08 0.00
## CITBORO_10 -0.06 -0.08 -0.08 -0.25 -0.03 -0.06
## DIST_WATER 1.00 -0.28 -0.29 0.03 -0.25 0.06
## DIST_RAILS -0.28 1.00 1.00 0.31 0.80 0.17
## DIST_REGRA -0.29 1.00 1.00 0.31 0.81 0.17
## DIST_PASSR 0.03 0.31 0.31 1.00 0.20 0.14
## DIST_INTER -0.25 0.80 0.81 0.20 1.00 0.42
## DIST_PARKS 0.06 0.17 0.17 0.14 0.42 1.00
## PAL_WETLND 0.01 -0.07 -0.07 0.00 -0.08 -0.05
## POPDEN90 0.04 -0.18 -0.18 -0.15 -0.12 -0.10
## MEDINC90 0.24 -0.39 -0.39 0.05 -0.31 -0.23
## MEDHSEVAL_ 0.36 -0.37 -0.37 0.10 -0.35 -0.24
## PCT_WHITE_ 0.08 -0.06 -0.07 0.22 -0.24 -0.14
## PCT_POV_90 -0.03 0.21 0.22 0.05 0.21 0.18
## PCT_HSB_19 0.04 0.28 0.28 0.10 0.19 0.21
## hi_PCT_WHITE_ -0.08 0.03 0.03 0.27 -0.16 -0.22
## CHNG_URB 0.04 -0.15 -0.15 -0.13 -0.09 -0.08
## PAL_WETLND POPDEN90 MEDINC90 MEDHSEVAL_ PCT_WHITE_ PCT_POV_90
## FOURLNE300 0.01 0.14 0.23 0.18 -0.20 -0.14
## REGRAIL300 0.00 0.17 0.21 0.16 -0.03 -0.16
## RAILSTN100 -0.03 0.13 0.01 0.02 -0.03 -0.04
## PARKS500M 0.00 -0.04 0.03 0.04 0.05 -0.06
## WATER100 0.02 -0.03 -0.07 -0.07 -0.02 0.05
## CITBORO_10 -0.02 0.27 -0.17 -0.18 -0.23 0.18
## DIST_WATER 0.01 0.04 0.24 0.36 0.08 -0.03
## DIST_RAILS -0.07 -0.18 -0.39 -0.37 -0.06 0.21
## DIST_REGRA -0.07 -0.18 -0.39 -0.37 -0.07 0.22
## DIST_PASSR 0.00 -0.15 0.05 0.10 0.22 0.05
## DIST_INTER -0.08 -0.12 -0.31 -0.35 -0.24 0.21
## DIST_PARKS -0.05 -0.10 -0.23 -0.24 -0.14 0.18
## PAL_WETLND 1.00 0.00 0.03 0.03 0.01 -0.02
## POPDEN90 0.00 1.00 -0.02 0.00 -0.09 0.04
## MEDINC90 0.03 -0.02 1.00 0.80 0.22 -0.48
## MEDHSEVAL_ 0.03 0.00 0.80 1.00 0.28 -0.35
## PCT_WHITE_ 0.01 -0.09 0.22 0.28 1.00 -0.33
## PCT_POV_90 -0.02 0.04 -0.48 -0.35 -0.33 1.00
## PCT_HSB_19 -0.03 0.06 -0.48 -0.31 -0.14 0.37
## hi_PCT_WHITE_ 0.03 -0.11 0.22 0.18 0.62 -0.27
## CHNG_URB 0.00 0.12 0.04 0.03 -0.06 -0.03
## PCT_HSB_19 hi_PCT_WHITE_ CHNG_URB
## FOURLNE300 -0.15 -0.14 0.16
## REGRAIL300 -0.33 -0.13 0.21
## RAILSTN100 -0.02 -0.11 0.16
## PARKS500M -0.03 0.07 -0.01
## WATER100 0.00 0.03 -0.02
## CITBORO_10 0.25 -0.22 0.10
## DIST_WATER 0.04 -0.08 0.04
## DIST_RAILS 0.28 0.03 -0.15
## DIST_REGRA 0.28 0.03 -0.15
## DIST_PASSR 0.10 0.27 -0.13
## DIST_INTER 0.19 -0.16 -0.09
## DIST_PARKS 0.21 -0.22 -0.08
## PAL_WETLND -0.03 0.03 0.00
## POPDEN90 0.06 -0.11 0.12
## MEDINC90 -0.48 0.22 0.04
## MEDHSEVAL_ -0.31 0.18 0.03
## PCT_WHITE_ -0.14 0.62 -0.06
## PCT_POV_90 0.37 -0.27 -0.03
## PCT_HSB_19 1.00 -0.13 -0.05
## hi_PCT_WHITE_ -0.13 1.00 -0.09
## CHNG_URB -0.05 -0.09 1.00
N = cor(clean_datchester1)
corrplot(N, method="square", order = 'FPC')
Variables with highest degree of correlation with CHNG_URB
| VARIABLE | DEGREE |
|---|---|
| REGRAIL300 | 0.21 |
| FOURLNE300 | 0.16 |
| RAILSTN100 | 0.16 |
| DIST_REGRA | 0.15 |
| DIST_RAILS | 0.15 |
| DIST_PASSR | 0.13 |
| DIST_INTER | 0.09 |
| DIST_PARKS | 0.08 |
| PCT_WHITE_ | 0.06 |
| PCT_HSB_19 | 0.05 |
Correlation Matrix of Potential Independent Variables
REGRAIL300 & DIST_REGRA & DIST_PASSR are highly correlated (>0.30) with almost all other independent variables and should not be used in the same model as any of the others.
This leaves independent variables with correlations < 30: PCT_WHITE_, RAILSTN100, DIST_PARKS & DIST_REGRA
colnamCleaning2 <- c("RAILSTN100", "REGRAIL300", "DIST_RAILS", "DIST_PASSR", "hi_POPDEN90", "FOURLNE300", "DIST_REGRA", "DIST_INTER", "DIST_PARKS", "PCT_WHITE_", "PCT_HSB_19")
clean_datchester2 <- dat_chester1[ , which(names(dat_chester1) %in% colnamCleaning2)]
cortable <- cor(clean_datchester2)
round(cortable, 2)
## FOURLNE300 REGRAIL300 RAILSTN100 DIST_RAILS DIST_REGRA DIST_PASSR
## FOURLNE300 1.00 0.30 0.11 -0.23 -0.24 -0.35
## REGRAIL300 0.30 1.00 0.31 -0.50 -0.51 -0.27
## RAILSTN100 0.11 0.31 1.00 -0.18 -0.17 -0.08
## DIST_RAILS -0.23 -0.50 -0.18 1.00 1.00 0.31
## DIST_REGRA -0.24 -0.51 -0.17 1.00 1.00 0.31
## DIST_PASSR -0.35 -0.27 -0.08 0.31 0.31 1.00
## DIST_INTER -0.17 -0.32 -0.10 0.80 0.81 0.20
## DIST_PARKS -0.15 -0.11 -0.05 0.17 0.17 0.14
## PCT_WHITE_ -0.20 -0.03 -0.03 -0.06 -0.07 0.22
## PCT_HSB_19 -0.15 -0.33 -0.02 0.28 0.28 0.10
## hi_POPDEN90 0.23 0.31 0.14 -0.29 -0.30 -0.18
## DIST_INTER DIST_PARKS PCT_WHITE_ PCT_HSB_19 hi_POPDEN90
## FOURLNE300 -0.17 -0.15 -0.20 -0.15 0.23
## REGRAIL300 -0.32 -0.11 -0.03 -0.33 0.31
## RAILSTN100 -0.10 -0.05 -0.03 -0.02 0.14
## DIST_RAILS 0.80 0.17 -0.06 0.28 -0.29
## DIST_REGRA 0.81 0.17 -0.07 0.28 -0.30
## DIST_PASSR 0.20 0.14 0.22 0.10 -0.18
## DIST_INTER 1.00 0.42 -0.24 0.19 -0.19
## DIST_PARKS 0.42 1.00 -0.14 0.21 -0.16
## PCT_WHITE_ -0.24 -0.14 1.00 -0.14 -0.05
## PCT_HSB_19 0.19 0.21 -0.14 1.00 -0.16
## hi_POPDEN90 -0.19 -0.16 -0.05 -0.16 1.00
MODEL A: MOST HIGHLY CORRELATED VARIABLES WITH LAND USE CHANGE
Model A1: Change in Land Use vs Percent of White Pop
AIC: 2168.6
mod_chester1 <- glm ( CHNG_URB ~ PCT_WHITE_, data=dat_chester1, family = binomial)
summary(mod_chester1)
##
## Call:
## glm(formula = CHNG_URB ~ PCT_WHITE_, family = binomial, data = dat_chester1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6680 -0.2727 -0.2603 -0.2524 2.6431
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.317304 0.572871 -0.554 0.58
## PCT_WHITE_ -0.031448 0.006146 -5.117 3.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2185.6 on 6941 degrees of freedom
## Residual deviance: 2164.6 on 6940 degrees of freedom
## AIC: 2168.6
##
## Number of Fisher Scoring iterations: 6
Model A2: Change in Land Use vs Percent of White Pop vs 100 Meters of SEPTA Regional Rail
AIC: 2095.1
mod_chester2 <- glm ( CHNG_URB ~ PCT_WHITE_ + RAILSTN100, data=dat_chester1, family = binomial)
summary(mod_chester2)
##
## Call:
## glm(formula = CHNG_URB ~ PCT_WHITE_ + RAILSTN100, family = binomial,
## data = dat_chester1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0338 -0.2593 -0.2477 -0.2402 2.6796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.452314 0.596422 -0.758 0.448
## PCT_WHITE_ -0.031099 0.006398 -4.861 1.17e-06 ***
## RAILSTN100 2.530466 0.241771 10.466 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2185.6 on 6941 degrees of freedom
## Residual deviance: 2089.1 on 6939 degrees of freedom
## AIC: 2095.1
##
## Number of Fisher Scoring iterations: 6
Model A3: Change in Land Use vs Percent of White Pop vs 100 Meters of SEPTA Regional Rail vs Distance from Nearest Park
AIC: 2047.4
mod_chester3 <- glm ( CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS,
data=dat_chester1, family = binomial)
summary(mod_chester3)
##
## Call:
## glm(formula = CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS,
## family = binomial, data = dat_chester1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1205 -0.2929 -0.2502 -0.1991 2.8468
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.031e+00 6.247e-01 1.651 0.0987 .
## PCT_WHITE_ -3.944e-02 6.411e-03 -6.152 7.66e-10 ***
## RAILSTN100 2.383e+00 2.437e-01 9.777 < 2e-16 ***
## DIST_PARKS -1.570e-04 2.424e-05 -6.477 9.36e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2185.6 on 6941 degrees of freedom
## Residual deviance: 2039.4 on 6938 degrees of freedom
## AIC: 2047.4
##
## Number of Fisher Scoring iterations: 6
Model A4: Change in Land Use vs Percent of White Pop vs 100 Meters of SEPTA Regional Rail vs Distance from Nearest Park vs Distance to a Regional Rail Line
AIC: 1937.8
mod_chester4 <- glm (CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS + DIST_REGRA, data=dat_chester1, family = binomial)
summary(mod_chester4)
##
## Call:
## glm(formula = CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS +
## DIST_REGRA, family = binomial, data = dat_chester1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0861 -0.2998 -0.2132 -0.1476 3.4005
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.888e+00 6.573e-01 2.873 0.00407 **
## PCT_WHITE_ -3.900e-02 6.875e-03 -5.672 1.41e-08 ***
## RAILSTN100 1.385e+00 2.569e-01 5.393 6.94e-08 ***
## DIST_PARKS -1.205e-04 2.628e-05 -4.585 4.55e-06 ***
## DIST_REGRA -1.051e-04 1.137e-05 -9.249 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2185.6 on 6941 degrees of freedom
## Residual deviance: 1927.8 on 6937 degrees of freedom
## AIC: 1937.8
##
## Number of Fisher Scoring iterations: 7
Analysis by Variance Tables
All models are statistically significant improvements to model 1.
anova(mod_chester1, mod_chester2, mod_chester3, mod_chester4, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: CHNG_URB ~ PCT_WHITE_
## Model 2: CHNG_URB ~ PCT_WHITE_ + RAILSTN100
## Model 3: CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS
## Model 4: CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS + DIST_REGRA
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 6940 2164.6
## 2 6939 2089.1 1 75.496 < 2.2e-16 ***
## 3 6938 2039.4 1 49.734 1.76e-12 ***
## 4 6937 1927.8 1 111.595 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Order of Significance by Step Through
Backward Selection
Run backward selection for model A4.Nothing drops.
AIC = 1937.78 Null Deviance: 2186 Residual Deviance: 1928
step(mod_chester4, direction = "backward")
## Start: AIC=1937.78
## CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS + DIST_REGRA
##
## Df Deviance AIC
## <none> 1927.8 1937.8
## - DIST_PARKS 1 1950.6 1958.6
## - RAILSTN100 1 1952.9 1960.9
## - PCT_WHITE_ 1 1953.7 1961.7
## - DIST_REGRA 1 2039.4 2047.4
##
## Call: glm(formula = CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS +
## DIST_REGRA, family = binomial, data = dat_chester1)
##
## Coefficients:
## (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
## 1.8882136 -0.0389976 1.3852463 -0.0001205 -0.0001051
##
## Degrees of Freedom: 6941 Total (i.e. Null); 6937 Residual
## Null Deviance: 2186
## Residual Deviance: 1928 AIC: 1938
#nothing is dropped
Forward Selection
Setting up the forward selection
modUpper <- glm(formula = CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS +
DIST_REGRA, family = binomial, data = dat_chester1)
intonly <- glm(formula = CHNG_URB ~1, family = binomial, data = dat_chester1)
step(intonly, scope=list(lower=intonly, upper=modUpper), direction="forward")
## Start: AIC=2187.6
## CHNG_URB ~ 1
##
## Df Deviance AIC
## + DIST_REGRA 1 2001.9 2005.9
## + RAILSTN100 1 2108.1 2112.1
## + DIST_PARKS 1 2140.7 2144.7
## + PCT_WHITE_ 1 2164.6 2168.6
## <none> 2185.6 2187.6
##
## Step: AIC=2005.88
## CHNG_URB ~ DIST_REGRA
##
## Df Deviance AIC
## + RAILSTN100 1 1974.8 1980.8
## + PCT_WHITE_ 1 1976.0 1982.0
## + DIST_PARKS 1 1980.2 1986.2
## <none> 2001.9 2005.9
##
## Step: AIC=1980.84
## CHNG_URB ~ DIST_REGRA + RAILSTN100
##
## Df Deviance AIC
## + PCT_WHITE_ 1 1950.6 1958.6
## + DIST_PARKS 1 1953.7 1961.7
## <none> 1974.8 1980.8
##
## Step: AIC=1958.64
## CHNG_URB ~ DIST_REGRA + RAILSTN100 + PCT_WHITE_
##
## Df Deviance AIC
## + DIST_PARKS 1 1927.8 1937.8
## <none> 1950.6 1958.6
##
## Step: AIC=1937.78
## CHNG_URB ~ DIST_REGRA + RAILSTN100 + PCT_WHITE_ + DIST_PARKS
##
## Call: glm(formula = CHNG_URB ~ DIST_REGRA + RAILSTN100 + PCT_WHITE_ +
## DIST_PARKS, family = binomial, data = dat_chester1)
##
## Coefficients:
## (Intercept) DIST_REGRA RAILSTN100 PCT_WHITE_ DIST_PARKS
## 1.8882136 -0.0001051 1.3852463 -0.0389976 -0.0001205
##
## Degrees of Freedom: 6941 Total (i.e. Null); 6937 Residual
## Null Deviance: 2186
## Residual Deviance: 1928 AIC: 1938
#Start: CHNG_URB ~ 1
## AIC = 2187.6
#Step: ~ DIST_REGRA
##AIC = 2005.88
#Step: ~ DIST_REGRA + RAILSTN100
##AIC = 1980.84
#Step: ~ DIST_REGRA + RAILSTN100 + PCT_WHITE_
##AIC = 1958.64
#Step: ~ DIST_REGRA + RAILSTN100 + PCT_WHITE_ + DIST_PARKS
##AIC = 1937.78
#Degrees of Freedom: 6941 Total (i.e. Null); 6937 Residual
#Null Deviance: 2186
#Residual Deviance: 1928
The AIC is at its lowest with all four possible independent variables present. This, model4, has been identified by ANOVA, backward and forward selection as the best fit.
PCT_White_ Coefficient
Unit increase in the percentage of white pop results in a 4% decrease in the odds land conversion
mod_chester4$coefficients
## (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
## 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
# (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
# 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
(exp(-0.0389976059) - 1) * 100
## [1] -3.824699
#unit increase in Y results in a 4% decrease in the odds land conversion
DIST_PARKS Coefficient
Unit increase in distance to the closest park 0.012% decrease in the odds land conversion. Virtually having no impact on land conversion likelihood within this model.
mod_chester4$coefficients
## (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
## 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
# (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
# 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
(exp(-0.0001204970) - 1) * 100
## [1] -0.01204897
#unit increase in Y results in a 4% decrease in the odds land conversion
DIST_REGRA Coefficient
Unit increase in distance to the closest park 0.011% decrease in the odds land conversion. Virtually having no impact on land conversion likelihood within this model.
mod_chester4$coefficients
## (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
## 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
# (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
# 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
(exp(-0.0001051128) - 1) * 100
## [1] -0.01051073
#unit increase in Y results in a 4% decrease in the odds land conversion
RAILSTN100 Coefficient
If a cell is within 100 meters of a SEPTA rail station then there is a 300% increase in the likelihood that the land will be converted to urban use. w
mod_chester4$coefficients
## (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
## 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
# (Intercept) PCT_WHITE_ RAILSTN100 DIST_PARKS DIST_REGRA
# 1.8882135917 -0.0389976059 1.3852463122 -0.0001204970 -0.0001051128
(exp(1.3852463122) - 1) * 100
## [1] 299.581
#unit increase in Y results in a 4% decrease in the odds land conversion
Shows distribution between land converted to urban use (1) and land that remained unconverted (0).
High percentages of white populations correspond with predominately unconverted land.
jit <- ggplot(dat_chester1, aes(PCT_WHITE_, CHNG_URB))
jit + geom_point(shape = 21, size = 4,
fill=I("lawngreen"),
color=("lawngreen"), alpha=I(.2), position = "jitter") + theme(panel.background = element_blank()) + plotTheme
#CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS + DIST_REGRA
Unsurprisingly, a lack of proximity to rail infrastructure corresponds with unconverted land.
jit2 <- ggplot(dat_chester1, aes(RAILSTN100, CHNG_URB))
jit2 + geom_point(shape = 21, size = 4,
fill=I("lawngreen"),
color=("lawngreen"), alpha=I(.2), position = "jitter") + theme(panel.background = element_blank()) + plotTheme
Interestingly, distance from parks seems to have more variation than other land use indicators. While the variable observations are concentrated within a close proximity to park space and unconverted land, there is more spread within this distribution.
jit3 <- ggplot(dat_chester1, aes(DIST_PARKS, CHNG_URB))
jit3 + geom_point(shape = 21, size = 4,
fill=I("lawngreen"),
color=("lawngreen"), alpha=I(.2), position = "jitter") + theme(panel.background = element_blank()) + plotTheme
Using model 4, our best model, we are evaluated how distance from regional rail, parks, and percentage of white population impacts changes in land use. Most notably is the downward curve of each scenario, indicating that as percentage of white population increases the likelihood of conversion to urban uses decreases. Conversely, it is notable that the lower percentages of white populations correspond with a higher likelihood of conversion to urban land. The plot also indicates the importance of rail infrastructure as seen in the red scenario “Pred_NoRAIL_PARKMED_REGA3Q”. In the absence of being within 100 meters from a SEPTA regional rail station and being farther away from a rail station than 75% (3rd quartile) of the data, the likelihood of experiencing conversion to urban land is under 10%. And as white population increases, this likelihood decreases even further.
Interestingly, holding distance from SEPTA rail and regional rail constant, being closer to a park, increases the probability of conversion to urban land uses, as seen between the purple “Pred_YesRAIL_PARKMED_REGAMED” and blue “Pred_YesRAIL_PARK3Q_REGAMED” line.
##CHNG_URB ~ PCT_WHITE_ + RAILSTN100 + DIST_PARKS + DIST_REGRA
newdat_chester <- data.frame(matrix(ncol = 4, nrow = nrow(dat_chester1)))
colnames(newdat_chester) <- c("PCT_WHITE_", "RAILSTN100", "DIST_PARKS", "DIST_REGRA")
newdat_chester$PCT_WHITE_ <- dat_chester1$PCT_WHITE_
newdat_chester$RAILSTN100 <- 0
newdat_chester$DIST_PARKS <- 4472 #MEDIAN
newdat_chester$DIST_REGRA <-18000 #third quar
newdat_chester1<-data.frame(matrix(ncol = 4, nrow = nrow(dat_chester1)))
colnames(newdat_chester1)<- c("PCT_WHITE_", "RAILSTN100", "DIST_PARKS", "DIST_REGRA")
newdat_chester1$PCT_WHITE_ <- dat_chester1$PCT_WHITE_
newdat_chester1$RAILSTN100 <- 1
newdat_chester1$DIST_PARKS <- 4472 #MEDIAN
newdat_chester1$DIST_REGRA <-12176 #MEDIAN
newdat_chester2<-data.frame(matrix(ncol = 4, nrow = nrow(dat_chester1)))
colnames(newdat_chester2)<- c("PCT_WHITE_", "RAILSTN100", "DIST_PARKS", "DIST_REGRA")
newdat_chester2$PCT_WHITE_ <- dat_chester1$PCT_WHITE_
newdat_chester2$RAILSTN100 <- 1
newdat_chester2$DIST_PARKS <- 7106 #THIRD QUAR
newdat_chester2$DIST_REGRA <-12176 #MEDIAN
newdat_chester3<-data.frame(matrix(ncol = 4, nrow = nrow(dat_chester1)))
colnames(newdat_chester3)<- c("PCT_WHITE_", "RAILSTN100", "DIST_PARKS", "DIST_REGRA")
newdat_chester3$PCT_WHITE_ <- dat_chester1$PCT_WHITE_
newdat_chester3$RAILSTN100 <- 1
newdat_chester3$DIST_PARKS <- 7106 #THIRD QUAR
newdat_chester3$DIST_REGRA <-18000 #THIRD QUAR
pred_dat<- data.frame(matrix(ncol = 3, nrow = nrow(dat_chester1)))
colnames(pred_dat)<- c("PCT_WHITE_", "Pred_NoRAIL_PARKMED_REGA3Q", "Pred_YesRAIL_PARKMED_REGAMED")
pred_dat$PCT_WHITE_<-dat_chester1$PCT_WHITE_
pred_dat$Pred_NoRAIL_PARKMED_REGA3Q<- predict(mod_chester4, newdat_chester, type="response")
pred_dat$Pred_YesRAIL_PARKMED_REGAMED<- predict(mod_chester4, newdat_chester1, type="response")
pred_dat<- data.frame(matrix(ncol = 4, nrow = nrow(dat_chester1)))
colnames(pred_dat)<- c("PCT_WHITE_", "Pred_NoRAIL_PARKMED_REGA3Q", "Pred_YesRAIL_PARKMED_REGAMED", "Pred_YesRAIL_PARK3Q_REGAMED")
pred_dat$PCT_WHITE_ <-dat_chester1$PCT_WHITE_
pred_dat$Pred_NoRAIL_PARKMED_REGA3Q<- predict(mod_chester4, newdat_chester, type="response")
pred_dat$Pred_YesRAIL_PARKMED_REGAMED<- predict(mod_chester4, newdat_chester1, type="response")
pred_dat$Pred_YesRAIL_PARK3Q_REGAMED<- predict(mod_chester4, newdat_chester2, type="response")
pred_dat<- data.frame(matrix(ncol = 5, nrow = nrow(dat_chester1)))
colnames(pred_dat)<- c("PCT_WHITE_", "Pred_NoRAIL_PARKMED_REGA3Q", "Pred_YesRAIL_PARKMED_REGAMED", "Pred_YesRAIL_PARK3Q_REGAMED", "Pred_YesRAIL_PARK3Q_REGA3Q")
pred_dat$PCT_WHITE_ <-dat_chester1$PCT_WHITE_
pred_dat$Pred_NoRAIL_PARKMED_REGA3Q<- predict(mod_chester4, newdat_chester, type="response")
pred_dat$Pred_YesRAIL_PARKMED_REGAMED<- predict(mod_chester4, newdat_chester1, type="response")
pred_dat$Pred_YesRAIL_PARK3Q_REGAMED<- predict(mod_chester4, newdat_chester2, type="response")
pred_dat$Pred_YesRAIL_PARK3Q_REGA3Q<- predict(mod_chester4, newdat_chester3, type="response")
dat_gg <- gather(pred_dat, -PCT_WHITE_, key = "Scenario", value = "value")
ggplot(dat_gg, aes(x = PCT_WHITE_, y = value, colour = Scenario)) +
geom_line() + ylim(0,1) +
xlab("Percentage of White Population") + ylab("Predicted Probability of Conversion to Urban Land") + plotTheme
Read Data
setwd("~/Documents/UPenn/Penn 2022 Spring/Planning by Numbers/Assignment 3")
dat_chester <- read.csv("Chester_Urban_Growth.csv", header = TRUE)
per_dat <- read.csv("per_pub.csv", header = TRUE)
hh_dat <- read.csv("hh_pub_CSV.csv", header = TRUE)
trip_dat <- read.csv("trip_pub.csv", header = TRUE)
veh_dat <- read.csv("veh_pub.csv", header = TRUE)
loc_dat <- read.csv("LOC_PUB.csv", header = TRUE)
Construct a binomial logit model explaining whether a commuter drove to work or used another mode
#PART1::DROVE TO WORK####
#Construct a binomial logit model explaining whether a commuter drove to work or used another mode
#TRIP_DAT
#First Transport Mode: TRAN1 = 21 :: Auto/van/SUV driver
#AND Main Activity: ACT1 = 11, 12 :: Work at regular jobsite, Work activity at other place
#Which factors could be determinants of trip mode?
##Trip Duration: TRPDUR
##Parking Cost: PARKC
##Toll: TOLL (y/n)
##Toll Amount: TOLLA
trip_dat1 <- trip_dat%>%
select(1:3, 7, 13, 16, 25, 38, 40, 43)%>%
drop_na(TRPDUR)%>%
filter(ACT1 == "11" | ACT1 == "12")%>%
add_column(Drive = 0)%>%
mutate(Drive = if_else (TRAN1 == "21", 1, Drive))
#HH_DAT
##Number of HH vehicles: TOTVEH
##HH size: HHSIZE
##Total number of workers: NOWRK
##HH Ethnicity: ETHNC (1=Black/African American (non-Hispanic), 2=White (non-Hispanic), 3=Asian/Pacific Islander, 4=American Indian, 5=Hispanic, 97=Other, 98=DK, 99=RF)
##HH Income: INCOM (2=<$15k, 3=$15k to $25k, 4=$25k to $35k, 5=$35k to $50k, 6 =$50k to $75k, 7=$75k to $100k, 8=$100k to $125k, 9=$125k to $150k, 10=$150k or more)
#OR, HH Income: INCLV (1=Under $50k, 2=$50k+)
hh_dat1 <- select(hh_dat, 1:2, TOTVEH, HHSIZE, INCOM, INCOME, INCLV, NOWRK, ETHNC)%>%
filter(ETHNC < 6)%>%
drop_na(TOTVEH, HHSIZE, INCLV)%>%
add_column(TOTVEH_lt2 = 0, TOTVEH_gt2 = 0, HHSIZE_lt3 = 0, HHSIZE_gt3 = 0, INCOME_lt50k = 0, INCOME_gt50K = 0)%>%
mutate(TOTVEH_lt2 = if_else(TOTVEH <= "2", 1, TOTVEH_lt2),
TOTVEH_gt2 = if_else(TOTVEH > "2", 1, TOTVEH_gt2),
HHSIZE_lt3 = if_else(HHSIZE <= "3", 1, HHSIZE_lt3),
HHSIZE_gt3 = if_else(HHSIZE > "3", 1, HHSIZE_gt3),
INCOME_lt50k = if_else(INCLV == "1", 1, INCOME_lt50k),
INCOME_gt50k = if_else(INCLV == "1", 1, INCOME_gt50K))
#categorical variable subsets created around position of mean values
#PER_DAT
##Gender: GEND (1=Male, 2=Female)
##Age: AGE
##License: LIC (1=yes, 2=no, 8=DK, 9=RF)
##Disability: DISAB (1=yes, 2=no, 8=DK, 9=RF)
##Employer subsidized parking: W1EPK (yes = 1)
per_dat1 <- select(per_dat, 1:2, GEND, AGE, LIC, DISAB, W1EPK)%>%
filter(GEND < 3,
AGE < 998,
LIC < 3,
DISAB < 3,
W1EPK < 3)%>%
add_column(GEND_M = 0, GEND_F = 0, DISAB_Y = 0, DISAB_N = 0)%>%
mutate(GEND_M = if_else(GEND == "1", 1, GEND_M),
GEND_F = if_else(GEND == "2", 1, GEND_F),
DISAB_Y = if_else(DISAB == "1", 1, DISAB_Y),
DISAB_N = if_else(DISAB == "2", 1, DISAB_N))
#VEH_DAT
## Vehicle Age: YEAR
##Vehicle Ownership: VHOWN (1=Household owned/leased, 2=Employer provided, 3=Rental car, 4=Borrowed from friend or relative, 5=Other, 8=Don't know, 9=Refused)
veh_dat1 <- select(veh_dat, SAMPN, YEAR, VHOWN)%>%
filter(VHOWN < 5)%>%
add_column(VHOWN_Y = 0)%>%
mutate(VHOWN_Y = if_else(VHOWN == "1", 1, VHOWN_Y))
#MERGE TABLES
##Merge SAMPN and PERNO
j1 <- left_join(per_dat1, hh_dat1, by = "SAMPN", copy = FALSE, keep = FALSE)
j1$SAMPN_PER <- paste0(j1$SAMPN, j1$PERNO)
j1 <- subset(j1, !duplicated(SAMPN_PER))
j1$SAMPN_PER <- paste0(j1$SAMPN, j1$PERNO)
trip_dat1$SAMPN_PER <- paste0(trip_dat1$SAMPN, trip_dat1$PERNO)
trip_dat1 <- subset(trip_dat1, !duplicated(SAMPN_PER))
#Trip duration is a categorical variable created by splitting continuous variable into quartiles??
summary(trip_dat1$TRPDUR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 15.00 25.00 29.62 39.00 1020.00
dat_drive <- left_join(trip_dat1, j1, by ="SAMPN_PER", copy = FALSE, keep = FALSE)%>%
select(-SAMPN.y, -PERNO.y, -X)
COR TESTS
Picking : INCOME, AGE, TOTVEH, TRPDUR, GEND
cor = 0.11, very low
cor.test(dat_drive$Drive, dat_drive$INCOME)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$INCOME
## t = 6.2884, df = 2988, p-value = 3.674e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.07876374 0.14952111
## sample estimates:
## cor
## 0.1142874
#cor = 0.11, very low
cor = 0.11, still low
cor.test(dat_drive$Drive, dat_drive$AGE)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$AGE
## t = 6.7572, df = 3439, p-value = 1.646e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08136482 0.14731890
## sample estimates:
## cor
## 0.114468
#cor = 0.11, still low
cor = 0.29, very low
cor.test(dat_drive$Drive, dat_drive$TOTVEH)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$TOTVEH
## t = 16.899, df = 2988, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2622957 0.3277411
## sample estimates:
## cor
## 0.2953649
#cor = 0.29, very low
cor = 0.13, very low
cor.test(dat_drive$Drive, dat_drive$TOTVEH_gt2)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$TOTVEH_gt2
## t = 7.5863, df = 2988, p-value = 4.367e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1021242 0.1724637
## sample estimates:
## cor
## 0.1374673
#cor = 0.13, very low
cor = 0.069, super low
cor.test(dat_drive$Drive, dat_drive$HHSIZE)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$HHSIZE
## t = 0.37804, df = 2988, p-value = 0.7054
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02893776 0.04275142
## sample estimates:
## cor
## 0.006915716
#cor = 0.069, super low
cor = -0.09, super low
cor.test(dat_drive$Drive, dat_drive$TRPDUR)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$TRPDUR
## t = -5.8535, df = 4003, p-value = 5.2e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12274533 -0.06132634
## sample estimates:
## cor
## -0.09212345
#cor = -0.09, super low
cor = -0.07, super low
cor.test(dat_drive$Drive, dat_drive$GEND_F)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$GEND_F
## t = -4.1415, df = 3439, p-value = 3.534e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.10361701 -0.03711953
## sample estimates:
## cor
## -0.07044653
#cor = -0.07, super low
cor = 0.07, super low but positive as opposed to GEND_F
cor.test(dat_drive$Drive, dat_drive$GEND_M)
##
## Pearson's product-moment correlation
##
## data: dat_drive$Drive and dat_drive$GEND_M
## t = 4.1415, df = 3439, p-value = 3.534e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03711953 0.10361701
## sample estimates:
## cor
## 0.07044653
#cor = 0.07, super low but positive as opposed to GEND_F
Mod4.1 and Mod5 have the lowest AIC with the most number of statistically significant variables
Mod4 has more statistically significant variables
Model 1
Statistically significant
Null deviance: 2455.6 on 2989 degrees of freedom
AIC: 2417.5
mod1 <- glm (Drive ~ INCOME, data = dat_drive, family = binomial)
summary(mod1)
##
## Call:
## glm(formula = Drive ~ INCOME, family = binomial, data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3834 0.4634 0.5187 0.5796 0.7253
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.105e+00 1.162e-01 9.516 < 2e-16 ***
## INCOME 9.570e-06 1.546e-06 6.192 5.95e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2413.5 on 2988 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2417.5
##
## Number of Fisher Scoring iterations: 5
Model 2
Both are statistically significant
Null deviance: 2455.6 on 2989 degrees of freedom
AIC: 2383
mod2 <- glm (Drive ~ INCOME + AGE, data = dat_drive, family = binomial)
summary(mod2)
##
## Call:
## glm(formula = Drive ~ INCOME + AGE, family = binomial, data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5939 0.4079 0.5111 0.5928 0.9209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.376e-02 2.118e-01 0.065 0.948
## INCOME 9.204e-06 1.560e-06 5.901 3.60e-09 ***
## AGE 2.663e-02 4.448e-03 5.988 2.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2377.0 on 2987 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2383
##
## Number of Fisher Scoring iterations: 5
#Both are statistically significant
#Null deviance: 2455.6 on 2989 degrees of freedom
#AIC: 2383
Model 3
Age and Total Vehicle are statistically significant
Null deviance: Null deviance: 2455.6 on 2989 degrees of freedom
AIC: 2351.8
mod3 <- glm (Drive ~ INCOME + AGE + TRPDUR, data = dat_drive, family = binomial)
summary(mod3)
##
## Call:
## glm(formula = Drive ~ INCOME + AGE + TRPDUR, family = binomial,
## data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6824 0.3909 0.4938 0.5897 2.8188
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.643e-01 2.218e-01 1.642 0.101
## INCOME 9.663e-06 1.562e-06 6.187 6.12e-10 ***
## AGE 2.655e-02 4.533e-03 5.858 4.69e-09 ***
## TRPDUR -1.235e-02 2.164e-03 -5.705 1.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2343.8 on 2986 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2351.8
##
## Number of Fisher Scoring iterations: 5
#Age and Total Vehicle are statistically significant
#Null deviance: Null deviance: 2455.6 on 2989 degrees of freedom
#AIC: 2351.8
Model 4
All are statistically significant
Null deviance: 2455.6 on 2989 degrees of freedom
AIC: 2332.5
mod4 <- glm (Drive ~ INCOME + AGE + TRPDUR + GEND_F, data = dat_drive, family = binomial)
summary(mod4)
##
## Call:
## glm(formula = Drive ~ INCOME + AGE + TRPDUR + GEND_F, family = binomial,
## data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7771 0.3749 0.4849 0.5889 2.9520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.962e-01 2.351e-01 2.961 0.00307 **
## INCOME 9.431e-06 1.556e-06 6.060 1.36e-09 ***
## AGE 2.650e-02 4.542e-03 5.835 5.37e-09 ***
## TRPDUR -1.386e-02 2.209e-03 -6.276 3.47e-10 ***
## GEND_F -5.025e-01 1.096e-01 -4.585 4.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2322.5 on 2985 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2332.5
##
## Number of Fisher Scoring iterations: 5
#All are statistically significant
#Null deviance: 2455.6 on 2989 degrees of freedom
#AIC: 2332.5
Model 4.1
Age, Trip Duration, Tot Vehicles, and Gender are statistically significant
Null deviance: 2455.6 on 2989 degrees of freedom
AIC: 2066.5
#Making mod4.1 to make predictions easier
mod4.1 <- glm (Drive ~ INCOME + AGE + TRPDUR + TOTVEH, data = dat_drive, family = binomial)
summary(mod4.1)
##
## Call:
## glm(formula = Drive ~ INCOME + AGE + TRPDUR + TOTVEH, family = binomial,
## data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0021 0.2327 0.4148 0.5626 1.7757
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.447e+00 2.651e-01 -5.459 4.79e-08 ***
## INCOME -1.787e-06 1.642e-06 -1.089 0.276
## AGE 3.504e-02 5.112e-03 6.855 7.15e-12 ***
## TRPDUR -9.202e-03 2.171e-03 -4.238 2.25e-05 ***
## TOTVEH 1.135e+00 7.852e-02 14.458 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2066.6 on 2985 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2076.6
##
## Number of Fisher Scoring iterations: 5
#Age, Trip Duration, Tot Vehicles, and Gender are statistically significant
#Null deviance: 2455.6 on 2989 degrees of freedom
#AIC: 2066.5
Model 5 Age, Trip Duration, Total Vehicles, and Gender are significant
Null deviance: 1910.2 on 3518 degrees of freedom
AIC: 2066.5
mod5 <- glm (Drive ~ INCOME + AGE + TRPDUR + TOTVEH + GEND, data = dat_drive, family = binomial)
summary(mod5)
##
## Call:
## glm(formula = Drive ~ INCOME + AGE + TRPDUR + TOTVEH + GEND,
## family = binomial, data = dat_drive)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9331 0.2210 0.4028 0.5595 1.8932
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.781e-01 3.281e-01 -2.371 0.017725 *
## INCOME -1.867e-06 1.641e-06 -1.138 0.255136
## AGE 3.537e-02 5.114e-03 6.917 4.62e-12 ***
## TRPDUR -1.024e-02 2.230e-03 -4.592 4.40e-06 ***
## TOTVEH 1.116e+00 7.850e-02 14.220 < 2e-16 ***
## GEND -4.000e-01 1.157e-01 -3.457 0.000547 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2455.6 on 2989 degrees of freedom
## Residual deviance: 2054.5 on 2984 degrees of freedom
## (1015 observations deleted due to missingness)
## AIC: 2066.5
##
## Number of Fisher Scoring iterations: 5
#Age, Trip Duration, Total Vehicles, and Gender are significant
#Null deviance: 1910.2 on 3518 degrees of freedom
#AIC: 2066.5
Null deviance shows how well the response is predicted by a model
Critical value = 3117.303
The null hypothesis (i.e., the model) is not rejected.
The fitted values with just an intercept are not significantly different from the observed values.
qchisq(0.95, df=2989)
## [1] 3117.303
#critical value = 3117.303
#The null hypothesis (i.e., the model) is not rejected.
#The fitted values with just an intercept are not significantly different from the observed values.
ANOVA
Mods 2, 3, 4, and 5 are statistically significant.
anova(mod1, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Drive
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2989 2455.6
## INCOME 1 42.061 2988 2413.5 8.845e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod1, mod2, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Drive ~ INCOME
## Model 2: Drive ~ INCOME + AGE
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2988 2413.5
## 2 2987 2377.0 1 36.513 1.516e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod1, mod2, mod3, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Drive ~ INCOME
## Model 2: Drive ~ INCOME + AGE
## Model 3: Drive ~ INCOME + AGE + TRPDUR
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2988 2413.5
## 2 2987 2377.0 1 36.513 1.516e-09 ***
## 3 2986 2343.8 1 33.141 8.571e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod1, mod2, mod3, mod4, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Drive ~ INCOME
## Model 2: Drive ~ INCOME + AGE
## Model 3: Drive ~ INCOME + AGE + TRPDUR
## Model 4: Drive ~ INCOME + AGE + TRPDUR + GEND_F
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2988 2413.5
## 2 2987 2377.0 1 36.513 1.516e-09 ***
## 3 2986 2343.8 1 33.141 8.571e-09 ***
## 4 2985 2322.5 1 21.382 3.764e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod1, mod2, mod3, mod4, mod5, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Drive ~ INCOME
## Model 2: Drive ~ INCOME + AGE
## Model 3: Drive ~ INCOME + AGE + TRPDUR
## Model 4: Drive ~ INCOME + AGE + TRPDUR + GEND_F
## Model 5: Drive ~ INCOME + AGE + TRPDUR + TOTVEH + GEND
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2988 2413.5
## 2 2987 2377.0 1 36.513 1.516e-09 ***
## 3 2986 2343.8 1 33.141 8.571e-09 ***
## 4 2985 2322.5 1 21.382 3.764e-06 ***
## 5 2984 2054.5 1 267.964 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Mods 2, 3, 4, and 5 are statistically significant
Stepwise Regression for Generalized Linear Models
Income gets dropped
#Stepwise Regression for Generalized Linear Models
step(mod4, direction = "backward")
## Start: AIC=2332.47
## Drive ~ INCOME + AGE + TRPDUR + GEND_F
##
## Df Deviance AIC
## <none> 2322.5 2332.5
## - GEND_F 1 2343.8 2351.8
## - AGE 1 2357.1 2365.1
## - INCOME 1 2362.5 2370.5
## - TRPDUR 1 2362.6 2370.6
##
## Call: glm(formula = Drive ~ INCOME + AGE + TRPDUR + GEND_F, family = binomial,
## data = dat_drive)
##
## Coefficients:
## (Intercept) INCOME AGE TRPDUR GEND_F
## 6.962e-01 9.431e-06 2.650e-02 -1.386e-02 -5.025e-01
##
## Degrees of Freedom: 2989 Total (i.e. Null); 2985 Residual
## (1015 observations deleted due to missingness)
## Null Deviance: 2456
## Residual Deviance: 2322 AIC: 2332
step(mod5, direction = "backward")
## Start: AIC=2066.5
## Drive ~ INCOME + AGE + TRPDUR + TOTVEH + GEND
##
## Df Deviance AIC
## - INCOME 1 2055.8 2065.8
## <none> 2054.5 2066.5
## - GEND 1 2066.6 2076.6
## - TRPDUR 1 2076.2 2086.2
## - AGE 1 2104.3 2114.3
## - TOTVEH 1 2322.5 2332.5
##
## Step: AIC=2065.78
## Drive ~ AGE + TRPDUR + TOTVEH + GEND
##
## Df Deviance AIC
## <none> 2055.8 2065.8
## - GEND 1 2067.7 2075.7
## - TRPDUR 1 2078.5 2086.5
## - AGE 1 2104.5 2112.5
## - TOTVEH 1 2362.5 2370.5
##
## Call: glm(formula = Drive ~ AGE + TRPDUR + TOTVEH + GEND, family = binomial,
## data = dat_drive)
##
## Coefficients:
## (Intercept) AGE TRPDUR TOTVEH GEND
## -0.81121 0.03471 -0.01049 1.07869 -0.39833
##
## Degrees of Freedom: 2989 Total (i.e. Null); 2985 Residual
## (1015 observations deleted due to missingness)
## Null Deviance: 2456
## Residual Deviance: 2056 AIC: 2066
#Income gets dropped
INCOME
A unit increase in income results in the chances of driving to work increasing by 0.00094%.
## odds = p(success)/p(failure) = (p)/(1-p)
#For a value of X, probability that Y will occur:
##log(p/1-p) = [(Intercept) - (Estimate of X)] * (Value of X)
#OR
## % change in Y for unit change in X = [(exponent (coefficient of X) - 1)] * 100
## - or + sign indicates associated increase or decrease
mod4$coefficients
## (Intercept) INCOME AGE TRPDUR GEND_F
## 6.961592e-01 9.430555e-06 2.650271e-02 -1.386406e-02 -5.025419e-01
#INCOME
(exp(0.000009430555) - 1) * 100
## [1] 0.0009430599
##A unit increase in income results in the chances of driving to work increasing by 0.00094%.
AGE
For a unit increase in age, the chances of driving to work increase by 2.6%.
#AGE
(exp(0.026502705698) - 1) * 100
## [1] 2.685703
##For a unit increase in age, the chances of driving to work increase by 2.6%
TRPDUR
For a unit increase in trip duration, the chances of driving to work decreases by 1.3%.
#TRPDUR
(exp(-0.013864062730) - 1) * 100
## [1] -1.37684
##For a unit increase in trip duration, the chances of driving to work decreases by 1.3%%
GENDER == FEMALE
The odds ratio between women driving and men driving is 0.60.
Women are 39% less likely to drive to work than men.
#GENDER = FEMALE
exp(-0.502541920285)
## [1] 0.6049909
## = 0.6332637
##The odds ratio between women driving and men driving is 0.60.
(exp(-0.502541920285) - 1) * 100
## [1] -39.50091
##Women are 39% less likely to drive to work than men.
Stargazer for FUN!
stargazer(mod3, mod4, mod5, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ==================================================
## Dependent variable:
## --------------------------------
## Drive
## (1) (2) (3)
## --------------------------------------------------
## INCOME 0.00001*** 0.00001*** -0.00000
## (0.00000) (0.00000) (0.00000)
##
## AGE 0.027*** 0.027*** 0.035***
## (0.005) (0.005) (0.005)
##
## TRPDUR -0.012*** -0.014*** -0.010***
## (0.002) (0.002) (0.002)
##
## GEND_F -0.503***
## (0.110)
##
## TOTVEH 1.116***
## (0.078)
##
## GEND -0.400***
## (0.116)
##
## Constant 0.364 0.696** -0.778*
## (0.222) (0.235) (0.328)
##
## --------------------------------------------------
## Observations 2,990 2,990 2,990
## Log Likelihood -1,171.925 -1,161.234 -1,027.252
## Akaike Inf. Crit. 2,351.849 2,332.468 2,066.503
## ==================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
mod4 vars : INCOME + AGE + TOTVEH + GEND_F
TRPDUR is pretty interesting
##mod4 vars : INCOME + AGE + TOTVEH + GEND_F
plot(dat_drive$AGE, dat_drive$Drive)
plot(dat_drive$TOTVEH, dat_drive$Drive)
plot(dat_drive$TRPDUR, dat_drive$Drive)
#Set Plot Theme
plotTheme <- theme(
axis.title = element_text(
size = 11, lineheight = .9, family = "Comic Sans MS"),
plot.title = element_text(size=16, face = "bold", hjust = 0.5, family = "Comic Sans MS"),
plot.background = element_blank())
table(is.na(dat_drive$Drive))
##
## FALSE
## 4005
table(is.na(dat_drive$TRPDUR))
##
## FALSE
## 4005
Trip Duration x Drive
dat_drive_pred <- select(dat_drive, 1:3, 10:13, 25)%>%
drop_na(INCOME)
plot(dat_drive_pred$Drive, dat_drive_pred$TRPDUR)
plot(jitter(dat_drive_pred$Drive, factor=0.5) ~ dat_drive_pred$TRPDUR,
pch= 4,
ylab="Drive To Work",
xlab="Trip Duration")
Income Fixed as Median Value
dat_gg <-data.frame(matrix(ncol = 3, nrow = nrow(dat_drive_pred)))
colnames(dat_gg) <- c("INCOME", "AGE", "TRPDUR")
dat_gg$INCOME <- 62000
dat_gg$AGE <- 25
dat_gg$TRPDUR <- dat_drive_pred$TRPDUR
dat_gg1 <-data.frame(matrix(ncol = 3, nrow = nrow(dat_drive_pred)))
colnames(dat_gg) <- c("INCOME", "AGE", "TRPDUR")
dat_gg1$INCOME <- 62000
dat_gg1$AGE <- 45
dat_gg1$TRPDUR <- dat_drive_pred$TRPDUR
dat_gg2 <-data.frame(matrix(ncol = 3, nrow = nrow(dat_drive_pred)))
colnames(dat_gg2) <- c("INCOME", "AGE", "TRPDUR")
dat_gg2$INCOME <- 62000
dat_gg2$AGE <- 65
dat_gg2$TRPDUR <- dat_drive_pred$TRPDUR
pred_dat <- data.frame(matrix(ncol = 4, nrow = nrow(dat_drive_pred)))
colnames(pred_dat)<- c("TRPDUR", "Pred_Inc62K_Age25", "Pred_Inc62K_Age45", "Pred_Inc62K_Age65")
pred_dat$TRPDUR <- dat_drive_pred$TRPDUR
pred_dat$Pred_Inc62K_Age25 <- predict(mod3, dat_gg, type="response")
pred_dat$Pred_Inc62K_Age45 <- predict(mod3, dat_gg1, type="response")
pred_dat$Pred_Inc62K_Age65 <- predict(mod3, dat_gg2, type="response")
dat_ggplot <- gather(pred_dat, -TRPDUR, key = "Scenario", value = "value")
ggplot(dat_ggplot, aes(x = TRPDUR, y = value, colour = Scenario)) +
geom_line() + ylim(0,1) +
xlab("Trip Duration") + ylab("Predicted Probability of Driving to Work") + plotTheme
Construct a binomial logit model explaining whether a commuter walked/biked or used another model
Load in Data
#TRIP_DAT
#First Transport Mode: TRAN1 = 11 : Walk ; 14 : Bicycle
#AND Main Activity: ACT1 = 11, 12 :: Work at regular jobsite, Work activity at other place
trip_dat2 <- trip_dat%>%
select(1:3, 7, 13, 16, 25, 38, 40, 43:43,)%>%
drop_na(TRPDUR)%>%
filter(ACT1 == "11" | ACT1 == "12")%>%
add_column(WalkBike = 0)%>%
mutate(WalkBike = if_else (TRAN1 == "11" | TRAN1 == "14", 1, WalkBike))
#HH_DAT
##HH size: HHSIZE
##Total number of workers: NOWRK
##HH Income: INCOM (2=<$15k, 3=$15k to $25k, 4=$25k to $35k, 5=$35k to $50k, 6 =$50k to $75k, 7=$75k to $100k, 8=$100k to $125k, 9=$125k to $150k, 10=$150k or more)
#OR, HH Income: INCLV (1=Under $50k, 2=$50k+)
hh_dat2 <- select(hh_dat, 1:2, HHSIZE, INCOME, NOWRK)%>%
drop_na(INCOME)
##categorical variable subsets created around position of mean values
#PER_DAT
##Age: AGE
##Disability: DISAB (1=yes, 2=no, 8=DK, 9=RF)
per_dat2 <- select(per_dat, 1:2, AGE, DISAB)%>%
filter(AGE < 998,
DISAB < 3)%>%
add_column(DISAB_Y = 0, DISAB_N = 0)%>%
mutate(DISAB_Y = if_else(DISAB == "1", 1, DISAB_Y),
DISAB_N = if_else(DISAB == "2", 1, DISAB_N))
#MERGE TABLES
##Merge SAMPN and PERNO
j3 <- left_join(per_dat2, hh_dat2, by = "SAMPN", copy = FALSE, keep = FALSE)
j3$SAMPN_PER <- paste0(j3$SAMPN, j3$PERNO)
j3 <- subset(j3, !duplicated(SAMPN_PER))
trip_dat2$SAMPN_PER <- paste0(trip_dat2$SAMPN, trip_dat2$PERNO)
trip_dat2 <- subset(trip_dat2, !duplicated(SAMPN_PER))
dat_wb <- left_join(trip_dat2, j3, by ="SAMPN_PER", copy = FALSE, keep = FALSE)%>%
select(-SAMPN.y, -PERNO.y, -X)%>%
add_column(TOLL_Y = 0)%>%
mutate(TOLL_Y = if_else(TOLL == "1", 1, TOLL_Y))
COR TEST
Decent correlations: INCOME, AGE, TRIPDUR, DISAB
cor = -0.05
cor.test(dat_wb$WalkBike, dat_wb$HHSIZE)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$HHSIZE
## t = -2.9417, df = 3452, p-value = 0.003286
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08321888 -0.01668263
## sample estimates:
## cor
## -0.05000624
#cor = -0.05
cor = -0.018
cor.test(dat_wb$WalkBike, dat_wb$INCOME)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$INCOME
## t = -1.1006, df = 3452, p-value = 0.2712
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05204741 0.01463209
## sample estimates:
## cor
## -0.01872849
#cor = -0.018
cor = -0.054
cor.test(dat_wb$WalkBike, dat_wb$AGE)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$AGE
## t = -3.4051, df = 3948, p-value = 0.0006679
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08515718 -0.02296564
## sample estimates:
## cor
## -0.05411389
#cor = -0.054
cor = -0.014
cor.test(dat_wb$WalkBike, dat_wb$DISAB_Y)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$DISAB_Y
## t = 0.89708, df = 3948, p-value = 0.3697
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01691877 0.04544254
## sample estimates:
## cor
## 0.01427577
#cor = -0.014
cor = 0.006
cor.test(dat_wb$WalkBike, dat_wb$TOLL)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$TOLL
## t = 0.35372, df = 3446, p-value = 0.7236
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02736038 0.03939809
## sample estimates:
## cor
## 0.006025569
#cor = 0.006
cor = -0.006
cor.test(dat_wb$WalkBike, dat_wb$TOLL_Y)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$TOLL_Y
## t = -0.35372, df = 3446, p-value = 0.7236
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03939809 0.02736038
## sample estimates:
## cor
## -0.006025569
#cor = -0.006
cor = -0.018
cor.test(dat_wb$WalkBike, dat_wb$NOWRK)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$NOWRK
## t = -1.1036, df = 3452, p-value = 0.2699
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05209862 0.01458075
## sample estimates:
## cor
## -0.01877981
#cor = -0.018
cor = -0.033
cor.test(dat_wb$WalkBike, dat_wb$TRPDUR)
##
## Pearson's product-moment correlation
##
## data: dat_wb$WalkBike and dat_wb$TRPDUR
## t = -2.1337, df = 4003, p-value = 0.03293
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.064609731 -0.002735839
## sample estimates:
## cor
## -0.03370508
#cor = -0.033
Only continuous variables
Mod 4 has the lowest AIC, passes the null deviance test, and has the most statistically significant variables
MODEL 1
Statistically significant
AIC: 1498.3
Null deviance: 1505.8 on 3949 degrees of freedom
mod1.wb <- glm (WalkBike ~ AGE, data = dat_wb, family = binomial)
summary(mod1.wb)
##
## Call:
## glm(formula = WalkBike ~ AGE, family = binomial, data = dat_wb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4257 -0.3293 -0.3040 -0.2777 2.7170
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.130112 0.259500 -8.209 2.24e-16 ***
## AGE -0.020477 0.006037 -3.392 0.000694 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1505.8 on 3949 degrees of freedom
## Residual deviance: 1494.3 on 3948 degrees of freedom
## (55 observations deleted due to missingness)
## AIC: 1498.3
##
## Number of Fisher Scoring iterations: 6
#Statistically sig
#AIC: 1498.3
#Null deviance: 1505.8 on 3949 degrees of freedom
MODEL 2
Only age is statistically sig
AIC: 1289.4
Null deviance: 1295.5 on 3453 degrees of freedom
mod2.wb <- glm (WalkBike ~ AGE + INCOME, data = dat_wb, family = binomial)
summary(mod2.wb)
##
## Call:
## glm(formula = WalkBike ~ AGE + INCOME, family = binomial, data = dat_wb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4274 -0.3292 -0.2981 -0.2689 2.7776
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.957e+00 3.141e-01 -6.231 4.64e-10 ***
## AGE -2.176e-02 6.604e-03 -3.295 0.000983 ***
## INCOME -2.033e-06 2.165e-06 -0.939 0.347792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1295.5 on 3453 degrees of freedom
## Residual deviance: 1283.4 on 3451 degrees of freedom
## (551 observations deleted due to missingness)
## AIC: 1289.4
##
## Number of Fisher Scoring iterations: 6
#Only age is statistically sig
#AIC: 1289.4
#Null deviance: 1295.5 on 3453 degrees of freedom
MODEL 3
Only AGE is statistically significant
AIC: 1290.6
Null deviance: 1295.5 on 3453 degrees of freedom
mod3.wb <- glm (WalkBike ~ AGE + INCOME + DISAB, data = dat_wb, family = binomial)
summary(mod3.wb)
##
## Call:
## glm(formula = WalkBike ~ AGE + INCOME + DISAB, family = binomial,
## data = dat_wb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5473 -0.3281 -0.2973 -0.2693 2.7777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.548e-01 1.517e+00 -0.366 0.714671
## AGE -2.177e-02 6.603e-03 -3.297 0.000976 ***
## INCOME -1.971e-06 2.162e-06 -0.912 0.361842
## DISAB -7.063e-01 7.494e-01 -0.942 0.345965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1295.5 on 3453 degrees of freedom
## Residual deviance: 1282.6 on 3450 degrees of freedom
## (551 observations deleted due to missingness)
## AIC: 1290.6
##
## Number of Fisher Scoring iterations: 6
#Only AGE is statistically sig
#AIC: 1290.6
#Null deviance: 1295.5 on 3453 degrees of freedom
MODEL 4
AGE and TRPDUR are statistically significant
AIC: 1269.2
Null deviance: 1295.5 on 3453 degrees of freedom
mod4.wb <- glm (WalkBike ~ AGE + INCOME + DISAB + TRPDUR, data = dat_wb, family = binomial)
summary(mod4.wb)
##
## Call:
## glm(formula = WalkBike ~ AGE + INCOME + DISAB + TRPDUR, family = binomial,
## data = dat_wb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6209 -0.3433 -0.2943 -0.2418 3.2818
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.200e-01 1.528e+00 -0.078 0.93745
## AGE -2.154e-02 6.475e-03 -3.326 0.00088 ***
## INCOME -1.355e-06 2.186e-06 -0.620 0.53552
## DISAB -6.725e-01 7.516e-01 -0.895 0.37091
## TRPDUR -2.254e-02 5.231e-03 -4.309 1.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1295.5 on 3453 degrees of freedom
## Residual deviance: 1259.2 on 3449 degrees of freedom
## (551 observations deleted due to missingness)
## AIC: 1269.2
##
## Number of Fisher Scoring iterations: 6
#AGE and TRPDUR are statistically sig
#AIC: AIC: 1269.2
#Null deviance: 1295.5 on 3453 degrees of freedom
#Mod 4 has the lowest AIC, passes the null deviance test, and has the most statistically significant variables
Critical value = 4096.307
Mod3.wb and mod4.wb are the only ones where the null hypothesis can be rejected.
qchisq(0.95, df=3949)
## [1] 4096.307
#critical value = 4096.307
##Mod3.wb and mod4.wb are the only ones where the null hypothesis can be rejected.
ANOVA TESTING
Mod4 is the most significant
anova(mod2.wb, mod3.wb, mod4.wb, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: WalkBike ~ AGE + INCOME
## Model 2: WalkBike ~ AGE + INCOME + DISAB
## Model 3: WalkBike ~ AGE + INCOME + DISAB + TRPDUR
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3451 1283.4
## 2 3450 1282.6 1 0.7402 0.3896
## 3 3449 1259.2 1 23.4033 1.314e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Mod4 is the most significant
STEPWISE REGRESSION
mod4.wb$coefficients
## (Intercept) AGE INCOME DISAB TRPDUR
## -1.199510e-01 -2.153680e-02 -1.354647e-06 -6.725342e-01 -2.253681e-02
AGE
Unit increase in age results in a 2.1% decrease in the odds of walking or biking to work
#AGE
(exp(-0.021536799405) - 1) * 100
#unit increase in age results in a 2.1% decrease in the odds of walking or biking to work
Income
Unit increase in income results in a 0.0001% decrease in the odds of walking or biking to work
#INCOME
(exp(-0.000001354647) - 1) * 100
## [1] -0.0001354646
#unit increase in income results in a 0.0001% decrease in the odds of walking or biking to work
TRPDUR
A unit increase in trip duration decreases the odds of walking or biking by 2.2%
#TRPDUR
(exp(-0.022536811517) - 1) * 100
## [1] -2.228475
#a unit increase in trip duration decreases the odds of walking or biking by 2.2%
mod5.wb <- glm (WalkBike ~ AGE + INCOME + TRPDUR + DISAB_Y, data = dat_wb, family = binomial)
summary(mod5.wb)
mod5.wb$coefficients
The odds of a person with disabilities walking or biking to work are 1.9 times less than the odds of a person without disabilities walking or biking to work.
A person with disabilities is 95% less likely to walk or bike to work.
#DISAB_Y
exp(0.672534208663)
## [1] 1.959196
#The odds of a person with disabilities walking or biking to work are 1.9 times less than the odds of a person without disabilities walking or biking to work.
(exp(0.672534208663) - 1) * 100
## [1] 95.9196
#A person with disabilities is 95% less likely to walk or bike to work
##That checks out....
stargazer(mod3.wb, mod4.wb, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ================================================
## Dependent variable:
## ------------------------------
## WalkBike
## (1) (2)
## ------------------------------------------------
## AGE -0.022*** -0.022***
## (0.007) (0.006)
##
## INCOME -0.00000 -0.00000
## (0.00000) (0.00000)
##
## DISAB -0.706 -0.673
## (0.749) (0.752)
##
## TRPDUR -0.023***
## (0.005)
##
## Constant -0.555 -0.120
## (1.517) (1.528)
##
## ------------------------------------------------
## Observations 3,454 3,454
## Log Likelihood -641.321 -629.619
## Akaike Inf. Crit. 1,290.641 1,269.238
## ================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
Mod4 vars : AGE, INCOME, TRPDUR, DISAB
plot(dat_wb$AGE, dat_wb$WalkBike)
plot(dat_wb$INCOME, dat_wb$WalkBike)
plot(dat_wb$TRPDUR, dat_wb$WalkBike)
plot(jitter(dat_wb$WalkBike, factor=0.5) ~ dat_wb$AGE,
pch= 4,
ylab="Walk or Bike To Work",
xlab="Age")
dat_wb_pred <- select(dat_wb, 1:3, 10:13, 18)%>%
drop_na(INCOME)
table(is.na(dat_wb_pred$INCOME))
##
## FALSE
## 3454
table(is.na(dat_wb_pred$TRPDUR))
##
## FALSE
## 3454
table(is.na(dat_wb_pred$AGE))
##
## FALSE
## 3454
table(is.na(dat_wb_pred$DISAB))
##
## FALSE
## 3454
Income Fixed As Median Value
dat_ggwb <-data.frame(matrix(ncol = 4, nrow = nrow(dat_wb_pred)))
colnames(dat_ggwb) <- c("INCOME", "AGE", "DISAB", "TRPDUR")
dat_ggwb$INCOME <- 62000
dat_ggwb$AGE <- 25
dat_ggwb$DISAB <- 0
dat_ggwb$TRPDUR <- dat_wb_pred$TRPDUR
dat_ggwb1 <-data.frame(matrix(ncol = 4, nrow = nrow(dat_wb_pred)))
colnames(dat_ggwb1) <- c("INCOME", "AGE", "DISAB", "TRPDUR")
dat_ggwb1$INCOME <- 62000
dat_ggwb1$AGE <- 45
dat_ggwb1$DISAB <- 0
dat_ggwb1$TRPDUR <- dat_wb_pred$TRPDUR
dat_ggwb2 <- data.frame(matrix(ncol = 4, nrow = nrow(dat_wb_pred)))
colnames(dat_ggwb2) <- c("INCOME", "AGE", "DISAB", "TRPDUR")
dat_ggwb2$INCOME <- 62000
dat_ggwb2$AGE <- 65
dat_ggwb2$DISAB <- 0
dat_ggwb2$TRPDUR <- dat_wb_pred$TRPDUR
pred_dat_wb <- data.frame(matrix(ncol = 5, nrow = nrow(dat_wb_pred)))
colnames(pred_dat_wb)<- c("TRPDUR", "DISAB", "Pred_Inc62K_Age25", "Pred_Inc62K_Age45", "Pred_Inc62K_Age65")
pred_dat_wb$TRPDUR <- dat_wb_pred$TRPDUR
pred_dat_wb$DISAB <- 0
pred_dat_wb$Pred_Inc62K_Age25 <- predict(mod4.wb, dat_ggwb, type="response")
pred_dat_wb$Pred_Inc62K_Age45 <- predict(mod4.wb, dat_ggwb1, type="response")
pred_dat_wb$Pred_Inc62K_Age65 <- predict(mod4.wb, dat_ggwb2, type="response")
dat_ggplot_wb <- gather(pred_dat_wb, -TRPDUR, key = "Scenario", value = "value")
ggplot(dat_ggplot_wb, aes(x = TRPDUR, y = value, colour = Scenario)) +
geom_line() + ylim(0,1) +
xlab("Trip Duration") + ylab("Predicted Probability of Walking or Biking to Work") + plotTheme
Read Data
#PART3::DROVE, CARPOOLED, TOOK TRANSIT, OR WALKED/BIKED TO WORK####
#Construct a multinomial logit model
#CAR : TRAN1 = 21 :: Auto/van/SUV driver
#CARPOOL : TRAN1 = 31 :: Shared ride (carpool, vanpool)
#TRANSIT: TRAN1 = 47, 48, 51, 52, 53, 41, 45 : Trolley, Jitney, Subway/El, Commuter Rail, Bus
#WALK/BIKE : TRAN1 = 11, 14 : Walk, Bike
trip_dat$WB[(trip_dat$TRAN1==11 | trip_dat$TRAN1==14 )]<-1
trip_dat$WB[which(is.na(trip_dat$WB))]<-0
trip_dat$Drive[(trip_dat$TRAN1==21 )]<-1
trip_dat$Drive[which(is.na(trip_dat$Drive))]<-0
trip_dat$Transit[(trip_dat$TRAN1==47 | trip_dat$TRAN1==48 | trip_dat$TRAN1==51 )]<-1
trip_dat$Transit[which(is.na(trip_dat$Transit))]<-0
##clean out any rows where they are all zero
trip_dat$SUMZERO<- trip_dat$WB+ trip_dat$Drive+trip_dat$Transit
summary(trip_dat$SUMZERO)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5702 1.0000 1.0000
head(trip_dat)
## SAMPN PERNO X PLANO ATIME DTIME TRPDUR ACTDUR Origin_PTYPE Dest_PTYE
## 1 1000018 1 10000181 1 300 630 NA 210 NA 1
## 2 1000018 1 10000181 2 800 830 90 30 1 2
## 3 1000018 1 10000181 3 900 1000 30 60 2 5
## 4 1000018 1 10000181 4 1040 1135 40 55 5 2
## 5 1000018 1 10000181 5 1200 1300 25 60 2 5
## 6 1000018 1 10000181 6 1415 1525 75 70 5 5
## PLOC X.1 ACT1 Work.NotWork Origin Mode TT ACT2 ACT3 ACT4 ACT5 ACT6 ACT1O
## 1 9000018 NA 15 0 NA NA NA 3 NA NA NA NA
## 2 26 NA 11 1 9000018 21 0 NA NA NA NA NA
## 3 436 NA 12 1 0 21 90 NA NA NA NA NA
## 4 26 NA 11 1 436 21 30 NA NA NA NA NA
## 5 437 NA 12 1 0 21 40 NA NA NA NA NA
## 6 438 NA 12 1 0 21 25 NA NA NA NA NA
## ACTO TRAN1 TRAN2 TRAN3 TRAN4 OTRAN HHVU HHVNO ADP TOTTRAV NTRAV NHTRV PHHTR
## 1 NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA 21 NA NA NA 1 2 1 1 0 NA NA
## 3 NA 21 NA NA NA 1 2 1 1 0 NA NA
## 4 NA 21 NA NA NA 1 2 1 1 0 NA NA
## 5 NA 21 NA NA NA 1 2 1 1 0 NA NA
## 6 NA 21 NA NA NA 1 2 1 1 0 NA NA
## NNHTR PARK PARKO PARKC PARKU PRKUO TOLL TOLLA ADDP ADDPH ADDPN APDP APDPH
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 0 3 0 NA 2 NA 2 NA NA NA NA
## 3 0 3 0 NA 2 NA 2 NA NA NA NA
## 4 0 3 0 NA 2 NA 2 NA NA NA NA
## 5 0 3 0 NA 2 NA 2 NA NA NA NA
## 6 0 3 0 NA 2 NA 2 NA NA NA NA
## APDPN SUBTR ACCESS OACESS LINE PAY1 PAY1O FARE1 TRFR TRFS1 TRFL1 PAY2 PAY2O
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
## 4 NA NA NA NA NA NA
## 5 NA NA NA NA NA NA
## 6 NA NA NA NA NA NA
## FARE2 TRFS2 TRFL2 PAY3 PAY3O FARE3 OTRFS EXIT EGRESS OEGRES SPDFLAG CPA
## 1 NA NA NA NA NA 0 58
## 2 NA NA NA NA NA 0 58
## 3 NA NA NA NA NA 0 58
## 4 NA NA NA NA NA 0 58
## 5 NA NA NA NA NA 0 58
## 6 NA NA NA NA NA 0 58
## REGION DVRPC WB Drive Transit SUMZERO
## 1 2 1 0 0 0 0
## 2 2 1 0 1 0 1
## 3 2 1 0 1 0 1
## 4 2 1 0 1 0 1
## 5 2 1 0 1 0 1
## 6 2 1 0 1 0 1
#Pick first work trip
TripsClean <- trip_dat[which(trip_dat$Dest_PTYE == 2), ]
head(TripsClean)
## SAMPN PERNO X PLANO ATIME DTIME TRPDUR ACTDUR Origin_PTYPE
## 2 1000018 1 10000181 2 800 830 90 30 1
## 4 1000018 1 10000181 4 1040 1135 40 55 5
## 15 1000024 2 10000242 2 645 1630 25 585 1
## 18 1000054 1 10000541 2 805 1455 5 410 1
## 41 1000212 2 10002122 2 900 1732 60 512 1
## 47 1000239 1 10002391 2 815 1206 37 231 1
## Dest_PTYE PLOC X.1 ACT1 Work.NotWork Origin Mode TT ACT2 ACT3 ACT4 ACT5
## 2 2 26 NA 11 1 9000018 21 0 NA NA NA NA
## 4 2 26 NA 11 1 436 21 30 NA NA NA NA
## 15 2 173 NA 11 1 9000024 21 0 NA NA NA NA
## 18 2 60 NA 11 1 9000054 21 0 NA NA NA NA
## 41 2 167 NA 11 1 9000212 21 0 NA NA NA NA
## 47 2 186 NA 11 1 9000239 21 0 3 NA NA NA
## ACT6 ACT1O ACTO TRAN1 TRAN2 TRAN3 TRAN4 OTRAN HHVU HHVNO ADP TOTTRAV NTRAV
## 2 NA NA 21 NA NA NA 1 2 1 1 0
## 4 NA NA 21 NA NA NA 1 2 1 1 0
## 15 NA NA 21 NA NA NA 1 2 1 1 0
## 18 NA NA 21 NA NA NA 1 2 1 1 0
## 41 NA NA 21 NA NA NA 1 2 1 1 0
## 47 NA NA 21 NA NA NA 1 1 1 1 0
## NHTRV PHHTR NNHTR PARK PARKO PARKC PARKU PRKUO TOLL TOLLA ADDP ADDPH ADDPN
## 2 NA NA 0 3 0 NA 2 NA 2 NA NA
## 4 NA NA 0 3 0 NA 2 NA 2 NA NA
## 15 NA NA 0 3 0 NA 2 NA 2 NA NA
## 18 0 NA 0 1 0 NA 2 NA 2 NA NA
## 41 NA NA 0 3 0 NA 2 NA 2 NA NA
## 47 NA NA 0 3 0 NA 2 NA 2 NA NA
## APDP APDPH APDPN SUBTR ACCESS OACESS LINE PAY1 PAY1O FARE1 TRFR TRFS1 TRFL1
## 2 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## 15 NA NA NA NA NA NA NA
## 18 NA NA NA NA NA NA NA
## 41 NA NA NA NA NA NA NA
## 47 NA NA NA NA NA NA NA
## PAY2 PAY2O FARE2 TRFS2 TRFL2 PAY3 PAY3O FARE3 OTRFS EXIT EGRESS OEGRES
## 2 NA NA NA NA NA NA
## 4 NA NA NA NA NA NA
## 15 NA NA NA NA NA NA
## 18 NA NA NA NA NA NA
## 41 NA NA NA NA NA NA
## 47 NA NA NA NA NA NA
## SPDFLAG CPA REGION DVRPC WB Drive Transit SUMZERO
## 2 0 58 2 1 0 1 0 1
## 4 0 58 2 1 0 1 0 1
## 15 0 58 2 1 0 1 0 1
## 18 0 58 2 1 0 1 0 1
## 41 0 58 2 1 0 1 0 1
## 47 0 58 2 1 0 1 0 1
TripsClean$Commute[TripsClean$Drive == 1] <- 1
TripsClean$Commute[TripsClean$WB == 1] <- 2
TripsClean$Commute[TripsClean$Transit == 1] <- 3
TripsClean <- drop_na(TripsClean, Commute)
Trips are overwhelmingly made by car
#Plot histogram
ggplot(TripsClean, aes(x=Commute)) +
geom_histogram(binwidth=1, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Trip Mode Frequency Distribution") +
xlab("Commute Mode") + ylab("Frequency")
##trips are overwhelmingly made by car
table(TripsClean$Commute)
##
## 1 2 3
## 3566 262 35
((3038-(182+33))/(3038))*100
## [1] 92.92298
Merge Data
TripsClean$SAMPN_PER <- do.call(paste, c(TripsClean[c("SAMPN", "PERNO")], sep = ""))
TripsClean <- subset(TripsClean, !duplicated(SAMPN_PER))
per_hh_dat <- merge(per_dat, hh_dat,
by.x = "SAMPN",
by.y = "SAMPN",
all.x = TRUE,
all.y=FALSE,
sort = FALSE)
colnamCleaning<-c("VETMO", "W2TY", "W2TYO", "W2TYP", "W2LOC", "W2IND", "W2INO", "W2OCC", "W2OCO", "W2DAY", "W2HOM", "W2HOO", "W2ST", "W2ET", "W1WKD3", "W1WKD4", "W1WKE", "W1WKD1", "W1WKD2")
per_datClean<-per_hh_dat[ , -which(names(per_dat) %in% colnamCleaning)]
head(per_datClean)
## SAMPN PERNO GEND AGE LIC RESP RELAT DISAB DISTY1 DISTY2 DABLO EDUC LEVEL
## 1 1000024 1 1 50 1 1 1 2 NA NA 2 NA
## 2 1000024 2 2 45 1 2 2 2 NA NA 2 NA
## 3 1000054 1 2 60 1 1 1 2 NA NA 2 NA
## 4 1000054 2 1 64 1 2 2 2 NA NA 2 NA
## 5 1000175 1 2 65 1 1 1 2 NA NA 2 NA
## 6 1000175 2 1 37 1 2 3 2 NA NA 2 NA
## S_LOC SHOME SDAY SMODE1 SMODE2 SMODE3 SMOO SCOST SUNIT EMPL NOJOB NMJOB ETYPE
## 1 NA NA NA NA NA NA NA NA NA 1 NA 2 3
## 2 NA NA NA NA NA NA NA NA NA 1 NA 2 1
## 3 NA NA NA NA NA NA NA NA NA 1 NA 2 2
## 4 NA NA NA NA NA NA NA NA NA 3 1 NA NA
## 5 NA NA NA NA NA NA NA NA NA 2 NA 2 1
## 6 NA NA NA NA NA NA NA NA NA 3 3 NA NA
## ETYPO W1IND W1INO W1OCC W1OCO W1TIM W1DAY W1HOM W1HMO W1CPR W1CPT W1CPO
## 1 NA 23 8 1 4 0 1 7 VARIES
## 2 NA 31 1 3 5 0 2 NA
## 3 NA 61 8 3 5 0 2 NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA 81 2 3 3 0 NA NA
## 6 NA NA NA NA NA NA NA NA
## O_W1WKD W1TYP W1LOC W1MOD1 W1MOD2 W1MOD3 W1MOO W1NDV W1CPK W1CPU W1EPK W1PKL
## 1 2 172 21 NA NA 1 0 NA 2 3
## 2 2 173 21 NA NA 2 0 NA 1 1
## 3 2 60 21 NA NA 2 0 NA 2 3
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 2 165 21 NA NA 2 0 NA 2 1
## 6 NA NA NA NA NA NA NA NA NA NA
## W1PKLO W1WLK TRNST VTRAN TRNCT TRNCU SCHED WSTIM WETIM SETIM VSTIM VSTMO
## 1 1 2 NA NA NA 1 900 1600 1 NA
## 2 1 2 NA NA NA 2 700 1600 1 NA
## 3 1 2 NA NA NA 2 830 1515 1 NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 1 2 NA NA NA 2 900 1700 1 NA
## 6 NA NA NA NA NA NA NA NA NA NA
## VETIM HMALL PNMPL NTRIPS.x DIARY.x PROXY HBW.x HBO.x NHB.x MOTTRIP.x NOMOTTRP
## 1 NA 2 3 2 1 98 0 2 0 2 0
## 2 NA 2 3 2 1 98 2 0 0 2 0
## 3 NA 2 4 3 1 0 1 1 1 3 0
## 4 NA 2 7 6 1 1 0 2 4 6 0
## 5 NA 2 4 3 1 0 0 2 1 3 0
## 6 NA 2 4 3 2 1 0 2 1 3 0
## CPA.x FIPS REGION DVRPC LISTD HADDR ADVLT TOTVEH DWELL YRMOV RENT DIARY.y
## 1 58 34005 2 1 1 9000024 1 2 1 3 2 1
## 2 58 34005 2 1 1 9000024 1 2 1 3 2 1
## 3 58 34005 2 1 1 9000054 1 2 1 3 2 1
## 4 58 34005 2 1 1 9000054 1 2 1 3 2 1
## 5 58 34005 2 1 1 9000175 1 2 1 3 2 1
## 6 58 34005 2 1 1 9000175 1 2 1 3 2 1
## ENGL HHSIZE NPHON NFAX NOPHO PHSER SHPHN SHNUM ETHNC INCLV INCOM INCOME ASSN
## 1 2 2 1 NA 2 NA 2 NA 2 2 10 175000 141
## 2 2 2 1 NA 2 NA 2 NA 2 2 10 175000 141
## 3 2 2 2 1 2 NA 2 NA 2 2 6 62000 140
## 4 2 2 2 1 2 NA 2 NA 2 2 6 62000 140
## 5 2 2 2 1 2 NA 2 NA 2 2 6 62000 141
## 6 2 2 2 1 2 NA 2 NA 2 2 6 62000 141
## DPHON NVIST NPLAC NTRIPS.y NOWRK NOSTU OWNSH EACCT CPA.y DAYOFWK HBW.y HBO.y
## 1 2 0 6 4 2 0 2 2 58 4 4 0
## 2 2 0 6 4 2 0 2 2 58 4 4 0
## 3 2 0 11 9 1 0 2 1 58 3 3 1
## 4 2 0 11 9 1 0 2 1 58 3 3 1
## 5 2 0 8 6 1 0 2 2 58 4 0 4
## 6 2 0 8 6 1 0 2 2 58 4 0 4
## NHB.y MOTTRIP.y
## 1 0 4
## 2 0 4
## 3 5 9
## 4 5 9
## 5 2 6
## 6 2 6
per_datClean$SAMPN_PER <- do.call(paste, c(per_datClean[c("SAMPN", "PERNO")], sep = ""))
head(per_datClean)
## SAMPN PERNO GEND AGE LIC RESP RELAT DISAB DISTY1 DISTY2 DABLO EDUC LEVEL
## 1 1000024 1 1 50 1 1 1 2 NA NA 2 NA
## 2 1000024 2 2 45 1 2 2 2 NA NA 2 NA
## 3 1000054 1 2 60 1 1 1 2 NA NA 2 NA
## 4 1000054 2 1 64 1 2 2 2 NA NA 2 NA
## 5 1000175 1 2 65 1 1 1 2 NA NA 2 NA
## 6 1000175 2 1 37 1 2 3 2 NA NA 2 NA
## S_LOC SHOME SDAY SMODE1 SMODE2 SMODE3 SMOO SCOST SUNIT EMPL NOJOB NMJOB ETYPE
## 1 NA NA NA NA NA NA NA NA NA 1 NA 2 3
## 2 NA NA NA NA NA NA NA NA NA 1 NA 2 1
## 3 NA NA NA NA NA NA NA NA NA 1 NA 2 2
## 4 NA NA NA NA NA NA NA NA NA 3 1 NA NA
## 5 NA NA NA NA NA NA NA NA NA 2 NA 2 1
## 6 NA NA NA NA NA NA NA NA NA 3 3 NA NA
## ETYPO W1IND W1INO W1OCC W1OCO W1TIM W1DAY W1HOM W1HMO W1CPR W1CPT W1CPO
## 1 NA 23 8 1 4 0 1 7 VARIES
## 2 NA 31 1 3 5 0 2 NA
## 3 NA 61 8 3 5 0 2 NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA 81 2 3 3 0 NA NA
## 6 NA NA NA NA NA NA NA NA
## O_W1WKD W1TYP W1LOC W1MOD1 W1MOD2 W1MOD3 W1MOO W1NDV W1CPK W1CPU W1EPK W1PKL
## 1 2 172 21 NA NA 1 0 NA 2 3
## 2 2 173 21 NA NA 2 0 NA 1 1
## 3 2 60 21 NA NA 2 0 NA 2 3
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 2 165 21 NA NA 2 0 NA 2 1
## 6 NA NA NA NA NA NA NA NA NA NA
## W1PKLO W1WLK TRNST VTRAN TRNCT TRNCU SCHED WSTIM WETIM SETIM VSTIM VSTMO
## 1 1 2 NA NA NA 1 900 1600 1 NA
## 2 1 2 NA NA NA 2 700 1600 1 NA
## 3 1 2 NA NA NA 2 830 1515 1 NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 1 2 NA NA NA 2 900 1700 1 NA
## 6 NA NA NA NA NA NA NA NA NA NA
## VETIM HMALL PNMPL NTRIPS.x DIARY.x PROXY HBW.x HBO.x NHB.x MOTTRIP.x NOMOTTRP
## 1 NA 2 3 2 1 98 0 2 0 2 0
## 2 NA 2 3 2 1 98 2 0 0 2 0
## 3 NA 2 4 3 1 0 1 1 1 3 0
## 4 NA 2 7 6 1 1 0 2 4 6 0
## 5 NA 2 4 3 1 0 0 2 1 3 0
## 6 NA 2 4 3 2 1 0 2 1 3 0
## CPA.x FIPS REGION DVRPC LISTD HADDR ADVLT TOTVEH DWELL YRMOV RENT DIARY.y
## 1 58 34005 2 1 1 9000024 1 2 1 3 2 1
## 2 58 34005 2 1 1 9000024 1 2 1 3 2 1
## 3 58 34005 2 1 1 9000054 1 2 1 3 2 1
## 4 58 34005 2 1 1 9000054 1 2 1 3 2 1
## 5 58 34005 2 1 1 9000175 1 2 1 3 2 1
## 6 58 34005 2 1 1 9000175 1 2 1 3 2 1
## ENGL HHSIZE NPHON NFAX NOPHO PHSER SHPHN SHNUM ETHNC INCLV INCOM INCOME ASSN
## 1 2 2 1 NA 2 NA 2 NA 2 2 10 175000 141
## 2 2 2 1 NA 2 NA 2 NA 2 2 10 175000 141
## 3 2 2 2 1 2 NA 2 NA 2 2 6 62000 140
## 4 2 2 2 1 2 NA 2 NA 2 2 6 62000 140
## 5 2 2 2 1 2 NA 2 NA 2 2 6 62000 141
## 6 2 2 2 1 2 NA 2 NA 2 2 6 62000 141
## DPHON NVIST NPLAC NTRIPS.y NOWRK NOSTU OWNSH EACCT CPA.y DAYOFWK HBW.y HBO.y
## 1 2 0 6 4 2 0 2 2 58 4 4 0
## 2 2 0 6 4 2 0 2 2 58 4 4 0
## 3 2 0 11 9 1 0 2 1 58 3 3 1
## 4 2 0 11 9 1 0 2 1 58 3 3 1
## 5 2 0 8 6 1 0 2 2 58 4 0 4
## 6 2 0 8 6 1 0 2 2 58 4 0 4
## NHB.y MOTTRIP.y SAMPN_PER
## 1 0 4 10000241
## 2 0 4 10000242
## 3 5 9 10000541
## 4 5 9 10000542
## 5 2 6 10001751
## 6 2 6 10001752
Commute <- merge(TripsClean, per_datClean,
by.x = "SAMPN_PER",
by.y = "SAMPN_PER",
all.x = TRUE,
all.y=FALSE,
sort = FALSE)
head(Commute)
## SAMPN_PER SAMPN.x PERNO.x X PLANO ATIME DTIME TRPDUR ACTDUR
## 1 10000181 1000018 1 10000181 2 800 830 90 30
## 2 10000242 1000024 2 10000242 2 645 1630 25 585
## 3 10000541 1000054 1 10000541 2 805 1455 5 410
## 4 10002122 1000212 2 10002122 2 900 1732 60 512
## 5 10002391 1000239 1 10002391 2 815 1206 37 231
## 6 10002441 1000244 1 10002441 2 818 1615 3 477
## Origin_PTYPE Dest_PTYE PLOC X.1 ACT1 Work.NotWork Origin Mode TT ACT2 ACT3
## 1 1 2 26 NA 11 1 9000018 21 0 NA NA
## 2 1 2 173 NA 11 1 9000024 21 0 NA NA
## 3 1 2 60 NA 11 1 9000054 21 0 NA NA
## 4 1 2 167 NA 11 1 9000212 21 0 NA NA
## 5 1 2 186 NA 11 1 9000239 21 0 3 NA
## 6 1 2 187 NA 11 1 9000244 21 0 NA NA
## ACT4 ACT5 ACT6 ACT1O ACTO TRAN1 TRAN2 TRAN3 TRAN4 OTRAN HHVU HHVNO ADP
## 1 NA NA NA NA 21 NA NA NA 1 2 1
## 2 NA NA NA NA 21 NA NA NA 1 2 1
## 3 NA NA NA NA 21 NA NA NA 1 2 1
## 4 NA NA NA NA 21 NA NA NA 1 2 1
## 5 NA NA NA NA 21 NA NA NA 1 1 1
## 6 NA NA NA NA 21 NA NA NA 1 1 1
## TOTTRAV NTRAV NHTRV PHHTR NNHTR PARK PARKO PARKC PARKU PRKUO TOLL TOLLA ADDP
## 1 1 0 NA NA 0 3 0 NA 2 NA 2
## 2 1 0 NA NA 0 3 0 NA 2 NA 2
## 3 1 0 0 NA 0 1 0 NA 2 NA 2
## 4 1 0 NA NA 0 3 0 NA 2 NA 2
## 5 1 0 NA NA 0 3 0 NA 2 NA 2
## 6 1 0 NA NA 0 3 0 NA 2 NA 2
## ADDPH ADDPN APDP APDPH APDPN SUBTR ACCESS OACESS LINE PAY1 PAY1O FARE1 TRFR
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA
## TRFS1 TRFL1 PAY2 PAY2O FARE2 TRFS2 TRFL2 PAY3 PAY3O FARE3 OTRFS EXIT EGRESS
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 NA NA NA NA NA NA
## 4 NA NA NA NA NA NA
## 5 NA NA NA NA NA NA
## 6 NA NA NA NA NA NA
## OEGRES SPDFLAG CPA REGION.x DVRPC.x WB Drive Transit SUMZERO Commute SAMPN.y
## 1 0 58 2 1 0 1 0 1 1 1000018
## 2 0 58 2 1 0 1 0 1 1 1000024
## 3 0 58 2 1 0 1 0 1 1 1000054
## 4 0 58 2 1 0 1 0 1 1 1000212
## 5 0 58 2 1 0 1 0 1 1 1000239
## 6 0 58 2 1 0 1 0 1 1 1000244
## PERNO.y GEND AGE LIC RESP RELAT DISAB DISTY1 DISTY2 DABLO EDUC LEVEL S_LOC
## 1 1 1 60 1 1 1 2 NA NA 2 NA NA
## 2 2 2 45 1 2 2 2 NA NA 2 NA NA
## 3 1 2 60 1 1 1 2 NA NA 2 NA NA
## 4 2 2 45 1 2 3 2 NA NA 2 NA NA
## 5 1 2 34 1 1 1 2 NA NA 2 NA NA
## 6 1 1 38 1 1 1 2 NA NA 2 NA NA
## SHOME SDAY SMODE1 SMODE2 SMODE3 SMOO SCOST SUNIT EMPL NOJOB NMJOB ETYPE ETYPO
## 1 NA NA NA NA NA NA NA NA 1 NA 2 1 NA
## 2 NA NA NA NA NA NA NA NA 1 NA 2 1 NA
## 3 NA NA NA NA NA NA NA NA 1 NA 2 2 NA
## 4 NA NA NA NA NA NA NA NA 1 NA 2 1 NA
## 5 NA NA NA NA NA NA NA NA 1 NA 2 1 NA
## 6 NA NA NA NA NA NA NA NA 1 NA 2 1 NA
## W1IND W1INO W1OCC W1OCO W1TIM W1DAY W1HOM W1HMO W1CPR W1CPT W1CPO O_W1WKD
## 1 54 3 2 5 0 2 NA
## 2 31 1 3 5 0 2 NA
## 3 61 8 3 5 0 2 NA
## 4 97 BUSINESS 2 2 5 0 2 NA
## 5 51 2 2 5 0 2 NA
## 6 52 4 3 5 0 2 NA
## W1TYP W1LOC W1MOD1 W1MOD2 W1MOD3 W1MOO W1NDV W1CPK W1CPU W1EPK W1PKL W1PKLO
## 1 2 26 21 NA NA 1 0.25 1 2 3
## 2 2 173 21 NA NA 2 0.00 NA 1 1
## 3 2 60 21 NA NA 2 0.00 NA 2 3
## 4 2 167 21 NA NA 1 90.00 4 2 1
## 5 2 186 21 NA NA 2 0.00 NA 1 1
## 6 2 187 21 NA NA 1 0.00 NA 2 1
## W1WLK TRNST VTRAN TRNCT TRNCU SCHED WSTIM WETIM SETIM VSTIM VSTMO VETIM HMALL
## 1 98 8 NA 4.5 1 2 730 1900 1 NA NA 2
## 2 1 2 NA NA NA 2 700 1600 1 NA NA 2
## 3 1 2 NA NA NA 2 830 1515 1 NA NA 2
## 4 98 2 NA 265.0 3 2 800 1800 1 NA NA 2
## 5 1 2 NA NA NA 1 800 1600 1 NA NA 2
## 6 1 2 NA NA NA 2 230 1700 1 NA NA 2
## PNMPL NTRIPS.x DIARY.x PROXY HBW.x HBO.x NHB.x MOTTRIP.x NOMOTTRP CPA.x FIPS
## 1 9 8 1 0 1 1 6 8 0 58 34005
## 2 3 2 1 98 2 0 0 2 0 58 34005
## 3 4 3 1 0 1 1 1 3 0 58 34005
## 4 6 5 1 0 2 2 1 5 0 58 34005
## 5 6 5 1 0 1 1 3 5 0 58 34005
## 6 5 4 2 0 2 2 0 4 0 58 34005
## REGION.y DVRPC.y LISTD HADDR ADVLT TOTVEH DWELL YRMOV RENT DIARY.y ENGL
## 1 2 1 NA NA NA NA NA NA NA NA NA
## 2 2 1 1 9000024 1 2 1 3 2 1 2
## 3 2 1 1 9000054 1 2 1 3 2 1 2
## 4 2 1 1 9000212 1 2 2 3 2 1 2
## 5 2 1 1 9000239 1 1 1 2 2 1 2
## 6 2 1 1 9000244 8 2 1 3 2 1 2
## HHSIZE NPHON NFAX NOPHO PHSER SHPHN SHNUM ETHNC INCLV INCOM INCOME ASSN DPHON
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 2 1 NA 2 NA 2 NA 2 2 10 175000 141 2
## 3 2 2 1 2 NA 2 NA 2 2 6 62000 140 2
## 4 2 1 NA 2 NA 2 NA 2 2 9 137000 141 2
## 5 1 1 NA 2 NA 2 NA 2 1 4 30000 141 2
## 6 5 2 1 2 NA 2 NA 2 2 9 137000 141 2
## NVIST NPLAC NTRIPS.y NOWRK NOSTU OWNSH EACCT CPA.y DAYOFWK HBW.y HBO.y NHB.y
## 1 NA NA NA NA NA NA NA NA NA NA NA NA
## 2 0 6 4 2 0 2 2 58 4 4 0 0
## 3 0 11 9 1 0 2 1 58 3 3 1 5
## 4 0 10 8 1 0 1 2 58 4 2 4 2
## 5 0 6 5 1 0 2 2 58 4 1 1 3
## 6 0 21 16 2 4 2 2 58 4 4 12 0
## MOTTRIP.y
## 1 NA
## 2 4
## 3 9
## 4 8
## 5 5
## 6 16
varsInterest <- c("SAMPN_PER", "TRPDUR", "WB", "Drive", "Transit", "Commute", "AGE", "INCOME")
Commute <- Commute[ , which(names(Commute) %in% varsInterest)]
head(Commute)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME
## 1 10000181 90 0 1 0 1 60 NA
## 2 10000242 25 0 1 0 1 45 175000
## 3 10000541 5 0 1 0 1 60 62000
## 4 10002122 60 0 1 0 1 45 137000
## 5 10002391 37 0 1 0 1 34 30000
## 6 10002441 3 0 1 0 1 38 137000
Drive <- Commute[which(Commute$Drive==1),]
WB <- Commute[which(Commute$WB==1),]
Transit <- Commute[which(Commute$Transit ==1),]
table(Drive$Commute)
##
## 1
## 3038
table(WB$Commute)
##
## 2
## 182
table(Transit$Commute)
##
## 3
## 33
summary(Drive$TRPDUR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 15.00 22.00 26.51 35.00 465.00
summary(WB$TRPDUR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.25 15.00 22.21 30.00 120.00
summary(Transit$TRPDUR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 24.00 35.00 42.15 60.00 95.00
Drive$distance<-(30/60)*Drive$TRPDUR
head(Drive)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME distance
## 1 10000181 90 0 1 0 1 60 NA 45.0
## 2 10000242 25 0 1 0 1 45 175000 12.5
## 3 10000541 5 0 1 0 1 60 62000 2.5
## 4 10002122 60 0 1 0 1 45 137000 30.0
## 5 10002391 37 0 1 0 1 34 30000 18.5
## 6 10002441 3 0 1 0 1 38 137000 1.5
WB$distance<-(12/60)*WB$TRPDUR
head(WB)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME distance
## 34 10014112 29 1 0 0 2 34 62000 5.8
## 37 10015901 21 1 0 0 2 38 62000 4.2
## 38 10015902 55 1 0 0 2 37 62000 11.0
## 42 10022591 80 1 0 0 2 36 30000 16.0
## 154 10068022 2 1 0 0 2 53 112000 0.4
## 209 10076163 15 1 0 0 2 19 62000 3.0
Transit$distance<-(25/60)*Transit$TRPDUR
head(Transit)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME distance
## 41 10020423 30 0 0 1 3 20 87000 12.50000
## 202 10075232 58 0 0 1 3 57 112000 24.16667
## 317 10115922 15 0 0 1 3 64 175000 6.25000
## 333 10120971 40 0 0 1 3 42 10000 16.66667
## 343 10125841 95 0 0 1 3 64 42000 39.58333
## 362 10130462 60 0 0 1 3 26 42000 25.00000
dat_commute <- rbind(WB, Drive, Transit)
head(dat_commute)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME distance
## 34 10014112 29 1 0 0 2 34 62000 5.8
## 37 10015901 21 1 0 0 2 38 62000 4.2
## 38 10015902 55 1 0 0 2 37 62000 11.0
## 42 10022591 80 1 0 0 2 36 30000 16.0
## 154 10068022 2 1 0 0 2 53 112000 0.4
## 209 10076163 15 1 0 0 2 19 62000 3.0
dat_commute$time.auto <- dat_commute$distance/30
dat_commute$time.bike <- dat_commute$distance/12
dat_commute$time.transit <- dat_commute$distance/25
dat_commute$mode[dat_commute$Drive == 1] <- "auto"
dat_commute$mode[dat_commute$WB == 1] <- "bike"
dat_commute$mode[dat_commute$Transit == 1] <- "transit"
rownames (dat_commute) <- NULL
head(dat_commute)
## SAMPN_PER TRPDUR WB Drive Transit Commute AGE INCOME distance time.auto
## 1 10014112 29 1 0 0 2 34 62000 5.8 0.19333333
## 2 10015901 21 1 0 0 2 38 62000 4.2 0.14000000
## 3 10015902 55 1 0 0 2 37 62000 11.0 0.36666667
## 4 10022591 80 1 0 0 2 36 30000 16.0 0.53333333
## 5 10068022 2 1 0 0 2 53 112000 0.4 0.01333333
## 6 10076163 15 1 0 0 2 19 62000 3.0 0.10000000
## time.bike time.transit mode
## 1 0.48333333 0.232 bike
## 2 0.35000000 0.168 bike
## 3 0.91666667 0.440 bike
## 4 1.33333333 0.640 bike
## 5 0.03333333 0.016 bike
## 6 0.25000000 0.120 bike
varsInterest <- c("TRPDUR", "WB" ,"Drive", "Transit", "Commute", "AGE", "INCOME", "distance", "time.auto", "time.bike", "time.transit", "mode")
dat_commute <- dat_commute[ , which(names(dat_commute) %in% varsInterest)]
head(dat_commute)
## TRPDUR WB Drive Transit Commute AGE INCOME distance time.auto time.bike
## 1 29 1 0 0 2 34 62000 5.8 0.19333333 0.48333333
## 2 21 1 0 0 2 38 62000 4.2 0.14000000 0.35000000
## 3 55 1 0 0 2 37 62000 11.0 0.36666667 0.91666667
## 4 80 1 0 0 2 36 30000 16.0 0.53333333 1.33333333
## 5 2 1 0 0 2 53 112000 0.4 0.01333333 0.03333333
## 6 15 1 0 0 2 19 62000 3.0 0.10000000 0.25000000
## time.transit mode
## 1 0.232 bike
## 2 0.168 bike
## 3 0.440 bike
## 4 0.640 bike
## 5 0.016 bike
## 6 0.120 bike
commuteMNL <- mlogit.data(dat_commute, shape="wide", choice="mode", varying = c(9:11))
head(commuteMNL)
## ~~~~~~~
## first 10 observations out of 9759
## ~~~~~~~
## TRPDUR WB Drive Transit Commute AGE INCOME distance mode alt time
## 1 29 1 0 0 2 34 62000 5.8 FALSE auto 0.1933333
## 2 29 1 0 0 2 34 62000 5.8 TRUE bike 0.4833333
## 3 29 1 0 0 2 34 62000 5.8 FALSE transit 0.2320000
## 4 21 1 0 0 2 38 62000 4.2 FALSE auto 0.1400000
## 5 21 1 0 0 2 38 62000 4.2 TRUE bike 0.3500000
## 6 21 1 0 0 2 38 62000 4.2 FALSE transit 0.1680000
## 7 55 1 0 0 2 37 62000 11.0 FALSE auto 0.3666667
## 8 55 1 0 0 2 37 62000 11.0 TRUE bike 0.9166667
## 9 55 1 0 0 2 37 62000 11.0 FALSE transit 0.4400000
## 10 80 1 0 0 2 36 30000 16.0 FALSE auto 0.5333333
## chid idx
## 1 1 1:auto
## 2 1 1:bike
## 3 1 1:nsit
## 4 2 2:auto
## 5 2 2:bike
## 6 2 2:nsit
## 7 3 3:auto
## 8 3 3:bike
## 9 3 3:nsit
## 10 4 4:auto
##
## ~~~ indexes ~~~~
## chid alt
## 1 1 auto
## 2 1 bike
## 3 1 transit
## 4 2 auto
## 5 2 bike
## 6 2 transit
## 7 3 auto
## 8 3 bike
## 9 3 transit
## 10 4 auto
## indexes: 1, 2
Intercept Specific
Probability of walking, biking, or taking transit to work is lower than that of driving.
mod.int <- mlogit (mode ~ 1, data = commuteMNL)
summary(mod.int)
##
## Call:
## mlogit(formula = mode ~ 1, data = commuteMNL, method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.933907 0.055948 0.010144
##
## nr method
## 7 iterations, 0h:0m:0s
## g'(-H)^-1g = 1.27E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -2.814948 0.076313 -36.887 < 2.2e-16 ***
## (Intercept):transit -4.522447 0.175021 -25.840 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -884
## McFadden R^2: -2.2204e-16
## Likelihood ratio test : chisq = -4.5475e-13 (p.value = 1)
##probability of walking, biking, or taking transit to work is lower than that of driving
Alternative Specific
As time goes up, controlling for age and income, the utility of travel goes down - one is less likely to opt for bike or transit than a car.
#Alternative specific
mod.alt <- mlogit (mode ~ time | 1, data = commuteMNL)
summary(mod.alt)
##
## Call:
## mlogit(formula = mode ~ time | 1, data = commuteMNL, method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.933907 0.055948 0.010144
##
## nr method
## 7 iterations, 0h:0m:0s
## g'(-H)^-1g = 2.52E-07
## gradient close to zero
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -0.64362 0.14188 -4.5363 5.724e-06 ***
## (Intercept):transit -4.06513 0.17845 -22.7801 < 2.2e-16 ***
## time -5.78595 0.47522 -12.1752 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -741.1
## McFadden R^2: 0.16164
## Likelihood ratio test : chisq = 285.79 (p.value = < 2.22e-16)
##As time goes up, controlling for age and income, the utility of travel goes down - one is less likely to opt for bike or transit than a car
Decision Maker (individual) Specific
#Decision maker (individual) specific
mod.des <- mlogit (mode ~ 1 | AGE + INCOME, data = commuteMNL)
summary(mod.des)
##
## Call:
## mlogit(formula = mode ~ 1 | AGE + INCOME, data = commuteMNL,
## method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.932767 0.055909 0.011323
##
## nr method
## 7 iterations, 0h:0m:0s
## g'(-H)^-1g = 4.13E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -2.5307e+00 1.8751e-01 -13.4963 < 2e-16 ***
## (Intercept):transit -3.9724e+00 3.9818e-01 -9.9765 < 2e-16 ***
## AGE:bike 3.5197e-04 8.7845e-04 0.4007 0.68866
## AGE:transit 1.5528e-03 1.1362e-03 1.3667 0.17172
## INCOME:bike -4.0497e-06 2.2731e-06 -1.7815 0.07482 .
## INCOME:transit -7.3098e-06 5.1558e-06 -1.4178 0.15625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -779.27
## McFadden R^2: 0.0041702
## Likelihood ratio test : chisq = 6.5266 (p.value = 0.16312)
mod2.mnl <- mlogit (mode ~ 1 | AGE + INCOME + TRPDUR, data = commuteMNL)
summary(mod2.mnl)
##
## Call:
## mlogit(formula = mode ~ 1 | AGE + INCOME + TRPDUR, data = commuteMNL,
## method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.932767 0.055909 0.011323
##
## nr method
## 7 iterations, 0h:0m:0s
## g'(-H)^-1g = 7.18E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -2.1391e+00 2.1668e-01 -9.8723 < 2.2e-16 ***
## (Intercept):transit -4.3615e+00 4.2910e-01 -10.1644 < 2.2e-16 ***
## AGE:bike 3.2423e-04 8.8327e-04 0.3671 0.7135602
## AGE:transit 1.6449e-03 1.1314e-03 1.4539 0.1459639
## INCOME:bike -3.4483e-06 2.2777e-06 -1.5140 0.1300319
## INCOME:transit -7.8456e-06 5.2122e-06 -1.5052 0.1322685
## TRPDUR:bike -1.8635e-02 5.5927e-03 -3.3320 0.0008622 ***
## TRPDUR:transit 1.3830e-02 4.8937e-03 2.8260 0.0047126 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -768.11
## McFadden R^2: 0.018422
## Likelihood ratio test : chisq = 28.832 (p.value = 6.5467e-05)
Both Alternative and Individual Specific
Time and TRPDUR are the most statistically significant.
#both alternative and individual specific
mod3.mnl <- mlogit (mode ~ time | AGE + INCOME, data = commuteMNL)
mod4.mnl <- mlogit (mode ~ time | AGE + INCOME + TRPDUR, data = commuteMNL)
summary(mod4.mnl)
##
## Call:
## mlogit(formula = mode ~ time | AGE + INCOME + TRPDUR, data = commuteMNL,
## method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.932767 0.055909 0.011323
##
## nr method
## 19 iterations, 0h:0m:0s
## g'(-H)^-1g = 77
## last step couldn't find higher value
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -9.6942e-01 7.2104e-01 -1.3445 0.1788
## (Intercept):transit -4.5380e+00 4.3222e-01 -10.4992 <2e-16 ***
## time -9.6146e+01 9.5815e+00 -10.0346 <2e-16 ***
## AGE:bike -3.3389e-03 1.1987e-02 -0.2786 0.7806
## AGE:transit 1.6398e-03 1.1407e-03 1.4375 0.1506
## INCOME:bike -4.6413e-06 5.9436e-06 -0.7809 0.4349
## INCOME:transit -6.0199e-06 5.2032e-06 -1.1569 0.2473
## TRPDUR:bike 1.9757e+00 2.0320e-01 9.7230 <2e-16 ***
## TRPDUR:transit 3.3123e-01 3.1780e-02 10.4226 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -127.13
## McFadden R^2: 0.83754
## Likelihood ratio test : chisq = 1310.8 (p.value = < 2.22e-16)
#Time and TRPDUR are the most statistically significant
Change in degrees of freedom
#Change in degrees of freedom
mod4.mnl$logLik
## 'log Lik.' -127.1268 (df=9)
df.mod4.mnl <- 9
mod.int$logLik
## 'log Lik.' -883.9957 (df=2)
df.mod.int <- 2
chidf <- df.mod4.mnl - df.mod.int
Log likelihood
Mod4.mnl has the lowest AIC by far
ModelChi <- (-2*(-127.1268)) - (-2*(-883.9957))
chisq.prob <- 1 - pchisq(ModelChi, chidf)
chisq.prob #this is the chisq p-value in the output
## [1] 1
AIC (mod.int, mod.alt, mod.des, mod2.mnl, mod3.mnl, mod4.mnl)
## df AIC
## mod.int 2 1771.9914
## mod.alt 3 1488.2046
## mod.des 6 1570.5304
## mod2.mnl 8 1552.2254
## mod3.mnl 7 1312.1475
## mod4.mnl 9 272.2536
#mod4.mnl has the lowest AIC by far
summary(mod4.mnl)
##
## Call:
## mlogit(formula = mode ~ time | AGE + INCOME + TRPDUR, data = commuteMNL,
## method = "nr")
##
## Frequencies of alternatives:choice
## auto bike transit
## 0.932767 0.055909 0.011323
##
## nr method
## 19 iterations, 0h:0m:0s
## g'(-H)^-1g = 77
## last step couldn't find higher value
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):bike -9.6942e-01 7.2104e-01 -1.3445 0.1788
## (Intercept):transit -4.5380e+00 4.3222e-01 -10.4992 <2e-16 ***
## time -9.6146e+01 9.5815e+00 -10.0346 <2e-16 ***
## AGE:bike -3.3389e-03 1.1987e-02 -0.2786 0.7806
## AGE:transit 1.6398e-03 1.1407e-03 1.4375 0.1506
## INCOME:bike -4.6413e-06 5.9436e-06 -0.7809 0.4349
## INCOME:transit -6.0199e-06 5.2032e-06 -1.1569 0.2473
## TRPDUR:bike 1.9757e+00 2.0320e-01 9.7230 <2e-16 ***
## TRPDUR:transit 3.3123e-01 3.1780e-02 10.4226 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -127.13
## McFadden R^2: 0.83754
## Likelihood ratio test : chisq = 1310.8 (p.value = < 2.22e-16)
MOD4.MNL COEFFICIENTS
AGE
A unit increase in age is associated with a 0.33% decrease in the odds of walking or biking to work.
A unit increase in age is associated with a 0.16% increase in the odds of taking transit.
mod4.mnl$coefficients
## (Intercept):bike (Intercept):transit time AGE:bike
## -9.694165e-01 -4.537986e+00 -9.614622e+01 -3.338910e-03
## AGE:transit INCOME:bike INCOME:transit TRPDUR:bike
## 1.639775e-03 -4.641342e-06 -6.019892e-06 1.975677e+00
## TRPDUR:transit
## 3.312314e-01
## attr(,"names.sup.coef")
## character(0)
## attr(,"fixed")
## (Intercept):bike (Intercept):transit time AGE:bike
## FALSE FALSE FALSE FALSE
## AGE:transit INCOME:bike INCOME:transit TRPDUR:bike
## FALSE FALSE FALSE FALSE
## TRPDUR:transit
## FALSE
## attr(,"sup")
## character(0)
#AGE
(exp(-0.0033389097) - 1) * 100
## [1] -0.3333342
#a unit increase in age is associated with a 0.33% decrease in the odds of walking or biking to work
(exp(0.0016397751) - 1) * 100
## [1] 0.164112
#unit increase in age is associated with a 0.16% increase in the odds of taking transit
INCOME
0.00046% decrease in the odds of walking or biking, 0.0006% decrease in the odds of taking transit.
#INCOME
(exp(-0.000004641342) - 1) * 100
## [1] -0.0004641331
(exp(-0.000006019892) - 1) * 100
## [1] -0.0006019874
#0.00046% decrease in the odds of walking or biking, 0.0006% decrease in the odds of taking transit
TRPDUR
While controlling for driving, a minute’s increase in trip duration increases the odds of taking transit by 39%, and for walking and biking by 620%.
#TRPDUR
(exp(1.9756767808) - 1) * 100
## [1] 621.1499
(exp(0.3312314049) - 1) * 100
## [1] 39.2682
#while controlling for driving, a minute's increase in trip duration increases the odds of taking transit by 39%, and for walking and biking by 620%
stargazer(mod.int, mod.alt, mod.des, mod2.mnl, mod3.mnl, mod4.mnl, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ===================================================================================================================================
## Dependent variable:
## ---------------------------------------------------------------------------------------------------------------
## mode
## (1) (2) (3) (4) (5) (6)
## -----------------------------------------------------------------------------------------------------------------------------------
## (Intercept):bike -2.815*** -0.644*** -2.531*** -2.139*** -0.422 -0.969
## (0.076) (0.142) (0.188) (0.217) (0.231) (0.721)
##
## (Intercept):transit -4.522*** -4.065*** -3.972*** -4.361*** -3.541*** -4.538***
## (0.175) (0.178) (0.398) (0.429) (0.399) (0.432)
##
## time -5.786*** -6.219*** -96.146***
## (0.475) (0.539) (9.582)
##
## AGE:bike 0.0004 0.0003 0.0004 -0.003
## (0.001) (0.001) (0.001) (0.012)
##
## AGE:transit 0.002 0.002 0.001 0.002
## (0.001) (0.001) (0.001) (0.001)
##
## INCOME:bike -0.00000 -0.00000 -0.00000 -0.00000
## (0.00000) (0.00000) (0.00000) (0.00001)
##
## INCOME:transit -0.00001 -0.00001 -0.00001 -0.00001
## (0.00001) (0.00001) (0.00001) (0.00001)
##
## TRPDUR:bike -0.019*** 1.976***
## (0.006) (0.203)
##
## TRPDUR:transit 0.014** 0.331***
## (0.005) (0.032)
##
## -----------------------------------------------------------------------------------------------------------------------------------
## Observations 3,253 3,253 2,826 2,826 2,826 2,826
## R2 -0.000 0.162 0.004 0.018 0.171 0.838
## Log Likelihood -883.996 -741.102 -779.265 -768.113 -649.074 -127.127
## LR Test -0.000 (df = 2) 285.787*** (df = 3) 6.527 (df = 6) 28.832*** (df = 8) 266.909*** (df = 7) 1,310.803*** (df = 9)
## ===================================================================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001