Prepared by Gregory Mallon
October 23rd, 2021
Transportation Demand Modeling

Question 1

A) Summarry Statistics

The Households data set has a 4 interesting variables. Type of Residence (resty), Income (income), Number of Vehicles (hhveh), and Number of Trips (htrips) were examined and descriptive statistcs are published below.

Summary Statistics for Type of Residence (resty)

#install.packages("tidyverse")
library(tidyverse)
households <- read_csv("TDM Assignment 1_Mallon/data/household.csv")
households
## # A tibble: 6,449 × 60
##      sampn  area rtcqc   pnr advltr rmode rtmode  lang incen   day  assn ribus
##      <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2000019    21     1    NA      2    NA     NA    NA     2     2   415     2
##  2 2000051    21     4    NA      2    NA     NA    NA     2     5   397     2
##  3 2000056    21     1    NA      2    NA     NA    NA     2     3   381     1
##  4 2000057    21     4    NA      2    NA     NA    NA     2     5   355     2
##  5 2000095    21     1    NA      2    NA     NA    NA     2     2   373     2
##  6 2000116    21     2    NA      2    NA     NA    NA     2     4   375     2
##  7 2000122    21     2    NA      2    NA     NA    NA     2     1   379     2
##  8 2000138    21     2    NA      2    NA     NA    NA     2     1   365     2
##  9 2000151    21     1    NA      2    NA     NA    NA     2     5   355     2
## 10 2000165    21     1    NA      2    NA     NA    NA     2     3   353     1
## # … with 6,439 more rows, and 48 more variables: s1a <dbl>, wabik <dbl>,
## #   hhsiz <dbl>, hhveh <dbl>, vflag <dbl>, vhold <dbl>, bikes <dbl>,
## #   flexc <dbl>, resty <dbl>, o_resty <chr>, own <dbl>, o_own <lgl>,
## #   hlive <dbl>, bfcity <chr>, bfstate <chr>, bfzip <dbl>, bfres <dbl>,
## #   o_bfres <chr>, bfown <dbl>, o_bfown <lgl>, cplns <dbl>, phlns <dbl>,
## #   income <dbl>, race <dbl>, o_race <chr>, htrips <dbl>, hhwrk <dbl>,
## #   hhstu <dbl>, hhlic <dbl>, nonrelat <dbl>, future <dbl>, hcity <chr>, …
options(digits=2)
# count number of households by type of residence NA answers excluded and percentage column added
households_sub <- households %>% 
  filter(!is.na(resty), resty != 9) 
households_sub %>% 
  count(resty) %>%
  mutate(freq = n / sum(n) * 100)
## # A tibble: 5 × 3
##   resty     n   freq
##   <dbl> <int>  <dbl>
## 1     1  5142 79.8  
## 2     2   188  2.92 
## 3     3  1000 15.5  
## 4     4   107  1.66 
## 5     7     7  0.109
# summarize household size by type of residence
households_sub %>% 
  group_by(resty) %>%
  summarize(avg_hhsiz=mean(hhsiz))
## # A tibble: 5 × 2
##   resty avg_hhsiz
##   <dbl>     <dbl>
## 1     1      2.55
## 2     2      1.89
## 3     3      1.63
## 4     4      1.90
## 5     7      1.86
# Graph showing the number of residences categorized by type
library(ggplot2)
ggplot(households_sub) +
  geom_bar(aes(x=factor(resty, labels= c("Single Family", "Duplex", 
                                         "Apt w/ 3+ Units", "Mobile Home", "Other")))) +
  xlab("Type of Residence")

Summary Statistics for Income (income)

options(digits=2)
# count number of incomes within income brackets, NA answers excluded and percentage column added
households_sub <- households %>% 
  filter(!is.na(income), income != 99)
households_sub %>% 
  count(income) %>%
  mutate(freq = n / sum(n) * 100)
## # A tibble: 8 × 3
##   income     n  freq
##    <dbl> <int> <dbl>
## 1      1   314  5.22
## 2      2   485  8.06
## 3      3   479  7.96
## 4      4   743 12.3 
## 5      5  1321 21.9 
## 6      6  1134 18.8 
## 7      7  1013 16.8 
## 8      8   530  8.81
# Graph showing the number of income within income categories
library(ggplot2)
ggplot(households_sub) +
geom_bar(aes(x=factor(income, labels= c("$0 - $14.9", "$15 - $24.9", "25 - $34.9", "$35 - $49.9", "$50 - $74.9", "$75 - $99.9", "$100 - $149.9", "$150 or more")))) +
xlab("Income Categories (in Thousands)") +
ggtitle("Number of Houshold Incomes within Income Categories")

Summary Statistics for Number of Vehicles (hhveh)

households_sub %>%
  count(hhveh) %>%
  mutate(freq = n / sum(n) *100)
## # A tibble: 9 × 3
##   hhveh     n   freq
##   <dbl> <int>  <dbl>
## 1     0   335  5.57 
## 2     1  1833 30.5  
## 3     2  2376 39.5  
## 4     3   981 16.3  
## 5     4   331  5.50 
## 6     5    99  1.64 
## 7     6    42  0.698
## 8     7    11  0.183
## 9     8    11  0.183
library(ggplot2)
ggplot(households_sub) +
geom_bar(aes(x=factor(hhveh, labels= c("0", "1", "2", "3", "4", "5", "6", "7", "8 or More")))) +
xlab("Number of Vehicles Within Household") +
ggtitle("Number of Vehicles Per Household")

Summary Statistics for Number of Trips (htrips)

options(digits=2)
# count number of trips
households_sub %>%
  count(htrips) %>%
  mutate(freq = n / sum(n) * 100)
## # A tibble: 55 × 3
##    htrips     n  freq
##     <dbl> <int> <dbl>
##  1      0   312 5.18 
##  2      1    18 0.299
##  3      2   586 9.74 
##  4      3   229 3.80 
##  5      4   524 8.71 
##  6      5   314 5.22 
##  7      6   512 8.51 
##  8      7   291 4.83 
##  9      8   429 7.13 
## 10      9   263 4.37 
## # … with 45 more rows
houshold_sub <- households %>%
  mean(htrips, na.rm = TRUE)

B) Calculating the Avgerage Number of Trips by Number of Vehicles and Income

Average Number of Trips by Number of Vehicles

The table below shows household number of vehicle in the center column ranging from 0-8 and average number of trips on the column to the right. For households with 0-1 vehicles, the average number of trips are around 7-8. For households with a number of vehicles ranging from 2-6, the average number of trips is in the range of 11-12. This suggests that the number of vehicles does not make an impact to the average number of trips taken by a household within the range. However, for households with 7 vehicles the average number of trips is 13. This is a slight uptick in average trips. Households with 8 or more vehicles show an average number of trips close to 8. This average number of trips is much closer to households with vehicles ranging from 0-1.

# Average number of trips by number of vehicles
households_sub %>%
  select(hhveh, htrips) %>%
  group_by(hhveh) %>%
  summarise(avg_htrips =mean(htrips))
## # A tibble: 9 × 2
##   hhveh avg_htrips
##   <dbl>      <dbl>
## 1     0       7.97
## 2     1       7.00
## 3     2      11.1 
## 4     3      11.5 
## 5     4      12.2 
## 6     5      12.2 
## 7     6      11.1 
## 8     7      13.6 
## 9     8       8.36

Average Number of Trips by Income

The table below shows income categories on the center column ranging from 1-8 and average number of trips on the column to the right. Income categories 1-3 ($0 to 34,999 per year) showed similar averages regarding number of trips, this is right around 6-7 average trips. Income category 4 ($35,000-$49,999 per year) showed a slight increase to average number of trips, this is about 9.5 trips. Income categories 6-8 ($50,000-$150,000 or More per year) showed average number trips increasing to about 11-13. This suggests households with higher income generally take more trips.

# Average number of trips by income
households_sub %>%
  select(income, htrips) %>%
  group_by(income) %>%
  summarise(avg_htrips =mean(htrips))
## # A tibble: 8 × 2
##   income avg_htrips
##    <dbl>      <dbl>
## 1      1       6.36
## 2      2       6.84
## 3      3       6.83
## 4      4       8.08
## 5      5       9.68
## 6      6      11.2 
## 7      7      12.2 
## 8      8      12.6

C) Creating New Variables Income Categories (inc_cat) and Income Midpoints (incval)

Create new variable inc_cat

# Recoding Income to Income Categories (inc_) (0-24000, 25000-49999, 50000+)
households_sub <- households %>%
  filter(!is.na(income), income != 99) %>%
  mutate(inc_cat=case_when(income == 1 ~ "0-24999",
                           income <= 2 ~ "25000-49999",
                           income >= 3 ~ "50000+"
  ))

Count number of housholds within each income category

# Count of the number of households within the each of the new income categories. 
households_sub %>%
  count(inc_cat)
## # A tibble: 3 × 2
##   inc_cat         n
##   <chr>       <int>
## 1 0-24999       314
## 2 25000-49999   485
## 3 50000+       5220

Average number of trips by income category

# Average number of trips by the income categories
households_sub %>%
  select(inc_cat, htrips) %>%
  group_by(inc_cat) %>%
  summarise(avg_htrips =mean(htrips))
## # A tibble: 3 × 2
##   inc_cat     avg_htrips
##   <chr>            <dbl>
## 1 0-24999           6.36
## 2 25000-49999       6.84
## 3 50000+           10.3

New variable incval mean and standard deviation

Explain why the mean and standard deviation from the original income category (INCOME) variable would not be useful.

The mean and standard deviation of the income variable is not very useful because there is a large gaps between income which first would inflate the average income to a number that is a lot higher than what most people make. 2nd, the standard deviation would be so big that it wouldn’t be helpful to report on as ait would catch a huge difference in income levels.

Calculate the correlation between INCVAL and number of trips (HTRIPS). The correlation between incval and htrips is a weak positive relationship at .23

Create new variable incval

# creating incval
households_sub <- households_sub %>%
  mutate(incval=case_when(income == 1 ~ (0+15000)/2,
                          income == 2 ~ (15000+25000)/2,
                          income == 3 ~ (25000+35000)/2,
                          income == 4 ~ (35000+50000)/2,
                          income == 5 ~ (50000+75000)/2,
                          income == 6 ~ (75000+100000)/2,
                          income == 7 ~ (100000+150000)/2,
                          income == 8 ~ 250000,
                          TRUE ~ NA_real_
         ))

Compute the mean and standard deviation of incval

# Average/mean incval
households_sub %>%
  summarize(avg_incval=mean(incval))
## # A tibble: 1 × 1
##   avg_incval
##        <dbl>
## 1     82890.
# Standard Deviation of incval
households_sub %>%
  summarise(sd(incval))
## # A tibble: 1 × 1
##   `sd(incval)`
##          <dbl>
## 1       62390.

Calculate the correlation between incval and number of trips (htrips)

#Calculate the correlation between incval and number of trips (HTRIPS)
households_sub %>%
  select(incval, htrips) %>%
  cor()
##        incval htrips
## incval   1.00   0.23
## htrips   0.23   1.00

D) Recode number household vehicles (hhveh) to number of houshold vehicle categories (hhveh_cat)

The advantage of the calculation below over the calculation completed in B) is that it reduces the number of categories that are being compare which in this case makes it a bit less noisy. This calculation easily demonstrates that there is this linear relationship between number of vehicles and the number of trips until a certain point. At that point (right around 5 or more vehicles) where we can that the number of vehicles does not affect the number of trips in the same way that it did with lower vehicled numbers.

Recode hhveh into a new variable hhveh_cat

#hhveh into categories hhveh_cat
households_sub <- households_sub %>%
  mutate(hhveh_cat=case_when(hhveh == 0 ~ "0",
                              hhveh <= 2 ~ "1-2",
                              hhveh <= 5 ~ "3-5",
                              hhveh > 5 ~ "5+"
     ))

Calculate average number of trips by number of household vehicle categories (hhveh_cat)

# Average number of trips by number of household vehicles categories (hhveh_cat)
households_sub %>%
  select(hhveh_cat, htrips) %>%
  group_by(hhveh_cat) %>%
  summarise(avg_htrips =mean(htrips))
## # A tibble: 4 × 2
##   hhveh_cat avg_htrips
##   <chr>          <dbl>
## 1 0               7.97
## 2 1-2             9.33
## 3 3-5            11.7 
## 4 5+             11.0

E) Among INCOME and HHVEH and their transformed variables (INCVAL, INC_CAT, HHVEH_CAT), which is the best predictor for HTRIPS? Why? Theorize what other variables in the household dataset affect number of trips a household makes?

Income is a better predictor for household number of trips. This is likely because income is related to another factors that predict number of trips such as living in a a single-family residence. Those that have higher incomes are more likely to live in those single-family households. On top of that - I think there some of this single-family/suburban type living may not have the transit resources available to people living within this context.

Other variables I think are probably good predictors of number of household trips would include things like household size. generally if more people are living in one household that that household would have more trips. Owned homes vs rented would probably a pretty good indicator of income therefore would probably be a good predictor of trips.

households_sub %>%
  select(hhveh, htrips) %>%
  cor()
##        hhveh htrips
## hhveh    1.0    0.2
## htrips   0.2    1.0
households_sub %>%
  select(income, htrips) %>%
  cor()
##        income htrips
## income   1.00   0.27
## htrips   0.27   1.00

F) Use techniques of statistical analysis (correlation, frequency cross tabulation) and visualization (scatter plot, bar chart, etc) to determine whether INCOME (or INCVAL, INC_CAT) and HHVEH (or HHVEH_CAT) affects RESTY (or a recoded version of RESTY), and which one is better at predicting RESTY.

Below is an analysis of the number of household vehicles categories (hhveh_Cat) and how that affects household description (resty).

As you can see in the table below for the category of 3-5 household vehicles, 96% of record indicated that those respondents lived in single-family homes. For the category of 1-2 vehicles 78.5 percent of respondents lived in single-family residence. It is interesting to note that for residents having no vehicles that meant there was a 63% chance of living in a multifamily apartment building. This shows that number of vehicles can be a good predictor of residence type.

##compare hhevh_cat to Resty
households_sub <- households_sub %>%
  filter(!is.na(resty), resty != 9)
households_sub %>%
  count(resty, hhveh_cat) %>%
  group_by(hhveh_cat) %>%
  mutate(percent=n/sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from = "hhveh_cat", values_from = "percent") %>%
  knitr::kable()
resty 0 1-2 3-5 5+
1 0.26 0.79 0.96 0.98
2 0.07 0.03 0.01 NA
3 0.63 0.16 0.02 0.02
4 0.03 0.02 0.01 NA
7 NA 0.00 0.00 NA

Comparing income categories to housing type (resty) It is interesting to note that for household incomes greater than $50k per means that there is a 85% chance that household will be located in a single-family residence. This is in contrast to lower end of the income spectrum 0-$52k per year meant there was a 54% chance that a person in this income categoy was living in a multifamily apartment type building.

##compare inc_cat to Resty
households_sub <- households_sub %>%
  filter(!is.na(resty), resty != 9)
households_sub %>%
  count(resty, inc_cat) %>%
  group_by(inc_cat) %>%
  mutate(percent=n/sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from = "inc_cat", values_from = "percent") %>%
knitr::kable()
resty 0-24999 25000-49999 50000+
1 0.34 0.53 0.85
2 0.08 0.06 0.02
3 0.54 0.35 0.11
4 0.05 0.05 0.01
7 NA NA 0.00

Question 2

Explain the weight column (hhwgt and exphhwgt) is or does?

weighting columns such as hhwgt and exphhwgt is a score of how representative a specific type of household (or record) is of the survey responses. Due to many factors survey responses are not representative of the community. In order to make a sample representative of the community a weighted score is assigned to a specific type of household this weighting factor can be more or less than one which helps to make a sample more representative. These scores are often developed from working with census data as that is more representational of a data set by it’s nature.

Compute the number of vehicles per household in the survey and then use weights to calculate the weighted average vehicles per household.

Average number of vehicles per household

##Calculate AVG number of vehicles per household
households %>%
  summarize(avg_hhveh=mean(hhveh))
## # A tibble: 1 × 1
##   avg_hhveh
##       <dbl>
## 1      1.95

Weighted average number of vehicles per household

households_sub %>%
  mutate(weighted_hhveh=hhveh*exphhwgt) %>% 
  filter(!is.na(exphhwgt)) %>% 
  #na.omit(exphhwgt)
  summarise(sum(weighted_hhveh)/sum(exphhwgt))
## # A tibble: 1 × 1
##   `sum(weighted_hhveh)/sum(exphhwgt)`
##                                 <dbl>
## 1                                1.85

Is there a difference between the average number of vehicles and the weighted average number of vehicles? If so, what does the difference tell you?

There is a difference between the unweighted and weighted average number of vehicles. The unweighted shows about 1.95 cars per household, while the weighted average is about 1.85 cars per household. This difference tells me that households with more cars are over represented in this survey.(meaning they respond to these surveys at a rate higher than other counterparts)

Compute the percent of households that use transit at least once per week (ribus) with the both unweighted and weighting factor.

households_sub %>%
count(ribus) %>%
mutate(freq = n / sum(n) *100)
## # A tibble: 3 × 3
##   ribus     n    freq
##   <dbl> <int>   <dbl>
## 1     1  1671 27.8   
## 2     2  4337 72.1   
## 3     9     6  0.0998
households %>%
  summarize(avg_ribus=mean(ribus))
## # A tibble: 1 × 1
##   avg_ribus
##       <dbl>
## 1      1.73
households_sub <- households_sub %>% 
  mutate(weighted_ribus=ribus*exphhwgt) %>% 
  filter(!is.na(exphhwgt)) 
  #na.omit(exphhwgt)
  
  households_sub %>%
  count(weighted_ribus) %>%
  mutate(freq = n / sum(n) *100)
## # A tibble: 1,620 × 3
##    weighted_ribus     n   freq
##             <dbl> <int>  <dbl>
##  1           7.98     1 0.0167
##  2           8.33     1 0.0167
##  3           9.51     1 0.0167
##  4           9.79     1 0.0167
##  5          10.6      1 0.0167
##  6          10.7      1 0.0167
##  7          11.0      1 0.0167
##  8          13.4      1 0.0167
##  9          13.5      1 0.0167
## 10          13.6      1 0.0167
## # … with 1,610 more rows
  households_sub %>%
  mutate(weighted_ribus=ribus*exphhwgt) %>% 
  filter(!is.na(exphhwgt)) %>% 
  #na.omit(exphhwgt)
  summarise(sum(weighted_ribus)/sum(exphhwgt))
## # A tibble: 1 × 1
##   `sum(weighted_ribus)/sum(exphhwgt)`
##                                 <dbl>
## 1                                1.72