Introduction

This analysis examines the relationship between household vehicle ownership and travel distance using mobility tracking data.

library(readr)
library(dplyr)
library(ggplot2)
demographics <- read_csv("demographics_clean.csv")
## Rows: 180 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (28): _id, user_token, user_id, At_your_primary_job_do_you_ha, Which_be...
## dbl  (14): data_ts, How_many_days_do_you_usually_w_001, Including_yourself_h...
## dttm  (1): data_fmt_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trips <- read_csv("trips_filtered.csv")
## Rows: 37907 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): user_token, user_id, data_start_loc_coordinates, data_end_loc_coor...
## dbl  (8): data_duration_minutes, data_distance, data_distance_meters, data_d...
## dttm (2): data_start_fmt_time, data_end_fmt_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
demographics_unique <- demographics %>%
  distinct(user_id, .keep_all = TRUE)
data <- trips %>%
  left_join(demographics_unique, by = "user_id")
distance <- data$data_distance_miles

vehicles <- data$How_many_motor_vehicles_are_ow
summary(distance)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.06236  0.69737  2.36948  4.58279  6.27473 68.86137
summary(vehicles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  1.0000  0.6417  1.0000  3.0000    2049
sum(is.na(distance))
## [1] 0
sum(is.na(vehicles))
## [1] 2049
data_clean <- data %>%
  filter(!is.na(data_distance_miles),
         !is.na(How_many_motor_vehicles_are_ow))

nrow(data_clean)
## [1] 35858
hist(data_clean$data_distance_miles,
     main="Distribution of Trip Distance",
     xlab="Distance (Miles)")

plot(data_clean$How_many_motor_vehicles_are_ow,
     data_clean$data_distance_miles,
     xlab="Household Vehicles",
     ylab="Trip Distance (Miles)",
     main="Vehicle Ownership vs Trip Distance")

ggplot(data_clean,
       aes(x = How_many_motor_vehicles_are_ow,
           y = data_distance_miles)) +
  geom_point() +
  labs(title="Relationship Between Vehicle Ownership and Trip Distance",
       x="Number of Vehicles in Household",
       y="Trip Distance (Miles)")

cor(data_clean$How_many_motor_vehicles_are_ow,
    data_clean$data_distance_miles)
## [1] 0.1513782