library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
trips <- read_csv("trips_filtered.csv", show_col_types = FALSE)
trips_model <- trips %>%
mutate(
mode_group = case_when(
data_primary_predicted_mode == "CAR" ~ "CAR",
data_primary_predicted_mode == "WALKING" ~ "WALKING",
data_primary_predicted_mode == "BICYCLING" ~ "BICYCLING",
TRUE ~ "OTHER"
),
mode_group = factor(mode_group)
)
trips_model$mode_group <- relevel(trips_model$mode_group, ref = "CAR")
y <- trips_model$data_duration_minutes
x1 <- trips_model$data_distance_miles
x2 <- trips_model$mode_group
model1 <- lm(data_duration_minutes ~ data_distance_miles + mode_group, data = trips_model)
summary(model1)
##
## Call:
## lm(formula = data_duration_minutes ~ data_distance_miles + mode_group,
## data = trips_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.978 -6.093 -2.644 3.514 47.440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.460680 0.078767 132.805 < 2e-16 ***
## data_distance_miles 1.454433 0.009012 161.394 < 2e-16 ***
## mode_groupBICYCLING 3.390970 0.258918 13.097 < 2e-16 ***
## mode_groupOTHER 0.248032 0.272210 0.911 0.362
## mode_groupWALKING 0.973592 0.125582 7.753 9.23e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.26 on 37902 degrees of freedom
## Multiple R-squared: 0.4422, Adjusted R-squared: 0.4421
## F-statistic: 7511 on 4 and 37902 DF, p-value: < 2.2e-16
plot(model1, which = 1)

A linear regression model was estimated to examine whether trip
distance and predicted travel mode affect trip duration. The dependent
variable was data_duration_minutes, and the independent
variables were data_distance_miles and
mode_group, a simplified version of the predicted travel
mode variable. The model produced an R-squared of approximately
0.442, meaning that about 44.2% of the
variation in trip duration is explained by the variables
included in the model. The statistically significant variables were
trip distance, walking, and
bicycling, all with p-values below 0.001. The
other travel mode category was not statistically
significant. The coefficient for trip distance suggests that each
additional mile is associated with an increase of about 1.45
minutes in trip duration, holding travel mode constant.
Compared with car trips, walking trips take about 0.97 more
minutes and bicycling trips take about 3.39 more
minutes, on average, for the same distance. The
residuals-versus-fitted plot can be used to evaluate linearity, and the
model appears to be mostly linear overall.