library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
trips <- read_csv("trips_filtered.csv", show_col_types = FALSE)
trips_model <- trips %>%
  mutate(
    mode_group = case_when(
      data_primary_predicted_mode == "CAR" ~ "CAR",
      data_primary_predicted_mode == "WALKING" ~ "WALKING",
      data_primary_predicted_mode == "BICYCLING" ~ "BICYCLING",
      TRUE ~ "OTHER"
    ),
    mode_group = factor(mode_group)
  )

trips_model$mode_group <- relevel(trips_model$mode_group, ref = "CAR")

y <- trips_model$data_duration_minutes
x1 <- trips_model$data_distance_miles
x2 <- trips_model$mode_group
model1 <- lm(data_duration_minutes ~ data_distance_miles + mode_group, data = trips_model)
summary(model1)
## 
## Call:
## lm(formula = data_duration_minutes ~ data_distance_miles + mode_group, 
##     data = trips_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.978  -6.093  -2.644   3.514  47.440 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         10.460680   0.078767 132.805  < 2e-16 ***
## data_distance_miles  1.454433   0.009012 161.394  < 2e-16 ***
## mode_groupBICYCLING  3.390970   0.258918  13.097  < 2e-16 ***
## mode_groupOTHER      0.248032   0.272210   0.911    0.362    
## mode_groupWALKING    0.973592   0.125582   7.753 9.23e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.26 on 37902 degrees of freedom
## Multiple R-squared:  0.4422, Adjusted R-squared:  0.4421 
## F-statistic:  7511 on 4 and 37902 DF,  p-value: < 2.2e-16
plot(model1, which = 1)

A linear regression model was estimated to examine whether trip distance and predicted travel mode affect trip duration. The dependent variable was data_duration_minutes, and the independent variables were data_distance_miles and mode_group, a simplified version of the predicted travel mode variable. The model produced an R-squared of approximately 0.442, meaning that about 44.2% of the variation in trip duration is explained by the variables included in the model. The statistically significant variables were trip distance, walking, and bicycling, all with p-values below 0.001. The other travel mode category was not statistically significant. The coefficient for trip distance suggests that each additional mile is associated with an increase of about 1.45 minutes in trip duration, holding travel mode constant. Compared with car trips, walking trips take about 0.97 more minutes and bicycling trips take about 3.39 more minutes, on average, for the same distance. The residuals-versus-fitted plot can be used to evaluate linearity, and the model appears to be mostly linear overall.