library(readr)
library(dplyr)
library(ggplot2)
library(lmtest)
library(car)
library(MASS)
trips <- read_csv("trips_filtered.csv", show_col_types = FALSE)
table(trips$data_primary_predicted_mode)
## 
## AIR_OR_HSR  BICYCLING        BUS        CAR      TRAIN    UNKNOWN    WALKING 
##          1       1366        121      26739          3       1095       8582

The dependent variable is trip duration in minutes

The independent variables are trip distance in miles and predicted travel mode

trips_model <- trips %>%
  mutate(
    mode_group = case_when(
      data_primary_predicted_mode == "CAR" ~ "CAR",
      data_primary_predicted_mode == "WALKING" ~ "WALKING",
      data_primary_predicted_mode == "BICYCLING" ~ "BICYCLING",
      TRUE ~ "OTHER"
    ),
    mode_group = factor(mode_group)
  )
# Make CAR the reference category
trips_model$mode_group <- relevel(trips_model$mode_group, ref = "CAR")

Linear model with a continuous dependent variable

model1 <- lm(data_duration_minutes ~ data_distance_miles + mode_group, data = trips_model)
summary(model1)
## 
## Call:
## lm(formula = data_duration_minutes ~ data_distance_miles + mode_group, 
##     data = trips_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.978  -6.093  -2.644   3.514  47.440 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         10.460680   0.078767 132.805  < 2e-16 ***
## data_distance_miles  1.454433   0.009012 161.394  < 2e-16 ***
## mode_groupBICYCLING  3.390970   0.258918  13.097  < 2e-16 ***
## mode_groupOTHER      0.248032   0.272210   0.911    0.362    
## mode_groupWALKING    0.973592   0.125582   7.753 9.23e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.26 on 37902 degrees of freedom
## Multiple R-squared:  0.4422, Adjusted R-squared:  0.4421 
## F-statistic:  7511 on 4 and 37902 DF,  p-value: < 2.2e-16

Check linearity with residual plot

plot(model1, which = 1)

# Check linearity with Rainbow test

raintest(model1)
## 
##  Rainbow test
## 
## data:  model1
## Rain = 0.98134, df1 = 18954, df2 = 18948, p-value = 0.9026

Check independence of errors with Durbin-Watson

durbinWatsonTest(model1)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.00442653      1.991089   0.392
##  Alternative hypothesis: rho != 0

Check homoscedasticity with scale-location plot

plot(model1, which = 3)

# Check homoscedasticity with Breusch-Pagan test

bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 1259, df = 4, p-value < 2.2e-16

Check normality of residuals with QQ plot

plot(model1, which = 2)

# Check normality of residuals with Shapiro-Wilk test. The dataset is too large, so I took a sample of residuals for the test

set.seed(123)
shapiro.test(sample(residuals(model1), 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(residuals(model1), 5000)
## W = 0.87523, p-value < 2.2e-16

Check multicollinearity with VIF

vif(model1)
##                        GVIF Df GVIF^(1/(2*Df))
## data_distance_miles 1.20308  1        1.096850
## mode_group          1.20308  3        1.031294

Check correlation among numeric predictors. Only one continuous predictor, so correlation is limited

cor(dplyr::select(trips_model, data_distance_miles), use = "complete.obs")
##                     data_distance_miles
## data_distance_miles                   1
trips_model <- trips_model %>%
  mutate(
    log_duration = log(data_duration_minutes + 1),
    log_distance = log(data_distance_miles + 1)
  )
model2 <- lm(log_duration ~ log_distance + mode_group, data = trips_model)
summary(model2)
## 
## Call:
## lm(formula = log_duration ~ log_distance + mode_group, data = trips_model)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46635 -0.30895 -0.04219  0.27522  2.17266 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.719365   0.006239 275.595   <2e-16 ***
## log_distance        0.670566   0.003397 197.394   <2e-16 ***
## mode_groupBICYCLING 0.263124   0.012844  20.486   <2e-16 ***
## mode_groupOTHER     0.020399   0.013501   1.511    0.131    
## mode_groupWALKING   0.382722   0.007248  52.805   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4585 on 37902 degrees of freedom
## Multiple R-squared:  0.5532, Adjusted R-squared:  0.5532 
## F-statistic: 1.173e+04 on 4 and 37902 DF,  p-value: < 2.2e-16
plot(model2, which = 1)

raintest(model2)
## 
##  Rainbow test
## 
## data:  model2
## Rain = 0.97877, df1 = 18954, df2 = 18948, p-value = 0.9302
durbinWatsonTest(model2)
##  lag Autocorrelation D-W Statistic p-value
##    1     0.006101648      1.987654   0.218
##  Alternative hypothesis: rho != 0
plot(model2, which = 3)

bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 2764.8, df = 4, p-value < 2.2e-16
plot(model2, which = 2)

set.seed(123)
shapiro.test(sample(residuals(model2), 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(residuals(model2), 5000)
## W = 0.98487, p-value < 2.2e-16
vif(model2)
##                  GVIF Df GVIF^(1/(2*Df))
## log_distance 1.626925  1        1.275510
## mode_group   1.626925  3        1.084496

#A linear regression model was created using data_duration_minutes as the dependent variable and data_distance_miles along with mode_group as the independent variables. Tests were used to evaluate whether the model met the assumptions of linear regression. Based on the residual plot and the Rainbow test, the relationship between the variables appears to be reasonably linear. The Durbin–Watson test suggests that the errors are likely independent. The VIF values also indicate that multicollinearity is not a major concern in this model.

#However, the results suggest that the model may violate the assumption of homoscedasticity. The scale-location plot shows some variation in the spread of the residuals, and the Breusch–Pagan test suggests that the variance of the errors may not be constant. In addition, the QQ plot and Shapiro–Wilk test indicate that the residuals may not be perfectly normally distributed. Because travel behavior data often contain skewed values and outliers, this result is not unexpected.

#I used a log transformation to both trip duration and trip distance and estimated a second model using these transformed variables. This transformation helps reduce skewness and stabilize the residuals’ variance. After the transformation, the model better satisfies the assumptions of linear regression, although the assumptions may not be perfectly met.