library(readr)
library(dplyr)
library(ggplot2)
library(lmtest)
library(car)
library(MASS)
trips <- read_csv("trips_filtered.csv", show_col_types = FALSE)
table(trips$data_primary_predicted_mode)
##
## AIR_OR_HSR BICYCLING BUS CAR TRAIN UNKNOWN WALKING
## 1 1366 121 26739 3 1095 8582
trips_model <- trips %>%
mutate(
mode_group = case_when(
data_primary_predicted_mode == "CAR" ~ "CAR",
data_primary_predicted_mode == "WALKING" ~ "WALKING",
data_primary_predicted_mode == "BICYCLING" ~ "BICYCLING",
TRUE ~ "OTHER"
),
mode_group = factor(mode_group)
)
# Make CAR the reference category
trips_model$mode_group <- relevel(trips_model$mode_group, ref = "CAR")
model1 <- lm(data_duration_minutes ~ data_distance_miles + mode_group, data = trips_model)
summary(model1)
##
## Call:
## lm(formula = data_duration_minutes ~ data_distance_miles + mode_group,
## data = trips_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.978 -6.093 -2.644 3.514 47.440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.460680 0.078767 132.805 < 2e-16 ***
## data_distance_miles 1.454433 0.009012 161.394 < 2e-16 ***
## mode_groupBICYCLING 3.390970 0.258918 13.097 < 2e-16 ***
## mode_groupOTHER 0.248032 0.272210 0.911 0.362
## mode_groupWALKING 0.973592 0.125582 7.753 9.23e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.26 on 37902 degrees of freedom
## Multiple R-squared: 0.4422, Adjusted R-squared: 0.4421
## F-statistic: 7511 on 4 and 37902 DF, p-value: < 2.2e-16
plot(model1, which = 1)
# Check linearity with Rainbow test
raintest(model1)
##
## Rainbow test
##
## data: model1
## Rain = 0.98134, df1 = 18954, df2 = 18948, p-value = 0.9026
durbinWatsonTest(model1)
## lag Autocorrelation D-W Statistic p-value
## 1 0.00442653 1.991089 0.392
## Alternative hypothesis: rho != 0
plot(model1, which = 3)
# Check homoscedasticity with Breusch-Pagan test
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 1259, df = 4, p-value < 2.2e-16
plot(model1, which = 2)
# Check normality of residuals with Shapiro-Wilk test. The dataset is
too large, so I took a sample of residuals for the test
set.seed(123)
shapiro.test(sample(residuals(model1), 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(residuals(model1), 5000)
## W = 0.87523, p-value < 2.2e-16
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## data_distance_miles 1.20308 1 1.096850
## mode_group 1.20308 3 1.031294
cor(dplyr::select(trips_model, data_distance_miles), use = "complete.obs")
## data_distance_miles
## data_distance_miles 1
trips_model <- trips_model %>%
mutate(
log_duration = log(data_duration_minutes + 1),
log_distance = log(data_distance_miles + 1)
)
model2 <- lm(log_duration ~ log_distance + mode_group, data = trips_model)
summary(model2)
##
## Call:
## lm(formula = log_duration ~ log_distance + mode_group, data = trips_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.46635 -0.30895 -0.04219 0.27522 2.17266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.719365 0.006239 275.595 <2e-16 ***
## log_distance 0.670566 0.003397 197.394 <2e-16 ***
## mode_groupBICYCLING 0.263124 0.012844 20.486 <2e-16 ***
## mode_groupOTHER 0.020399 0.013501 1.511 0.131
## mode_groupWALKING 0.382722 0.007248 52.805 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4585 on 37902 degrees of freedom
## Multiple R-squared: 0.5532, Adjusted R-squared: 0.5532
## F-statistic: 1.173e+04 on 4 and 37902 DF, p-value: < 2.2e-16
plot(model2, which = 1)
raintest(model2)
##
## Rainbow test
##
## data: model2
## Rain = 0.97877, df1 = 18954, df2 = 18948, p-value = 0.9302
durbinWatsonTest(model2)
## lag Autocorrelation D-W Statistic p-value
## 1 0.006101648 1.987654 0.218
## Alternative hypothesis: rho != 0
plot(model2, which = 3)
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 2764.8, df = 4, p-value < 2.2e-16
plot(model2, which = 2)
set.seed(123)
shapiro.test(sample(residuals(model2), 5000))
##
## Shapiro-Wilk normality test
##
## data: sample(residuals(model2), 5000)
## W = 0.98487, p-value < 2.2e-16
vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## log_distance 1.626925 1 1.275510
## mode_group 1.626925 3 1.084496
#A linear regression model was created using data_duration_minutes as the dependent variable and data_distance_miles along with mode_group as the independent variables. Tests were used to evaluate whether the model met the assumptions of linear regression. Based on the residual plot and the Rainbow test, the relationship between the variables appears to be reasonably linear. The Durbin–Watson test suggests that the errors are likely independent. The VIF values also indicate that multicollinearity is not a major concern in this model.
#However, the results suggest that the model may violate the assumption of homoscedasticity. The scale-location plot shows some variation in the spread of the residuals, and the Breusch–Pagan test suggests that the variance of the errors may not be constant. In addition, the QQ plot and Shapiro–Wilk test indicate that the residuals may not be perfectly normally distributed. Because travel behavior data often contain skewed values and outliers, this result is not unexpected.
#I used a log transformation to both trip duration and trip distance and estimated a second model using these transformed variables. This transformation helps reduce skewness and stabilize the residuals’ variance. After the transformation, the model better satisfies the assumptions of linear regression, although the assumptions may not be perfectly met.