Loading packages
Reading data
data <- read.csv("C:/Users/49765/Desktop/Advanced Planning Methods/Group/Group_project_deliverable_2/June_cleaned.csv")
str(data)
## 'data.frame': 482476 obs. of 11 variables:
## $ ori : int 75 255 256 976 1519 74 75 255 256 434 ...
## $ dst : int 74 74 74 74 74 75 75 75 75 75 ...
## $ dist_km : num 0.38 0.83 1.05 4.57 4.21 0.38 0 0.45 0.66 1.34 ...
## $ fsize_m6_wd: num 0 0.136 0.591 15.409 62.955 ...
## $ trip_m6_wd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mrt_km_o : num 3.27 3.14 2.91 1.97 0.52 3.64 3.27 3.14 2.91 3.57 ...
## $ cycle_km_o : num 0 0 0 0 0.52 0 0 0 0 0 ...
## $ far_hdb_o : num 0 0 0 0.000442 0.742848 ...
## $ far_priv_o : num 0 0 0 0 0 0 0 0 0 0 ...
## $ far_comm_o : num 0 0 0 0 0.536 ...
## $ time : int 0 0 0 0 0 0 0 0 0 0 ...
Correlation
data$fsize_s <- data$fsize_m6_wd^2
Using_data <- data[c("dist_km", "fsize_m6_wd", "trip_m6_wd", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o", "fsize_s")]
correlation_matrix <- cor(Using_data)
color_theme <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(correlation_matrix,
method = "color",
col = color_theme(200),
order = "hclust",
addCoef.col = "darkgrey"
)
# check order terms
# whether include a higher order term (Ramsey’s RESET Test) --> answer is yes
resettest(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time, power=2 , type="regressor", data = data)
##
## RESET test
##
## data: trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time
## RESET = 2416.7, df1 = 8, df2 = 482459, p-value < 2.2e-16
# check different higher order terms
# ***
data$fsize_m6_wd2 <- data$fsize_m6_wd^2
data$dist_km2 <- data$dist_km^2
data$mrt_km_o2 <- data$mrt_km_o^2
# **
data$far_hdb_o2 <- data$far_hdb_o^2
# *
data$cycle_km_o2 <- data$cycle_km_o^2
data$cycle_km_o2 <- data$cycle_km_o^2
# none
data$far_priv_o2 <- data$far_priv_o^2
data$far_comm_o2 <- data$far_comm_o^2
# list of models
ori_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time, data = data)
RdNDdist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + dist_km2, data = data)
FleetSize2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2 , data = data)
MRTDist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + mrt_km_o2, data = data)
tab_model(ori_model, FleetSize2_model, RdNDdist2_model, MRTDist2_model,
show.ci = 0.95,
pred.labels = c("(Intercept)", "Rd Network distance (O-D)","Bike fleet size","Distance to nearest MRT STA", "Bike lanes length",
"Floor Area Ratio of Public Housing Bldg", "Floor Area Ratio of Private Residential Bldg",
"Floor Area Ratio of Commercial Bldg", "Morning/Evening",
"Rd Network distance (O-D)_2","Bike fleet size_2", "Dist to nearest MRT STA_2"),
dv.labels = c("model1", "model2",
"model3", "model4"),
file = "higher_orders_comparison.html")
|
Â
|
model1
|
model2
|
model3
|
model4
|
|
Predictors
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
|
(Intercept)
|
1.19
|
1.16 – 1.21
|
<0.001
|
1.12
|
1.09 – 1.15
|
<0.001
|
2.44
|
2.41 – 2.47
|
<0.001
|
1.24
|
1.21 – 1.27
|
<0.001
|
|
Rd Network distance (O-D)
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
-0.39
|
-0.39 – -0.38
|
<0.001
|
-1.29
|
-1.30 – -1.28
|
<0.001
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
|
Bike fleet size
|
0.04
|
0.03 – 0.04
|
<0.001
|
0.05
|
0.04 – 0.05
|
<0.001
|
0.04
|
0.04 – 0.04
|
<0.001
|
0.04
|
0.03 – 0.04
|
<0.001
|
|
Distance to nearest MRT STA
|
-0.04
|
-0.05 – -0.03
|
<0.001
|
-0.03
|
-0.04 – -0.02
|
<0.001
|
-0.08
|
-0.09 – -0.07
|
<0.001
|
-0.09
|
-0.10 – -0.07
|
<0.001
|
|
Bike lanes length
|
0.25
|
0.14 – 0.36
|
<0.001
|
0.12
|
0.01 – 0.23
|
0.037
|
0.25
|
0.14 – 0.36
|
<0.001
|
0.26
|
0.15 – 0.37
|
<0.001
|
|
Floor Area Ratio of Public Housing Bldg
|
0.07
|
0.05 – 0.08
|
<0.001
|
0.03
|
0.02 – 0.05
|
<0.001
|
0.09
|
0.08 – 0.11
|
<0.001
|
0.07
|
0.05 – 0.08
|
<0.001
|
|
Floor Area Ratio of Private Residential Bldg
|
-0.03
|
-0.05 – -0.00
|
0.028
|
-0.04
|
-0.06 – -0.01
|
0.003
|
-0.03
|
-0.05 – -0.00
|
0.022
|
-0.03
|
-0.05 – -0.00
|
0.024
|
|
Floor Area Ratio of Commercial Bldg
|
-0.01
|
-0.03 – 0.02
|
0.525
|
-0.01
|
-0.04 – 0.01
|
0.262
|
0.01
|
-0.01 – 0.04
|
0.269
|
-0.02
|
-0.04 – 0.00
|
0.119
|
|
Morning/Evening
|
0.30
|
0.28 – 0.32
|
<0.001
|
0.27
|
0.25 – 0.29
|
<0.001
|
0.29
|
0.27 – 0.31
|
<0.001
|
0.30
|
0.28 – 0.32
|
<0.001
|
|
Rd Network distance (O-D)_2
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
|
|
|
|
|
|
Bike fleet size_2
|
|
|
|
|
|
|
0.11
|
0.11 – 0.11
|
<0.001
|
|
|
|
|
Dist to nearest MRT STA_2
|
|
|
|
|
|
|
|
|
|
0.01
|
0.00 – 0.01
|
<0.001
|
|
Observations
|
482476
|
482476
|
482476
|
482476
|
|
R2 / R2 adjusted
|
0.081 / 0.081
|
0.082 / 0.082
|
0.115 / 0.115
|
0.081 / 0.081
|
Origin Model Combination
variables <- c("dist_km", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o")
# Stores all combinations into a list
all_combinations <- c()
# Combination of 1 to 7 variables
for (num_variables in 1:6) {
# Get all possible combinations
variable_combinations <- combn(variables, num_variables, simplify = TRUE)
# Add the name of the combination to the list
for (i in 1:ncol(variable_combinations)) {
combination_name <- paste(variable_combinations[, i], collapse = " + ")
all_combinations <- c(all_combinations, combination_name)
}
}
# Create an empty list for storing all models
all_models <- list()
# Iterate over all combinations and fit a linear regression model
for (combination in all_combinations) {
formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd +", combination))
model <- lm(formula, data = data)
model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
all_models[[model_name]] <- model
}
# Create an empty list to store the R-squared values for all models
r_square_values <- list()
# Iterate over all models and extract R-squared values
for (model_name in names(all_models)) {
model <- all_models[[model_name]]
r_square <- summary(model)$r.square
r_square_values[[model_name]] <- r_square
}
# Converting lists to df
r_square_df <- data.frame(
O_r2 = unlist(r_square_values)
)
print(r_square_df)
## O_r2
## lm_model_1 0.07848705
## lm_model_2 0.03697736
## lm_model_3 0.03696703
## lm_model_4 0.03712659
## lm_model_5 0.03686562
## lm_model_6 0.03703114
## lm_model_7 0.07864858
## lm_model_8 0.07854319
## lm_model_9 0.07870190
## lm_model_10 0.07851311
## lm_model_11 0.07848934
## lm_model_12 0.03708484
## lm_model_13 0.03721141
## lm_model_14 0.03699222
## lm_model_15 0.03720817
## lm_model_16 0.03718242
## lm_model_17 0.03697187
## lm_model_18 0.03712972
## lm_model_19 0.03712798
## lm_model_20 0.03723957
## lm_model_21 0.03704102
## lm_model_22 0.07870315
## lm_model_23 0.07882688
## lm_model_24 0.07868852
## lm_model_25 0.07866204
## lm_model_26 0.07872486
## lm_model_27 0.07856498
## lm_model_28 0.07854467
## lm_model_29 0.07870451
## lm_model_30 0.07870270
## lm_model_31 0.07851579
## lm_model_32 0.03726903
## lm_model_33 0.03709532
## lm_model_34 0.03730282
## lm_model_35 0.03721141
## lm_model_36 0.03736952
## lm_model_37 0.03722825
## lm_model_38 0.03718405
## lm_model_39 0.03729385
## lm_model_40 0.03713622
## lm_model_41 0.03723978
## lm_model_42 0.07885123
## lm_model_43 0.07873778
## lm_model_44 0.07871446
## lm_model_45 0.07883603
## lm_model_46 0.07882836
## lm_model_47 0.07870394
## lm_model_48 0.07872725
## lm_model_49 0.07872574
## lm_model_50 0.07856678
## lm_model_51 0.07870513
## lm_model_52 0.03726904
## lm_model_53 0.03742571
## lm_model_54 0.03731791
## lm_model_55 0.03737081
## lm_model_56 0.03729417
## lm_model_57 0.07886001
## lm_model_58 0.07885263
## lm_model_59 0.07875087
## lm_model_60 0.07883829
## lm_model_61 0.07872795
## lm_model_62 0.03742679
## lm_model_63 0.07886215
Squared Model Combination
all_models <- list()
for (combination in all_combinations) {
formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd + fsize_s +", combination))
model <- lm(formula, data = data)
model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
all_models[[model_name]] <- model
}
r_square_values <- list()
for (model_name in names(all_models)) {
model <- all_models[[model_name]]
r_square <- summary(model)$r.square
r_square_values[[model_name]] <- r_square
}
r_square_df2 <- data.frame(
S_r2 = unlist(r_square_values)
)
print(r_square_df2)
## S_r2
## lm_model_1 0.08027222
## lm_model_2 0.03809388
## lm_model_3 0.03810056
## lm_model_4 0.03815376
## lm_model_5 0.03806906
## lm_model_6 0.03824234
## lm_model_7 0.08031107
## lm_model_8 0.08027717
## lm_model_9 0.08031072
## lm_model_10 0.08029580
## lm_model_11 0.08027520
## lm_model_12 0.03813292
## lm_model_13 0.03817747
## lm_model_14 0.03810379
## lm_model_15 0.03830579
## lm_model_16 0.03817516
## lm_model_17 0.03810556
## lm_model_18 0.03827401
## lm_model_19 0.03815383
## lm_model_20 0.03829719
## lm_model_21 0.03825093
## lm_model_22 0.08031634
## lm_model_23 0.08034384
## lm_model_24 0.08034106
## lm_model_25 0.08031894
## lm_model_26 0.08031214
## lm_model_27 0.08029956
## lm_model_28 0.08027985
## lm_model_29 0.08032255
## lm_model_30 0.08031121
## lm_model_31 0.08029920
## lm_model_32 0.03820005
## lm_model_33 0.03814071
## lm_model_34 0.03833786
## lm_model_35 0.03817819
## lm_model_36 0.03834884
## lm_model_37 0.03831988
## lm_model_38 0.03817520
## lm_model_39 0.03831712
## lm_model_40 0.03828075
## lm_model_41 0.03829839
## lm_model_42 0.08034563
## lm_model_43 0.08034498
## lm_model_44 0.08032372
## lm_model_45 0.08036110
## lm_model_46 0.08034701
## lm_model_47 0.08035025
## lm_model_48 0.08032381
## lm_model_49 0.08031260
## lm_model_50 0.08030268
## lm_model_51 0.08032341
## lm_model_52 0.03820065
## lm_model_53 0.03837030
## lm_model_54 0.03834956
## lm_model_55 0.03835315
## lm_model_56 0.03831813
## lm_model_57 0.08036271
## lm_model_58 0.08034876
## lm_model_59 0.08035369
## lm_model_60 0.08036582
## lm_model_61 0.08032465
## lm_model_62 0.03837431
## lm_model_63 0.08036738
Comparasion of R-square
r_square_df$S_r2 <- r_square_df2$S_r2
write.csv(r_square_df, file = "C:/Users/49765/Desktop/Advanced Planning Methods/Group/r_square_data.csv", row.names = FALSE)
t_test_result <- t.test(r_square_df$O_r2, r_square_df$S_r2, paired = TRUE)
print(t_test_result)
##
## Paired t-test
##
## data: r_square_df$O_r2 and r_square_df$S_r2
## t = -35.295, df = 62, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.001419630 -0.001267446
## sample estimates:
## mean difference
## -0.001343538
Model Specification
best subset
# best subset
bestsubset <- regsubsets(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data, nbest=1)
#subsets(bestsubset, statistic = "adjr2", which = 1)
best.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time,
data = data)
summary(best.model)
##
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + time, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.881 -0.967 -0.505 0.174 277.266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1444209 0.0096345 118.78 <2e-16 ***
## dist_km -0.3834739 0.0026051 -147.20 <2e-16 ***
## fsize_m6_wd 0.0367418 0.0002551 144.01 <2e-16 ***
## time 0.2894217 0.0095862 30.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.27 on 482472 degrees of freedom
## Multiple R-squared: 0.08022, Adjusted R-squared: 0.08022
## F-statistic: 1.403e+04 on 3 and 482472 DF, p-value: < 2.2e-16
# (Rule of Parsimonious) --> include variables in best.model and pick the higher order term that boost R-squared the most
best.model_1 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + dist_km2,
data = data)
best.model_2 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + fsize_m6_wd2,
data = data)
best.model_3 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + mrt_km_o2,
data = data)
tab_model(best.model_1, best.model_2, best.model_3,
show.ci = 0.95,
pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike fleet size",
"Distance to nearest MRT STA", "Rd Network distance (O-D)_2",
"Bike fleet size_2","Distance to nearest MRT STA_2"),
dv.labels = c("RdNDdist2_model", "FleetSize2_model", "MRTDist2_model"),
file = "best_models_comparison.html")
|
Â
|
RdNDdist2_model
|
FleetSize2_model
|
MRTDist2_model
|
|
Predictors
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
|
Intercept
|
2.33
|
2.30 – 2.35
|
<0.001
|
1.08
|
1.06 – 1.10
|
<0.001
|
1.16
|
1.14 – 1.18
|
<0.001
|
|
Rd Network distance (O-D)
|
-1.28
|
-1.29 – -1.26
|
<0.001
|
-0.39
|
-0.39 – -0.38
|
<0.001
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
|
Bike fleet size
|
0.04
|
0.04 – 0.04
|
<0.001
|
0.05
|
0.05 – 0.05
|
<0.001
|
0.04
|
0.04 – 0.04
|
<0.001
|
|
Distance to nearest MRT STA
|
0.28
|
0.26 – 0.29
|
<0.001
|
0.26
|
0.24 – 0.28
|
<0.001
|
0.29
|
0.27 – 0.31
|
<0.001
|
|
Rd Network distance (O-D)_2
|
0.11
|
0.11 – 0.11
|
<0.001
|
|
|
|
|
|
|
|
Bike fleet size_2
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
|
|
|
Distance to nearest MRT STA_2
|
|
|
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
Observations
|
482476
|
482476
|
482476
|
|
R2 / R2 adjusted
|
0.113 / 0.113
|
0.082 / 0.082
|
0.080 / 0.080
|
Stepwise(final model)
null.model <- lm(trip_m6_wd ~ 1, data=data)
full.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data)
step.model.for <- step(null.model,
scope = formula(full.model),
direction = "forward",
trace = 0)
step.model.back <- step(full.model,
direction = "backward",
trace = 0)
step.model.both <- step(null.model,
scope=formula(full.model),
direction = "both",
trace = 0)
stargazer(step.model.for, step.model.back, step.model.both,
type = "text",
file = "stepwise.html",
add.lines=list(c("AIC", round(AIC(step.model.for),1), round(AIC(step.model.back),1),round(AIC(step.model.both),1))),
column.labels = c("Forward", "Backward", "Both"))
##
## ========================================================================
## Dependent variable:
## --------------------------------------
## trip_m6_wd
## Forward Backward Both
## (1) (2) (3)
## ------------------------------------------------------------------------
## fsize_m6_wd 0.045*** 0.045*** 0.045***
## (0.0005) (0.0005) (0.0005)
##
## dist_km -0.386*** -0.386*** -0.386***
## (0.003) (0.003) (0.003)
##
## fsize_m6_wd2 -0.0001*** -0.0001*** -0.0001***
## (0.00000) (0.00000) (0.00000)
##
## time 0.272*** 0.272*** 0.272***
## (0.010) (0.010) (0.010)
##
## far_hdb_o 0.035*** 0.035*** 0.035***
## (0.008) (0.008) (0.008)
##
## mrt_km_o -0.026*** -0.026*** -0.026***
## (0.004) (0.004) (0.004)
##
## far_priv_o -0.037*** -0.037*** -0.037***
## (0.013) (0.013) (0.013)
##
## cycle_km_o 0.119** 0.119** 0.119**
## (0.057) (0.057) (0.057)
##
## Constant 1.117*** 1.117*** 1.117***
## (0.013) (0.013) (0.013)
##
## ------------------------------------------------------------------------
## AIC 2511639.3 2511639.3 2511639.3
## Observations 482,476 482,476 482,476
## R2 0.082 0.082 0.082
## Adjusted R2 0.082 0.082 0.082
## Residual Std. Error (df = 482467) 3.267 3.267 3.267
## F Statistic (df = 8; 482467) 5,377.216*** 5,377.216*** 5,377.216***
## ========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## =============
## stepwise.html
## -------------
summary(step.model.both)
##
## Call:
## lm(formula = trip_m6_wd ~ fsize_m6_wd + dist_km + fsize_m6_wd2 +
## time + far_hdb_o + mrt_km_o + far_priv_o + cycle_km_o, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.560 -0.965 -0.488 0.195 277.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e+00 1.326e-02 84.231 < 2e-16 ***
## fsize_m6_wd 4.546e-02 4.990e-04 91.094 < 2e-16 ***
## dist_km -3.861e-01 2.606e-03 -148.144 < 2e-16 ***
## fsize_m6_wd2 -8.954e-05 3.654e-06 -24.506 < 2e-16 ***
## time 2.716e-01 9.673e-03 28.078 < 2e-16 ***
## far_hdb_o 3.541e-02 7.592e-03 4.664 3.10e-06 ***
## mrt_km_o -2.636e-02 4.306e-03 -6.122 9.24e-10 ***
## far_priv_o -3.662e-02 1.266e-02 -2.892 0.00383 **
## cycle_km_o 1.189e-01 5.666e-02 2.098 0.03591 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared: 0.08186, Adjusted R-squared: 0.08185
## F-statistic: 5377 on 8 and 482467 DF, p-value: < 2.2e-16
final.model <- step.model.both
Predictions with Bike fleet size
vector <- seq(1, 300)
vector_df <- data.frame(vector)
new_data_time_1 <- data.frame(dist_km = mean(data$dist_km),
fsize_m6_wd = vector_df$vector,
mrt_km_o = mean(data$mrt_km_o),
cycle_km_o = mean(data$cycle_km_o),
far_hdb_o = mean(data$far_hdb_o),
far_priv_o = mean(data$far_priv_o),
fsize_m6_wd2 = vector_df$vector^2,
time = 1)
predicted_values_time_1 <- predict(final.model, newdata = new_data_time_1); #print(predicted_values_time_1)
plot(predicted_values_time_1,
xlab = 'Bike fleet size',
ylab = 'Number of bike trips')
#scatter.smooth(predicted_values_time_1, lpars = list(col = "green", lwd = 3),
# xlab = 'Bike fleet size',ylab = 'Number of bike trips')
# find the max point index, use the index to get the max value
max_point_index <- which.max(predicted_values_time_1)
max_point <- vector[max_point_index]
points(max_point, predicted_values_time_1[max_point_index], col = "red", pch = 16)

#max_point_index = 254(6.133189 )
Diagnose
### FINAL MODEL
final.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + time + fsize_m6_wd2, data = data)
final.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
I(far_hdb_o*100) + I(far_priv_o*100) + time + fsize_m6_wd2, data = data)
summary(final.model)
##
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o +
## cycle_km_o + I(far_hdb_o * 100) + I(far_priv_o * 100) + time +
## fsize_m6_wd2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.560 -0.965 -0.488 0.195 277.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e+00 1.326e-02 84.231 < 2e-16 ***
## dist_km -3.861e-01 2.606e-03 -148.144 < 2e-16 ***
## fsize_m6_wd 4.546e-02 4.990e-04 91.094 < 2e-16 ***
## mrt_km_o -2.636e-02 4.306e-03 -6.122 9.24e-10 ***
## cycle_km_o 1.189e-01 5.666e-02 2.098 0.03591 *
## I(far_hdb_o * 100) 3.541e-04 7.592e-05 4.664 3.10e-06 ***
## I(far_priv_o * 100) -3.662e-04 1.266e-04 -2.892 0.00383 **
## time 2.716e-01 9.673e-03 28.078 < 2e-16 ***
## fsize_m6_wd2 -8.954e-05 3.654e-06 -24.506 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared: 0.08186, Adjusted R-squared: 0.08185
## F-statistic: 5377 on 8 and 482467 DF, p-value: < 2.2e-16
#outlier detection
cook.dist <-cooks.distance(final.model)
data$cook.dist <-cooks.distance(final.model)
plot(cook.dist, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cook.dist, na.rm=T), col="red")
text(x = 1:length(cook.dist) + 5,
y = cook.dist,
col="red",
labels = ifelse(cook.dist > 0.05, # Conditional statement
names(cook.dist),
""))

#log-transformation
final.model_log <-lm((log(2+trip_m6_wd)) ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o+ +
I(far_hdb_o*100) + I(far_priv_o*100)+ time + fsize_m6_wd2, data = data)
summary(final.model_log)
##
## Call:
## lm(formula = (log(2 + trip_m6_wd)) ~ dist_km + fsize_m6_wd +
## mrt_km_o + cycle_km_o + +I(far_hdb_o * 100) + I(far_priv_o *
## 100) + time + fsize_m6_wd2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7259 -0.2137 -0.1183 0.1022 4.1915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.469e-01 1.571e-03 602.843 < 2e-16 ***
## dist_km -6.953e-02 3.086e-04 -225.284 < 2e-16 ***
## fsize_m6_wd 6.341e-03 5.910e-05 107.293 < 2e-16 ***
## mrt_km_o -9.782e-03 5.100e-04 -19.180 < 2e-16 ***
## cycle_km_o 3.319e-02 6.710e-03 4.945 7.6e-07 ***
## I(far_hdb_o * 100) 1.280e-04 8.992e-06 14.231 < 2e-16 ***
## I(far_priv_o * 100) -4.029e-05 1.500e-05 -2.686 0.00723 **
## time 7.674e-02 1.146e-03 66.987 < 2e-16 ***
## fsize_m6_wd2 -2.150e-05 4.328e-07 -49.672 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.387 on 482467 degrees of freedom
## Multiple R-squared: 0.141, Adjusted R-squared: 0.1409
## F-statistic: 9895 on 8 and 482467 DF, p-value: < 2.2e-16
summary(lm.beta(final.model_log))
##
## Call:
## lm(formula = (log(2 + trip_m6_wd)) ~ dist_km + fsize_m6_wd +
## mrt_km_o + cycle_km_o + +I(far_hdb_o * 100) + I(far_priv_o *
## 100) + time + fsize_m6_wd2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7259 -0.2137 -0.1183 0.1022 4.1915
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 9.469e-01 NA 1.571e-03 602.843 < 2e-16 ***
## dist_km -6.953e-02 -3.024e-01 3.086e-04 -225.284 < 2e-16 ***
## fsize_m6_wd 6.341e-03 2.866e-01 5.910e-05 107.293 < 2e-16 ***
## mrt_km_o -9.782e-03 -2.699e-02 5.100e-04 -19.180 < 2e-16 ***
## cycle_km_o 3.319e-02 6.896e-03 6.710e-03 4.945 7.6e-07 ***
## I(far_hdb_o * 100) 1.280e-04 2.110e-02 8.992e-06 14.231 < 2e-16 ***
## I(far_priv_o * 100) -4.029e-05 -3.728e-03 1.500e-05 -2.686 0.00723 **
## time 7.674e-02 9.191e-02 1.146e-03 66.987 < 2e-16 ***
## fsize_m6_wd2 -2.150e-05 -1.237e-01 4.328e-07 -49.672 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.387 on 482467 degrees of freedom
## Multiple R-squared: 0.141, Adjusted R-squared: 0.1409
## F-statistic: 9895 on 8 and 482467 DF, p-value: < 2.2e-16
#cook distance of logged model
cook.dist.log <- cooks.distance(final.model_log)
plot(cook.dist.log, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cook.dist.log, na.rm=T), col="red")
text(x = 1:length(cook.dist.log) + 5,
y = cook.dist.log,
col="red",
labels = ifelse(cook.dist.log > .005,
names(cook.dist.log),
""))

#comparision of two models
tab_model(final.model, final.model_log,
show.ci = 0.95,
show.stat = TRUE,
show.aic = TRUE,
pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike trips",
"Distance to nearest MRT STA", "Lenth of Bike Lanes","FAR of public residential",
"FAR of private residential","time","Bike trips_2"),
dv.labels = c("Normal model","Logged model"),
file = "log_comparison.html")
|
Â
|
Normal model
|
Logged model
|
|
Predictors
|
Estimates
|
CI
|
Statistic
|
p
|
Estimates
|
CI
|
Statistic
|
p
|
|
Intercept
|
1.12
|
1.09 – 1.14
|
84.23
|
<0.001
|
0.95
|
0.94 – 0.95
|
602.84
|
<0.001
|
|
Rd Network distance (O-D)
|
-0.39
|
-0.39 – -0.38
|
-148.14
|
<0.001
|
-0.07
|
-0.07 – -0.07
|
-225.28
|
<0.001
|
|
Bike trips
|
0.05
|
0.04 – 0.05
|
91.09
|
<0.001
|
0.01
|
0.01 – 0.01
|
107.29
|
<0.001
|
|
Distance to nearest MRT STA
|
-0.03
|
-0.03 – -0.02
|
-6.12
|
<0.001
|
-0.01
|
-0.01 – -0.01
|
-19.18
|
<0.001
|
|
Lenth of Bike Lanes
|
0.12
|
0.01 – 0.23
|
2.10
|
0.036
|
0.03
|
0.02 – 0.05
|
4.95
|
<0.001
|
|
FAR of public residential
|
0.00
|
0.00 – 0.00
|
4.66
|
<0.001
|
0.00
|
0.00 – 0.00
|
14.23
|
<0.001
|
|
FAR of private residential
|
-0.00
|
-0.00 – -0.00
|
-2.89
|
0.004
|
-0.00
|
-0.00 – -0.00
|
-2.69
|
0.007
|
|
time
|
0.27
|
0.25 – 0.29
|
28.08
|
<0.001
|
0.08
|
0.07 – 0.08
|
66.99
|
<0.001
|
|
Bike trips_2
|
-0.00
|
-0.00 – -0.00
|
-24.51
|
<0.001
|
-0.00
|
-0.00 – -0.00
|
-49.67
|
<0.001
|
|
Observations
|
482476
|
482476
|
|
R2 / R2 adjusted
|
0.082 / 0.082
|
0.141 / 0.141
|
|
AIC
|
2511639.335
|
-Inf
|
#Comparisions of the residuals distribution
plot(fitted(final.model),resid(final.model),abline(h=0))

plot(fitted(final.model_log),resid(final.model_log),abline(h=0))

hist(final.model$residuals,
main = "Histogram of Final model_Resid",
xlab = "Final model_Resid",
ylab = "Frequency",
col = "skyblue")

hist(final.model_log$residuals,
main = "Histogram of Final model_Residln",
xlab = "Final model_Residln",
ylab = "Frequency",
col = "skyblue")

#Heteroskedasticity detection
plot(predict(final.model),
final.model$residuals,
xlab = "Predicted values",
ylab = "Residuals",
main = "Residuals vs. Predicted")
abline(h = 0, col = 'red', lty = 2)

ncvTest(final.model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1279035, Df = 1, p = < 2.22e-16
plot(predict(final.model_log),
final.model_log$residuals,
xlab = "Predicted values",
ylab = "Residuals",
main = "Residuals vs. Predicted")
abline(h = 0, col = 'red', lty = 2)

ncvTest(final.model_log)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 230291.1, Df = 1, p = < 2.22e-16
coeftest(final.model_log, vcov = vcovHC(final.model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4692e-01 1.3997e-02 67.6520 < 2.2e-16 ***
## dist_km -6.9533e-02 3.5009e-03 -19.8617 < 2.2e-16 ***
## fsize_m6_wd 6.3414e-03 1.6525e-03 3.8375 0.0001243 ***
## mrt_km_o -9.7821e-03 3.1531e-03 -3.1024 0.0019198 **
## cycle_km_o 3.3186e-02 8.6630e-02 0.3831 0.7016611
## I(far_hdb_o * 100) 1.2797e-04 9.4699e-05 1.3514 0.1765806
## I(far_priv_o * 100) -4.0291e-05 1.2385e-04 -0.3253 0.7449344
## time 7.6744e-02 9.1221e-03 8.4130 < 2.2e-16 ***
## fsize_m6_wd2 -2.1496e-05 1.8560e-05 -1.1582 0.2467955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Multicollinearity detection
car::vif(final.model) #which indicates there's no multicollinearity among variables
## dist_km fsize_m6_wd mrt_km_o cycle_km_o
## 1.011795 4.008320 1.112352 1.092022
## I(far_hdb_o * 100) I(far_priv_o * 100) time fsize_m6_wd2
## 1.234481 1.081515 1.057264 3.481572