Loading packages
Reading data
data <- read.csv("C:/Users/49765/Desktop/Advanced Planning Methods/Group/Group_project_deliverable_2/June_cleaned.csv")
str(data)
## 'data.frame': 482476 obs. of 11 variables:
## $ ori : int 75 255 256 976 1519 74 75 255 256 434 ...
## $ dst : int 74 74 74 74 74 75 75 75 75 75 ...
## $ dist_km : num 0.38 0.83 1.05 4.57 4.21 0.38 0 0.45 0.66 1.34 ...
## $ fsize_m6_wd: num 0 0.136 0.591 15.409 62.955 ...
## $ trip_m6_wd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mrt_km_o : num 3.27 3.14 2.91 1.97 0.52 3.64 3.27 3.14 2.91 3.57 ...
## $ cycle_km_o : num 0 0 0 0 0.52 0 0 0 0 0 ...
## $ far_hdb_o : num 0 0 0 0.000442 0.742848 ...
## $ far_priv_o : num 0 0 0 0 0 0 0 0 0 0 ...
## $ far_comm_o : num 0 0 0 0 0.536 ...
## $ time : int 0 0 0 0 0 0 0 0 0 0 ...
Correlation
data$fsize_s <- data$fsize_m6_wd^2
Using_data <- data[c("dist_km", "fsize_m6_wd", "trip_m6_wd", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o", "fsize_s")]
correlation_matrix <- cor(Using_data)
color_theme <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(correlation_matrix,
method = "color",
col = color_theme(200),
order = "hclust",
addCoef.col = "darkgrey"
)
# check order terms
# whether include a higher order term (Ramsey’s RESET Test) --> answer is yes
resettest(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time, power=2 , type="regressor", data = data)
##
## RESET test
##
## data: trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time
## RESET = 2416.7, df1 = 8, df2 = 482459, p-value < 2.2e-16
# check different higher order terms
# ***
data$fsize_m6_wd2 <- data$fsize_m6_wd^2
data$dist_km2 <- data$dist_km^2
data$mrt_km_o2 <- data$mrt_km_o^2
# **
data$far_hdb_o2 <- data$far_hdb_o^2
# *
data$cycle_km_o2 <- data$cycle_km_o^2
data$cycle_km_o2 <- data$cycle_km_o^2
# none
data$far_priv_o2 <- data$far_priv_o^2
data$far_comm_o2 <- data$far_comm_o^2
# list of models
ori_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time, data = data)
RdNDdist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + dist_km2, data = data)
FleetSize2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2 , data = data)
MRTDist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + mrt_km_o2, data = data)
tab_model(ori_model, FleetSize2_model, RdNDdist2_model, MRTDist2_model,
show.ci = 0.95,
pred.labels = c("(Intercept)", "Rd Network distance (O-D)","Bike fleet size","Distance to nearest MRT STA", "Bike lanes length",
"Floor Area Ratio of Public Housing Bldg", "Floor Area Ratio of Private Residential Bldg",
"Floor Area Ratio of Commercial Bldg", "Morning/Evening",
"Rd Network distance (O-D)_2","Bike fleet size_2", "Dist to nearest MRT STA_2"),
dv.labels = c("Full_model", "RdNDdist2_model",
"FleetSize2_model", "MRTDist2_model"),
file = "higher_orders_comparison.html")
|
|
Full_model
|
RdNDdist2_model
|
FleetSize2_model
|
MRTDist2_model
|
|
Predictors
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
|
(Intercept)
|
1.19
|
1.16 – 1.21
|
<0.001
|
1.12
|
1.09 – 1.15
|
<0.001
|
2.44
|
2.41 – 2.47
|
<0.001
|
1.24
|
1.21 – 1.27
|
<0.001
|
|
Rd Network distance (O-D)
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
-0.39
|
-0.39 – -0.38
|
<0.001
|
-1.29
|
-1.30 – -1.28
|
<0.001
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
|
Bike fleet size
|
0.04
|
0.03 – 0.04
|
<0.001
|
0.05
|
0.04 – 0.05
|
<0.001
|
0.04
|
0.04 – 0.04
|
<0.001
|
0.04
|
0.03 – 0.04
|
<0.001
|
|
Distance to nearest MRT STA
|
-0.04
|
-0.05 – -0.03
|
<0.001
|
-0.03
|
-0.04 – -0.02
|
<0.001
|
-0.08
|
-0.09 – -0.07
|
<0.001
|
-0.09
|
-0.10 – -0.07
|
<0.001
|
|
Bike lanes length
|
0.25
|
0.14 – 0.36
|
<0.001
|
0.12
|
0.01 – 0.23
|
0.037
|
0.25
|
0.14 – 0.36
|
<0.001
|
0.26
|
0.15 – 0.37
|
<0.001
|
|
Floor Area Ratio of Public Housing Bldg
|
0.07
|
0.05 – 0.08
|
<0.001
|
0.03
|
0.02 – 0.05
|
<0.001
|
0.09
|
0.08 – 0.11
|
<0.001
|
0.07
|
0.05 – 0.08
|
<0.001
|
|
Floor Area Ratio of Private Residential Bldg
|
-0.03
|
-0.05 – -0.00
|
0.028
|
-0.04
|
-0.06 – -0.01
|
0.003
|
-0.03
|
-0.05 – -0.00
|
0.022
|
-0.03
|
-0.05 – -0.00
|
0.024
|
|
Floor Area Ratio of Commercial Bldg
|
-0.01
|
-0.03 – 0.02
|
0.525
|
-0.01
|
-0.04 – 0.01
|
0.262
|
0.01
|
-0.01 – 0.04
|
0.269
|
-0.02
|
-0.04 – 0.00
|
0.119
|
|
Morning/Evening
|
0.30
|
0.28 – 0.32
|
<0.001
|
0.27
|
0.25 – 0.29
|
<0.001
|
0.29
|
0.27 – 0.31
|
<0.001
|
0.30
|
0.28 – 0.32
|
<0.001
|
|
Rd Network distance (O-D)_2
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
|
|
|
|
|
|
Bike fleet size_2
|
|
|
|
|
|
|
0.11
|
0.11 – 0.11
|
<0.001
|
|
|
|
|
Dist to nearest MRT STA_2
|
|
|
|
|
|
|
|
|
|
0.01
|
0.00 – 0.01
|
<0.001
|
|
Observations
|
482476
|
482476
|
482476
|
482476
|
|
R2 / R2 adjusted
|
0.081 / 0.081
|
0.082 / 0.082
|
0.115 / 0.115
|
0.081 / 0.081
|
Origin Model Combination
variables <- c("dist_km", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o")
# Stores all combinations into a list
all_combinations <- c()
# Combination of 1 to 7 variables
for (num_variables in 1:6) {
# Get all possible combinations
variable_combinations <- combn(variables, num_variables, simplify = TRUE)
# Add the name of the combination to the list
for (i in 1:ncol(variable_combinations)) {
combination_name <- paste(variable_combinations[, i], collapse = " + ")
all_combinations <- c(all_combinations, combination_name)
}
}
# Create an empty list for storing all models
all_models <- list()
# Iterate over all combinations and fit a linear regression model
for (combination in all_combinations) {
formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd +", combination))
model <- lm(formula, data = data)
model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
all_models[[model_name]] <- model
}
# Create an empty list to store the R-squared values for all models
r_square_values <- list()
# Iterate over all models and extract R-squared values
for (model_name in names(all_models)) {
model <- all_models[[model_name]]
r_square <- summary(model)$r.square
r_square_values[[model_name]] <- r_square
}
# Converting lists to df
r_square_df <- data.frame(
O_r2 = unlist(r_square_values)
)
print(r_square_df)
## O_r2
## lm_model_1 0.07848705
## lm_model_2 0.03697736
## lm_model_3 0.03696703
## lm_model_4 0.03712659
## lm_model_5 0.03686562
## lm_model_6 0.03703114
## lm_model_7 0.07864858
## lm_model_8 0.07854319
## lm_model_9 0.07870190
## lm_model_10 0.07851311
## lm_model_11 0.07848934
## lm_model_12 0.03708484
## lm_model_13 0.03721141
## lm_model_14 0.03699222
## lm_model_15 0.03720817
## lm_model_16 0.03718242
## lm_model_17 0.03697187
## lm_model_18 0.03712972
## lm_model_19 0.03712798
## lm_model_20 0.03723957
## lm_model_21 0.03704102
## lm_model_22 0.07870315
## lm_model_23 0.07882688
## lm_model_24 0.07868852
## lm_model_25 0.07866204
## lm_model_26 0.07872486
## lm_model_27 0.07856498
## lm_model_28 0.07854467
## lm_model_29 0.07870451
## lm_model_30 0.07870270
## lm_model_31 0.07851579
## lm_model_32 0.03726903
## lm_model_33 0.03709532
## lm_model_34 0.03730282
## lm_model_35 0.03721141
## lm_model_36 0.03736952
## lm_model_37 0.03722825
## lm_model_38 0.03718405
## lm_model_39 0.03729385
## lm_model_40 0.03713622
## lm_model_41 0.03723978
## lm_model_42 0.07885123
## lm_model_43 0.07873778
## lm_model_44 0.07871446
## lm_model_45 0.07883603
## lm_model_46 0.07882836
## lm_model_47 0.07870394
## lm_model_48 0.07872725
## lm_model_49 0.07872574
## lm_model_50 0.07856678
## lm_model_51 0.07870513
## lm_model_52 0.03726904
## lm_model_53 0.03742571
## lm_model_54 0.03731791
## lm_model_55 0.03737081
## lm_model_56 0.03729417
## lm_model_57 0.07886001
## lm_model_58 0.07885263
## lm_model_59 0.07875087
## lm_model_60 0.07883829
## lm_model_61 0.07872795
## lm_model_62 0.03742679
## lm_model_63 0.07886215
Squared Model Combination
all_models <- list()
for (combination in all_combinations) {
formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd + fsize_s +", combination))
model <- lm(formula, data = data)
model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
all_models[[model_name]] <- model
}
r_square_values <- list()
for (model_name in names(all_models)) {
model <- all_models[[model_name]]
r_square <- summary(model)$r.square
r_square_values[[model_name]] <- r_square
}
r_square_df2 <- data.frame(
S_r2 = unlist(r_square_values)
)
print(r_square_df2)
## S_r2
## lm_model_1 0.08027222
## lm_model_2 0.03809388
## lm_model_3 0.03810056
## lm_model_4 0.03815376
## lm_model_5 0.03806906
## lm_model_6 0.03824234
## lm_model_7 0.08031107
## lm_model_8 0.08027717
## lm_model_9 0.08031072
## lm_model_10 0.08029580
## lm_model_11 0.08027520
## lm_model_12 0.03813292
## lm_model_13 0.03817747
## lm_model_14 0.03810379
## lm_model_15 0.03830579
## lm_model_16 0.03817516
## lm_model_17 0.03810556
## lm_model_18 0.03827401
## lm_model_19 0.03815383
## lm_model_20 0.03829719
## lm_model_21 0.03825093
## lm_model_22 0.08031634
## lm_model_23 0.08034384
## lm_model_24 0.08034106
## lm_model_25 0.08031894
## lm_model_26 0.08031214
## lm_model_27 0.08029956
## lm_model_28 0.08027985
## lm_model_29 0.08032255
## lm_model_30 0.08031121
## lm_model_31 0.08029920
## lm_model_32 0.03820005
## lm_model_33 0.03814071
## lm_model_34 0.03833786
## lm_model_35 0.03817819
## lm_model_36 0.03834884
## lm_model_37 0.03831988
## lm_model_38 0.03817520
## lm_model_39 0.03831712
## lm_model_40 0.03828075
## lm_model_41 0.03829839
## lm_model_42 0.08034563
## lm_model_43 0.08034498
## lm_model_44 0.08032372
## lm_model_45 0.08036110
## lm_model_46 0.08034701
## lm_model_47 0.08035025
## lm_model_48 0.08032381
## lm_model_49 0.08031260
## lm_model_50 0.08030268
## lm_model_51 0.08032341
## lm_model_52 0.03820065
## lm_model_53 0.03837030
## lm_model_54 0.03834956
## lm_model_55 0.03835315
## lm_model_56 0.03831813
## lm_model_57 0.08036271
## lm_model_58 0.08034876
## lm_model_59 0.08035369
## lm_model_60 0.08036582
## lm_model_61 0.08032465
## lm_model_62 0.03837431
## lm_model_63 0.08036738
Comparasion of R-square
r_square_df$S_r2 <- r_square_df2$S_r2
write.csv(r_square_df, file = "C:/Users/49765/Desktop/Advanced Planning Methods/Group/r_square_data.csv", row.names = FALSE)
t_test_result <- t.test(r_square_df$O_r2, r_square_df$S_r2, paired = TRUE)
print(t_test_result)
##
## Paired t-test
##
## data: r_square_df$O_r2 and r_square_df$S_r2
## t = -35.295, df = 62, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.001419630 -0.001267446
## sample estimates:
## mean difference
## -0.001343538
Model Specification
best subset
# best subset
bestsubset <- regsubsets(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data, nbest=1)
#subsets(bestsubset, statistic = "adjr2", which = 1)
best.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time,
data = data)
summary(best.model)
##
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + time, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.881 -0.967 -0.505 0.174 277.266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1444209 0.0096345 118.78 <2e-16 ***
## dist_km -0.3834739 0.0026051 -147.20 <2e-16 ***
## fsize_m6_wd 0.0367418 0.0002551 144.01 <2e-16 ***
## time 0.2894217 0.0095862 30.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.27 on 482472 degrees of freedom
## Multiple R-squared: 0.08022, Adjusted R-squared: 0.08022
## F-statistic: 1.403e+04 on 3 and 482472 DF, p-value: < 2.2e-16
# (Rule of Parsimonious) --> include variables in best.model and pick the higher order term that boost R-squared the most
best.model_1 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + dist_km2,
data = data)
best.model_2 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + fsize_m6_wd2,
data = data)
best.model_3 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + mrt_km_o2,
data = data)
tab_model(best.model_1, best.model_2, best.model_3,
show.ci = 0.95,
pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike fleet size",
"Distance to nearest MRT STA", "Rd Network distance (O-D)_2",
"Bike fleet size_2","Distance to nearest MRT STA_2"),
dv.labels = c("RdNDdist2_model", "FleetSize2_model", "MRTDist2_model"),
file = "best_models_comparison.html")
|
|
RdNDdist2_model
|
FleetSize2_model
|
MRTDist2_model
|
|
Predictors
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
Estimates
|
CI
|
p
|
|
Intercept
|
2.33
|
2.30 – 2.35
|
<0.001
|
1.08
|
1.06 – 1.10
|
<0.001
|
1.16
|
1.14 – 1.18
|
<0.001
|
|
Rd Network distance (O-D)
|
-1.28
|
-1.29 – -1.26
|
<0.001
|
-0.39
|
-0.39 – -0.38
|
<0.001
|
-0.38
|
-0.39 – -0.38
|
<0.001
|
|
Bike fleet size
|
0.04
|
0.04 – 0.04
|
<0.001
|
0.05
|
0.05 – 0.05
|
<0.001
|
0.04
|
0.04 – 0.04
|
<0.001
|
|
Distance to nearest MRT STA
|
0.28
|
0.26 – 0.29
|
<0.001
|
0.26
|
0.24 – 0.28
|
<0.001
|
0.29
|
0.27 – 0.31
|
<0.001
|
|
Rd Network distance (O-D)_2
|
0.11
|
0.11 – 0.11
|
<0.001
|
|
|
|
|
|
|
|
Bike fleet size_2
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
|
|
|
Distance to nearest MRT STA_2
|
|
|
|
|
|
|
-0.00
|
-0.00 – -0.00
|
<0.001
|
|
Observations
|
482476
|
482476
|
482476
|
|
R2 / R2 adjusted
|
0.113 / 0.113
|
0.082 / 0.082
|
0.080 / 0.080
|
Stepwise(final model)
null.model <- lm(trip_m6_wd ~ 1, data=data)
full.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data)
step.model.for <- step(null.model,
scope = formula(full.model),
direction = "forward",
trace = 0)
step.model.back <- step(full.model,
direction = "backward",
trace = 0)
step.model.both <- step(null.model,
scope=formula(full.model),
direction = "both",
trace = 0)
stargazer(step.model.for, step.model.back, step.model.both,
type = "text",
add.lines=list(c("AIC", round(AIC(step.model.for),1), round(AIC(step.model.back),1),round(AIC(step.model.both),1))),
column.labels = c("Forward", "Backward", "Both"))
##
## ========================================================================
## Dependent variable:
## --------------------------------------
## trip_m6_wd
## Forward Backward Both
## (1) (2) (3)
## ------------------------------------------------------------------------
## fsize_m6_wd 0.045*** 0.045*** 0.045***
## (0.0005) (0.0005) (0.0005)
##
## dist_km -0.386*** -0.386*** -0.386***
## (0.003) (0.003) (0.003)
##
## fsize_m6_wd2 -0.0001*** -0.0001*** -0.0001***
## (0.00000) (0.00000) (0.00000)
##
## time 0.272*** 0.272*** 0.272***
## (0.010) (0.010) (0.010)
##
## far_hdb_o 0.035*** 0.035*** 0.035***
## (0.008) (0.008) (0.008)
##
## mrt_km_o -0.026*** -0.026*** -0.026***
## (0.004) (0.004) (0.004)
##
## far_priv_o -0.037*** -0.037*** -0.037***
## (0.013) (0.013) (0.013)
##
## cycle_km_o 0.119** 0.119** 0.119**
## (0.057) (0.057) (0.057)
##
## Constant 1.117*** 1.117*** 1.117***
## (0.013) (0.013) (0.013)
##
## ------------------------------------------------------------------------
## AIC 2511639.3 2511639.3 2511639.3
## Observations 482,476 482,476 482,476
## R2 0.082 0.082 0.082
## Adjusted R2 0.082 0.082 0.082
## Residual Std. Error (df = 482467) 3.267 3.267 3.267
## F Statistic (df = 8; 482467) 5,377.216*** 5,377.216*** 5,377.216***
## ========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
summary(step.model.both)
##
## Call:
## lm(formula = trip_m6_wd ~ fsize_m6_wd + dist_km + fsize_m6_wd2 +
## time + far_hdb_o + mrt_km_o + far_priv_o + cycle_km_o, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.560 -0.965 -0.488 0.195 277.555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e+00 1.326e-02 84.231 < 2e-16 ***
## fsize_m6_wd 4.546e-02 4.990e-04 91.094 < 2e-16 ***
## dist_km -3.861e-01 2.606e-03 -148.144 < 2e-16 ***
## fsize_m6_wd2 -8.954e-05 3.654e-06 -24.506 < 2e-16 ***
## time 2.716e-01 9.673e-03 28.078 < 2e-16 ***
## far_hdb_o 3.541e-02 7.592e-03 4.664 3.10e-06 ***
## mrt_km_o -2.636e-02 4.306e-03 -6.122 9.24e-10 ***
## far_priv_o -3.662e-02 1.266e-02 -2.892 0.00383 **
## cycle_km_o 1.189e-01 5.666e-02 2.098 0.03591 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared: 0.08186, Adjusted R-squared: 0.08185
## F-statistic: 5377 on 8 and 482467 DF, p-value: < 2.2e-16