Loading packages

Reading data

data <- read.csv("C:/Users/49765/Desktop/Advanced Planning Methods/Group/Group_project_deliverable_2/June_cleaned.csv")
str(data)
## 'data.frame':    482476 obs. of  11 variables:
##  $ ori        : int  75 255 256 976 1519 74 75 255 256 434 ...
##  $ dst        : int  74 74 74 74 74 75 75 75 75 75 ...
##  $ dist_km    : num  0.38 0.83 1.05 4.57 4.21 0.38 0 0.45 0.66 1.34 ...
##  $ fsize_m6_wd: num  0 0.136 0.591 15.409 62.955 ...
##  $ trip_m6_wd : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mrt_km_o   : num  3.27 3.14 2.91 1.97 0.52 3.64 3.27 3.14 2.91 3.57 ...
##  $ cycle_km_o : num  0 0 0 0 0.52 0 0 0 0 0 ...
##  $ far_hdb_o  : num  0 0 0 0.000442 0.742848 ...
##  $ far_priv_o : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ far_comm_o : num  0 0 0 0 0.536 ...
##  $ time       : int  0 0 0 0 0 0 0 0 0 0 ...

Correlation

data$fsize_s <- data$fsize_m6_wd^2
Using_data <- data[c("dist_km", "fsize_m6_wd", "trip_m6_wd", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o", "fsize_s")]

correlation_matrix <- cor(Using_data)

color_theme <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(correlation_matrix, 
         method = "color", 
         col = color_theme(200), 
         order = "hclust", 
         addCoef.col = "darkgrey"
)

# check order terms

# whether include a higher order term (Ramsey’s RESET Test) --> answer is yes
resettest(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time, power=2 , type="regressor", data = data)
## 
##  RESET test
## 
## data:  trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +     far_hdb_o + far_priv_o + far_comm_o + time
## RESET = 2416.7, df1 = 8, df2 = 482459, p-value < 2.2e-16
# check different higher order terms

# ***
data$fsize_m6_wd2 <- data$fsize_m6_wd^2
data$dist_km2 <- data$dist_km^2
data$mrt_km_o2 <- data$mrt_km_o^2
# **
data$far_hdb_o2 <- data$far_hdb_o^2
# *
data$cycle_km_o2 <- data$cycle_km_o^2
data$cycle_km_o2 <- data$cycle_km_o^2
# none
data$far_priv_o2 <- data$far_priv_o^2
data$far_comm_o2 <- data$far_comm_o^2


# list of models
ori_model <- lm(trip_m6_wd ~  dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time, data = data)

RdNDdist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                        far_hdb_o + far_priv_o + far_comm_o + time + dist_km2, data = data)

FleetSize2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                  far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2 , data = data)

MRTDist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                        far_hdb_o + far_priv_o + far_comm_o + time + mrt_km_o2, data = data)

tab_model(ori_model, FleetSize2_model, RdNDdist2_model, MRTDist2_model,
          show.ci = 0.95,
          pred.labels = c("(Intercept)", "Rd Network distance (O-D)","Bike fleet size","Distance to nearest MRT STA", "Bike lanes length",
                          "Floor Area Ratio of Public Housing Bldg", "Floor Area Ratio of Private Residential Bldg",
                          "Floor Area Ratio of Commercial Bldg", "Morning/Evening",
                          "Rd Network distance (O-D)_2","Bike fleet size_2", "Dist to nearest MRT STA_2"),
          dv.labels = c("Full_model", "RdNDdist2_model",
                        "FleetSize2_model", "MRTDist2_model"),
          file = "higher_orders_comparison.html")
  Full_model RdNDdist2_model FleetSize2_model MRTDist2_model
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 1.19 1.16 – 1.21 <0.001 1.12 1.09 – 1.15 <0.001 2.44 2.41 – 2.47 <0.001 1.24 1.21 – 1.27 <0.001
Rd Network distance (O-D) -0.38 -0.39 – -0.38 <0.001 -0.39 -0.39 – -0.38 <0.001 -1.29 -1.30 – -1.28 <0.001 -0.38 -0.39 – -0.38 <0.001
Bike fleet size 0.04 0.03 – 0.04 <0.001 0.05 0.04 – 0.05 <0.001 0.04 0.04 – 0.04 <0.001 0.04 0.03 – 0.04 <0.001
Distance to nearest MRT STA -0.04 -0.05 – -0.03 <0.001 -0.03 -0.04 – -0.02 <0.001 -0.08 -0.09 – -0.07 <0.001 -0.09 -0.10 – -0.07 <0.001
Bike lanes length 0.25 0.14 – 0.36 <0.001 0.12 0.01 – 0.23 0.037 0.25 0.14 – 0.36 <0.001 0.26 0.15 – 0.37 <0.001
Floor Area Ratio of Public Housing Bldg 0.07 0.05 – 0.08 <0.001 0.03 0.02 – 0.05 <0.001 0.09 0.08 – 0.11 <0.001 0.07 0.05 – 0.08 <0.001
Floor Area Ratio of Private Residential Bldg -0.03 -0.05 – -0.00 0.028 -0.04 -0.06 – -0.01 0.003 -0.03 -0.05 – -0.00 0.022 -0.03 -0.05 – -0.00 0.024
Floor Area Ratio of Commercial Bldg -0.01 -0.03 – 0.02 0.525 -0.01 -0.04 – 0.01 0.262 0.01 -0.01 – 0.04 0.269 -0.02 -0.04 – 0.00 0.119
Morning/Evening 0.30 0.28 – 0.32 <0.001 0.27 0.25 – 0.29 <0.001 0.29 0.27 – 0.31 <0.001 0.30 0.28 – 0.32 <0.001
Rd Network distance (O-D)_2 -0.00 -0.00 – -0.00 <0.001
Bike fleet size_2 0.11 0.11 – 0.11 <0.001
Dist to nearest MRT STA_2 0.01 0.00 – 0.01 <0.001
Observations 482476 482476 482476 482476
R2 / R2 adjusted 0.081 / 0.081 0.082 / 0.082 0.115 / 0.115 0.081 / 0.081

Origin Model Combination

variables <- c("dist_km",  "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o")

# Stores all combinations into a list
all_combinations <- c()

# Combination of 1 to 7 variables
for (num_variables in 1:6) {
  # Get all possible combinations
  variable_combinations <- combn(variables, num_variables, simplify = TRUE)
  
  # Add the name of the combination to the list
  for (i in 1:ncol(variable_combinations)) {
    combination_name <- paste(variable_combinations[, i], collapse = " + ")
    all_combinations <- c(all_combinations, combination_name)
  }
}

# Create an empty list for storing all models
all_models <- list()

# Iterate over all combinations and fit a linear regression model
for (combination in all_combinations) {
  formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd +", combination))
  model <- lm(formula, data = data)
  model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
  all_models[[model_name]] <- model
}
# Create an empty list to store the R-squared values for all models
r_square_values <- list()

# Iterate over all models and extract R-squared values
for (model_name in names(all_models)) {
  model <- all_models[[model_name]]
  r_square <- summary(model)$r.square
  r_square_values[[model_name]] <- r_square
}

# Converting lists to df
r_square_df <- data.frame(
  O_r2 = unlist(r_square_values)
)

print(r_square_df)
##                   O_r2
## lm_model_1  0.07848705
## lm_model_2  0.03697736
## lm_model_3  0.03696703
## lm_model_4  0.03712659
## lm_model_5  0.03686562
## lm_model_6  0.03703114
## lm_model_7  0.07864858
## lm_model_8  0.07854319
## lm_model_9  0.07870190
## lm_model_10 0.07851311
## lm_model_11 0.07848934
## lm_model_12 0.03708484
## lm_model_13 0.03721141
## lm_model_14 0.03699222
## lm_model_15 0.03720817
## lm_model_16 0.03718242
## lm_model_17 0.03697187
## lm_model_18 0.03712972
## lm_model_19 0.03712798
## lm_model_20 0.03723957
## lm_model_21 0.03704102
## lm_model_22 0.07870315
## lm_model_23 0.07882688
## lm_model_24 0.07868852
## lm_model_25 0.07866204
## lm_model_26 0.07872486
## lm_model_27 0.07856498
## lm_model_28 0.07854467
## lm_model_29 0.07870451
## lm_model_30 0.07870270
## lm_model_31 0.07851579
## lm_model_32 0.03726903
## lm_model_33 0.03709532
## lm_model_34 0.03730282
## lm_model_35 0.03721141
## lm_model_36 0.03736952
## lm_model_37 0.03722825
## lm_model_38 0.03718405
## lm_model_39 0.03729385
## lm_model_40 0.03713622
## lm_model_41 0.03723978
## lm_model_42 0.07885123
## lm_model_43 0.07873778
## lm_model_44 0.07871446
## lm_model_45 0.07883603
## lm_model_46 0.07882836
## lm_model_47 0.07870394
## lm_model_48 0.07872725
## lm_model_49 0.07872574
## lm_model_50 0.07856678
## lm_model_51 0.07870513
## lm_model_52 0.03726904
## lm_model_53 0.03742571
## lm_model_54 0.03731791
## lm_model_55 0.03737081
## lm_model_56 0.03729417
## lm_model_57 0.07886001
## lm_model_58 0.07885263
## lm_model_59 0.07875087
## lm_model_60 0.07883829
## lm_model_61 0.07872795
## lm_model_62 0.03742679
## lm_model_63 0.07886215

Squared Model Combination

all_models <- list()

for (combination in all_combinations) {
  formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd + fsize_s +", combination))
  model <- lm(formula, data = data)
  model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
  all_models[[model_name]] <- model
}
r_square_values <- list()

for (model_name in names(all_models)) {
  model <- all_models[[model_name]]
  r_square <- summary(model)$r.square
  r_square_values[[model_name]] <- r_square
}

r_square_df2 <- data.frame(
  S_r2 = unlist(r_square_values)
)

print(r_square_df2)
##                   S_r2
## lm_model_1  0.08027222
## lm_model_2  0.03809388
## lm_model_3  0.03810056
## lm_model_4  0.03815376
## lm_model_5  0.03806906
## lm_model_6  0.03824234
## lm_model_7  0.08031107
## lm_model_8  0.08027717
## lm_model_9  0.08031072
## lm_model_10 0.08029580
## lm_model_11 0.08027520
## lm_model_12 0.03813292
## lm_model_13 0.03817747
## lm_model_14 0.03810379
## lm_model_15 0.03830579
## lm_model_16 0.03817516
## lm_model_17 0.03810556
## lm_model_18 0.03827401
## lm_model_19 0.03815383
## lm_model_20 0.03829719
## lm_model_21 0.03825093
## lm_model_22 0.08031634
## lm_model_23 0.08034384
## lm_model_24 0.08034106
## lm_model_25 0.08031894
## lm_model_26 0.08031214
## lm_model_27 0.08029956
## lm_model_28 0.08027985
## lm_model_29 0.08032255
## lm_model_30 0.08031121
## lm_model_31 0.08029920
## lm_model_32 0.03820005
## lm_model_33 0.03814071
## lm_model_34 0.03833786
## lm_model_35 0.03817819
## lm_model_36 0.03834884
## lm_model_37 0.03831988
## lm_model_38 0.03817520
## lm_model_39 0.03831712
## lm_model_40 0.03828075
## lm_model_41 0.03829839
## lm_model_42 0.08034563
## lm_model_43 0.08034498
## lm_model_44 0.08032372
## lm_model_45 0.08036110
## lm_model_46 0.08034701
## lm_model_47 0.08035025
## lm_model_48 0.08032381
## lm_model_49 0.08031260
## lm_model_50 0.08030268
## lm_model_51 0.08032341
## lm_model_52 0.03820065
## lm_model_53 0.03837030
## lm_model_54 0.03834956
## lm_model_55 0.03835315
## lm_model_56 0.03831813
## lm_model_57 0.08036271
## lm_model_58 0.08034876
## lm_model_59 0.08035369
## lm_model_60 0.08036582
## lm_model_61 0.08032465
## lm_model_62 0.03837431
## lm_model_63 0.08036738

Comparasion of R-square

r_square_df$S_r2 <- r_square_df2$S_r2
write.csv(r_square_df, file = "C:/Users/49765/Desktop/Advanced Planning Methods/Group/r_square_data.csv", row.names = FALSE)
t_test_result <- t.test(r_square_df$O_r2, r_square_df$S_r2, paired = TRUE)

print(t_test_result)
## 
##  Paired t-test
## 
## data:  r_square_df$O_r2 and r_square_df$S_r2
## t = -35.295, df = 62, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.001419630 -0.001267446
## sample estimates:
## mean difference 
##    -0.001343538

Model Specification

best subset

# best subset
bestsubset <- regsubsets(trip_m6_wd ~  dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time +  fsize_m6_wd2, data = data, nbest=1)
#subsets(bestsubset, statistic = "adjr2", which = 1)
best.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time,
                 data = data)
summary(best.model)
## 
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + time, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.881  -0.967  -0.505   0.174 277.266 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.1444209  0.0096345  118.78   <2e-16 ***
## dist_km     -0.3834739  0.0026051 -147.20   <2e-16 ***
## fsize_m6_wd  0.0367418  0.0002551  144.01   <2e-16 ***
## time         0.2894217  0.0095862   30.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.27 on 482472 degrees of freedom
## Multiple R-squared:  0.08022,    Adjusted R-squared:  0.08022 
## F-statistic: 1.403e+04 on 3 and 482472 DF,  p-value: < 2.2e-16
# (Rule of Parsimonious) --> include variables in best.model and pick the higher order term that boost R-squared the most
best.model_1 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + dist_km2,
                 data = data)
best.model_2 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + fsize_m6_wd2,
                   data = data)
best.model_3 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + mrt_km_o2,
                   data = data)
tab_model(best.model_1, best.model_2, best.model_3,
          show.ci = 0.95,
          pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike fleet size",
                          "Distance to nearest MRT STA", "Rd Network distance (O-D)_2",
                          "Bike fleet size_2","Distance to nearest MRT STA_2"),
          dv.labels = c("RdNDdist2_model", "FleetSize2_model", "MRTDist2_model"),
          file = "best_models_comparison.html")
  RdNDdist2_model FleetSize2_model MRTDist2_model
Predictors Estimates CI p Estimates CI p Estimates CI p
Intercept 2.33 2.30 – 2.35 <0.001 1.08 1.06 – 1.10 <0.001 1.16 1.14 – 1.18 <0.001
Rd Network distance (O-D) -1.28 -1.29 – -1.26 <0.001 -0.39 -0.39 – -0.38 <0.001 -0.38 -0.39 – -0.38 <0.001
Bike fleet size 0.04 0.04 – 0.04 <0.001 0.05 0.05 – 0.05 <0.001 0.04 0.04 – 0.04 <0.001
Distance to nearest MRT STA 0.28 0.26 – 0.29 <0.001 0.26 0.24 – 0.28 <0.001 0.29 0.27 – 0.31 <0.001
Rd Network distance (O-D)_2 0.11 0.11 – 0.11 <0.001
Bike fleet size_2 -0.00 -0.00 – -0.00 <0.001
Distance to nearest MRT STA_2 -0.00 -0.00 – -0.00 <0.001
Observations 482476 482476 482476
R2 / R2 adjusted 0.113 / 0.113 0.082 / 0.082 0.080 / 0.080

Stepwise(final model)

null.model <- lm(trip_m6_wd ~ 1, data=data)
full.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data) 

step.model.for <- step(null.model,
                       scope = formula(full.model),
                       direction = "forward",
                       trace = 0)
step.model.back <- step(full.model,
                        direction = "backward",
                        trace = 0)
step.model.both <- step(null.model,
                        scope=formula(full.model),
                        direction = "both",
                        trace = 0)

stargazer(step.model.for, step.model.back, step.model.both, 
          type = "text",
          add.lines=list(c("AIC", round(AIC(step.model.for),1),  round(AIC(step.model.back),1),round(AIC(step.model.both),1))), 
          column.labels = c("Forward", "Backward", "Both"))
## 
## ========================================================================
##                                            Dependent variable:          
##                                   --------------------------------------
##                                                 trip_m6_wd              
##                                     Forward      Backward       Both    
##                                       (1)          (2)          (3)     
## ------------------------------------------------------------------------
## fsize_m6_wd                         0.045***     0.045***     0.045***  
##                                     (0.0005)     (0.0005)     (0.0005)  
##                                                                         
## dist_km                            -0.386***    -0.386***    -0.386***  
##                                     (0.003)      (0.003)      (0.003)   
##                                                                         
## fsize_m6_wd2                       -0.0001***   -0.0001***   -0.0001*** 
##                                    (0.00000)    (0.00000)    (0.00000)  
##                                                                         
## time                                0.272***     0.272***     0.272***  
##                                     (0.010)      (0.010)      (0.010)   
##                                                                         
## far_hdb_o                           0.035***     0.035***     0.035***  
##                                     (0.008)      (0.008)      (0.008)   
##                                                                         
## mrt_km_o                           -0.026***    -0.026***    -0.026***  
##                                     (0.004)      (0.004)      (0.004)   
##                                                                         
## far_priv_o                         -0.037***    -0.037***    -0.037***  
##                                     (0.013)      (0.013)      (0.013)   
##                                                                         
## cycle_km_o                          0.119**      0.119**      0.119**   
##                                     (0.057)      (0.057)      (0.057)   
##                                                                         
## Constant                            1.117***     1.117***     1.117***  
##                                     (0.013)      (0.013)      (0.013)   
##                                                                         
## ------------------------------------------------------------------------
## AIC                                2511639.3    2511639.3    2511639.3  
## Observations                        482,476      482,476      482,476   
## R2                                   0.082        0.082        0.082    
## Adjusted R2                          0.082        0.082        0.082    
## Residual Std. Error (df = 482467)    3.267        3.267        3.267    
## F Statistic (df = 8; 482467)      5,377.216*** 5,377.216*** 5,377.216***
## ========================================================================
## Note:                                        *p<0.1; **p<0.05; ***p<0.01
summary(step.model.both)
## 
## Call:
## lm(formula = trip_m6_wd ~ fsize_m6_wd + dist_km + fsize_m6_wd2 + 
##     time + far_hdb_o + mrt_km_o + far_priv_o + cycle_km_o, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -6.560  -0.965  -0.488   0.195 277.555 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   1.117e+00  1.326e-02   84.231  < 2e-16 ***
## fsize_m6_wd   4.546e-02  4.990e-04   91.094  < 2e-16 ***
## dist_km      -3.861e-01  2.606e-03 -148.144  < 2e-16 ***
## fsize_m6_wd2 -8.954e-05  3.654e-06  -24.506  < 2e-16 ***
## time          2.716e-01  9.673e-03   28.078  < 2e-16 ***
## far_hdb_o     3.541e-02  7.592e-03    4.664 3.10e-06 ***
## mrt_km_o     -2.636e-02  4.306e-03   -6.122 9.24e-10 ***
## far_priv_o   -3.662e-02  1.266e-02   -2.892  0.00383 ** 
## cycle_km_o    1.189e-01  5.666e-02    2.098  0.03591 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared:  0.08186,    Adjusted R-squared:  0.08185 
## F-statistic:  5377 on 8 and 482467 DF,  p-value: < 2.2e-16