Loading packages

Reading data

data <- read.csv("C:/Users/49765/Desktop/Advanced Planning Methods/Group/Group_project_deliverable_2/June_cleaned.csv")
str(data)
## 'data.frame':    482476 obs. of  11 variables:
##  $ ori        : int  75 255 256 976 1519 74 75 255 256 434 ...
##  $ dst        : int  74 74 74 74 74 75 75 75 75 75 ...
##  $ dist_km    : num  0.38 0.83 1.05 4.57 4.21 0.38 0 0.45 0.66 1.34 ...
##  $ fsize_m6_wd: num  0 0.136 0.591 15.409 62.955 ...
##  $ trip_m6_wd : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mrt_km_o   : num  3.27 3.14 2.91 1.97 0.52 3.64 3.27 3.14 2.91 3.57 ...
##  $ cycle_km_o : num  0 0 0 0 0.52 0 0 0 0 0 ...
##  $ far_hdb_o  : num  0 0 0 0.000442 0.742848 ...
##  $ far_priv_o : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ far_comm_o : num  0 0 0 0 0.536 ...
##  $ time       : int  0 0 0 0 0 0 0 0 0 0 ...

Correlation

data$fsize_s <- data$fsize_m6_wd^2
Using_data <- data[c("dist_km", "fsize_m6_wd", "trip_m6_wd", "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o", "fsize_s")]

correlation_matrix <- cor(Using_data)

color_theme <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

corrplot(correlation_matrix, 
         method = "color", 
         col = color_theme(200), 
         order = "hclust", 
         addCoef.col = "darkgrey"
)

# check order terms

# whether include a higher order term (Ramsey’s RESET Test) --> answer is yes
resettest(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o + far_hdb_o + far_priv_o + far_comm_o + time, power=2 , type="regressor", data = data)
## 
##  RESET test
## 
## data:  trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +     far_hdb_o + far_priv_o + far_comm_o + time
## RESET = 2416.7, df1 = 8, df2 = 482459, p-value < 2.2e-16
# check different higher order terms

# ***
data$fsize_m6_wd2 <- data$fsize_m6_wd^2
data$dist_km2 <- data$dist_km^2
data$mrt_km_o2 <- data$mrt_km_o^2
# **
data$far_hdb_o2 <- data$far_hdb_o^2
# *
data$cycle_km_o2 <- data$cycle_km_o^2
data$cycle_km_o2 <- data$cycle_km_o^2
# none
data$far_priv_o2 <- data$far_priv_o^2
data$far_comm_o2 <- data$far_comm_o^2


# list of models
ori_model <- lm(trip_m6_wd ~  dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time, data = data)

RdNDdist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                        far_hdb_o + far_priv_o + far_comm_o + time + dist_km2, data = data)

FleetSize2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                  far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2 , data = data)

MRTDist2_model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                        far_hdb_o + far_priv_o + far_comm_o + time + mrt_km_o2, data = data)

tab_model(ori_model, FleetSize2_model, RdNDdist2_model, MRTDist2_model,
          show.ci = 0.95,
          pred.labels = c("(Intercept)", "Rd Network distance (O-D)","Bike fleet size","Distance to nearest MRT STA", "Bike lanes length",
                          "Floor Area Ratio of Public Housing Bldg", "Floor Area Ratio of Private Residential Bldg",
                          "Floor Area Ratio of Commercial Bldg", "Morning/Evening",
                          "Rd Network distance (O-D)_2","Bike fleet size_2", "Dist to nearest MRT STA_2"),
          dv.labels = c("model1", "model2",
                        "model3", "model4"),
          file = "higher_orders_comparison.html")
  model1 model2 model3 model4
Predictors Estimates CI p Estimates CI p Estimates CI p Estimates CI p
(Intercept) 1.19 1.16 – 1.21 <0.001 1.12 1.09 – 1.15 <0.001 2.44 2.41 – 2.47 <0.001 1.24 1.21 – 1.27 <0.001
Rd Network distance (O-D) -0.38 -0.39 – -0.38 <0.001 -0.39 -0.39 – -0.38 <0.001 -1.29 -1.30 – -1.28 <0.001 -0.38 -0.39 – -0.38 <0.001
Bike fleet size 0.04 0.03 – 0.04 <0.001 0.05 0.04 – 0.05 <0.001 0.04 0.04 – 0.04 <0.001 0.04 0.03 – 0.04 <0.001
Distance to nearest MRT STA -0.04 -0.05 – -0.03 <0.001 -0.03 -0.04 – -0.02 <0.001 -0.08 -0.09 – -0.07 <0.001 -0.09 -0.10 – -0.07 <0.001
Bike lanes length 0.25 0.14 – 0.36 <0.001 0.12 0.01 – 0.23 0.037 0.25 0.14 – 0.36 <0.001 0.26 0.15 – 0.37 <0.001
Floor Area Ratio of Public Housing Bldg 0.07 0.05 – 0.08 <0.001 0.03 0.02 – 0.05 <0.001 0.09 0.08 – 0.11 <0.001 0.07 0.05 – 0.08 <0.001
Floor Area Ratio of Private Residential Bldg -0.03 -0.05 – -0.00 0.028 -0.04 -0.06 – -0.01 0.003 -0.03 -0.05 – -0.00 0.022 -0.03 -0.05 – -0.00 0.024
Floor Area Ratio of Commercial Bldg -0.01 -0.03 – 0.02 0.525 -0.01 -0.04 – 0.01 0.262 0.01 -0.01 – 0.04 0.269 -0.02 -0.04 – 0.00 0.119
Morning/Evening 0.30 0.28 – 0.32 <0.001 0.27 0.25 – 0.29 <0.001 0.29 0.27 – 0.31 <0.001 0.30 0.28 – 0.32 <0.001
Rd Network distance (O-D)_2 -0.00 -0.00 – -0.00 <0.001
Bike fleet size_2 0.11 0.11 – 0.11 <0.001
Dist to nearest MRT STA_2 0.01 0.00 – 0.01 <0.001
Observations 482476 482476 482476 482476
R2 / R2 adjusted 0.081 / 0.081 0.082 / 0.082 0.115 / 0.115 0.081 / 0.081

Origin Model Combination

variables <- c("dist_km",  "mrt_km_o", "cycle_km_o", "far_hdb_o", "far_priv_o", "far_comm_o")

# Stores all combinations into a list
all_combinations <- c()

# Combination of 1 to 7 variables
for (num_variables in 1:6) {
  # Get all possible combinations
  variable_combinations <- combn(variables, num_variables, simplify = TRUE)
  
  # Add the name of the combination to the list
  for (i in 1:ncol(variable_combinations)) {
    combination_name <- paste(variable_combinations[, i], collapse = " + ")
    all_combinations <- c(all_combinations, combination_name)
  }
}

# Create an empty list for storing all models
all_models <- list()

# Iterate over all combinations and fit a linear regression model
for (combination in all_combinations) {
  formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd +", combination))
  model <- lm(formula, data = data)
  model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
  all_models[[model_name]] <- model
}
# Create an empty list to store the R-squared values for all models
r_square_values <- list()

# Iterate over all models and extract R-squared values
for (model_name in names(all_models)) {
  model <- all_models[[model_name]]
  r_square <- summary(model)$r.square
  r_square_values[[model_name]] <- r_square
}

# Converting lists to df
r_square_df <- data.frame(
  O_r2 = unlist(r_square_values)
)

print(r_square_df)
##                   O_r2
## lm_model_1  0.07848705
## lm_model_2  0.03697736
## lm_model_3  0.03696703
## lm_model_4  0.03712659
## lm_model_5  0.03686562
## lm_model_6  0.03703114
## lm_model_7  0.07864858
## lm_model_8  0.07854319
## lm_model_9  0.07870190
## lm_model_10 0.07851311
## lm_model_11 0.07848934
## lm_model_12 0.03708484
## lm_model_13 0.03721141
## lm_model_14 0.03699222
## lm_model_15 0.03720817
## lm_model_16 0.03718242
## lm_model_17 0.03697187
## lm_model_18 0.03712972
## lm_model_19 0.03712798
## lm_model_20 0.03723957
## lm_model_21 0.03704102
## lm_model_22 0.07870315
## lm_model_23 0.07882688
## lm_model_24 0.07868852
## lm_model_25 0.07866204
## lm_model_26 0.07872486
## lm_model_27 0.07856498
## lm_model_28 0.07854467
## lm_model_29 0.07870451
## lm_model_30 0.07870270
## lm_model_31 0.07851579
## lm_model_32 0.03726903
## lm_model_33 0.03709532
## lm_model_34 0.03730282
## lm_model_35 0.03721141
## lm_model_36 0.03736952
## lm_model_37 0.03722825
## lm_model_38 0.03718405
## lm_model_39 0.03729385
## lm_model_40 0.03713622
## lm_model_41 0.03723978
## lm_model_42 0.07885123
## lm_model_43 0.07873778
## lm_model_44 0.07871446
## lm_model_45 0.07883603
## lm_model_46 0.07882836
## lm_model_47 0.07870394
## lm_model_48 0.07872725
## lm_model_49 0.07872574
## lm_model_50 0.07856678
## lm_model_51 0.07870513
## lm_model_52 0.03726904
## lm_model_53 0.03742571
## lm_model_54 0.03731791
## lm_model_55 0.03737081
## lm_model_56 0.03729417
## lm_model_57 0.07886001
## lm_model_58 0.07885263
## lm_model_59 0.07875087
## lm_model_60 0.07883829
## lm_model_61 0.07872795
## lm_model_62 0.03742679
## lm_model_63 0.07886215

Squared Model Combination

all_models <- list()

for (combination in all_combinations) {
  formula <- as.formula(paste("trip_m6_wd ~ fsize_m6_wd + fsize_s +", combination))
  model <- lm(formula, data = data)
  model_name <- paste("lm_model_", length(all_models) + 1, sep = "")
  all_models[[model_name]] <- model
}
r_square_values <- list()

for (model_name in names(all_models)) {
  model <- all_models[[model_name]]
  r_square <- summary(model)$r.square
  r_square_values[[model_name]] <- r_square
}

r_square_df2 <- data.frame(
  S_r2 = unlist(r_square_values)
)

print(r_square_df2)
##                   S_r2
## lm_model_1  0.08027222
## lm_model_2  0.03809388
## lm_model_3  0.03810056
## lm_model_4  0.03815376
## lm_model_5  0.03806906
## lm_model_6  0.03824234
## lm_model_7  0.08031107
## lm_model_8  0.08027717
## lm_model_9  0.08031072
## lm_model_10 0.08029580
## lm_model_11 0.08027520
## lm_model_12 0.03813292
## lm_model_13 0.03817747
## lm_model_14 0.03810379
## lm_model_15 0.03830579
## lm_model_16 0.03817516
## lm_model_17 0.03810556
## lm_model_18 0.03827401
## lm_model_19 0.03815383
## lm_model_20 0.03829719
## lm_model_21 0.03825093
## lm_model_22 0.08031634
## lm_model_23 0.08034384
## lm_model_24 0.08034106
## lm_model_25 0.08031894
## lm_model_26 0.08031214
## lm_model_27 0.08029956
## lm_model_28 0.08027985
## lm_model_29 0.08032255
## lm_model_30 0.08031121
## lm_model_31 0.08029920
## lm_model_32 0.03820005
## lm_model_33 0.03814071
## lm_model_34 0.03833786
## lm_model_35 0.03817819
## lm_model_36 0.03834884
## lm_model_37 0.03831988
## lm_model_38 0.03817520
## lm_model_39 0.03831712
## lm_model_40 0.03828075
## lm_model_41 0.03829839
## lm_model_42 0.08034563
## lm_model_43 0.08034498
## lm_model_44 0.08032372
## lm_model_45 0.08036110
## lm_model_46 0.08034701
## lm_model_47 0.08035025
## lm_model_48 0.08032381
## lm_model_49 0.08031260
## lm_model_50 0.08030268
## lm_model_51 0.08032341
## lm_model_52 0.03820065
## lm_model_53 0.03837030
## lm_model_54 0.03834956
## lm_model_55 0.03835315
## lm_model_56 0.03831813
## lm_model_57 0.08036271
## lm_model_58 0.08034876
## lm_model_59 0.08035369
## lm_model_60 0.08036582
## lm_model_61 0.08032465
## lm_model_62 0.03837431
## lm_model_63 0.08036738

Comparasion of R-square

r_square_df$S_r2 <- r_square_df2$S_r2
write.csv(r_square_df, file = "C:/Users/49765/Desktop/Advanced Planning Methods/Group/r_square_data.csv", row.names = FALSE)
t_test_result <- t.test(r_square_df$O_r2, r_square_df$S_r2, paired = TRUE)

print(t_test_result)
## 
##  Paired t-test
## 
## data:  r_square_df$O_r2 and r_square_df$S_r2
## t = -35.295, df = 62, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.001419630 -0.001267446
## sample estimates:
## mean difference 
##    -0.001343538

Model Specification

best subset

# best subset
bestsubset <- regsubsets(trip_m6_wd ~  dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time +  fsize_m6_wd2, data = data, nbest=1)
#subsets(bestsubset, statistic = "adjr2", which = 1)
best.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time,
                 data = data)
summary(best.model)
## 
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + time, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.881  -0.967  -0.505   0.174 277.266 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.1444209  0.0096345  118.78   <2e-16 ***
## dist_km     -0.3834739  0.0026051 -147.20   <2e-16 ***
## fsize_m6_wd  0.0367418  0.0002551  144.01   <2e-16 ***
## time         0.2894217  0.0095862   30.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.27 on 482472 degrees of freedom
## Multiple R-squared:  0.08022,    Adjusted R-squared:  0.08022 
## F-statistic: 1.403e+04 on 3 and 482472 DF,  p-value: < 2.2e-16
# (Rule of Parsimonious) --> include variables in best.model and pick the higher order term that boost R-squared the most
best.model_1 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + dist_km2,
                 data = data)
best.model_2 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + fsize_m6_wd2,
                   data = data)
best.model_3 <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + time + mrt_km_o2,
                   data = data)
tab_model(best.model_1, best.model_2, best.model_3,
          show.ci = 0.95,
          pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike fleet size",
                          "Distance to nearest MRT STA", "Rd Network distance (O-D)_2",
                          "Bike fleet size_2","Distance to nearest MRT STA_2"),
          dv.labels = c("RdNDdist2_model", "FleetSize2_model", "MRTDist2_model"),
          file = "best_models_comparison.html")
  RdNDdist2_model FleetSize2_model MRTDist2_model
Predictors Estimates CI p Estimates CI p Estimates CI p
Intercept 2.33 2.30 – 2.35 <0.001 1.08 1.06 – 1.10 <0.001 1.16 1.14 – 1.18 <0.001
Rd Network distance (O-D) -1.28 -1.29 – -1.26 <0.001 -0.39 -0.39 – -0.38 <0.001 -0.38 -0.39 – -0.38 <0.001
Bike fleet size 0.04 0.04 – 0.04 <0.001 0.05 0.05 – 0.05 <0.001 0.04 0.04 – 0.04 <0.001
Distance to nearest MRT STA 0.28 0.26 – 0.29 <0.001 0.26 0.24 – 0.28 <0.001 0.29 0.27 – 0.31 <0.001
Rd Network distance (O-D)_2 0.11 0.11 – 0.11 <0.001
Bike fleet size_2 -0.00 -0.00 – -0.00 <0.001
Distance to nearest MRT STA_2 -0.00 -0.00 – -0.00 <0.001
Observations 482476 482476 482476
R2 / R2 adjusted 0.113 / 0.113 0.082 / 0.082 0.080 / 0.080

Stepwise(final model)

null.model <- lm(trip_m6_wd ~ 1, data=data)
full.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
              far_hdb_o + far_priv_o + far_comm_o + time + fsize_m6_wd2, data = data) 

step.model.for <- step(null.model,
                       scope = formula(full.model),
                       direction = "forward",
                       trace = 0)
step.model.back <- step(full.model,
                        direction = "backward",
                        trace = 0)
step.model.both <- step(null.model,
                        scope=formula(full.model),
                        direction = "both",
                        trace = 0)

stargazer(step.model.for, step.model.back, step.model.both, 
          type = "text",
          file = "stepwise.html",
          add.lines=list(c("AIC", round(AIC(step.model.for),1),  round(AIC(step.model.back),1),round(AIC(step.model.both),1))), 
          column.labels = c("Forward", "Backward", "Both"))
## 
## ========================================================================
##                                            Dependent variable:          
##                                   --------------------------------------
##                                                 trip_m6_wd              
##                                     Forward      Backward       Both    
##                                       (1)          (2)          (3)     
## ------------------------------------------------------------------------
## fsize_m6_wd                         0.045***     0.045***     0.045***  
##                                     (0.0005)     (0.0005)     (0.0005)  
##                                                                         
## dist_km                            -0.386***    -0.386***    -0.386***  
##                                     (0.003)      (0.003)      (0.003)   
##                                                                         
## fsize_m6_wd2                       -0.0001***   -0.0001***   -0.0001*** 
##                                    (0.00000)    (0.00000)    (0.00000)  
##                                                                         
## time                                0.272***     0.272***     0.272***  
##                                     (0.010)      (0.010)      (0.010)   
##                                                                         
## far_hdb_o                           0.035***     0.035***     0.035***  
##                                     (0.008)      (0.008)      (0.008)   
##                                                                         
## mrt_km_o                           -0.026***    -0.026***    -0.026***  
##                                     (0.004)      (0.004)      (0.004)   
##                                                                         
## far_priv_o                         -0.037***    -0.037***    -0.037***  
##                                     (0.013)      (0.013)      (0.013)   
##                                                                         
## cycle_km_o                          0.119**      0.119**      0.119**   
##                                     (0.057)      (0.057)      (0.057)   
##                                                                         
## Constant                            1.117***     1.117***     1.117***  
##                                     (0.013)      (0.013)      (0.013)   
##                                                                         
## ------------------------------------------------------------------------
## AIC                                2511639.3    2511639.3    2511639.3  
## Observations                        482,476      482,476      482,476   
## R2                                   0.082        0.082        0.082    
## Adjusted R2                          0.082        0.082        0.082    
## Residual Std. Error (df = 482467)    3.267        3.267        3.267    
## F Statistic (df = 8; 482467)      5,377.216*** 5,377.216*** 5,377.216***
## ========================================================================
## Note:                                        *p<0.1; **p<0.05; ***p<0.01
## 
## =============
## stepwise.html
## -------------
summary(step.model.both)
## 
## Call:
## lm(formula = trip_m6_wd ~ fsize_m6_wd + dist_km + fsize_m6_wd2 + 
##     time + far_hdb_o + mrt_km_o + far_priv_o + cycle_km_o, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -6.560  -0.965  -0.488   0.195 277.555 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   1.117e+00  1.326e-02   84.231  < 2e-16 ***
## fsize_m6_wd   4.546e-02  4.990e-04   91.094  < 2e-16 ***
## dist_km      -3.861e-01  2.606e-03 -148.144  < 2e-16 ***
## fsize_m6_wd2 -8.954e-05  3.654e-06  -24.506  < 2e-16 ***
## time          2.716e-01  9.673e-03   28.078  < 2e-16 ***
## far_hdb_o     3.541e-02  7.592e-03    4.664 3.10e-06 ***
## mrt_km_o     -2.636e-02  4.306e-03   -6.122 9.24e-10 ***
## far_priv_o   -3.662e-02  1.266e-02   -2.892  0.00383 ** 
## cycle_km_o    1.189e-01  5.666e-02    2.098  0.03591 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared:  0.08186,    Adjusted R-squared:  0.08185 
## F-statistic:  5377 on 8 and 482467 DF,  p-value: < 2.2e-16
final.model <- step.model.both

Predictions with Bike fleet size

vector <- seq(1, 300)
vector_df <- data.frame(vector)
new_data_time_1 <- data.frame(dist_km = mean(data$dist_km),
                              fsize_m6_wd = vector_df$vector,
                              mrt_km_o = mean(data$mrt_km_o),
                              cycle_km_o = mean(data$cycle_km_o),
                              far_hdb_o = mean(data$far_hdb_o),
                              far_priv_o = mean(data$far_priv_o),
                              fsize_m6_wd2 = vector_df$vector^2,
                              time = 1)
predicted_values_time_1 <- predict(final.model, newdata = new_data_time_1); #print(predicted_values_time_1)

plot(predicted_values_time_1,
     xlab = 'Bike fleet size',
     ylab = 'Number of bike trips')

#scatter.smooth(predicted_values_time_1, lpars = list(col = "green", lwd = 3),
#               xlab = 'Bike fleet size',ylab = 'Number of bike trips')

# find the max point index, use the index to get the max value
max_point_index <- which.max(predicted_values_time_1)
max_point <- vector[max_point_index]
points(max_point, predicted_values_time_1[max_point_index], col = "red", pch = 16)

#max_point_index = 254(6.133189 )

Diagnose

### FINAL MODEL
final.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                    far_hdb_o + far_priv_o + time + fsize_m6_wd2, data = data)
final.model <- lm(trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o +
                    I(far_hdb_o*100) + I(far_priv_o*100) + time + fsize_m6_wd2, data = data)
summary(final.model)
## 
## Call:
## lm(formula = trip_m6_wd ~ dist_km + fsize_m6_wd + mrt_km_o + 
##     cycle_km_o + I(far_hdb_o * 100) + I(far_priv_o * 100) + time + 
##     fsize_m6_wd2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -6.560  -0.965  -0.488   0.195 277.555 
## 
## Coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          1.117e+00  1.326e-02   84.231  < 2e-16 ***
## dist_km             -3.861e-01  2.606e-03 -148.144  < 2e-16 ***
## fsize_m6_wd          4.546e-02  4.990e-04   91.094  < 2e-16 ***
## mrt_km_o            -2.636e-02  4.306e-03   -6.122 9.24e-10 ***
## cycle_km_o           1.189e-01  5.666e-02    2.098  0.03591 *  
## I(far_hdb_o * 100)   3.541e-04  7.592e-05    4.664 3.10e-06 ***
## I(far_priv_o * 100) -3.662e-04  1.266e-04   -2.892  0.00383 ** 
## time                 2.716e-01  9.673e-03   28.078  < 2e-16 ***
## fsize_m6_wd2        -8.954e-05  3.654e-06  -24.506  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.267 on 482467 degrees of freedom
## Multiple R-squared:  0.08186,    Adjusted R-squared:  0.08185 
## F-statistic:  5377 on 8 and 482467 DF,  p-value: < 2.2e-16
#outlier detection
cook.dist <-cooks.distance(final.model)
data$cook.dist <-cooks.distance(final.model)
plot(cook.dist, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cook.dist, na.rm=T), col="red")
text(x = 1:length(cook.dist) + 5, 
     y = cook.dist, 
     col="red",
     labels = ifelse(cook.dist > 0.05, # Conditional statement
                     names(cook.dist),
                     ""))

#log-transformation
final.model_log <-lm((log(2+trip_m6_wd))  ~ dist_km + fsize_m6_wd + mrt_km_o + cycle_km_o+ +
                       I(far_hdb_o*100) + I(far_priv_o*100)+ time + fsize_m6_wd2, data = data)
summary(final.model_log)
## 
## Call:
## lm(formula = (log(2 + trip_m6_wd)) ~ dist_km + fsize_m6_wd + 
##     mrt_km_o + cycle_km_o + +I(far_hdb_o * 100) + I(far_priv_o * 
##     100) + time + fsize_m6_wd2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7259 -0.2137 -0.1183  0.1022  4.1915 
## 
## Coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          9.469e-01  1.571e-03  602.843  < 2e-16 ***
## dist_km             -6.953e-02  3.086e-04 -225.284  < 2e-16 ***
## fsize_m6_wd          6.341e-03  5.910e-05  107.293  < 2e-16 ***
## mrt_km_o            -9.782e-03  5.100e-04  -19.180  < 2e-16 ***
## cycle_km_o           3.319e-02  6.710e-03    4.945  7.6e-07 ***
## I(far_hdb_o * 100)   1.280e-04  8.992e-06   14.231  < 2e-16 ***
## I(far_priv_o * 100) -4.029e-05  1.500e-05   -2.686  0.00723 ** 
## time                 7.674e-02  1.146e-03   66.987  < 2e-16 ***
## fsize_m6_wd2        -2.150e-05  4.328e-07  -49.672  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.387 on 482467 degrees of freedom
## Multiple R-squared:  0.141,  Adjusted R-squared:  0.1409 
## F-statistic:  9895 on 8 and 482467 DF,  p-value: < 2.2e-16
summary(lm.beta(final.model_log))
## 
## Call:
## lm(formula = (log(2 + trip_m6_wd)) ~ dist_km + fsize_m6_wd + 
##     mrt_km_o + cycle_km_o + +I(far_hdb_o * 100) + I(far_priv_o * 
##     100) + time + fsize_m6_wd2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7259 -0.2137 -0.1183  0.1022  4.1915 
## 
## Coefficients:
##                       Estimate Standardized Std. Error  t value Pr(>|t|)    
## (Intercept)          9.469e-01           NA  1.571e-03  602.843  < 2e-16 ***
## dist_km             -6.953e-02   -3.024e-01  3.086e-04 -225.284  < 2e-16 ***
## fsize_m6_wd          6.341e-03    2.866e-01  5.910e-05  107.293  < 2e-16 ***
## mrt_km_o            -9.782e-03   -2.699e-02  5.100e-04  -19.180  < 2e-16 ***
## cycle_km_o           3.319e-02    6.896e-03  6.710e-03    4.945  7.6e-07 ***
## I(far_hdb_o * 100)   1.280e-04    2.110e-02  8.992e-06   14.231  < 2e-16 ***
## I(far_priv_o * 100) -4.029e-05   -3.728e-03  1.500e-05   -2.686  0.00723 ** 
## time                 7.674e-02    9.191e-02  1.146e-03   66.987  < 2e-16 ***
## fsize_m6_wd2        -2.150e-05   -1.237e-01  4.328e-07  -49.672  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.387 on 482467 degrees of freedom
## Multiple R-squared:  0.141,  Adjusted R-squared:  0.1409 
## F-statistic:  9895 on 8 and 482467 DF,  p-value: < 2.2e-16
#cook distance of logged model
cook.dist.log <- cooks.distance(final.model_log)
plot(cook.dist.log, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cook.dist.log, na.rm=T), col="red")
text(x = 1:length(cook.dist.log) + 5, 
     y = cook.dist.log, 
     col="red",
     labels = ifelse(cook.dist.log > .005, 
                     names(cook.dist.log), 
                     ""))

#comparision of two models
tab_model(final.model, final.model_log,
          show.ci = 0.95,
          show.stat = TRUE,
          show.aic = TRUE,
          pred.labels = c("Intercept", "Rd Network distance (O-D)", "Bike trips",
                          "Distance to nearest MRT STA", "Lenth of Bike Lanes","FAR of public residential",
                          "FAR of private residential","time","Bike trips_2"),
          dv.labels = c("Normal model","Logged model"),
          file = "log_comparison.html")
  Normal model Logged model
Predictors Estimates CI Statistic p Estimates CI Statistic p
Intercept 1.12 1.09 – 1.14 84.23 <0.001 0.95 0.94 – 0.95 602.84 <0.001
Rd Network distance (O-D) -0.39 -0.39 – -0.38 -148.14 <0.001 -0.07 -0.07 – -0.07 -225.28 <0.001
Bike trips 0.05 0.04 – 0.05 91.09 <0.001 0.01 0.01 – 0.01 107.29 <0.001
Distance to nearest MRT STA -0.03 -0.03 – -0.02 -6.12 <0.001 -0.01 -0.01 – -0.01 -19.18 <0.001
Lenth of Bike Lanes 0.12 0.01 – 0.23 2.10 0.036 0.03 0.02 – 0.05 4.95 <0.001
FAR of public residential 0.00 0.00 – 0.00 4.66 <0.001 0.00 0.00 – 0.00 14.23 <0.001
FAR of private residential -0.00 -0.00 – -0.00 -2.89 0.004 -0.00 -0.00 – -0.00 -2.69 0.007
time 0.27 0.25 – 0.29 28.08 <0.001 0.08 0.07 – 0.08 66.99 <0.001
Bike trips_2 -0.00 -0.00 – -0.00 -24.51 <0.001 -0.00 -0.00 – -0.00 -49.67 <0.001
Observations 482476 482476
R2 / R2 adjusted 0.082 / 0.082 0.141 / 0.141
AIC 2511639.335 -Inf
#Comparisions of the residuals distribution
plot(fitted(final.model),resid(final.model),abline(h=0))

plot(fitted(final.model_log),resid(final.model_log),abline(h=0))

hist(final.model$residuals,
     main = "Histogram of Final model_Resid",
     xlab = "Final model_Resid",
     ylab = "Frequency",
     col = "skyblue")

hist(final.model_log$residuals,
     main = "Histogram of Final model_Residln",
     xlab = "Final model_Residln",
     ylab = "Frequency",
     col = "skyblue")

#Heteroskedasticity detection
plot(predict(final.model), 
     final.model$residuals, 
     xlab = "Predicted values",
     ylab = "Residuals",
     main = "Residuals vs. Predicted")
abline(h = 0, col = 'red', lty = 2)

ncvTest(final.model)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1279035, Df = 1, p = < 2.22e-16
plot(predict(final.model_log), 
     final.model_log$residuals, 
     xlab = "Predicted values",
     ylab = "Residuals",
     main = "Residuals vs. Predicted")
abline(h = 0, col = 'red', lty = 2)

ncvTest(final.model_log)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 230291.1, Df = 1, p = < 2.22e-16
coeftest(final.model_log, vcov = vcovHC(final.model, type = "HC1"))
## 
## t test of coefficients:
## 
##                        Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept)          9.4692e-01  1.3997e-02  67.6520 < 2.2e-16 ***
## dist_km             -6.9533e-02  3.5009e-03 -19.8617 < 2.2e-16 ***
## fsize_m6_wd          6.3414e-03  1.6525e-03   3.8375 0.0001243 ***
## mrt_km_o            -9.7821e-03  3.1531e-03  -3.1024 0.0019198 ** 
## cycle_km_o           3.3186e-02  8.6630e-02   0.3831 0.7016611    
## I(far_hdb_o * 100)   1.2797e-04  9.4699e-05   1.3514 0.1765806    
## I(far_priv_o * 100) -4.0291e-05  1.2385e-04  -0.3253 0.7449344    
## time                 7.6744e-02  9.1221e-03   8.4130 < 2.2e-16 ***
## fsize_m6_wd2        -2.1496e-05  1.8560e-05  -1.1582 0.2467955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Multicollinearity detection
car::vif(final.model) #which indicates there's no multicollinearity among variables
##             dist_km         fsize_m6_wd            mrt_km_o          cycle_km_o 
##            1.011795            4.008320            1.112352            1.092022 
##  I(far_hdb_o * 100) I(far_priv_o * 100)                time        fsize_m6_wd2 
##            1.234481            1.081515            1.057264            3.481572