rm(list = ls())

1 Data

library(foreign)
clogit <- read.csv("clogit.csv")

summary(clogit)

##       MODE           TTME            INVC             INVT       
##  Min.   :0.00   Min.   : 0.00   Min.   :  2.00   Min.   :  63.0  
##  1st Qu.:0.00   1st Qu.: 0.75   1st Qu.: 23.00   1st Qu.: 234.0  
##  Median :0.00   Median :35.00   Median : 39.00   Median : 397.0  
##  Mean   :0.25   Mean   :34.59   Mean   : 47.76   Mean   : 486.2  
##  3rd Qu.:0.25   3rd Qu.:53.00   3rd Qu.: 66.25   3rd Qu.: 795.5  
##  Max.   :1.00   Max.   :99.00   Max.   :180.00   Max.   :1440.0  
##        GC            CHAIR             HINC           PSIZE      
##  Min.   : 30.0   Min.   :0.0000   Min.   : 2.00   Min.   :1.000  
##  1st Qu.: 71.0   1st Qu.:0.0000   1st Qu.:20.00   1st Qu.:1.000  
##  Median :101.5   Median :0.0000   Median :34.50   Median :1.000  
##  Mean   :110.9   Mean   :0.2762   Mean   :34.55   Mean   :1.743  
##  3rd Qu.:144.0   3rd Qu.:1.0000   3rd Qu.:50.00   3rd Qu.:2.000  
##  Max.   :269.0   Max.   :1.0000   Max.   :72.00   Max.   :6.000  
##       AASC           TASC           BASC           CASC          HINCA       
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   : 0.000  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.: 0.000  
##  Median :0.00   Median :0.00   Median :0.00   Median :0.00   Median : 0.000  
##  Mean   :0.25   Mean   :0.25   Mean   :0.25   Mean   :0.25   Mean   : 8.637  
##  3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.: 0.500  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :72.000  
##      PSIZEA      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4357  
##  3rd Qu.:0.2500  
##  Max.   :6.0000

summary(clogit)

##       MODE           TTME            INVC             INVT       
##  Min.   :0.00   Min.   : 0.00   Min.   :  2.00   Min.   :  63.0  
##  1st Qu.:0.00   1st Qu.: 0.75   1st Qu.: 23.00   1st Qu.: 234.0  
##  Median :0.00   Median :35.00   Median : 39.00   Median : 397.0  
##  Mean   :0.25   Mean   :34.59   Mean   : 47.76   Mean   : 486.2  
##  3rd Qu.:0.25   3rd Qu.:53.00   3rd Qu.: 66.25   3rd Qu.: 795.5  
##  Max.   :1.00   Max.   :99.00   Max.   :180.00   Max.   :1440.0  
##        GC            CHAIR             HINC           PSIZE      
##  Min.   : 30.0   Min.   :0.0000   Min.   : 2.00   Min.   :1.000  
##  1st Qu.: 71.0   1st Qu.:0.0000   1st Qu.:20.00   1st Qu.:1.000  
##  Median :101.5   Median :0.0000   Median :34.50   Median :1.000  
##  Mean   :110.9   Mean   :0.2762   Mean   :34.55   Mean   :1.743  
##  3rd Qu.:144.0   3rd Qu.:1.0000   3rd Qu.:50.00   3rd Qu.:2.000  
##  Max.   :269.0   Max.   :1.0000   Max.   :72.00   Max.   :6.000  
##       AASC           TASC           BASC           CASC          HINCA       
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   : 0.000  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00   1st Qu.: 0.000  
##  Median :0.00   Median :0.00   Median :0.00   Median :0.00   Median : 0.000  
##  Mean   :0.25   Mean   :0.25   Mean   :0.25   Mean   :0.25   Mean   : 8.637  
##  3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.:0.25   3rd Qu.: 0.500  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :72.000  
##      PSIZEA      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4357  
##  3rd Qu.:0.2500  
##  Max.   :6.0000

str(clogit)

## 'data.frame':    840 obs. of  14 variables:
##  $ MODE  : int  0 0 0 1 0 0 0 1 0 0 ...
##  $ TTME  : int  69 34 35 0 64 44 53 0 69 34 ...
##  $ INVC  : int  59 31 25 10 58 31 25 11 115 98 ...
##  $ INVT  : int  100 372 417 180 68 354 399 255 125 892 ...
##  $ GC    : int  70 71 70 30 68 84 85 50 129 195 ...
##  $ CHAIR : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HINC  : int  35 35 35 35 30 30 30 30 40 40 ...
##  $ PSIZE : int  1 1 1 1 2 2 2 2 1 1 ...
##  $ AASC  : int  1 0 0 0 1 0 0 0 1 0 ...
##  $ TASC  : int  0 1 0 0 0 1 0 0 0 1 ...
##  $ BASC  : int  0 0 1 0 0 0 1 0 0 0 ...
##  $ CASC  : int  0 0 0 1 0 0 0 1 0 0 ...
##  $ HINCA : int  35 0 0 0 30 0 0 0 40 0 ...
##  $ PSIZEA: num  1 0 0 0 2 0 0 0 1 0 ...

head(clogit)

##   MODE TTME INVC INVT GC CHAIR HINC PSIZE AASC TASC BASC CASC HINCA PSIZEA
## 1    0   69   59  100 70     0   35     1    1    0    0    0    35      1
## 2    0   34   31  372 71     0   35     1    0    1    0    0     0      0
## 3    0   35   25  417 70     0   35     1    0    0    1    0     0      0
## 4    1    0   10  180 30     0   35     1    0    0    0    1     0      0
## 5    0   64   58   68 68     0   30     2    1    0    0    0    30      2
## 6    0   44   31  354 84     0   30     2    0    1    0    0     0      0

save(clogit, file = '/Users/admin/Desktop/Linh Data Studio/Transport modelling/clogit.rdata')
load('/Users/admin/Desktop/Linh Data Studio/Transport modelling/clogit.rdata')

2 The choice set for each individual

clogit$mode.ids<-factor(rep(1:4,210),labels=c("air","train","bus","car"))

3 Simplified mlogit data specification

library(mlogit)

## Loading required package: dfidx

## 
## Attaching package: 'dfidx'

## The following object is masked from 'package:stats':
## 
##     filter

CLOGIT <- mlogit.data(clogit, shape = "long",
                      choice = "MODE", alt.var = "mode.ids")

head(index(CLOGIT), 5)

## ~~~ indexes ~~~~
##   chid   alt
## 1    1   air
## 2    1 train
## 3    1   bus
## 4    1   car
## 5    2   air
## indexes:  1, 2

4 Estimating logit models

4.1 Standard Logit

# Part_1 is the “basic” specification, as discussed above. Typically, variables in this part will vary with the alternative under consideration (for example, in a mode choice model, they might describe a mode’s cost (fare) or travel times
res1 <- mlogit(MODE ~ TTME + GC, data = clogit, shape = "long",
               alt.var = "mode.ids")

summary(res1)

## 
## Call:
## mlogit(formula = MODE ~ TTME + GC, data = clogit, shape = "long", 
##     alt.var = "mode.ids", method = "nr")
## 
## Frequencies of alternatives:choice
##     air   train     bus     car 
## 0.27619 0.30000 0.14286 0.28095 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000221 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):train -1.8533538  0.3700925 -5.0078 5.505e-07 ***
## (Intercept):bus   -2.5656173  0.3843251 -6.6756 2.461e-11 ***
## (Intercept):car   -5.7763487  0.6559187 -8.8065 < 2.2e-16 ***
## TTME              -0.0970904  0.0104351 -9.3042 < 2.2e-16 ***
## GC                -0.0157837  0.0043828 -3.6013 0.0003166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -199.98
## McFadden R^2:  0.29526 
## Likelihood ratio test : chisq = 167.56 (p.value = < 2.22e-16)

res2 <- mlogit(MODE ~ TTME + GC, data = CLOGIT)

summary(res2)

## 
## Call:
## mlogit(formula = MODE ~ TTME + GC, data = CLOGIT, method = "nr")
## 
## Frequencies of alternatives:choice
##     air   train     bus     car 
## 0.27619 0.30000 0.14286 0.28095 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000221 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):train -1.8533538  0.3700925 -5.0078 5.505e-07 ***
## (Intercept):bus   -2.5656173  0.3843251 -6.6756 2.461e-11 ***
## (Intercept):car   -5.7763487  0.6559187 -8.8065 < 2.2e-16 ***
## TTME              -0.0970904  0.0104351 -9.3042 < 2.2e-16 ***
## GC                -0.0157837  0.0043828 -3.6013 0.0003166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -199.98
## McFadden R^2:  0.29526 
## Likelihood ratio test : chisq = 167.56 (p.value = < 2.22e-16)

# Part 2 variables: gia tri ve ca nhan (e.g. gioi tinh, thu nhap ho gia dinh), ko anh huong den ban chat cua Phuong tien Mode
# part_2 consists of the names of variables whose values do not vary by alternative (individual-specific variables, for example, gender or perhaps age). 

res3 <- mlogit(MODE ~ TTME + GC | HINC, data = CLOGIT)

summary(res3)

## 
## Call:
## mlogit(formula = MODE ~ TTME + GC | HINC, data = CLOGIT, method = "nr")
## 
## Frequencies of alternatives:choice
##     air   train     bus     car 
## 0.27619 0.30000 0.14286 0.28095 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000614 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):train -0.3249576  0.5763335 -0.5638  0.572866    
## (Intercept):bus   -1.7445354  0.6775004 -2.5750  0.010025 *  
## (Intercept):car   -5.8747921  0.8020903 -7.3244   2.4e-13 ***
## TTME              -0.0954602  0.0104732 -9.1147 < 2.2e-16 ***
## GC                -0.0109273  0.0045878 -2.3818  0.017226 *  
## HINC:train        -0.0511880  0.0147352 -3.4739  0.000513 ***
## HINC:bus          -0.0232100  0.0162306 -1.4300  0.152712    
## HINC:car           0.0053735  0.0115294  0.4661  0.641163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -189.53
## McFadden R^2:  0.33209 
## Likelihood ratio test : chisq = 188.47 (p.value = < 2.22e-16)

# Finally, part_3 allows you to specify that variables describing characteris- tics of the alternatives have coefficients that differ across alternatives.
res4 <- mlogit(MODE ~ TTME | HINC | GC, data = CLOGIT)

summary(res4)

## 
## Call:
## mlogit(formula = MODE ~ TTME | HINC | GC, data = CLOGIT, method = "nr")
## 
## Frequencies of alternatives:choice
##     air   train     bus     car 
## 0.27619 0.30000 0.14286 0.28095 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000795 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):train  1.8492221  1.0578547  1.7481 0.0804489 .  
## (Intercept):bus    0.2332096  1.2150069  0.1919 0.8477885    
## (Intercept):car   -3.5030068  1.1092306 -3.1581 0.0015883 ** 
## TTME              -0.0962846  0.0104908 -9.1780 < 2.2e-16 ***
## HINC:train        -0.0547168  0.0150964 -3.6245 0.0002895 ***
## HINC:bus          -0.0249718  0.0163743 -1.5251 0.1272444    
## HINC:car           0.0034443  0.0119876  0.2873 0.7738634    
## GC:air             0.0104705  0.0091005  1.1505 0.2499210    
## GC:train          -0.0088560  0.0050541 -1.7522 0.0797317 .  
## GC:bus            -0.0070853  0.0074225 -0.9546 0.3397949    
## GC:car            -0.0110825  0.0056119 -1.9748 0.0482870 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -184.95
## McFadden R^2:  0.34822 
## Likelihood ratio test : chisq = 197.62 (p.value = < 2.22e-16)

Note that the GC variable is now interacted with dummys for all the modes: this is possible because gc varies with alternatives, not with individuals. The bottom line is that gc’s coefficient now differs by alternative. It’s also worth noting that the significance levels have changed quite a bit, suggesting that this specification may not be appropriate (at least for this dataset).

4.2 Mixed-logit models

# res5<-mlogit(mode~ttme+gc,data=CLOGIT,reflevel="car",
#       rpar=c("air:(intercept)"="n","bus:(intercept)"="n",
#       "train:(intercept)"="n"),R=500,halton=NA,print.level=1)
# summary(res5)
# 
# # https://stackoverflow.com/questions/60713637/mlogit-in-r-coefficients-and-unknown-random-parameter

res5<-mlogit(MODE~TTME+GC,data=CLOGIT,reflevel="car",
      rpar=c("(Intercept):air"="n","(Intercept):bus"="n",
      "(Intercept):train"="n"),R=500,halton=NA,print.level=1)

## Initial value of the function : 200.021471160969 
## iteration 1, step = 0.00390625, lnL = 200.00700181, chi2 = 42.89405615
## iteration 2, step = 0.015625, lnL = 200.00530375, chi2 = 0.23847745
## iteration 3, step = 0.125, lnL = 199.73904009, chi2 = 0.77798228
## iteration 4, step = 0.25, lnL = 199.71467864, chi2 = 0.25353518
## iteration 5, step = 0.5, lnL = 199.62259894, chi2 = 2.54825399
## iteration 6, step = 1, lnL = 198.92335441, chi2 = 0.9368018
## iteration 7, step = 1, lnL = 198.21614612, chi2 = 1.17814832
## iteration 8, step = 1, lnL = 197.78031543, chi2 = 0.73589678
## iteration 9, step = 1, lnL = 197.54249151, chi2 = 0.36853162
## iteration 10, step = 1, lnL = 197.48342838, chi2 = 0.0944876
## iteration 11, step = 1, lnL = 197.47108299, chi2 = 0.0189415
## iteration 12, step = 1, lnL = 197.46807699, chi2 = 0.00507095
## iteration 13, step = 1, lnL = 197.46781515, chi2 = 0.00046435
## iteration 14, step = 1, lnL = 197.46780495, chi2 = 1.855e-05
## iteration 15, step = 1, lnL = 197.46780438, chi2 = 8.2e-07

summary(res5)

## 
## Call:
## mlogit(formula = MODE ~ TTME + GC, data = CLOGIT, reflevel = "car", 
##     rpar = c(`(Intercept):air` = "n", `(Intercept):bus` = "n", 
##         `(Intercept):train` = "n"), R = 500, halton = NA, print.level = 1)
## 
## Frequencies of alternatives:choice
##     car     air   train     bus 
## 0.28095 0.27619 0.30000 0.14286 
## 
## bfgs method
## 15 iterations, 0h:0m:8s 
## g'(-H)^-1g = 8.22E-07 
## gradient close to zero 
## 
## Coefficients :
##                        Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):air       6.1592355  1.1386852  5.4091 6.335e-08 ***
## (Intercept):train     5.1095985  0.8174536  6.2506 4.088e-10 ***
## (Intercept):bus       4.1487347  0.9042209  4.5882 4.471e-06 ***
## TTME                 -0.1144355  0.0199023 -5.7499 8.932e-09 ***
## GC                   -0.0308410  0.0077524 -3.9782 6.943e-05 ***
## sd.(Intercept):air    2.9380453  0.9163970  3.2061  0.001346 ** 
## sd.(Intercept):train  0.0382613 21.4589826  0.0018  0.998577    
## sd.(Intercept):bus    0.0016575 76.6920349  0.0000  0.999983    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -197.47
## McFadden R^2:  0.3041 
## Likelihood ratio test : chisq = 172.58 (p.value = < 2.22e-16)
## 
## random coefficients
##                   Min.  1st Qu.   Median     Mean  3rd Qu. Max.
## (Intercept):air   -Inf 4.177554 6.159236 6.159236 8.140917  Inf
## (Intercept):bus   -Inf 4.147617 4.148735 4.148735 4.149853  Inf
## (Intercept):train -Inf 5.083792 5.109599 5.109599 5.135405  Inf

# print info on what was estimated for a random parameter
rpar(res5,"(Intercept):air")

## normal distribution with parameters 6.159 (mean) and 2.938 (sd)

Note: The estimates denoted sd.air:(intercept) (etc) represent (in this case) the estimated standard deviations of the random coefficients: in this example we estimate that the air dummy is normal with mean 6:1436 and variance 2:97682 D 8: 861 3; ie N .6:1436; 8:8613/:21 Note that the results here strongly suggest that the train and bus dummys are not in fact random (because we cannot reject the null hypothesis of zero standard deviations)

Note: As we have noted, the clogit dataset is not a random sample, but choice-based. The name arises because instead of picking out people at random (say from the phone book) we sample them by surveying them at their chosen modes (say, on the bus, or at the parking lot). Thus our sample depends on the choices they actually made. Choice-based sampling may be appropriate when we wish to over-sample some lightly used alternatives in order to ensure that their characteristics are represented in the dataset, or because they are simpler to collect.

Choice-based sampling + mlogit <- this makes the likelihood function wrong (since it is based on the assumption of random sampling).

4.3 Choice-based sampling

# Let Pi_i be the sample proportion using mode i. 
# let Pi_i_1 be the population proportion using this mode. Pi_i_1 require external information. For clogit the Pi_i_1 are: air = 0.14 ; train = 0.13 ; bus = 0.09 ; and car = 0.64

Pi_i <- c(0.27619, 0.30000, 0.14286, 0.28095 ) #value from res1 (Frequencies of alternatives:choice <- air train bus car )
Pi_i_1 <- c(0.14, 0.13, 0.09, 0.64) #external values

wts_ratio <- Pi_i_1/Pi_i

wts <- rep(c(0.507, 0.433, 0.63, 2.278), 210)

res1 <- mlogit(MODE ~ TTME + GC, data = clogit, shape = "long",
               alt.var = "mode.ids", weights = wts)

summary(res1)

## 
## Call:
## mlogit(formula = MODE ~ TTME + GC, data = clogit, weights = wts, 
##     shape = "long", alt.var = "mode.ids", method = "nr")
## 
## Frequencies of alternatives:choice
##     air   train     bus     car 
## 0.27619 0.30000 0.14286 0.28095 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000221 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):train -1.8533538  0.3700925 -5.0078 5.505e-07 ***
## (Intercept):bus   -2.5656173  0.3843251 -6.6756 2.461e-11 ***
## (Intercept):car   -5.7763487  0.6559187 -8.8065 < 2.2e-16 ***
## TTME              -0.0970904  0.0104351 -9.3042 < 2.2e-16 ***
## GC                -0.0157837  0.0043828 -3.6013 0.0003166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -199.98
## McFadden R^2:  0.29526 
## Likelihood ratio test : chisq = 167.56 (p.value = < 2.22e-16)

Discrete Choice Models - Part 1

Apr 17, 2022