Package Loading

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(AER)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(mlogit)
## Loading required package: dfidx
## 
## Attaching package: 'dfidx'
## The following object is masked from 'package:stats':
## 
##     filter

Data processing

data("TravelMode", package = "AER")
head(TravelMode) # frist 6 row
##   individual  mode choice wait vcost travel gcost income size
## 1          1   air     no   69    59    100    70     35    1
## 2          1 train     no   34    31    372    71     35    1
## 3          1   bus     no   35    25    417    70     35    1
## 4          1   car    yes    0    10    180    30     35    1
## 5          2   air     no   64    58     68    68     30    2
## 6          2 train     no   44    31    354    84     30    2
nrow(TravelMode) # number of row 
## [1] 840
length(unique(TravelMode$individual)) # how many individuals?
## [1] 210
table(is.na(TravelMode)) 
## 
## FALSE 
##  7560
table(is.na(TravelMode)) #is there any missing data
## 
## FALSE 
##  7560
table(filter(TravelMode, choice == "yes")$mode) #count number of choice per vehicle
## 
##   air train   bus   car 
##    58    63    30    59
table(filter(TravelMode, choice=="yes")$mode)/(nrow(TravelMode)/4) #average frequency of choices
## 
##       air     train       bus       car 
## 0.2761905 0.3000000 0.1428571 0.2809524

Discrete Choice Model

Set Reference Group: car, constant normalize to 0 Parameters: , length of 6 constants: 3 weight on independent variables: 3

Data: TravelMode(mode,choice,gcost,wait,travel) Utility for each alternative given X,Θ Choice probability for one single individual given utility of four alternatives Choice probabilites for all the individuals Sum of LogLikelihood

Utility for each alternative given

  1. Utility for each alternative given
theta = c(1,2,3,-0.001, -0.003, -0.005) #generate one set of parameters 
sample <- filter(TravelMode, individual==1) #filter one individual 

sample$constant <- 0
sample$constant[sample$mode=="air"] = theta[1]
sample$constant[sample$mode=="train"] = theta[2]
sample$constant[sample$mode=="bus"] = theta[3]

sample$utility = theta[4]*sample$gcost + theta[5]*sample$wait + theta[6]*sample$travel + sample$constant 
sample
##   individual  mode choice wait vcost travel gcost income size constant utility
## 1          1   air     no   69    59    100    70     35    1        1   0.223
## 2          1 train     no   34    31    372    71     35    1        2  -0.033
## 3          1   bus     no   35    25    417    70     35    1        3   0.740
## 4          1   car    yes    0    10    180    30     35    1        0  -0.930

Likelihood function

  1. Choice probability for each individual given utility of four alternatives
real_choice <- filter(sample, choice == "yes")
real_choice
##   individual mode choice wait vcost travel gcost income size constant utility
## 4          1  car    yes    0    10    180    30     35    1        0   -0.93
real_choice$utility 
## [1] -0.93
exp(sample$utility)
## [1] 1.2498206 0.9675386 2.0959355 0.3945537
sum(exp(sample$utility))
## [1] 4.707848
choice_prob = exp(real_choice$utility)/sum(exp(sample$utility))
choice_prob
## [1] 0.08380765

Likelihood Function

  1. Write functions to repeat step 1& step 2 for all individuals Choice Probability Function Input: sample, a data.frame for one individual,contains utility for each alternative and real choice Output: choice probability for the individual
choice.prob<-function(sample){
  x = filter(sample,choice=="yes") #filter the choice 
  prob = exp(x$utility)/sum(exp(sample$utility)) #caculate probability
  return(prob)
}

Likelihood Function

Grouping, Operations and Aggregations(Within-across) Group the dataset by individual Compute choice probability within group Summarize choice probabilities across group group_by(),summarise(),do() function in package dplyr

group = group_by(TravelMode,individual) #group the data by individual
summarise(group,avg_travel = mean(travel)) # calculate the avg travel time for each individual
## # A tibble: 210 × 2
##    individual avg_travel
##    <fct>           <dbl>
##  1 1                267.
##  2 2                269 
##  3 3                655.
##  4 4                250.
##  5 5                399.
##  6 6                286.
##  7 7                704 
##  8 8                675 
##  9 9                275.
## 10 10               259.
## # … with 200 more rows
TravelMode$utility<-runif(nrow(TravelMode),0,1) #generate some random utilities
Probability <- TravelMode  %>% group_by(individual)  %>% do(data.frame( prob= choice.prob(.))) #apply choice.prob() function on each individuals
Probability # a data.frame, each row contains the choice probability for each individual
## # A tibble: 210 × 2
## # Groups:   individual [210]
##    individual  prob
##    <fct>      <dbl>
##  1 1          0.162
##  2 2          0.187
##  3 3          0.233
##  4 4          0.178
##  5 5          0.176
##  6 6          0.187
##  7 7          0.277
##  8 8          0.264
##  9 9          0.305
## 10 10         0.339
## # … with 200 more rows
sum(log(Probability$prob)) #sum up the log likelihoods
## [1] -298.0739

Likelihood Function

Likelihood function Input: parameter Intermediate: utility,choice probability Output: Loglikelihood

# Likelihood function for the whole data
Likelihood<-function(theta){
  # caclulating utility for each alternative for all the individuals
  TravelMode$constant=0
  TravelMode$constant[TravelMode$mode=="air"] = theta[1]
  TravelMode$constant[TravelMode$mode=="train"] = theta[2]
  TravelMode$constant[TravelMode$mode=="bus"] = theta[3]
  TravelMode$utility = theta[4]*TravelMode$gcost+theta[5]*TravelMode$wait
  +theta[6]*TravelMode$travel+TravelMode$constant 
  
  # caclulating choice probability for each individual
   Probability=TravelMode %>% 
                group_by(individual) %>% # group the data by individual
                do(data.frame(prob=choice.prob(.)))  # use Choice() function on each individual
   return(-sum(log(Probability$prob))) # return the sum log likelihood 
}

Likelihood Function: Summary Break down Likelihood functions to several different processes Grouping & Aggregating Avoid for loops

Maximum Likelihood Estimation

optim() Starting value Function to optimize optimizing method

est<-optim(c(4.0,4.0,4.0,-0.01,-0.01,-0.01),Likelihood,method = "BFGS") 
est$par
## [1]  4.00000000  4.00000000  4.00000000 -0.01063591 -0.01298183 -0.01000000

Using mlogit Package

A wrapper of all the above

# formatting data
TM <- mlogit.data(TravelMode, choice = "choice", shape = "long", 
                  chid.var = "individual", alt.var = "mode", drop.index = TRUE)
head(TM)
## ~~~~~~~
##  first 10 observations out of 840 
## ~~~~~~~
##    choice wait vcost travel gcost income size   utility    idx
## 1   FALSE   69    59    100    70     35    1 0.9553560  1:air
## 2   FALSE   34    31    372    71     35    1 0.1909535 1:rain
## 3   FALSE   35    25    417    70     35    1 0.7508079  1:bus
## 4    TRUE    0    10    180    30     35    1 0.1342728  1:car
## 5   FALSE   64    58     68    68     30    2 0.9915289  2:air
## 6   FALSE   44    31    354    84     30    2 0.3127537 2:rain
## 7   FALSE   53    25    399    85     30    2 0.3705673  2:bus
## 8    TRUE    0    11    255    50     30    2 0.2342526  2:car
## 9   FALSE   69   115    125   129     40    1 0.1904882  3:air
## 10  FALSE   34    98    892   195     40    1 0.3060156 3:rain
## 
## ~~~ indexes ~~~~
##    chid   alt
## 1     1   air
## 2     1 train
## 3     1   bus
## 4     1   car
## 5     2   air
## 6     2 train
## 7     2   bus
## 8     2   car
## 9     3   air
## 10    3 train
## indexes:  1, 2
# estimate with mlogit
ml.TM <- mlogit(choice ~ gcost +wait +travel, TM, reflevel = "car")
#show results
summary(ml.TM)
## 
## Call:
## mlogit(formula = choice ~ gcost + wait + travel, data = TM, reflevel = "car", 
##     method = "nr")
## 
## Frequencies of alternatives:choice
##     car     air   train     bus 
## 0.28095 0.27619 0.30000 0.14286 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000159 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):air    4.0540450  0.8366245  4.8457 1.262e-06 ***
## (Intercept):train  3.6445988  0.4427624  8.2315 2.220e-16 ***
## (Intercept):bus    3.1957885  0.4519434  7.0712 1.536e-12 ***
## gcost             -0.0028601  0.0060976 -0.4691  0.639032    
## wait              -0.0974635  0.0103529 -9.4141 < 2.2e-16 ***
## travel            -0.0034895  0.0011489 -3.0371  0.002388 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -195
## McFadden R^2:  0.31281 
## Likelihood ratio test : chisq = 177.52 (p.value = < 2.22e-16)
# how fitted choice probability match with the data
apply(fitted(ml.TM, outcome = FALSE), 2, mean) # fitted mean choice probability 
##       car       air     train       bus 
## 0.2809524 0.2761905 0.3000000 0.1428571
ml.TM$freq/sum(ml.TM$freq)
## choice
##       car       air     train       bus 
## 0.2809524 0.2761905 0.3000000 0.1428571
ml.TM$freq/sum( ml.TM$freq) # mean choice probability in data
## choice
##       car       air     train       bus 
## 0.2809524 0.2761905 0.3000000 0.1428571

References

  1. https://rpubs.com/sallychen/313125