Slides

http://rpubs.com/sallychen/313125

install.packages(“dplyr”)

install.packages(“AER”)

install.packages(“mlogit”)

library(dplyr)
library(AER)
library(mlogit)

Contents

Data

data("TravelMode", package = "AER") #load the data
head(TravelMode) #see the head of the data
##   individual  mode choice wait vcost travel gcost income size
## 1          1   air     no   69    59    100    70     35    1
## 2          1 train     no   34    31    372    71     35    1
## 3          1   bus     no   35    25    417    70     35    1
## 4          1   car    yes    0    10    180    30     35    1
## 5          2   air     no   64    58     68    68     30    2
## 6          2 train     no   44    31    354    84     30    2
nrow(TravelMode) #number of rows
## [1] 840
length(unique(TravelMode$individual)) #how many individuals?
## [1] 210
table(is.na(TravelMode)) #is there any missing data
## 
## FALSE 
##  7560
table(filter(TravelMode, choice=="yes")$mode)/(nrow(TravelMode)/4) #average frequency of choices
## 
##       air     train       bus       car 
## 0.2761905 0.3000000 0.1428571 0.2809524

Discrete Choice Model

Question: How does waiting time/travel time/general cost affect the choice of transportation?

\[Prob_i(j) = \frac{e^{X_{ij}\Theta}}{\Sigma_j e^{X_{ij}\Theta}}\]

\[Max_{\Theta}L({\Theta|Data})\]

Likelihood Function

\[\Theta,X \rightarrow U(X,\Theta) \rightarrow Prob_i(j|\Theta,X) \rightarrow LL(\Theta|X,y)\]

Likelihood Function: Example

Likelihood Function: Example

  1. Utility for each alternative given \(X,\Theta\)
theta = c(1,2,3,-0.001,-0.003,-0.005) #generate one set of parameters
sample<-filter(TravelMode,individual==1)  #filter one individual
sample$constant<-0
sample$constant[sample$mode=="air"] = theta[1]
sample$constant[sample$mode=="train"] = theta[2]
sample$constant[sample$mode=="bus"] = theta[3]
sample$utility = theta[4]*sample$gcost+theta[5]*sample$wait+theta[6]*sample$travel+sample$constant 
sample
##   individual  mode choice wait vcost travel gcost income size constant
## 1          1   air     no   69    59    100    70     35    1        1
## 2          1 train     no   34    31    372    71     35    1        2
## 3          1   bus     no   35    25    417    70     35    1        3
## 4          1   car    yes    0    10    180    30     35    1        0
##   utility
## 1   0.223
## 2  -0.033
## 3   0.740
## 4  -0.930

Likelihood Function: Example

  1. Choice probability for each individual given utility of four alternatives
real_choice<-filter(sample,choice=="yes")
real_choice
##   individual mode choice wait vcost travel gcost income size constant
## 1          1  car    yes    0    10    180    30     35    1        0
##   utility
## 1   -0.93
real_choice$utility
## [1] -0.93
exp(sample$utility)
## [1] 1.2498206 0.9675386 2.0959355 0.3945537
sum(exp(sample$utility))
## [1] 4.707848
choice_prob=exp(real_choice$utility)/sum(exp(sample$utility))
choice_prob
## [1] 0.08380765

Likelihood Function: Example

3.Write functions to repeat step 1& step 2 for all individuals

# Choice Probability Function for each individual
# Suppose we have the utility for each alternative
choice.prob<-function(sample){
  x = filter(sample,choice=="yes") #filter the choice 
  prob = exp(x$utility)/sum(exp(sample$utility)) #caculate probability
  return(prob)
}

Likelihood Function: Example

  1. Grouping, Operations and Aggregations(Within-across)
group = group_by(TravelMode,individual) #group the data by individual
summarise(group,avg_travel = mean(travel)) # calculate the avg travel time for each individual
## # A tibble: 210 × 2
##    individual avg_travel
##        <fctr>      <dbl>
## 1           1     267.25
## 2           2     269.00
## 3           3     654.75
## 4           4     250.25
## 5           5     399.25
## 6           6     286.50
## 7           7     704.00
## 8           8     675.00
## 9           9     274.75
## 10         10     259.25
## # ... with 200 more rows
TravelMode$utility<-runif(nrow(TravelMode),0,1) #generate some random utilities
Probability <- TravelMode  %>% group_by(individual)  %>% do(data.frame( prob= choice.prob(.))) #apply choice.prob() function on each individuals
Probability # a data.frame, each row contains the choice probability for each individual
## Source: local data frame [210 x 2]
## Groups: individual [210]
## 
##    individual      prob
##        <fctr>     <dbl>
## 1           1 0.3283217
## 2           2 0.3111945
## 3           3 0.2916671
## 4           4 0.2173985
## 5           5 0.2300607
## 6           6 0.2476131
## 7           7 0.2592969
## 8           8 0.3179163
## 9           9 0.2390440
## 10         10 0.3821410
## # ... with 200 more rows
sum(log(Probability$prob)) #sum up the log likelihoods
## [1] -292.8526

Likelihood Function: Example

# Likelihood function for the whole data
Likelihood<-function(theta){
  # caclulating utility for each alternative for all the individuals
  TravelMode$constant=0
  TravelMode$constant[TravelMode$mode=="air"] = theta[1]
  TravelMode$constant[TravelMode$mode=="train"] = theta[2]
  TravelMode$constant[TravelMode$mode=="bus"] = theta[3]
  TravelMode$utility = theta[4]*TravelMode$gcost+theta[5]*TravelMode$wait
  +theta[6]*TravelMode$travel+TravelMode$constant 
  
  # caclulating choice probability for each individual
   Probability=TravelMode %>% 
                group_by(individual) %>% # group the data by individual
                do(data.frame(prob=choice.prob(.)))  # use Choice() function on each individual
   return(-sum(log(Probability$prob))) # return the sum log likelihood 
}

Likelihood Function: Summary

Maximum Likelihood Estimation

est<-optim(c(4.0,4.0,4.0,-0.01,-0.01,-0.01),Likelihood,method = "BFGS") 
est$par

Estimated Parameters

Using mlogit Package

# formatting data
TM <- mlogit.data(TravelMode, choice = "choice", shape = "long", 
                  chid.var = "individual", alt.var = "mode", drop.index = TRUE)
head(TM)
##         choice wait vcost travel gcost income size   utility
## 1.air    FALSE   69    59    100    70     35    1 0.7270067
## 1.train  FALSE   34    31    372    71     35    1 0.7502752
## 1.bus    FALSE   35    25    417    70     35    1 0.1733003
## 1.car     TRUE    0    10    180    30     35    1 0.9661005
## 2.air    FALSE   64    58     68    68     30    2 0.5118723
## 2.train  FALSE   44    31    354    84     30    2 0.7046743
# estimate with mlogit
ml.TM <- mlogit(choice ~ gcost +wait +travel, TM, reflevel = "car")
#show results
summary(ml.TM)
## 
## Call:
## mlogit(formula = choice ~ gcost + wait + travel, data = TM, reflevel = "car", 
##     method = "nr", print.level = 0)
## 
## Frequencies of alternatives:
##     car     air   train     bus 
## 0.28095 0.27619 0.30000 0.14286 
## 
## nr method
## 5 iterations, 0h:0m:0s 
## g'(-H)^-1g = 0.000159 
## successive function values within tolerance limits 
## 
## Coefficients :
##                     Estimate Std. Error t-value  Pr(>|t|)    
## air:(intercept)    4.0540450  0.8366245  4.8457 1.262e-06 ***
## train:(intercept)  3.6445988  0.4427624  8.2315 2.220e-16 ***
## bus:(intercept)    3.1957885  0.4519434  7.0712 1.536e-12 ***
## gcost             -0.0028601  0.0060976 -0.4691  0.639032    
## wait              -0.0974635  0.0103529 -9.4141 < 2.2e-16 ***
## travel            -0.0034895  0.0011489 -3.0371  0.002388 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -195
## McFadden R^2:  0.31281 
## Likelihood ratio test : chisq = 177.52 (p.value = < 2.22e-16)
#how fitted choice probability match with data
apply(fitted(ml.TM, outcome=FALSE), 2, mean) # fitted mean choice probability
##       car       air     train       bus 
## 0.2809524 0.2761905 0.3000000 0.1428571
 ml.TM$freq/sum( ml.TM$freq) # mean choice probability in data
## 
##       car       air     train       bus 
## 0.2809524 0.2761905 0.3000000 0.1428571