library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(AER)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(mlogit)
## Loading required package: dfidx
##
## Attaching package: 'dfidx'
## The following object is masked from 'package:stats':
##
## filter
data("TravelMode", package = "AER")
head(TravelMode) # frist 6 row
## individual mode choice wait vcost travel gcost income size
## 1 1 air no 69 59 100 70 35 1
## 2 1 train no 34 31 372 71 35 1
## 3 1 bus no 35 25 417 70 35 1
## 4 1 car yes 0 10 180 30 35 1
## 5 2 air no 64 58 68 68 30 2
## 6 2 train no 44 31 354 84 30 2
nrow(TravelMode) # number of row
## [1] 840
length(unique(TravelMode$individual)) # how many individuals?
## [1] 210
table(is.na(TravelMode))
##
## FALSE
## 7560
table(is.na(TravelMode)) #is there any missing data
##
## FALSE
## 7560
table(filter(TravelMode, choice == "yes")$mode) #count number of choice per vehicle
##
## air train bus car
## 58 63 30 59
table(filter(TravelMode, choice=="yes")$mode)/(nrow(TravelMode)/4) #average frequency of choices
##
## air train bus car
## 0.2761905 0.3000000 0.1428571 0.2809524
Set Reference Group: car, constant normalize to 0 Parameters: , length of 6 constants: 3 weight on independent variables: 3
Data: TravelMode(mode,choice,gcost,wait,travel) Utility for each alternative given X,Θ Choice probability for one single individual given utility of four alternatives Choice probabilites for all the individuals Sum of LogLikelihood
theta = c(1,2,3,-0.001, -0.003, -0.005) #generate one set of parameters
sample <- filter(TravelMode, individual==1) #filter one individual
sample$constant <- 0
sample$constant[sample$mode=="air"] = theta[1]
sample$constant[sample$mode=="train"] = theta[2]
sample$constant[sample$mode=="bus"] = theta[3]
sample$utility = theta[4]*sample$gcost + theta[5]*sample$wait + theta[6]*sample$travel + sample$constant
sample
## individual mode choice wait vcost travel gcost income size constant utility
## 1 1 air no 69 59 100 70 35 1 1 0.223
## 2 1 train no 34 31 372 71 35 1 2 -0.033
## 3 1 bus no 35 25 417 70 35 1 3 0.740
## 4 1 car yes 0 10 180 30 35 1 0 -0.930
real_choice <- filter(sample, choice == "yes")
real_choice
## individual mode choice wait vcost travel gcost income size constant utility
## 4 1 car yes 0 10 180 30 35 1 0 -0.93
real_choice$utility
## [1] -0.93
exp(sample$utility)
## [1] 1.2498206 0.9675386 2.0959355 0.3945537
sum(exp(sample$utility))
## [1] 4.707848
choice_prob = exp(real_choice$utility)/sum(exp(sample$utility))
choice_prob
## [1] 0.08380765
choice.prob<-function(sample){
x = filter(sample,choice=="yes") #filter the choice
prob = exp(x$utility)/sum(exp(sample$utility)) #caculate probability
return(prob)
}
Grouping, Operations and Aggregations(Within-across) Group the dataset by individual Compute choice probability within group Summarize choice probabilities across group group_by(),summarise(),do() function in package dplyr
group = group_by(TravelMode,individual) #group the data by individual
summarise(group,avg_travel = mean(travel)) # calculate the avg travel time for each individual
## # A tibble: 210 × 2
## individual avg_travel
## <fct> <dbl>
## 1 1 267.
## 2 2 269
## 3 3 655.
## 4 4 250.
## 5 5 399.
## 6 6 286.
## 7 7 704
## 8 8 675
## 9 9 275.
## 10 10 259.
## # … with 200 more rows
TravelMode$utility<-runif(nrow(TravelMode),0,1) #generate some random utilities
Probability <- TravelMode %>% group_by(individual) %>% do(data.frame( prob= choice.prob(.))) #apply choice.prob() function on each individuals
Probability # a data.frame, each row contains the choice probability for each individual
## # A tibble: 210 × 2
## # Groups: individual [210]
## individual prob
## <fct> <dbl>
## 1 1 0.162
## 2 2 0.187
## 3 3 0.233
## 4 4 0.178
## 5 5 0.176
## 6 6 0.187
## 7 7 0.277
## 8 8 0.264
## 9 9 0.305
## 10 10 0.339
## # … with 200 more rows
sum(log(Probability$prob)) #sum up the log likelihoods
## [1] -298.0739
Likelihood function Input: parameter Intermediate: utility,choice probability Output: Loglikelihood
# Likelihood function for the whole data
Likelihood<-function(theta){
# caclulating utility for each alternative for all the individuals
TravelMode$constant=0
TravelMode$constant[TravelMode$mode=="air"] = theta[1]
TravelMode$constant[TravelMode$mode=="train"] = theta[2]
TravelMode$constant[TravelMode$mode=="bus"] = theta[3]
TravelMode$utility = theta[4]*TravelMode$gcost+theta[5]*TravelMode$wait
+theta[6]*TravelMode$travel+TravelMode$constant
# caclulating choice probability for each individual
Probability=TravelMode %>%
group_by(individual) %>% # group the data by individual
do(data.frame(prob=choice.prob(.))) # use Choice() function on each individual
return(-sum(log(Probability$prob))) # return the sum log likelihood
}
Likelihood Function: Summary Break down Likelihood functions to several different processes Grouping & Aggregating Avoid for loops
optim() Starting value Function to optimize optimizing method
est<-optim(c(4.0,4.0,4.0,-0.01,-0.01,-0.01),Likelihood,method = "BFGS")
est$par
## [1] 4.00000000 4.00000000 4.00000000 -0.01063591 -0.01298183 -0.01000000
A wrapper of all the above
# formatting data
TM <- mlogit.data(TravelMode, choice = "choice", shape = "long",
chid.var = "individual", alt.var = "mode", drop.index = TRUE)
head(TM)
## ~~~~~~~
## first 10 observations out of 840
## ~~~~~~~
## choice wait vcost travel gcost income size utility idx
## 1 FALSE 69 59 100 70 35 1 0.9553560 1:air
## 2 FALSE 34 31 372 71 35 1 0.1909535 1:rain
## 3 FALSE 35 25 417 70 35 1 0.7508079 1:bus
## 4 TRUE 0 10 180 30 35 1 0.1342728 1:car
## 5 FALSE 64 58 68 68 30 2 0.9915289 2:air
## 6 FALSE 44 31 354 84 30 2 0.3127537 2:rain
## 7 FALSE 53 25 399 85 30 2 0.3705673 2:bus
## 8 TRUE 0 11 255 50 30 2 0.2342526 2:car
## 9 FALSE 69 115 125 129 40 1 0.1904882 3:air
## 10 FALSE 34 98 892 195 40 1 0.3060156 3:rain
##
## ~~~ indexes ~~~~
## chid alt
## 1 1 air
## 2 1 train
## 3 1 bus
## 4 1 car
## 5 2 air
## 6 2 train
## 7 2 bus
## 8 2 car
## 9 3 air
## 10 3 train
## indexes: 1, 2
# estimate with mlogit
ml.TM <- mlogit(choice ~ gcost +wait +travel, TM, reflevel = "car")
#show results
summary(ml.TM)
##
## Call:
## mlogit(formula = choice ~ gcost + wait + travel, data = TM, reflevel = "car",
## method = "nr")
##
## Frequencies of alternatives:choice
## car air train bus
## 0.28095 0.27619 0.30000 0.14286
##
## nr method
## 5 iterations, 0h:0m:0s
## g'(-H)^-1g = 0.000159
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):air 4.0540450 0.8366245 4.8457 1.262e-06 ***
## (Intercept):train 3.6445988 0.4427624 8.2315 2.220e-16 ***
## (Intercept):bus 3.1957885 0.4519434 7.0712 1.536e-12 ***
## gcost -0.0028601 0.0060976 -0.4691 0.639032
## wait -0.0974635 0.0103529 -9.4141 < 2.2e-16 ***
## travel -0.0034895 0.0011489 -3.0371 0.002388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -195
## McFadden R^2: 0.31281
## Likelihood ratio test : chisq = 177.52 (p.value = < 2.22e-16)
# how fitted choice probability match with the data
apply(fitted(ml.TM, outcome = FALSE), 2, mean) # fitted mean choice probability
## car air train bus
## 0.2809524 0.2761905 0.3000000 0.1428571
ml.TM$freq/sum(ml.TM$freq)
## choice
## car air train bus
## 0.2809524 0.2761905 0.3000000 0.1428571
ml.TM$freq/sum( ml.TM$freq) # mean choice probability in data
## choice
## car air train bus
## 0.2809524 0.2761905 0.3000000 0.1428571