http://rpubs.com/sallychen/313125
install.packages(“dplyr”)
install.packages(“AER”)
install.packages(“mlogit”)
library(dplyr)
library(AER)
library(mlogit)
data("TravelMode", package = "AER") #load the data
head(TravelMode) #see the head of the data
## individual mode choice wait vcost travel gcost income size
## 1 1 air no 69 59 100 70 35 1
## 2 1 train no 34 31 372 71 35 1
## 3 1 bus no 35 25 417 70 35 1
## 4 1 car yes 0 10 180 30 35 1
## 5 2 air no 64 58 68 68 30 2
## 6 2 train no 44 31 354 84 30 2
nrow(TravelMode) #number of rows
## [1] 840
length(unique(TravelMode$individual)) #how many individuals?
## [1] 210
table(is.na(TravelMode)) #is there any missing data
##
## FALSE
## 7560
table(filter(TravelMode, choice=="yes")$mode)/(nrow(TravelMode)/4) #average frequency of choices
##
## air train bus car
## 0.2761905 0.3000000 0.1428571 0.2809524
Question: How does waiting time/travel time/general cost affect the choice of transportation?
For individual i, utility on choice j \[U_{ij} = c_j + \gamma gcost_j +\alpha wait_j+\beta travel_j + \epsilon_{ij}\] \[U_{ij} = X_{ij}\Theta + \epsilon_{ij}\]
Logit Model: close form choice probability
\[Prob_i(j) = \frac{e^{X_{ij}\Theta}}{\Sigma_j e^{X_{ij}\Theta}}\]
\[Max_{\Theta}L({\Theta|Data})\]
\[\Theta,X \rightarrow U(X,\Theta) \rightarrow Prob_i(j|\Theta,X) \rightarrow LL(\Theta|X,y)\]
theta = c(1,2,3,-0.001,-0.003,-0.005) #generate one set of parameters
sample<-filter(TravelMode,individual==1) #filter one individual
sample$constant<-0
sample$constant[sample$mode=="air"] = theta[1]
sample$constant[sample$mode=="train"] = theta[2]
sample$constant[sample$mode=="bus"] = theta[3]
sample$utility = theta[4]*sample$gcost+theta[5]*sample$wait+theta[6]*sample$travel+sample$constant
sample
## individual mode choice wait vcost travel gcost income size constant
## 1 1 air no 69 59 100 70 35 1 1
## 2 1 train no 34 31 372 71 35 1 2
## 3 1 bus no 35 25 417 70 35 1 3
## 4 1 car yes 0 10 180 30 35 1 0
## utility
## 1 0.223
## 2 -0.033
## 3 0.740
## 4 -0.930
real_choice<-filter(sample,choice=="yes")
real_choice
## individual mode choice wait vcost travel gcost income size constant
## 1 1 car yes 0 10 180 30 35 1 0
## utility
## 1 -0.93
real_choice$utility
## [1] -0.93
exp(sample$utility)
## [1] 1.2498206 0.9675386 2.0959355 0.3945537
sum(exp(sample$utility))
## [1] 4.707848
choice_prob=exp(real_choice$utility)/sum(exp(sample$utility))
choice_prob
## [1] 0.08380765
3.Write functions to repeat step 1& step 2 for all individuals
# Choice Probability Function for each individual
# Suppose we have the utility for each alternative
choice.prob<-function(sample){
x = filter(sample,choice=="yes") #filter the choice
prob = exp(x$utility)/sum(exp(sample$utility)) #caculate probability
return(prob)
}
group = group_by(TravelMode,individual) #group the data by individual
summarise(group,avg_travel = mean(travel)) # calculate the avg travel time for each individual
## # A tibble: 210 × 2
## individual avg_travel
## <fctr> <dbl>
## 1 1 267.25
## 2 2 269.00
## 3 3 654.75
## 4 4 250.25
## 5 5 399.25
## 6 6 286.50
## 7 7 704.00
## 8 8 675.00
## 9 9 274.75
## 10 10 259.25
## # ... with 200 more rows
TravelMode$utility<-runif(nrow(TravelMode),0,1) #generate some random utilities
Probability <- TravelMode %>% group_by(individual) %>% do(data.frame( prob= choice.prob(.))) #apply choice.prob() function on each individuals
Probability # a data.frame, each row contains the choice probability for each individual
## Source: local data frame [210 x 2]
## Groups: individual [210]
##
## individual prob
## <fctr> <dbl>
## 1 1 0.3283217
## 2 2 0.3111945
## 3 3 0.2916671
## 4 4 0.2173985
## 5 5 0.2300607
## 6 6 0.2476131
## 7 7 0.2592969
## 8 8 0.3179163
## 9 9 0.2390440
## 10 10 0.3821410
## # ... with 200 more rows
sum(log(Probability$prob)) #sum up the log likelihoods
## [1] -292.8526
# Likelihood function for the whole data
Likelihood<-function(theta){
# caclulating utility for each alternative for all the individuals
TravelMode$constant=0
TravelMode$constant[TravelMode$mode=="air"] = theta[1]
TravelMode$constant[TravelMode$mode=="train"] = theta[2]
TravelMode$constant[TravelMode$mode=="bus"] = theta[3]
TravelMode$utility = theta[4]*TravelMode$gcost+theta[5]*TravelMode$wait
+theta[6]*TravelMode$travel+TravelMode$constant
# caclulating choice probability for each individual
Probability=TravelMode %>%
group_by(individual) %>% # group the data by individual
do(data.frame(prob=choice.prob(.))) # use Choice() function on each individual
return(-sum(log(Probability$prob))) # return the sum log likelihood
}
est<-optim(c(4.0,4.0,4.0,-0.01,-0.01,-0.01),Likelihood,method = "BFGS")
est$par
Estimated Parameters
# formatting data
TM <- mlogit.data(TravelMode, choice = "choice", shape = "long",
chid.var = "individual", alt.var = "mode", drop.index = TRUE)
head(TM)
## choice wait vcost travel gcost income size utility
## 1.air FALSE 69 59 100 70 35 1 0.7270067
## 1.train FALSE 34 31 372 71 35 1 0.7502752
## 1.bus FALSE 35 25 417 70 35 1 0.1733003
## 1.car TRUE 0 10 180 30 35 1 0.9661005
## 2.air FALSE 64 58 68 68 30 2 0.5118723
## 2.train FALSE 44 31 354 84 30 2 0.7046743
# estimate with mlogit
ml.TM <- mlogit(choice ~ gcost +wait +travel, TM, reflevel = "car")
#show results
summary(ml.TM)
##
## Call:
## mlogit(formula = choice ~ gcost + wait + travel, data = TM, reflevel = "car",
## method = "nr", print.level = 0)
##
## Frequencies of alternatives:
## car air train bus
## 0.28095 0.27619 0.30000 0.14286
##
## nr method
## 5 iterations, 0h:0m:0s
## g'(-H)^-1g = 0.000159
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error t-value Pr(>|t|)
## air:(intercept) 4.0540450 0.8366245 4.8457 1.262e-06 ***
## train:(intercept) 3.6445988 0.4427624 8.2315 2.220e-16 ***
## bus:(intercept) 3.1957885 0.4519434 7.0712 1.536e-12 ***
## gcost -0.0028601 0.0060976 -0.4691 0.639032
## wait -0.0974635 0.0103529 -9.4141 < 2.2e-16 ***
## travel -0.0034895 0.0011489 -3.0371 0.002388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -195
## McFadden R^2: 0.31281
## Likelihood ratio test : chisq = 177.52 (p.value = < 2.22e-16)
#how fitted choice probability match with data
apply(fitted(ml.TM, outcome=FALSE), 2, mean) # fitted mean choice probability
## car air train bus
## 0.2809524 0.2761905 0.3000000 0.1428571
ml.TM$freq/sum( ml.TM$freq) # mean choice probability in data
##
## car air train bus
## 0.2809524 0.2761905 0.3000000 0.1428571