A. ANALYSE WITH DATA OF THE CHOICE OF BUS AND MOTOR 1. Read and Create Data
t = "E:\\NGHIEN CUU SINH\\NCS - PHUONG ANH\\Part 1-Mode choice\\SO LIEU R\\Mode choice in DN.csv"
MC = read.csv(t, header = T)
head(MC)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 1 6 1 0 3 3 1
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 6 4 1 0 1 4 1
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 1 2 10 1 1 0
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 6 20 30 1 1 0
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 1 4 1 0 1 12 1
## 2 2 3 0 1 8 2
## 3 4 1 0 2 5 1
## 4 2 1 0 4 5 1
## 5 2 3 0 2 8 2
## 6 5 3 0 2 40 2
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 1 0 5 4 2 2 0
## 2 0 3 6 3 1 1
## 3 1 3 2 4 0 1
## 4 1 3 2 3 0 1
## 5 0 4 6 3 1 1
## 6 1 4 3 4 2 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 1 0 0 0 0 1
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 6 1 1 1 1 1
## Number.of.Motors Number.of.Car
## 1 2 1
## 2 3 0
## 3 3 0
## 4 3 0
## 5 2 0
## 6 2 1
str(MC)
## 'data.frame': 847 obs. of 30 variables:
## $ Travel.Mode : int 6 3 3 3 3 4 4 3 3 3 ...
## $ Bus.Stop.Condition : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Central.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose : int 3 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : int 3 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : int 1 1 3 1 3 1 1 1 2 1 ...
## $ Distance : num 2 8 5 5 8 20 15 10 12 10 ...
## $ Travel.Period : num 10 15 15 10 15 30 20 25 30 25 ...
## $ Sidewalk.Clearance : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Lane.Separate : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Temporary.Stop.Number: int 0 1 1 1 0 0 0 0 1 1 ...
## $ Mode.Choice.Reason : int 4 2 4 2 2 5 5 2 2 2 ...
## $ Weather : int 1 3 1 1 3 3 3 3 1 3 ...
## $ Weekend : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Non.Bus.Reason : int 1 1 2 4 2 2 4 2 1 4 ...
## $ Cost : int 12 8 5 5 8 40 30 10 12 10 ...
## $ Bus.Stop.Present : int 1 2 1 1 2 2 2 1 1 1 ...
## $ Gender : int 0 0 1 1 0 1 1 0 0 1 ...
## $ Age : int 5 3 3 3 4 4 5 4 3 4 ...
## $ Occupation : int 4 6 2 2 6 3 3 7 7 3 ...
## $ Income : int 2 3 4 3 3 4 4 2 3 3 ...
## $ Number.of.Children : int 2 1 0 0 1 2 2 0 1 0 ...
## $ Motor.Certificate : int 0 1 1 1 1 1 1 1 1 1 ...
## $ Car.Certificate : int 0 0 0 0 0 1 1 0 0 0 ...
## $ Bicycle.Owning : int 0 1 0 1 0 1 0 0 1 0 ...
## $ Motor.Owning : int 0 1 1 1 1 1 1 1 1 1 ...
## $ Car.Owning : int 0 0 0 0 0 1 1 0 0 0 ...
## $ Number.of.Bicycles : int 1 1 0 1 0 1 0 0 1 0 ...
## $ Number.of.Motors : int 2 3 3 3 2 2 2 2 3 3 ...
## $ Number.of.Car : int 1 0 0 0 0 1 1 0 0 0 ...
str(MC$Travel.Mode)
## int [1:847] 6 3 3 3 3 4 4 3 3 3 ...
# View data (number of rows and colume)
dim(MC)
## [1] 847 30
# Sumerize Data
summary(MC)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose
## Min. :1.000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000
## Median :3.000 Median :1.0000 Median :0.000 Median :1.000
## Mean :3.646 Mean :0.5089 Mean :0.477 Mean :1.568
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :7.000 Max. :1.0000 Max. :1.000 Max. :4.000
##
## Frequency Departure.Time Distance Travel.Period
## Min. :1.000 Min. :1.000 Min. : 0.015 Min. : 0.00
## 1st Qu.:4.000 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.: 10.00
## Median :4.000 Median :1.000 Median : 5.000 Median : 15.00
## Mean :3.548 Mean :1.843 Mean : 6.492 Mean : 17.42
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 20.00
## Max. :4.000 Max. :4.000 Max. :35.000 Max. :180.00
##
## Sidewalk.Clearance Lane.Separate Temporary.Stop.Number Mode.Choice.Reason
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:2.000
## Median :1.0000 Median :1.0000 Median :0.0000 Median :2.000
## Mean :0.8666 Mean :0.7898 Mean :0.6824 Mean :2.713
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :1.0000 Max. :1.0000 Max. :3.0000 Max. :6.000
##
## Weather Weekend Non.Bus.Reason Cost
## Min. :1.000 Min. :0.0000 Min. :1.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.: 3.000
## Median :1.000 Median :0.0000 Median :2.000 Median : 5.000
## Mean :1.646 Mean :0.2444 Mean :2.912 Mean : 8.685
## 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 10.000
## Max. :3.000 Max. :1.0000 Max. :6.000 Max. :285.000
## NA's :110
## Bus.Stop.Present Gender Age Occupation
## Min. :0.000 Min. :0.0000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:3.00 1st Qu.:2.000
## Median :1.000 Median :1.0000 Median :4.00 Median :3.000
## Mean :1.004 Mean :0.5832 Mean :3.56 Mean :4.152
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:4.00 3rd Qu.:7.000
## Max. :2.000 Max. :1.0000 Max. :6.00 Max. :8.000
##
## Income Number.of.Children Motor.Certificate Car.Certificate
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000
## Median :2.000 Median :1.0000 Median :1.0000 Median :0.000
## Mean :2.205 Mean :0.7674 Mean :0.8453 Mean :0.183
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :4.000 Max. :3.0000 Max. :1.0000 Max. :1.000
##
## Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.4109 Mean :0.7851 Mean :0.1405 Mean :0.7084
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :3.0000
##
## Number.of.Motors Number.of.Car
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :0.0000
## Mean :2.118 Mean :0.2456
## 3rd Qu.:3.000 3rd Qu.:0.0000
## Max. :4.000 Max. :2.0000
##
# Create dat - MotorBus (Bus và xe máy)
datMB <- MC[MC$Travel.Mode %in% c(3,7),]
head(datMB)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 8 3 1 0 1 4 1
## 9 3 1 0 1 4 2
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 8 10 25 1 1 0
## 9 12 30 1 1 1
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 2 3 0 1 8 2
## 3 4 1 0 2 5 1
## 4 2 1 0 4 5 1
## 5 2 3 0 2 8 2
## 8 2 3 1 2 10 1
## 9 2 1 0 1 12 1
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 2 0 3 6 3 1 1
## 3 1 3 2 4 0 1
## 4 1 3 2 3 0 1
## 5 0 4 6 3 1 1
## 8 0 4 7 2 0 1
## 9 0 3 7 3 1 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 8 0 0 1 0 0
## 9 0 1 1 0 1
## Number.of.Motors Number.of.Car
## 2 3 0
## 3 3 0
## 4 3 0
## 5 2 0
## 8 2 0
## 9 3 0
dim(datMB)
## [1] 628 30
datMB$Bus[datMB$Travel.Mode == 3] <- 0
datMB$Bus[datMB$Travel.Mode == 7] <- 1
datMB$Temporary.Stop.Number[datMB$Temporary.Stop.Number == 0] <- 0
datMB$Temporary.Stop.Number[datMB$Temporary.Stop.Number >= 1] <- 1
datMB$Weather[datMB$Weather == 1] <- 0
datMB$Weather[datMB$Weather == 2] <- 1
datMB$Weather[datMB$Weather == 3] <- 0
datMB$Number.of.Children[datMB$Number.of.Children == 0] <- 0
datMB$Number.of.Children[datMB$Number.of.Children >= 1] <- 1
datMB$Age[datMB$Age == 2] <- 1
datMB$Age[datMB$Age == 3] <- 1
datMB$Age[datMB$Age == 4] <- 2
datMB$Age[datMB$Age == 5] <- 2
datMB$Age[datMB$Age == 6] <- 3
datMB$Occupation[datMB$Occupation == 2] <- 1
datMB$Occupation[datMB$Occupation == 3] <- 2
datMB$Occupation[datMB$Occupation == 6] <- 2
datMB$Occupation[datMB$Occupation == 4] <- 3
datMB$Occupation[datMB$Occupation == 5] <- 3
datMB$Occupation[datMB$Occupation == 7] <- 4
datMB$Occupation[datMB$Occupation == 8] <- 5
head(datMB)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 8 3 1 0 1 4 1
## 9 3 1 0 1 4 2
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 8 10 25 1 1 0
## 9 12 30 1 1 1
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 2 0 0 1 8 2
## 3 4 0 0 2 5 1
## 4 2 0 0 4 5 1
## 5 2 0 0 2 8 2
## 8 2 0 1 2 10 1
## 9 2 0 0 1 12 1
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 2 0 1 2 3 1 1
## 3 1 1 1 4 0 1
## 4 1 1 1 3 0 1
## 5 0 2 2 3 1 1
## 8 0 2 4 2 0 1
## 9 0 1 4 3 1 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 8 0 0 1 0 0
## 9 0 1 1 0 1
## Number.of.Motors Number.of.Car Bus
## 2 3 0 0
## 3 3 0 0
## 4 3 0 0
## 5 2 0 0
## 8 2 0 0
## 9 3 0 0
str(datMB)
## 'data.frame': 628 obs. of 31 variables:
## $ Travel.Mode : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Bus.Stop.Condition : int 1 1 1 1 1 1 1 1 0 0 ...
## $ Central.Area : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Purpose : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : int 1 3 1 3 1 2 1 1 1 1 ...
## $ Distance : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Travel.Period : num 15 15 10 15 25 30 25 5 5 10 ...
## $ Sidewalk.Clearance : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Lane.Separate : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Temporary.Stop.Number: num 1 1 1 0 0 1 1 0 1 0 ...
## $ Mode.Choice.Reason : int 2 4 2 2 2 2 2 2 4 3 ...
## $ Weather : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Weekend : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Non.Bus.Reason : int 1 2 4 2 2 1 4 4 4 2 ...
## $ Cost : int 8 5 5 8 10 12 10 3 2 4 ...
## $ Bus.Stop.Present : int 2 1 1 2 1 1 1 1 1 1 ...
## $ Gender : int 0 1 1 0 0 0 1 1 1 1 ...
## $ Age : num 1 1 1 2 2 1 2 1 2 1 ...
## $ Occupation : num 2 1 1 2 4 4 2 1 4 4 ...
## $ Income : int 3 4 3 3 2 3 3 2 2 1 ...
## $ Number.of.Children : num 1 0 0 1 0 1 0 0 1 0 ...
## $ Motor.Certificate : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Car.Certificate : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bicycle.Owning : int 1 0 1 0 0 1 0 1 0 0 ...
## $ Motor.Owning : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Car.Owning : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Bicycles : int 1 0 1 0 0 1 0 1 2 0 ...
## $ Number.of.Motors : int 3 3 3 2 2 3 3 3 2 1 ...
## $ Number.of.Car : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
# reformat variables in factor variables
attach(datMB)
datMB = within(datMB, {
Travel.Mode = factor(Travel.Mode, labels = c("Motorbike", "Bus"))
Bus.Stop.Condition = factor(Bus.Stop.Condition,labels = c("No", "Yes"))
Central.Area = factor(Central.Area, labels = c("No", "Yes"))
Purpose = factor(Purpose, labels = c("Work/Study", "Picking Children", "Entertainment", "Others"))
Frequency = factor(Frequency, labels = c("Once", "2 times", "3 times", "> 3 times"))
Departure.Time = factor(Departure.Time, labels = c("Morning", "Afternoon", "Evening", "Others"))
Sidewalk.Clearance = factor(Sidewalk.Clearance, labels = c("No", "Yes"))
Lane.Separate = factor(Lane.Separate, labels = c("No", "Yes"))
Temporary.Stop.Number = factor(Temporary.Stop.Number, labels = c("No", "Yes"))
Mode.Choice.Reason = factor(Mode.Choice.Reason, labels = c("Safety", "Comfortable", "Low price", "Accessibility", "Reliability", "others"))
Weather = factor(Weather, labels = c("Sunny/cool", "Rainny"))
Weekend = factor(Weekend, labels = c("No", "Yes"))
Non.Bus.Reason = factor(Non.Bus.Reason, labels = c("No Route", "Uncomfortable", "Unsafety", "Long waiting time", "Unreliability", "others"))
Bus.Stop.Present = factor(Bus.Stop.Present, labels = c("No", "Yes", "Don't know"))
Gender = factor(Gender, labels = c("Female", "Male"))
Age = factor(Age, labels = c("15-24", "25-60", ">60"))
Occupation = factor(Occupation, labels = c("Pupils/Students", "Officers/Workers", "Housewife/Unemployed", "Free labor", "Others"))
Number.of.Children = factor(Number.of.Children, labels = c("No", "Yes"))
Motor.Certificate = factor(Motor.Certificate, labels = c("No", "Yes"))
Car.Certificate = factor(Car.Certificate, labels = c("No", "Yes"))
Bicycle.Owning = factor(Bicycle.Owning, labels = c("No", "Yes"))
Motor.Owning = factor(Motor.Owning, labels = c("No", "Yes"))
Car.Owning = factor(Car.Owning, labels = c("No", "Yes"))
Number.of.Bicycles = factor(Number.of.Bicycles, labels = c("None", "1", "2", ">=3"))
Number.of.Motors = factor(Number.of.Motors, labels = c("None", "1", "2", "3", ">3"))
Number.of.Car = factor(Number.of.Car, labels = c("None", "1", ">=2"))
Income = factor(Income, labels = c("<8 millions", "(8-15) millions", "(15-25) millions", ">25 millions"))
Bus = factor(Bus, labels = c("No", "Yes"))
Distance = as.numeric(Distance)
Travel.Period = as.numeric(Travel.Period)
Cost = as.numeric(Cost)
} )
head(datMB)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency
## 2 Motorbike Yes No Work/Study > 3 times
## 3 Motorbike Yes No Work/Study > 3 times
## 4 Motorbike Yes No Work/Study > 3 times
## 5 Motorbike Yes No Work/Study > 3 times
## 8 Motorbike Yes No Work/Study > 3 times
## 9 Motorbike Yes No Work/Study > 3 times
## Departure.Time Distance Travel.Period Sidewalk.Clearance Lane.Separate
## 2 Morning 8 15 Yes Yes
## 3 Evening 5 15 Yes Yes
## 4 Morning 5 10 Yes Yes
## 5 Evening 8 15 Yes Yes
## 8 Morning 10 25 Yes Yes
## 9 Afternoon 12 30 Yes Yes
## Temporary.Stop.Number Mode.Choice.Reason Weather Weekend Non.Bus.Reason
## 2 Yes Comfortable Sunny/cool No No Route
## 3 Yes Accessibility Sunny/cool No Uncomfortable
## 4 Yes Comfortable Sunny/cool No Long waiting time
## 5 No Comfortable Sunny/cool No Uncomfortable
## 8 No Comfortable Sunny/cool Yes Uncomfortable
## 9 Yes Comfortable Sunny/cool No No Route
## Cost Bus.Stop.Present Gender Age Occupation Income
## 2 8 Don't know Female 15-24 Officers/Workers (15-25) millions
## 3 5 Yes Male 15-24 Pupils/Students >25 millions
## 4 5 Yes Male 15-24 Pupils/Students (15-25) millions
## 5 8 Don't know Female 25-60 Officers/Workers (15-25) millions
## 8 10 Yes Female 25-60 Free labor (8-15) millions
## 9 12 Yes Female 15-24 Free labor (15-25) millions
## Number.of.Children Motor.Certificate Car.Certificate Bicycle.Owning
## 2 Yes Yes No Yes
## 3 No Yes No No
## 4 No Yes No Yes
## 5 Yes Yes No No
## 8 No Yes No No
## 9 Yes Yes No Yes
## Motor.Owning Car.Owning Number.of.Bicycles Number.of.Motors Number.of.Car Bus
## 2 Yes No 1 3 None No
## 3 Yes No None 3 None No
## 4 Yes No 1 3 None No
## 5 Yes No None 2 None No
## 8 Yes No None 2 None No
## 9 Yes No 1 3 None No
str(datMB)
## 'data.frame': 628 obs. of 31 variables:
## $ Travel.Mode : Factor w/ 2 levels "Motorbike","Bus": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bus.Stop.Condition : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
## $ Central.Area : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 2 ...
## $ Purpose : Factor w/ 4 levels "Work/Study","Picking Children",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : Factor w/ 4 levels "Once","2 times",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : Factor w/ 4 levels "Morning","Afternoon",..: 1 3 1 3 1 2 1 1 1 1 ...
## $ Distance : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Travel.Period : num 15 15 10 15 25 30 25 5 5 10 ...
## $ Sidewalk.Clearance : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Lane.Separate : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Temporary.Stop.Number: Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 2 1 ...
## $ Mode.Choice.Reason : Factor w/ 6 levels "Safety","Comfortable",..: 2 4 2 2 2 2 2 2 4 3 ...
## $ Weather : Factor w/ 2 levels "Sunny/cool","Rainny": 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekend : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ Non.Bus.Reason : Factor w/ 6 levels "No Route","Uncomfortable",..: 1 2 4 2 2 1 4 4 4 2 ...
## $ Cost : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Bus.Stop.Present : Factor w/ 3 levels "No","Yes","Don't know": 3 2 2 3 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 2 2 2 2 ...
## $ Age : Factor w/ 3 levels "15-24","25-60",..: 1 1 1 2 2 1 2 1 2 1 ...
## $ Occupation : Factor w/ 5 levels "Pupils/Students",..: 2 1 1 2 4 4 2 1 4 4 ...
## $ Income : Factor w/ 4 levels "<8 millions",..: 3 4 3 3 2 3 3 2 2 1 ...
## $ Number.of.Children : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 1 2 1 ...
## $ Motor.Certificate : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ Car.Certificate : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bicycle.Owning : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 2 1 2 1 1 ...
## $ Motor.Owning : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ Car.Owning : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Number.of.Bicycles : Factor w/ 4 levels "None","1","2",..: 2 1 2 1 1 2 1 2 3 1 ...
## $ Number.of.Motors : Factor w/ 5 levels "None","1","2",..: 4 4 4 3 3 4 4 4 3 2 ...
## $ Number.of.Car : Factor w/ 3 levels "None","1",">=2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
# Remove tiep variables : Travel.Mode (1), Car.Certificate (24), Bicycle.Owning (25), Car.Owning (27), Number.of.Bicycles (28), Number.of.Car (30)
attach(datMB)
## The following objects are masked from datMB (pos = 3):
##
## Age, Bicycle.Owning, Bus, Bus.Stop.Condition, Bus.Stop.Present,
## Car.Certificate, Car.Owning, Central.Area, Cost, Departure.Time,
## Distance, Frequency, Gender, Income, Lane.Separate,
## Mode.Choice.Reason, Motor.Certificate, Motor.Owning,
## Non.Bus.Reason, Number.of.Bicycles, Number.of.Car,
## Number.of.Children, Number.of.Motors, Occupation, Purpose,
## Sidewalk.Clearance, Temporary.Stop.Number, Travel.Mode,
## Travel.Period, Weather, Weekend
datMB <- datMB[,c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,26,29,31)]
head(datMB)
## Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time Distance
## 2 Yes No Work/Study > 3 times Morning 8
## 3 Yes No Work/Study > 3 times Evening 5
## 4 Yes No Work/Study > 3 times Morning 5
## 5 Yes No Work/Study > 3 times Evening 8
## 8 Yes No Work/Study > 3 times Morning 10
## 9 Yes No Work/Study > 3 times Afternoon 12
## Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 15 Yes Yes Yes
## 3 15 Yes Yes Yes
## 4 10 Yes Yes Yes
## 5 15 Yes Yes No
## 8 25 Yes Yes No
## 9 30 Yes Yes Yes
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 Comfortable Sunny/cool No No Route 8 Don't know
## 3 Accessibility Sunny/cool No Uncomfortable 5 Yes
## 4 Comfortable Sunny/cool No Long waiting time 5 Yes
## 5 Comfortable Sunny/cool No Uncomfortable 8 Don't know
## 8 Comfortable Sunny/cool Yes Uncomfortable 10 Yes
## 9 Comfortable Sunny/cool No No Route 12 Yes
## Gender Age Occupation Income Number.of.Children
## 2 Female 15-24 Officers/Workers (15-25) millions Yes
## 3 Male 15-24 Pupils/Students >25 millions No
## 4 Male 15-24 Pupils/Students (15-25) millions No
## 5 Female 25-60 Officers/Workers (15-25) millions Yes
## 8 Female 25-60 Free labor (8-15) millions No
## 9 Female 15-24 Free labor (15-25) millions Yes
## Motor.Certificate Motor.Owning Number.of.Motors Bus
## 2 Yes Yes 3 No
## 3 Yes Yes 3 No
## 4 Yes Yes 3 No
## 5 Yes Yes 2 No
## 8 Yes Yes 2 No
## 9 Yes Yes 3 No
dim(datMB)
## [1] 628 25
2. Descriptive analyse
# Descriptive analyse with package "table 1" - sort by Bus variable
## install.packages("table1", dependencies = T)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1 (~ Bus.Stop.Condition + Central.Area + Purpose + Frequency + Departure.Time + Sidewalk.Clearance + Sidewalk.Clearance + Lane.Separate + Temporary.Stop.Number + Mode.Choice.Reason + Weather + Weekend + Non.Bus.Reason + Bus.Stop.Present + Gender + Age + Occupation + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +Distance + Travel.Period + Cost| Bus , data = datMB)
| No (N=518) |
Yes (N=110) |
Overall (N=628) |
|
|---|---|---|---|
| Bus.Stop.Condition | |||
| No | 253 (48.8%) | 63 (57.3%) | 316 (50.3%) |
| Yes | 265 (51.2%) | 47 (42.7%) | 312 (49.7%) |
| Central.Area | |||
| No | 284 (54.8%) | 44 (40.0%) | 328 (52.2%) |
| Yes | 234 (45.2%) | 66 (60.0%) | 300 (47.8%) |
| Purpose | |||
| Work/Study | 351 (67.8%) | 80 (72.7%) | 431 (68.6%) |
| Picking Children | 69 (13.3%) | 9 (8.2%) | 78 (12.4%) |
| Entertainment | 89 (17.2%) | 14 (12.7%) | 103 (16.4%) |
| Others | 9 (1.7%) | 7 (6.4%) | 16 (2.5%) |
| Frequency | |||
| Once | 23 (4.4%) | 13 (11.8%) | 36 (5.7%) |
| 2 times | 18 (3.5%) | 19 (17.3%) | 37 (5.9%) |
| 3 times | 33 (6.4%) | 20 (18.2%) | 53 (8.4%) |
| > 3 times | 444 (85.7%) | 58 (52.7%) | 502 (79.9%) |
| Departure.Time | |||
| Morning | 328 (63.3%) | 71 (64.5%) | 399 (63.5%) |
| Afternoon | 34 (6.6%) | 12 (10.9%) | 46 (7.3%) |
| Evening | 85 (16.4%) | 20 (18.2%) | 105 (16.7%) |
| Others | 71 (13.7%) | 7 (6.4%) | 78 (12.4%) |
| Sidewalk.Clearance | |||
| No | 79 (15.3%) | 15 (13.6%) | 94 (15.0%) |
| Yes | 439 (84.7%) | 95 (86.4%) | 534 (85.0%) |
| Lane.Separate | |||
| No | 111 (21.4%) | 20 (18.2%) | 131 (20.9%) |
| Yes | 407 (78.6%) | 90 (81.8%) | 497 (79.1%) |
| Temporary.Stop.Number | |||
| No | 247 (47.7%) | 81 (73.6%) | 328 (52.2%) |
| Yes | 271 (52.3%) | 29 (26.4%) | 300 (47.8%) |
| Mode.Choice.Reason | |||
| Safety | 27 (5.2%) | 32 (29.1%) | 59 (9.4%) |
| Comfortable | 256 (49.4%) | 41 (37.3%) | 297 (47.3%) |
| Low price | 24 (4.6%) | 34 (30.9%) | 58 (9.2%) |
| Accessibility | 161 (31.1%) | 1 (0.9%) | 162 (25.8%) |
| Reliability | 42 (8.1%) | 0 (0%) | 42 (6.7%) |
| others | 8 (1.5%) | 2 (1.8%) | 10 (1.6%) |
| Weather | |||
| Sunny/cool | 516 (99.6%) | 101 (91.8%) | 617 (98.2%) |
| Rainny | 2 (0.4%) | 9 (8.2%) | 11 (1.8%) |
| Weekend | |||
| No | 403 (77.8%) | 87 (79.1%) | 490 (78.0%) |
| Yes | 115 (22.2%) | 23 (20.9%) | 138 (22.0%) |
| Non.Bus.Reason | |||
| No Route | 122 (23.6%) | 0 (0%) | 122 (19.4%) |
| Uncomfortable | 160 (30.9%) | 0 (0%) | 160 (25.5%) |
| Unsafety | 7 (1.4%) | 0 (0%) | 7 (1.1%) |
| Long waiting time | 159 (30.7%) | 0 (0%) | 159 (25.3%) |
| Unreliability | 47 (9.1%) | 0 (0%) | 47 (7.5%) |
| others | 23 (4.4%) | 0 (0%) | 23 (3.7%) |
| Missing | 0 (0%) | 110 (100%) | 110 (17.5%) |
| Bus.Stop.Present | |||
| No | 48 (9.3%) | 19 (17.3%) | 67 (10.7%) |
| Yes | 415 (80.1%) | 86 (78.2%) | 501 (79.8%) |
| Don't know | 55 (10.6%) | 5 (4.5%) | 60 (9.6%) |
| Gender | |||
| Female | 223 (43.1%) | 44 (40.0%) | 267 (42.5%) |
| Male | 295 (56.9%) | 66 (60.0%) | 361 (57.5%) |
| Age | |||
| 15-24 | 215 (41.5%) | 64 (58.2%) | 279 (44.4%) |
| 25-60 | 294 (56.8%) | 26 (23.6%) | 320 (51.0%) |
| >60 | 9 (1.7%) | 20 (18.2%) | 29 (4.6%) |
| Occupation | |||
| Pupils/Students | 165 (31.9%) | 54 (49.1%) | 219 (34.9%) |
| Officers/Workers | 153 (29.5%) | 14 (12.7%) | 167 (26.6%) |
| Housewife/Unemployed | 31 (6.0%) | 12 (10.9%) | 43 (6.8%) |
| Free labor | 139 (26.8%) | 12 (10.9%) | 151 (24.0%) |
| Others | 30 (5.8%) | 18 (16.4%) | 48 (7.6%) |
| Income | |||
| <8 millions | 114 (22.0%) | 52 (47.3%) | 166 (26.4%) |
| (8-15) millions | 236 (45.6%) | 29 (26.4%) | 265 (42.2%) |
| (15-25) millions | 129 (24.9%) | 26 (23.6%) | 155 (24.7%) |
| >25 millions | 39 (7.5%) | 3 (2.7%) | 42 (6.7%) |
| Number.of.Children | |||
| No | 224 (43.2%) | 64 (58.2%) | 288 (45.9%) |
| Yes | 294 (56.8%) | 46 (41.8%) | 340 (54.1%) |
| Motor.Certificate | |||
| No | 25 (4.8%) | 39 (35.5%) | 64 (10.2%) |
| Yes | 493 (95.2%) | 71 (64.5%) | 564 (89.8%) |
| Motor.Owning | |||
| No | 42 (8.1%) | 51 (46.4%) | 93 (14.8%) |
| Yes | 476 (91.9%) | 59 (53.6%) | 535 (85.2%) |
| Number.of.Motors | |||
| None | 5 (1.0%) | 13 (11.8%) | 18 (2.9%) |
| 1 | 68 (13.1%) | 27 (24.5%) | 95 (15.1%) |
| 2 | 262 (50.6%) | 50 (45.5%) | 312 (49.7%) |
| 3 | 147 (28.4%) | 17 (15.5%) | 164 (26.1%) |
| >3 | 36 (6.9%) | 3 (2.7%) | 39 (6.2%) |
| Distance | |||
| Mean (SD) | 6.31 (5.00) | 8.55 (4.73) | 6.70 (5.03) |
| Median [Min, Max] | 5.00 [0.200, 30.0] | 8.00 [0.500, 25.0] | 5.00 [0.200, 30.0] |
| Travel.Period | |||
| Mean (SD) | 15.8 (12.8) | 21.5 (12.4) | 16.8 (12.9) |
| Median [Min, Max] | 15.0 [0, 180] | 20.0 [3.00, 60.0] | 15.0 [0, 180] |
| Cost | |||
| Mean (SD) | 6.33 (4.98) | 5.23 (1.05) | 6.14 (4.57) |
| Median [Min, Max] | 5.00 [0, 30.0] | 5.00 [5.00, 10.0] | 5.00 [0, 30.0] |
# Descriptive analyse with package "compareGroups" - sort by Bus variable (hon table1 la cung cap them tri so p. Doi voi bien phan loai, tri so p cung duoc tinh theo Chi-Square)
## install.packages("compareGroups", dependencies = T)
library(compareGroups)
t = compareGroups(Bus ~ Bus.Stop.Condition + Central.Area + Purpose + Frequency + Departure.Time + Sidewalk.Clearance + Lane.Separate + Temporary.Stop.Number + Mode.Choice.Reason + Weather + Weekend + Non.Bus.Reason + Bus.Stop.Present + Gender + Age + Occupation + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +Distance + Travel.Period + Cost, data = datMB) # 24 variables
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
createTable(t)
##
## --------Summary descriptives table by 'Bus'---------
##
## __________________________________________________________
## No Yes p.overall
## N=518 N=110
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Bus.Stop.Condition: 0.133
## No 253 (48.8%) 63 (57.3%)
## Yes 265 (51.2%) 47 (42.7%)
## Central.Area: 0.006
## No 284 (54.8%) 44 (40.0%)
## Yes 234 (45.2%) 66 (60.0%)
## Purpose: 0.019
## Work/Study 351 (67.8%) 80 (72.7%)
## Picking Children 69 (13.3%) 9 (8.18%)
## Entertainment 89 (17.2%) 14 (12.7%)
## Others 9 (1.74%) 7 (6.36%)
## Frequency: <0.001
## Once 23 (4.44%) 13 (11.8%)
## 2 times 18 (3.47%) 19 (17.3%)
## 3 times 33 (6.37%) 20 (18.2%)
## > 3 times 444 (85.7%) 58 (52.7%)
## Departure.Time: 0.091
## Morning 328 (63.3%) 71 (64.5%)
## Afternoon 34 (6.56%) 12 (10.9%)
## Evening 85 (16.4%) 20 (18.2%)
## Others 71 (13.7%) 7 (6.36%)
## Sidewalk.Clearance: 0.776
## No 79 (15.3%) 15 (13.6%)
## Yes 439 (84.7%) 95 (86.4%)
## Lane.Separate: 0.527
## No 111 (21.4%) 20 (18.2%)
## Yes 407 (78.6%) 90 (81.8%)
## Temporary.Stop.Number: <0.001
## No 247 (47.7%) 81 (73.6%)
## Yes 271 (52.3%) 29 (26.4%)
## Mode.Choice.Reason: .
## Safety 27 (5.21%) 32 (29.1%)
## Comfortable 256 (49.4%) 41 (37.3%)
## Low price 24 (4.63%) 34 (30.9%)
## Accessibility 161 (31.1%) 1 (0.91%)
## Reliability 42 (8.11%) 0 (0.00%)
## others 8 (1.54%) 2 (1.82%)
## Weather: <0.001
## Sunny/cool 516 (99.6%) 101 (91.8%)
## Rainny 2 (0.39%) 9 (8.18%)
## Weekend: 0.865
## No 403 (77.8%) 87 (79.1%)
## Yes 115 (22.2%) 23 (20.9%)
## Non.Bus.Reason: .
## No Route 122 (23.6%) 0 (.%)
## Uncomfortable 160 (30.9%) 0 (.%)
## Unsafety 7 (1.35%) 0 (.%)
## Long waiting time 159 (30.7%) 0 (.%)
## Unreliability 47 (9.07%) 0 (.%)
## others 23 (4.44%) 0 (.%)
## Bus.Stop.Present: 0.011
## No 48 (9.27%) 19 (17.3%)
## Yes 415 (80.1%) 86 (78.2%)
## Don't know 55 (10.6%) 5 (4.55%)
## Gender: 0.630
## Female 223 (43.1%) 44 (40.0%)
## Male 295 (56.9%) 66 (60.0%)
## Age: <0.001
## 15-24 215 (41.5%) 64 (58.2%)
## 25-60 294 (56.8%) 26 (23.6%)
## >60 9 (1.74%) 20 (18.2%)
## Occupation: <0.001
## Pupils/Students 165 (31.9%) 54 (49.1%)
## Officers/Workers 153 (29.5%) 14 (12.7%)
## Housewife/Unemployed 31 (5.98%) 12 (10.9%)
## Free labor 139 (26.8%) 12 (10.9%)
## Others 30 (5.79%) 18 (16.4%)
## Income: <0.001
## <8 millions 114 (22.0%) 52 (47.3%)
## (8-15) millions 236 (45.6%) 29 (26.4%)
## (15-25) millions 129 (24.9%) 26 (23.6%)
## >25 millions 39 (7.53%) 3 (2.73%)
## Number.of.Children: 0.006
## No 224 (43.2%) 64 (58.2%)
## Yes 294 (56.8%) 46 (41.8%)
## Motor.Certificate: <0.001
## No 25 (4.83%) 39 (35.5%)
## Yes 493 (95.2%) 71 (64.5%)
## Motor.Owning: <0.001
## No 42 (8.11%) 51 (46.4%)
## Yes 476 (91.9%) 59 (53.6%)
## Number.of.Motors: <0.001
## None 5 (0.97%) 13 (11.8%)
## 1 68 (13.1%) 27 (24.5%)
## 2 262 (50.6%) 50 (45.5%)
## 3 147 (28.4%) 17 (15.5%)
## >3 36 (6.95%) 3 (2.73%)
## Distance 6.31 (5.00) 8.55 (4.73) <0.001
## Travel.Period 15.8 (12.8) 21.5 (12.4) <0.001
## Cost 6.33 (4.98) 5.23 (1.05) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Remove variables with p-value > 0.05: Bus.Stop.Condition (2), Departure.Time (6), Sidewalk.Clearance (9), Lane.Separate (10), Weekend (14), Gender (18)
t1 = compareGroups(Bus ~ Central.Area + Purpose + Frequency + Temporary.Stop.Number + Mode.Choice.Reason + Weather + Non.Bus.Reason + Bus.Stop.Present + Age + Occupation + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +Distance + Travel.Period + Cost, data = datMB) # 18 variables
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
createTable(t1)
##
## --------Summary descriptives table by 'Bus'---------
##
## __________________________________________________________
## No Yes p.overall
## N=518 N=110
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Central.Area: 0.006
## No 284 (54.8%) 44 (40.0%)
## Yes 234 (45.2%) 66 (60.0%)
## Purpose: 0.019
## Work/Study 351 (67.8%) 80 (72.7%)
## Picking Children 69 (13.3%) 9 (8.18%)
## Entertainment 89 (17.2%) 14 (12.7%)
## Others 9 (1.74%) 7 (6.36%)
## Frequency: <0.001
## Once 23 (4.44%) 13 (11.8%)
## 2 times 18 (3.47%) 19 (17.3%)
## 3 times 33 (6.37%) 20 (18.2%)
## > 3 times 444 (85.7%) 58 (52.7%)
## Temporary.Stop.Number: <0.001
## No 247 (47.7%) 81 (73.6%)
## Yes 271 (52.3%) 29 (26.4%)
## Mode.Choice.Reason: .
## Safety 27 (5.21%) 32 (29.1%)
## Comfortable 256 (49.4%) 41 (37.3%)
## Low price 24 (4.63%) 34 (30.9%)
## Accessibility 161 (31.1%) 1 (0.91%)
## Reliability 42 (8.11%) 0 (0.00%)
## others 8 (1.54%) 2 (1.82%)
## Weather: <0.001
## Sunny/cool 516 (99.6%) 101 (91.8%)
## Rainny 2 (0.39%) 9 (8.18%)
## Non.Bus.Reason: .
## No Route 122 (23.6%) 0 (.%)
## Uncomfortable 160 (30.9%) 0 (.%)
## Unsafety 7 (1.35%) 0 (.%)
## Long waiting time 159 (30.7%) 0 (.%)
## Unreliability 47 (9.07%) 0 (.%)
## others 23 (4.44%) 0 (.%)
## Bus.Stop.Present: 0.011
## No 48 (9.27%) 19 (17.3%)
## Yes 415 (80.1%) 86 (78.2%)
## Don't know 55 (10.6%) 5 (4.55%)
## Age: <0.001
## 15-24 215 (41.5%) 64 (58.2%)
## 25-60 294 (56.8%) 26 (23.6%)
## >60 9 (1.74%) 20 (18.2%)
## Occupation: <0.001
## Pupils/Students 165 (31.9%) 54 (49.1%)
## Officers/Workers 153 (29.5%) 14 (12.7%)
## Housewife/Unemployed 31 (5.98%) 12 (10.9%)
## Free labor 139 (26.8%) 12 (10.9%)
## Others 30 (5.79%) 18 (16.4%)
## Income: <0.001
## <8 millions 114 (22.0%) 52 (47.3%)
## (8-15) millions 236 (45.6%) 29 (26.4%)
## (15-25) millions 129 (24.9%) 26 (23.6%)
## >25 millions 39 (7.53%) 3 (2.73%)
## Number.of.Children: 0.006
## No 224 (43.2%) 64 (58.2%)
## Yes 294 (56.8%) 46 (41.8%)
## Motor.Certificate: <0.001
## No 25 (4.83%) 39 (35.5%)
## Yes 493 (95.2%) 71 (64.5%)
## Motor.Owning: <0.001
## No 42 (8.11%) 51 (46.4%)
## Yes 476 (91.9%) 59 (53.6%)
## Number.of.Motors: <0.001
## None 5 (0.97%) 13 (11.8%)
## 1 68 (13.1%) 27 (24.5%)
## 2 262 (50.6%) 50 (45.5%)
## 3 147 (28.4%) 17 (15.5%)
## >3 36 (6.95%) 3 (2.73%)
## Distance 6.31 (5.00) 8.55 (4.73) <0.001
## Travel.Period 15.8 (12.8) 21.5 (12.4) <0.001
## Cost 6.33 (4.98) 5.23 (1.05) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
3. Descriptive analyse by graphs
library(magrittr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
datMB %>%
group_by(Bus, Bus.Stop.Condition) %>%
count() %>%
ggplot(aes(Bus.Stop.Condition, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Bus Stop Condition") +
ylab("Bus") +
ggtitle("Proportion of Bus Choice ~ Bus Stop Condition")
datMB %>%
group_by(Bus, Central.Area) %>%
count() %>%
ggplot(aes(Central.Area, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Central Area") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Central Area")
datMB %>%
group_by(Bus, Purpose) %>%
count() %>%
ggplot(aes(Purpose, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Purpose of travelling") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Purpose of Travelling")
datMB %>%
group_by(Bus, Frequency) %>%
count() %>%
ggplot(aes(Frequency, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Frequency of Travelling") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Frequency of Travelling")
datMB %>%
group_by(Bus, Departure.Time) %>%
count() %>%
ggplot(aes(Departure.Time, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Departure time of travel") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Departure Time of Travel")
datMB %>%
group_by(Bus, Sidewalk.Clearance) %>%
count() %>%
ggplot(aes(Sidewalk.Clearance, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Sidewalk Clearance of Roads") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Sidewalk Clearance of Roads")
datMB %>%
group_by(Bus, Lane.Separate) %>%
count() %>%
ggplot(aes(Lane.Separate, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Lane Separate") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Lane Separate of Roads")
datMB %>%
group_by(Bus, Temporary.Stop.Number) %>%
count() %>%
ggplot(aes(Temporary.Stop.Number, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("The number of temporary stops") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ The number of temporary stops")
datMB %>%
group_by(Bus, Mode.Choice.Reason) %>%
count() %>%
ggplot(aes(Mode.Choice.Reason, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("The reason of motor mode choice") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Motor Choice Reason")
datMB %>%
group_by(Bus, Weather) %>%
count() %>%
ggplot(aes(Weather, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Weather condition") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Weather condition")
datMB %>%
group_by(Bus, Weekend) %>%
count() %>%
ggplot(aes(Weekend, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Weekend") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Weekend")
datMB %>%
group_by(Bus, Bus.Stop.Present) %>%
count() %>%
ggplot(aes(Bus.Stop.Present, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("The presence of bus stop") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ The presence of bus stop")
datMB %>%
group_by(Bus, Gender) %>%
count() %>%
ggplot(aes(Gender, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Gender") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Gender")
datMB %>%
group_by(Bus, Age) %>%
count() %>%
ggplot(aes(Age, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Age") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Age")
datMB %>%
group_by(Bus, Occupation) %>%
count() %>%
ggplot(aes(Occupation, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Occupation") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Occupation")
datMB %>%
group_by(Bus, Income) %>%
count() %>%
ggplot(aes(Income, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Income") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Income")
datMB %>%
group_by(Bus, Number.of.Children) %>%
count() %>%
ggplot(aes(Number.of.Children, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Number of Children") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Number of Children in family")
datMB %>%
group_by(Bus, Motor.Certificate) %>%
count() %>%
ggplot(aes(Motor.Certificate, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Motor Certificate") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Motor certificate")
datMB %>%
group_by(Bus, Motor.Owning) %>%
count() %>%
ggplot(aes(Motor.Owning, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Motor Owning") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Motor Owning")
datMB %>%
group_by(Bus, Number.of.Motors) %>%
count() %>%
ggplot(aes(Number.of.Motors, n, fill = Bus)) +
geom_col(position = "fill") +
xlab("Number of Motors") +
ylab("Bus Choice") +
ggtitle("Proportion of Bus Choice ~ Number of Motors")
## Travel Mode Choice ~ Countinuous Variables (Distance, Travel Period and Cost)
ggplot(datMB, aes(x=Distance, fill = Bus, color = Bus)) +
geom_histogram (position = "dodge") +
xlab("Distance") +
ylab("Count") +
ggtitle("Histogram of Distance")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(datMB, aes(x=Travel.Period, fill = Bus, color = Bus)) +
geom_histogram (position = "dodge") +
xlab("Travel Period") +
ylab("Count") +
ggtitle("Histogram of Travel Period")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(datMB, aes(x=Cost, fill = Bus, color = Bus)) +
geom_histogram (position = "dodge") +
xlab("Cost") +
ylab("Count") +
ggtitle("Histogram of Cost")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Correlation in continuous variables
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
CVBus = data.frame(datMB$Bus, datMB$Distance, datMB$Travel.Period, datMB$Cost)
pairs.panels(CVBus)
CVBus1 = data_frame(datMB$Bus.Stop.Condition, datMB$Central.Area, datMB$Purpose, datMB$Frequency, datMB$Departure.Time, datMB$Sidewalk.Clearance, datMB$Lane.Separate, datMB$Temporary.Stop.Number, datMB$Weather, datMB$Weekend)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
CVBus2 = data_frame(datMB$Bus.Stop.Present, datMB$Gender, datMB$Age, datMB$Occupation, datMB$Income, datMB$Number.of.Children, datMB$Motor.Certificate, datMB$Motor.Owning, datMB$Number.of.Motors)
pairs.panels(CVBus1)
pairs.panels(CVBus2)
## Statistic Analysis by boxplot (continuous variales)
datMB %>%
group_by(Distance, Bus) %>%
count() %>%
ggplot(aes(x = Bus, y = Distance, fill = Bus)) +
geom_boxplot() +
xlab("Bus Choice") +
ylab("Distance") +
ggtitle("Boxplot of Bus Choice ~ Distance")
datMB %>%
group_by(Cost, Bus) %>%
count() %>%
ggplot(aes(x = Bus, y = Cost, fill = Bus)) +
geom_boxplot() +
xlab("Bus Choice") +
ylab("Cost") +
ggtitle("Boxplot of Bus Choice ~ Cost")
datMB %>%
group_by(Travel.Period, Bus) %>%
count() %>%
ggplot(aes(x = Bus, y = Travel.Period, fill = Bus)) +
geom_boxplot() +
xlab("Bus Choice") +
ylab("Travel Period") +
ggtitle("Boxplot of Bus Choice ~ Travel Period")
### Boxplot of Distance, Travel Period and Cost ~ Gender
datMB %>%
group_by(Distance, Gender) %>%
count() %>%
ggplot(aes(x = Gender, y = Distance, fill = Gender)) +
geom_boxplot() +
xlab("Gender") +
ylab("Distance") +
ggtitle("Boxplot of Distance ~ Gender")
datMB %>%
group_by(Cost, Gender) %>%
count() %>%
ggplot(aes(x = Gender, y = Cost, fill = Gender)) +
geom_boxplot() +
xlab("Gender") +
ylab("Cost") +
ggtitle("Boxplot of Cost ~ Gender")
datMB %>%
group_by(Travel.Period, Gender) %>%
count() %>%
ggplot(aes(x = Gender, y = Travel.Period, fill = Gender)) +
geom_boxplot() +
xlab("Gender") +
ylab("Travel Period") +
ggtitle("Boxplot of Travel Period ~ Gender")
### Boxplot of Distance, Travel Period and Cost ~ Occupation
datMB %>%
group_by(Distance, Occupation) %>%
count() %>%
ggplot(aes(x = Occupation, y = Distance, fill = Occupation)) +
geom_boxplot() +
xlab("Occupation") +
ylab("Distance") +
ggtitle("Boxplot of Distance ~ Occupation")
datMB %>%
group_by(Cost, Occupation) %>%
count() %>%
ggplot(aes(x = Occupation, y = Cost, fill = Occupation)) +
geom_boxplot() +
xlab("Occupation") +
ylab("Cost") +
ggtitle("Boxplot of Cost ~ Occupation")
datMB %>%
group_by(Travel.Period, Occupation) %>%
count() %>%
ggplot(aes(x = Occupation, y = Travel.Period, fill = Occupation)) +
geom_boxplot() +
xlab("Occupation") +
ylab("Travel Period") +
ggtitle("Boxplot of Travel Period ~ Occupation")
summary(datMB)
## Bus.Stop.Condition Central.Area Purpose Frequency
## No :316 No :328 Work/Study :431 Once : 36
## Yes:312 Yes:300 Picking Children: 78 2 times : 37
## Entertainment :103 3 times : 53
## Others : 16 > 3 times:502
##
##
##
## Departure.Time Distance Travel.Period Sidewalk.Clearance
## Morning :399 Min. : 0.200 Min. : 0.0 No : 94
## Afternoon: 46 1st Qu.: 3.000 1st Qu.: 10.0 Yes:534
## Evening :105 Median : 5.000 Median : 15.0
## Others : 78 Mean : 6.703 Mean : 16.8
## 3rd Qu.:10.000 3rd Qu.: 20.0
## Max. :30.000 Max. :180.0
##
## Lane.Separate Temporary.Stop.Number Mode.Choice.Reason Weather
## No :131 No :328 Safety : 59 Sunny/cool:617
## Yes:497 Yes:300 Comfortable :297 Rainny : 11
## Low price : 58
## Accessibility:162
## Reliability : 42
## others : 10
##
## Weekend Non.Bus.Reason Cost Bus.Stop.Present
## No :490 No Route :122 Min. : 0.000 No : 67
## Yes:138 Uncomfortable :160 1st Qu.: 3.000 Yes :501
## Unsafety : 7 Median : 5.000 Don't know: 60
## Long waiting time:159 Mean : 6.139
## Unreliability : 47 3rd Qu.: 8.000
## others : 23 Max. :30.000
## NA's :110
## Gender Age Occupation Income
## Female:267 15-24:279 Pupils/Students :219 <8 millions :166
## Male :361 25-60:320 Officers/Workers :167 (8-15) millions :265
## >60 : 29 Housewife/Unemployed: 43 (15-25) millions:155
## Free labor :151 >25 millions : 42
## Others : 48
##
##
## Number.of.Children Motor.Certificate Motor.Owning Number.of.Motors Bus
## No :288 No : 64 No : 93 None: 18 No :518
## Yes:340 Yes:564 Yes:535 1 : 95 Yes:110
## 2 :312
## 3 :164
## >3 : 39
##
##
3. Binary Model
# Model with multi-variables (remove variables with p-value>0,05 in t1) --> 19 variables. Bus.Stop.Condition (2), Departure.Time (6), Sidewalk.Clearance (9), Lane.Separate (10), Weekend (14), Gender (18) - Besides remove variables: Mode.Choice.Reason (12), Non.Bus.Reason (15) -> 16 variables.
mg = glm(Bus ~ Central.Area + Purpose + Frequency + Temporary.Stop.Number + Weather + Bus.Stop.Present + Age + Occupation + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +Distance + Travel.Period + Cost, family = binomial, data = datMB)
mg
##
## Call: glm(formula = Bus ~ Central.Area + Purpose + Frequency + Temporary.Stop.Number +
## Weather + Bus.Stop.Present + Age + Occupation + Income +
## Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +
## Distance + Travel.Period + Cost, family = binomial, data = datMB)
##
## Coefficients:
## (Intercept) Central.AreaYes
## 3.070156 0.565545
## PurposePicking Children PurposeEntertainment
## -1.232672 -2.066949
## PurposeOthers Frequency2 times
## 0.172665 2.121711
## Frequency3 times Frequency> 3 times
## 0.633608 -0.951016
## Temporary.Stop.NumberYes WeatherRainny
## -1.239462 2.532484
## Bus.Stop.PresentYes Bus.Stop.PresentDon't know
## -0.746232 -1.769581
## Age25-60 Age>60
## -0.230899 2.825977
## OccupationOfficers/Workers OccupationHousewife/Unemployed
## -0.003997 0.908776
## OccupationFree labor OccupationOthers
## -1.125910 1.320413
## Income(8-15) millions Income(15-25) millions
## -1.015551 -0.641152
## Income>25 millions Number.of.ChildrenYes
## -2.800307 -0.805410
## Motor.CertificateYes Motor.OwningYes
## -1.411194 -2.095355
## Number.of.Motors1 Number.of.Motors2
## 0.876734 0.424273
## Number.of.Motors3 Number.of.Motors>3
## -0.672451 -0.381830
## Distance Travel.Period
## 0.876821 0.010150
## Cost
## -0.917914
##
## Degrees of Freedom: 627 Total (i.e. Null); 597 Residual
## Null Deviance: 582.8
## Residual Deviance: 206.3 AIC: 268.3
summary(mg)
##
## Call:
## glm(formula = Bus ~ Central.Area + Purpose + Frequency + Temporary.Stop.Number +
## Weather + Bus.Stop.Present + Age + Occupation + Income +
## Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +
## Distance + Travel.Period + Cost, family = binomial, data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2620 -0.2679 -0.1383 -0.0540 3.4652
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.070156 1.613146 1.903 0.05701 .
## Central.AreaYes 0.565545 0.419960 1.347 0.17809
## PurposePicking Children -1.232672 0.817235 -1.508 0.13147
## PurposeEntertainment -2.066949 0.669445 -3.088 0.00202 **
## PurposeOthers 0.172665 1.074156 0.161 0.87229
## Frequency2 times 2.121711 0.984158 2.156 0.03109 *
## Frequency3 times 0.633608 0.972582 0.651 0.51474
## Frequency> 3 times -0.951016 0.791240 -1.202 0.22939
## Temporary.Stop.NumberYes -1.239462 0.433266 -2.861 0.00423 **
## WeatherRainny 2.532484 1.322140 1.915 0.05544 .
## Bus.Stop.PresentYes -0.746232 0.606945 -1.229 0.21889
## Bus.Stop.PresentDon't know -1.769581 0.961864 -1.840 0.06581 .
## Age25-60 -0.230899 0.737307 -0.313 0.75415
## Age>60 2.825977 1.113357 2.538 0.01114 *
## OccupationOfficers/Workers -0.003997 0.776038 -0.005 0.99589
## OccupationHousewife/Unemployed 0.908776 1.083856 0.838 0.40177
## OccupationFree labor -1.125910 0.788097 -1.429 0.15311
## OccupationOthers 1.320413 0.937865 1.408 0.15916
## Income(8-15) millions -1.015551 0.490945 -2.069 0.03859 *
## Income(15-25) millions -0.641152 0.547466 -1.171 0.24155
## Income>25 millions -2.800307 1.458481 -1.920 0.05486 .
## Number.of.ChildrenYes -0.805410 0.476603 -1.690 0.09105 .
## Motor.CertificateYes -1.411194 0.605329 -2.331 0.01974 *
## Motor.OwningYes -2.095355 0.541903 -3.867 0.00011 ***
## Number.of.Motors1 0.876734 1.263569 0.694 0.48777
## Number.of.Motors2 0.424273 1.240399 0.342 0.73232
## Number.of.Motors3 -0.672451 1.319583 -0.510 0.61034
## Number.of.Motors>3 -0.381830 1.638538 -0.233 0.81574
## Distance 0.876821 0.128842 6.805 1.01e-11 ***
## Travel.Period 0.010150 0.014937 0.680 0.49680
## Cost -0.917914 0.132180 -6.944 3.80e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 206.29 on 597 degrees of freedom
## AIC: 268.29
##
## Number of Fisher Scoring iterations: 7
# Model with multi-variables (remove variables with p-value>0,05 in mg): Central.Area (3), Bus.Stop.Present (17), Occupation (20), Number.of.Motors (29), Travel.Period (8), Weather (13) --> 10 variables
mg1 = glm(Bus ~ Purpose + Frequency + Temporary.Stop.Number + Age + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Distance + Cost, family = binomial, data = datMB)
mg1
##
## Call: glm(formula = Bus ~ Purpose + Frequency + Temporary.Stop.Number +
## Age + Income + Number.of.Children + Motor.Certificate + Motor.Owning +
## Distance + Cost, family = binomial, data = datMB)
##
## Coefficients:
## (Intercept) PurposePicking Children PurposeEntertainment
## 3.0872 -1.0713 -1.8459
## PurposeOthers Frequency2 times Frequency3 times
## 0.5242 1.1266 0.4545
## Frequency> 3 times Temporary.Stop.NumberYes Age25-60
## -1.4192 -1.0269 -0.2575
## Age>60 Income(8-15) millions Income(15-25) millions
## 3.3903 -1.1220 -0.9255
## Income>25 millions Number.of.ChildrenYes Motor.CertificateYes
## -3.0608 -0.3426 -1.2884
## Motor.OwningYes Distance Cost
## -2.1378 0.9014 -0.9209
##
## Degrees of Freedom: 627 Total (i.e. Null); 610 Residual
## Null Deviance: 582.8
## Residual Deviance: 233.9 AIC: 269.9
summary(mg1)
##
## Call:
## glm(formula = Bus ~ Purpose + Frequency + Temporary.Stop.Number +
## Age + Income + Number.of.Children + Motor.Certificate + Motor.Owning +
## Distance + Cost, family = binomial, data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4037 -0.3092 -0.1858 -0.0852 3.3956
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.0872 0.8730 3.536 0.000406 ***
## PurposePicking Children -1.0713 0.7518 -1.425 0.154160
## PurposeEntertainment -1.8459 0.6046 -3.053 0.002265 **
## PurposeOthers 0.5242 0.8808 0.595 0.551783
## Frequency2 times 1.1266 0.8800 1.280 0.200455
## Frequency3 times 0.4545 0.8299 0.548 0.583885
## Frequency> 3 times -1.4192 0.7168 -1.980 0.047698 *
## Temporary.Stop.NumberYes -1.0269 0.3879 -2.647 0.008113 **
## Age25-60 -0.2575 0.4559 -0.565 0.572229
## Age>60 3.3903 0.8730 3.883 0.000103 ***
## Income(8-15) millions -1.1220 0.4251 -2.640 0.008300 **
## Income(15-25) millions -0.9255 0.4920 -1.881 0.059960 .
## Income>25 millions -3.0608 1.3231 -2.313 0.020706 *
## Number.of.ChildrenYes -0.3426 0.3992 -0.858 0.390684
## Motor.CertificateYes -1.2884 0.5358 -2.404 0.016196 *
## Motor.OwningYes -2.1378 0.4664 -4.584 4.57e-06 ***
## Distance 0.9014 0.1146 7.867 3.65e-15 ***
## Cost -0.9209 0.1199 -7.680 1.59e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 233.87 on 610 degrees of freedom
## AIC: 269.87
##
## Number of Fisher Scoring iterations: 7
## Way 2: Use function lmr in package "rms" - library(rms) - ml = lrm(Bus ~ Central.Area + Purpose + Frequency + Temporary.Stop.Number + Mode.Choice.Reason + Weather + Non.Bus.Reason + Bus.Stop.Present + Age + Occupation + Income + Number.of.Children + Motor.Certificate + Motor.Owning + Number.of.Motors +Distance + Travel.Period + Cost) - ml
# Find Parsimonious model - Tìm mô hình tối ưu
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.5-5)
head(datMB)
## Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time Distance
## 2 Yes No Work/Study > 3 times Morning 8
## 3 Yes No Work/Study > 3 times Evening 5
## 4 Yes No Work/Study > 3 times Morning 5
## 5 Yes No Work/Study > 3 times Evening 8
## 8 Yes No Work/Study > 3 times Morning 10
## 9 Yes No Work/Study > 3 times Afternoon 12
## Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 15 Yes Yes Yes
## 3 15 Yes Yes Yes
## 4 10 Yes Yes Yes
## 5 15 Yes Yes No
## 8 25 Yes Yes No
## 9 30 Yes Yes Yes
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 Comfortable Sunny/cool No No Route 8 Don't know
## 3 Accessibility Sunny/cool No Uncomfortable 5 Yes
## 4 Comfortable Sunny/cool No Long waiting time 5 Yes
## 5 Comfortable Sunny/cool No Uncomfortable 8 Don't know
## 8 Comfortable Sunny/cool Yes Uncomfortable 10 Yes
## 9 Comfortable Sunny/cool No No Route 12 Yes
## Gender Age Occupation Income Number.of.Children
## 2 Female 15-24 Officers/Workers (15-25) millions Yes
## 3 Male 15-24 Pupils/Students >25 millions No
## 4 Male 15-24 Pupils/Students (15-25) millions No
## 5 Female 25-60 Officers/Workers (15-25) millions Yes
## 8 Female 25-60 Free labor (8-15) millions No
## 9 Female 15-24 Free labor (15-25) millions Yes
## Motor.Certificate Motor.Owning Number.of.Motors Bus
## 2 Yes Yes 3 No
## 3 Yes Yes 3 No
## 4 Yes Yes 3 No
## 5 Yes Yes 2 No
## 8 Yes Yes 2 No
## 9 Yes Yes 3 No
dim(datMB)
## [1] 628 25
str(datMB)
## 'data.frame': 628 obs. of 25 variables:
## $ Bus.Stop.Condition : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
## $ Central.Area : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 2 ...
## $ Purpose : Factor w/ 4 levels "Work/Study","Picking Children",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : Factor w/ 4 levels "Once","2 times",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : Factor w/ 4 levels "Morning","Afternoon",..: 1 3 1 3 1 2 1 1 1 1 ...
## $ Distance : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Travel.Period : num 15 15 10 15 25 30 25 5 5 10 ...
## $ Sidewalk.Clearance : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Lane.Separate : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Temporary.Stop.Number: Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 2 1 ...
## $ Mode.Choice.Reason : Factor w/ 6 levels "Safety","Comfortable",..: 2 4 2 2 2 2 2 2 4 3 ...
## $ Weather : Factor w/ 2 levels "Sunny/cool","Rainny": 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekend : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ Non.Bus.Reason : Factor w/ 6 levels "No Route","Uncomfortable",..: 1 2 4 2 2 1 4 4 4 2 ...
## $ Cost : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Bus.Stop.Present : Factor w/ 3 levels "No","Yes","Don't know": 3 2 2 3 2 2 2 2 2 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 2 2 2 2 ...
## $ Age : Factor w/ 3 levels "15-24","25-60",..: 1 1 1 2 2 1 2 1 2 1 ...
## $ Occupation : Factor w/ 5 levels "Pupils/Students",..: 2 1 1 2 4 4 2 1 4 4 ...
## $ Income : Factor w/ 4 levels "<8 millions",..: 3 4 3 3 2 3 3 2 2 1 ...
## $ Number.of.Children : Factor w/ 2 levels "No","Yes": 2 1 1 2 1 2 1 1 2 1 ...
## $ Motor.Certificate : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ Motor.Owning : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ Number.of.Motors : Factor w/ 5 levels "None","1","2",..: 4 4 4 3 3 4 4 4 3 2 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## all variables, remove 2 variables: Mode.Choice.Reason and Non.Bus.Reason --> 24 variables
xvars <- datMB[,c(1,2,3,4,5,8,9,13,10,12,16,17,18,19,20,21,22,23,24,6,7,15)]
y <- Bus
bma.search <- bic.glm(xvars, y, strict = F, OR = 20, glm.family = "binomial")
summary(bma.search)
##
## Call:
## bic.glm.data.frame(x = xvars, y = y, glm.family = "binomial", strict = F, OR = 20)
##
##
## 10 models were selected
## Best 5 models (cumulative posterior probability = 0.8602 ):
##
## p!=0 EV SD model 1
## Intercept 100 1.38731 0.7215 1.6243
## Bus.Stop.Condition 83.2
## .Yes -0.94588 0.5411 -1.1883
## Central.Area 0.0
## .Yes 0.00000 0.0000 .
## Purpose 0.0
## .Picking Children 0.00000 0.0000 .
## .Entertainment 0.00000 0.0000 .
## .Others 0.00000 0.0000 .
## Frequency 64.6
## .2 times 0.95031 0.9546 1.4941
## .3 times 0.14306 0.6259 0.1221
## .> 3 times -0.57715 0.6554 -0.9287
## Departure.Time 0.0
## .Afternoon 0.00000 0.0000 .
## .Evening 0.00000 0.0000 .
## .Others 0.00000 0.0000 .
## Sidewalk.Clearance 0.0
## .Yes 0.00000 0.0000 .
## Lane.Separate 0.0
## .Yes 0.00000 0.0000 .
## Weekend 0.0
## .Yes 0.00000 0.0000 .
## Temporary.Stop.Number 100.0
## .Yes -1.34423 0.3662 -1.4018
## Weather 39.9
## .Rainny 1.21275 1.6779 3.0692
## Bus.Stop.Present 0.0
## .Yes 0.00000 0.0000 .
## .Don't know 0.00000 0.0000 .
## Gender 0.0
## .Male 0.00000 0.0000 .
## Age 100.0
## .25-60 -0.81142 0.3863 -0.8205
## .>60 2.95947 0.7167 3.0598
## Occupation 0.0
## .Officers/Workers 0.00000 0.0000 .
## .Housewife/Unemployed 0.00000 0.0000 .
## .Free labor 0.00000 0.0000 .
## .Others 0.00000 0.0000 .
## Income 11.0
## .(8-15) millions -0.13621 0.4088 .
## .(15-25) millions -0.10695 0.3396 .
## .>25 millions -0.36119 1.1031 .
## Number.of.Children 0.0
## .Yes 0.00000 0.0000 .
## Motor.Certificate 5.3
## .Yes -0.06683 0.3005 .
## Motor.Owning 100.0
## .Yes -2.47338 0.4071 -2.4756
## Number.of.Motors 0.0
## .1 0.00000 0.0000 .
## .2 0.00000 0.0000 .
## .3 0.00000 0.0000 .
## .>3 0.00000 0.0000 .
## Distance 100.0 0.87811 0.1166 0.8547
## Travel.Period 0.0 0.00000 0.0000 .
## Cost 100.0 -0.91926 0.1217 -0.9098
##
## nVar 8
## BIC -3719.1690
## post prob 0.343
## model 2 model 3 model 4 model 5
## Intercept 1.4797 1.0001 1.8399 1.1249
## Bus.Stop.Condition
## .Yes -1.1111 -1.0663 -1.1234 .
## Central.Area
## .Yes . . . .
## Purpose
## .Picking Children . . . .
## .Entertainment . . . .
## .Others . . . .
## Frequency
## .2 times 1.4770 . . .
## .3 times 0.3490 . . .
## .> 3 times -0.8570 . . .
## Departure.Time
## .Afternoon . . . .
## .Evening . . . .
## .Others . . . .
## Sidewalk.Clearance
## .Yes . . . .
## Lane.Separate
## .Yes . . . .
## Weekend
## .Yes . . . .
## Temporary.Stop.Number
## .Yes -1.2821 -1.3164 -1.3633 -1.2977
## Weather
## .Rainny . . . .
## Bus.Stop.Present
## .Yes . . . .
## .Don't know . . . .
## Gender
## .Male . . . .
## Age
## .25-60 -0.8394 -0.8675 -0.7296 -0.7221
## .>60 2.9715 2.9792 3.1317 2.6518
## Occupation
## .Officers/Workers . . . .
## .Housewife/Unemployed . . . .
## .Free labor . . . .
## .Others . . . .
## Income
## .(8-15) millions . . -1.2668 .
## .(15-25) millions . . -0.9489 .
## .>25 millions . . -3.2982 .
## Number.of.Children
## .Yes . . . .
## Motor.Certificate
## .Yes . . . -1.2614
## Motor.Owning
## .Yes -2.4717 -2.5362 -2.6858 -2.0477
## Number.of.Motors
## .1 . . . .
## .2 . . . .
## .3 . . . .
## .>3 . . . .
## Distance 0.8785 0.9027 0.9739 0.8622
## Travel.Period . . . .
## Cost -0.9204 -0.9365 -1.0008 -0.8805
##
## nVar 7 6 7 6
## BIC -3718.2087 -3717.7152 -3716.3825 -3715.4308
## post prob 0.212 0.166 0.085 0.053
imageplot.bma(bma.search)
## Remove insignificant variables: remove variables unsignificant in t1 --> 16 variables
xvars1 <- datMB[,c(2,3,4,10,12,16,18,19,20,21,22,23,24,6,7,15)]
y1 <- Bus
bma.search1 <- bic.glm(xvars1, y1, strict = F, OR = 20, glm.family = "binomial")
summary(bma.search1)
##
## Call:
## bic.glm.data.frame(x = xvars1, y = y1, glm.family = "binomial", strict = F, OR = 20)
##
##
## 17 models were selected
## Best 5 models (cumulative posterior probability = 0.5917 ):
##
## p!=0 EV SD model 1
## Intercept 100 0.97293 0.7048 1.125e+00
## Central.Area 11.5
## .Yes 0.08340 0.2609 .
## Purpose 0.0
## .Picking Children 0.00000 0.0000 .
## .Entertainment 0.00000 0.0000 .
## .Others 0.00000 0.0000 .
## Frequency 39.0
## .2 times 0.52127 0.8175 .
## .3 times 0.08643 0.4942 .
## .> 3 times -0.33598 0.5775 .
## Temporary.Stop.Number 100.0
## .Yes -1.34510 0.3649 -1.298e+00
## Weather 45.1
## .Rainny 1.34016 1.6925 .
## Bus.Stop.Present 0.0
## .Yes 0.00000 0.0000 .
## .Don't know 0.00000 0.0000 .
## Age 100.0
## .25-60 -0.73418 0.3851 -7.221e-01
## .>60 2.62032 0.6432 2.652e+00
## Occupation 0.0
## .Officers/Workers 0.00000 0.0000 .
## .Housewife/Unemployed 0.00000 0.0000 .
## .Free labor 0.00000 0.0000 .
## .Others 0.00000 0.0000 .
## Income 12.8
## .(8-15) millions -0.15571 0.4305 .
## .(15-25) millions -0.09866 0.3023 .
## .>25 millions -0.41298 1.1673 .
## Number.of.Children 0.0
## .Yes 0.00000 0.0000 .
## Motor.Certificate 47.4
## .Yes -0.57227 0.6824 -1.261e+00
## Motor.Owning 100.0
## .Yes -2.24197 0.4188 -2.048e+00
## Number.of.Motors 0.0
## .1 0.00000 0.0000 .
## .2 0.00000 0.0000 .
## .3 0.00000 0.0000 .
## .>3 0.00000 0.0000 .
## Distance 100.0 0.84141 0.1117 8.622e-01
## Travel.Period 0.0 0.00000 0.0000 .
## Cost 100.0 -0.86873 0.1162 -8.805e-01
##
## nVar 6
## BIC -3.715e+03
## post prob 0.163
## model 2 model 3 model 4 model 5
## Intercept 1.193e+00 8.674e-01 9.620e-01 3.862e-01
## Central.Area
## .Yes . . . .
## Purpose
## .Picking Children . . . .
## .Entertainment . . . .
## .Others . . . .
## Frequency
## .2 times . 1.334e+00 1.335e+00 .
## .3 times . 2.773e-01 7.337e-02 .
## .> 3 times . -8.726e-01 -9.415e-01 .
## Temporary.Stop.Number
## .Yes -1.387e+00 -1.282e+00 -1.370e+00 -1.307e+00
## Weather
## .Rainny 3.179e+00 . 2.774e+00 .
## Bus.Stop.Present
## .Yes . . . .
## .Don't know . . . .
## Age
## .25-60 -7.168e-01 -7.735e-01 -7.569e-01 -8.230e-01
## .>60 2.692e+00 2.546e+00 2.595e+00 2.563e+00
## Occupation
## .Officers/Workers . . . .
## .Housewife/Unemployed . . . .
## .Free labor . . . .
## .Others . . . .
## Income
## .(8-15) millions . . . .
## .(15-25) millions . . . .
## .>25 millions . . . .
## Number.of.Children
## .Yes . . . .
## Motor.Certificate
## .Yes -1.272e+00 . . .
## Motor.Owning
## .Yes -2.048e+00 -2.361e+00 -2.345e+00 -2.411e+00
## Number.of.Motors
## .1 . . . .
## .2 . . . .
## .3 . . . .
## .>3 . . . .
## Distance 8.347e-01 8.314e-01 8.063e-01 8.568e-01
## Travel.Period . . . .
## Cost -8.679e-01 -8.603e-01 -8.480e-01 -8.783e-01
##
## nVar 7 6 7 5
## BIC -3.715e+03 -3.715e+03 -3.714e+03 -3.714e+03
## post prob 0.149 0.103 0.097 0.080
imageplot.bma(bma.search1)
## Choose Model 1 from bma.search : 7 variables (remove Frequency variables because of p-value>0.05)
mg2 = glm(Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather + Age + Motor.Owning + Distance + Cost, family = binomial, data = datMB)
mg2
##
## Call: glm(formula = Bus ~ Bus.Stop.Condition + Temporary.Stop.Number +
## Weather + Age + Motor.Owning + Distance + Cost, family = binomial,
## data = datMB)
##
## Coefficients:
## (Intercept) Bus.Stop.ConditionYes Temporary.Stop.NumberYes
## 1.0827 -1.1454 -1.4357
## WeatherRainny Age25-60 Age>60
## 3.3460 -0.8581 3.0726
## Motor.OwningYes Distance Cost
## -2.5521 0.8812 -0.9272
##
## Degrees of Freedom: 627 Total (i.e. Null); 619 Residual
## Null Deviance: 582.8
## Residual Deviance: 269.2 AIC: 287.2
summary(mg2)
##
## Call:
## glm(formula = Bus ~ Bus.Stop.Condition + Temporary.Stop.Number +
## Weather + Age + Motor.Owning + Distance + Cost, family = binomial,
## data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5543 -0.3473 -0.2047 -0.1105 3.5424
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0827 0.4366 2.480 0.01313 *
## Bus.Stop.ConditionYes -1.1454 0.3559 -3.218 0.00129 **
## Temporary.Stop.NumberYes -1.4357 0.3665 -3.917 8.97e-05 ***
## WeatherRainny 3.3460 1.2303 2.720 0.00653 **
## Age25-60 -0.8581 0.3723 -2.305 0.02118 *
## Age>60 3.0726 0.6714 4.577 4.73e-06 ***
## Motor.OwningYes -2.5521 0.3773 -6.763 1.35e-11 ***
## Distance 0.8812 0.1097 8.034 9.47e-16 ***
## Cost -0.9272 0.1168 -7.941 2.00e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 269.20 on 619 degrees of freedom
## AIC: 287.2
##
## Number of Fisher Scoring iterations: 6
## Choose Model 1 from bma.search1 : 6 variables : Khong chon do Motor.Certificate va Motor.Owning co kha nang tuong quan
mg3 = glm(Bus ~ Temporary.Stop.Number + Age + Motor.Certificate + Motor.Owning + Distance + Cost, family = binomial, data = datMB)
mg3
##
## Call: glm(formula = Bus ~ Temporary.Stop.Number + Age + Motor.Certificate +
## Motor.Owning + Distance + Cost, family = binomial, data = datMB)
##
## Coefficients:
## (Intercept) Temporary.Stop.NumberYes Age25-60
## 1.1249 -1.2977 -0.7221
## Age>60 Motor.CertificateYes Motor.OwningYes
## 2.6518 -1.2614 -2.0477
## Distance Cost
## 0.8622 -0.8805
##
## Degrees of Freedom: 627 Total (i.e. Null); 620 Residual
## Null Deviance: 582.8
## Residual Deviance: 278.9 AIC: 294.9
summary(mg3)
##
## Call:
## glm(formula = Bus ~ Temporary.Stop.Number + Age + Motor.Certificate +
## Motor.Owning + Distance + Cost, family = binomial, data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6521 -0.3234 -0.2352 -0.1597 3.4706
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1249 0.4578 2.457 0.014011 *
## Temporary.Stop.NumberYes -1.2977 0.3518 -3.689 0.000225 ***
## Age25-60 -0.7221 0.3726 -1.938 0.052588 .
## Age>60 2.6518 0.6239 4.250 2.14e-05 ***
## Motor.CertificateYes -1.2614 0.4444 -2.839 0.004532 **
## Motor.OwningYes -2.0477 0.3807 -5.378 7.51e-08 ***
## Distance 0.8622 0.1091 7.900 2.79e-15 ***
## Cost -0.8805 0.1135 -7.759 8.56e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 278.94 on 620 degrees of freedom
## AIC: 294.94
##
## Number of Fisher Scoring iterations: 6
## --> Chọn mg2
## Calculate OR (odd ratio)
exp(mg2$coefficients)
## (Intercept) Bus.Stop.ConditionYes Temporary.Stop.NumberYes
## 2.95276908 0.31809858 0.23795804
## WeatherRainny Age25-60 Age>60
## 28.38966923 0.42398082 21.59839491
## Motor.OwningYes Distance Cost
## 0.07791836 2.41388678 0.39566691
exp(coef(mg2))
## (Intercept) Bus.Stop.ConditionYes Temporary.Stop.NumberYes
## 2.95276908 0.31809858 0.23795804
## WeatherRainny Age25-60 Age>60
## 28.38966923 0.42398082 21.59839491
## Motor.OwningYes Distance Cost
## 0.07791836 2.41388678 0.39566691
library(epiDisplay)
## Loading required package: foreign
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following objects are masked from 'package:psych':
##
## alpha, cs, lookup
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(mg2)
##
## Logistic regression predicting Bus : Yes vs No
##
## crude OR(95%CI) adj. OR(95%CI)
## Bus.Stop.Condition: Yes vs No 0.71 (0.47,1.08) 0.32 (0.16,0.64)
##
## Temporary.Stop.Number: Yes vs No 0.33 (0.21,0.52) 0.24 (0.12,0.49)
##
## Weather: Rainny vs Sunny/cool 22.99 (4.9,107.95) 28.39 (2.55,316.5)
##
## Age: ref.=15-24
## 25-60 0.3 (0.18,0.48) 0.42 (0.2,0.88)
## >60 7.47 (3.24,17.2) 21.6 (5.79,80.52)
##
## Motor.Owning: Yes vs No 0.1 (0.06,0.17) 0.08 (0.04,0.16)
##
## Distance (cont. var.) 1.08 (1.04,1.12) 2.41 (1.95,2.99)
##
## Cost (cont. var.) 0.94 (0.89,0.99) 0.4 (0.31,0.5)
##
## P(Wald's test) P(LR-test)
## Bus.Stop.Condition: Yes vs No 0.001 < 0.001
##
## Temporary.Stop.Number: Yes vs No < 0.001 < 0.001
##
## Weather: Rainny vs Sunny/cool 0.007 0.006
##
## Age: ref.=15-24 < 0.001
## 25-60 0.021
## >60 < 0.001
##
## Motor.Owning: Yes vs No < 0.001 < 0.001
##
## Distance (cont. var.) < 0.001 < 0.001
##
## Cost (cont. var.) < 0.001 < 0.001
##
## Log-likelihood = -134.5978
## No. of observations = 628
## AIC value = 287.1957
## Thay vì sử dụng hàm glm dùng cách 2 với hàm lrm trong gói rms
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
## The following objects are masked from 'package:car':
##
## Predict, vif
mg2.c2 = lrm(Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather + Age + Motor.Owning + Distance + Cost, data = datMB)
mg2.c2
## Logistic Regression Model
##
## lrm(formula = Bus ~ Bus.Stop.Condition + Temporary.Stop.Number +
## Weather + Age + Motor.Owning + Distance + Cost, data = datMB)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 628 LR chi2 313.55 R2 0.650 C 0.909
## No 518 d.f. 8 g 2.814 Dxy 0.818
## Yes 110 Pr(> chi2) <0.0001 gr 16.680 gamma 0.819
## max |deriv| 1e-08 gp 0.250 tau-a 0.237
## Brier 0.055
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 1.0827 0.4366 2.48 0.0131
## Bus.Stop.Condition=Yes -1.1454 0.3559 -3.22 0.0013
## Temporary.Stop.Number=Yes -1.4357 0.3666 -3.92 <0.0001
## Weather=Rainny 3.3460 1.2303 2.72 0.0065
## Age=25-60 -0.8581 0.3723 -2.30 0.0212
## Age=>60 3.0726 0.6714 4.58 <0.0001
## Motor.Owning=Yes -2.5521 0.3774 -6.76 <0.0001
## Distance 0.8812 0.1097 8.03 <0.0001
## Cost -0.9272 0.1168 -7.94 <0.0001
##
4. Test models
# Test and compare models mg2 và mg3
# 4.1. Deviance - càng nhỏ càng tốt (sai khác giữa giá trị thực tế và giá trị ước lượng theo mô hình)
deviance(mg2) # 7 variables
## [1] 269.1957
deviance(mg3) # 6 variables
## [1] 278.9441
# 4.2. LRT (Likelihood Ratio Test) - Chênh lệch deviance giữa mô hình đơn giản và mô hình phức tạp --> có ý nghĩa thống kê
library(rms)
lrtest(mg2, mg3)
##
## Model 1: Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather +
## Age + Motor.Owning + Distance + Cost
## Model 2: Bus ~ Temporary.Stop.Number + Age + Motor.Certificate + Motor.Owning +
## Distance + Cost
##
## L.R. Chisq d.f. P
## 9.748373717 1.000000000 0.001794814
# 4.3. AIC (Aikake Information Criterion) - càng nhỏ càng tốt
AIC(mg2)
## [1] 287.1957
AIC(mg3)
## [1] 294.9441
## Calculate MacFadden's R2 voi packages "pscl"
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(mg2)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -134.5978398 -291.3752088 313.5547380 0.5380601 0.3930391 0.6500434
pR2(mg3)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -139.4720267 -291.3752088 303.8063643 0.5213319 0.3835439 0.6343392
# 4.4. Calcule the number of observation that model predict correctly = Accuracy (Độ chính xác của mô hình)
## Model mg2
contrasts(datMB$Bus)
## Yes
## No 0
## Yes 1
glm.probs.mg2 <- predict(mg2, type = "response") # XS dự báo cho tất cả các quan sát theo mô hình mg2 (nếu >0,5 thì xem như bẳng 1 nghĩa là mô hình dự báo đúng). Có thể dùng hàm probmg2 <- fitted(mg2), head(probmg2), tail(probmg2)
glm.probs.mg2[1:10] # Xem XS dự báo cho 10 quan sát đầu tiên
## 2 3 4 5 8 9
## 0.011915126 0.013651950 0.013651950 0.021033808 0.019222677 0.009934699
## 10 13 15 16
## 0.004642192 0.450046311 0.020735248 0.160686683
glm.pred.mg2 <- rep("No", 628)
glm.pred.mg2[glm.probs.mg2 >= 0.5] = "Yes"
table(glm.pred.mg2, datMB$Bus)
##
## glm.pred.mg2 No Yes
## No 513 36
## Yes 5 74
mean(glm.pred.mg2 == datMB$Bus) # Tỷ lệ dự báo chính xác của mô hình 93,47%
## [1] 0.9347134
round(prop.table(table(datMB$Bus))*100,digits = 2) # Tính tỷ lệ các nhóm Y khác nhau - nhóm chiếm ưu thế là sd xe máy 82.48% < 93.47% --> KL: mức độ chính xác dự báo của mô hình 2 > lớp có tỷ lệ cao nhất ở biến phụ thuộc
##
## No Yes
## 82.48 17.52
## Model mg3
contrasts(datMB$Bus)
## Yes
## No 0
## Yes 1
glm.probs.mg3 <- predict(mg3, type = "response") # XS dự báo cho tất cả các quan sát theo mô hình mg (nếu >0,5 thì xem như bẳng 1 nghĩa là mô hình dự báo đúng)
glm.probs.mg3[1:10] # Xem XS dự báo cho 10 quan sát đầu tiên
## 2 3 4 5 8 9 10
## 0.02586326 0.02728683 0.02728683 0.04508261 0.04352897 0.02407716 0.01227855
## 13 15 16
## 0.74456434 0.01419227 0.09469469
glm.pred.mg3 <- rep("No", 628)
glm.pred.mg3[glm.probs.mg3 >= 0.5] = "Yes"
table(glm.pred.mg3, datMB$Bus)
##
## glm.pred.mg3 No Yes
## No 514 36
## Yes 4 74
mean(glm.pred.mg3 == datMB$Bus) # Tỷ lệ dự báo chính xác của mô hình 93,63% --> Tỷ lệ sai sót kiểm định (training error rate) = 100-95,06 = 5,94%
## [1] 0.9363057
round(prop.table(table(datMB$Bus))*100,digits = 2) # Tính tỷ lệ các nhóm Y khác nhau - nhóm chiếm ưu thế là không sd xe buýt 82.48% < 93,63% --> KL: mức độ chính xác dự báo của mô hình > lớp có tỷ lệ cao nhất ở biến phụ thuộc
##
## No Yes
## 82.48 17.52
# 4.5. Calculate Sensitivity - Độ nhạy
## Model mg2
513/(513+5) # Đạt 99,03% --> Rất lý tưởng: Mô hình xác định tới 99,03% người sử dụng xe máy
## [1] 0.9903475
## Model mg3
514/(514+4) # Đạt 99,22% --> Rất lý tưởng: Mô hình xác định tới 99,22% người sử dụng xe máy
## [1] 0.992278
74/(74+36) # Đạt 67.27% --> Mô hình xác định được 67.27% người sử dụng xe buýt
## [1] 0.6727273
# 4.6. Calculate Specificity - Độ đặc hiệu
## Model mg2
74/(74+36) # Đạt 67.27% --> Mô hình xác định được 67.27% người sử dụng xe buýt
## [1] 0.6727273
## Model mg3
74/(74+36) # Đạt 67.27% --> Mô hình xác định được 67.27% người sử dụng xe buýt
## [1] 0.6727273
# 4.7. Hosmer-Lemeshow = Goodness of fit test - Kiểm định sự phù hợp của mô hình với Hàm HLgof.test(fit = fitted(mg2), obs = datBus$Bus) trong package "MKmisc" (Version 3.6.3) : Nếu p-value < 0.05 --> MH không phù hợp --> Ít sử dụng
# 4.8. Kiểm tra tính phổ quát của mô hình
## Chạy lại hai mô hình với hàm train thuộc package "caret"
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, classProbs = TRUE, summaryFunction = defaultSummary) # sử dụng kiểm tra chéo 10 lớp (number=10), lặp lại 10 lần (repeats=10) với lựa chọn method="repeatedcv, lựa chọn classProbs = TRUE áp dụng cho mô hình phân loại và lựa chọn summaryFunction = defaultSummary để chỉ thị R tính toán chỉ 2 thông tin về phẩm chất của mô hình là Accuracy và Kappa.
mg2train = train(Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather + Age + Motor.Owning + Distance + Cost, data = datMB, method = "glm", family = "binomial", trControl = ctrl)
summary(mg2train)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5543 -0.3473 -0.2047 -0.1105 3.5424
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0827 0.4366 2.480 0.01313 *
## Bus.Stop.ConditionYes -1.1454 0.3559 -3.218 0.00129 **
## Temporary.Stop.NumberYes -1.4357 0.3665 -3.917 8.97e-05 ***
## WeatherRainny 3.3460 1.2303 2.720 0.00653 **
## `Age25-60` -0.8581 0.3723 -2.305 0.02118 *
## `Age>60` 3.0726 0.6714 4.577 4.73e-06 ***
## Motor.OwningYes -2.5521 0.3773 -6.763 1.35e-11 ***
## Distance 0.8812 0.1097 8.034 9.47e-16 ***
## Cost -0.9272 0.1168 -7.941 2.00e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 269.20 on 619 degrees of freedom
## AIC: 287.2
##
## Number of Fisher Scoring iterations: 6
mg3train = train(Bus ~ Temporary.Stop.Number + Age + Motor.Certificate + Motor.Owning + Distance + Cost, data = datMB, method = "glm", family = "binomial", trControl = ctrl)
summary(mg3train)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6521 -0.3234 -0.2352 -0.1597 3.4706
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1249 0.4578 2.457 0.014011 *
## Temporary.Stop.NumberYes -1.2977 0.3518 -3.689 0.000225 ***
## `Age25-60` -0.7221 0.3726 -1.938 0.052588 .
## `Age>60` 2.6518 0.6239 4.250 2.14e-05 ***
## Motor.CertificateYes -1.2614 0.4444 -2.839 0.004532 **
## Motor.OwningYes -2.0477 0.3807 -5.378 7.51e-08 ***
## Distance 0.8622 0.1091 7.900 2.79e-15 ***
## Cost -0.8805 0.1135 -7.759 8.56e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 278.94 on 620 degrees of freedom
## AIC: 294.94
##
## Number of Fisher Scoring iterations: 6
## Các thông tin về khả năng dự báo của mô hình bằng confusionMatrix()
predmg2 <- predict(mg2train, newdata = datMB)
confusionMatrix(data = predmg2, datMB$Bus)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 513 36
## Yes 5 74
##
## Accuracy : 0.9347
## 95% CI : (0.9125, 0.9527)
## No Information Rate : 0.8248
## P-Value [Acc > NIR] : 4.705e-16
##
## Kappa : 0.7459
##
## Mcnemar's Test P-Value : 2.797e-06
##
## Sensitivity : 0.9903
## Specificity : 0.6727
## Pos Pred Value : 0.9344
## Neg Pred Value : 0.9367
## Prevalence : 0.8248
## Detection Rate : 0.8169
## Detection Prevalence : 0.8742
## Balanced Accuracy : 0.8315
##
## 'Positive' Class : No
##
predmg3 <- predict(mg3train, newdata = datMB)
confusionMatrix(data = predmg3, datMB$Bus)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 514 36
## Yes 4 74
##
## Accuracy : 0.9363
## 95% CI : (0.9143, 0.9541)
## No Information Rate : 0.8248
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7511
##
## Mcnemar's Test P-Value : 9.509e-07
##
## Sensitivity : 0.9923
## Specificity : 0.6727
## Pos Pred Value : 0.9345
## Neg Pred Value : 0.9487
## Prevalence : 0.8248
## Detection Rate : 0.8185
## Detection Prevalence : 0.8758
## Balanced Accuracy : 0.8325
##
## 'Positive' Class : No
##
## Lấy dataframe với thông tin Accuracy của mô hình
a = mg2train$resample
a = a[, -3] # Trừ cột resample
a$Mohinh = "Logitmg2" # Thêm biến mô hình
a
## Accuracy Kappa Mohinh
## 1 0.8870968 0.6265060 Logitmg2
## 2 0.9354839 0.7422037 Logitmg2
## 3 0.9047619 0.5790646 Logitmg2
## 4 0.9047619 0.6142857 Logitmg2
## 5 0.9206349 0.6645367 Logitmg2
## 6 0.9206349 0.6914789 Logitmg2
## 7 0.9365079 0.7627119 Logitmg2
## 8 0.9206349 0.6914789 Logitmg2
## 9 0.9523810 0.8148874 Logitmg2
## 10 0.9365079 0.7627119 Logitmg2
## 11 0.9365079 0.7627119 Logitmg2
## 12 0.9193548 0.7134935 Logitmg2
## 13 0.9032258 0.6133056 Logitmg2
## 14 0.9523810 0.8148874 Logitmg2
## 15 0.9206349 0.6645367 Logitmg2
## 16 0.9206349 0.6914789 Logitmg2
## 17 0.9365079 0.7627119 Logitmg2
## 18 0.9206349 0.6645367 Logitmg2
## 19 0.9206349 0.6914789 Logitmg2
## 20 0.9365079 0.7627119 Logitmg2
## 21 0.9523810 0.8286491 Logitmg2
## 22 0.9365079 0.7627119 Logitmg2
## 23 0.9193548 0.6637744 Logitmg2
## 24 0.9365079 0.7627119 Logitmg2
## 25 0.9047619 0.5790646 Logitmg2
## 26 0.8888889 0.5303514 Logitmg2
## 27 0.9516129 0.8143713 Logitmg2
## 28 0.9523810 0.8286491 Logitmg2
## 29 0.9523810 0.8286491 Logitmg2
## 30 0.8730159 0.5254237 Logitmg2
## 31 0.9365079 0.7428571 Logitmg2
## 32 0.9365079 0.7627119 Logitmg2
## 33 0.9365079 0.7627119 Logitmg2
## 34 0.9516129 0.8280961 Logitmg2
## 35 0.9206349 0.6645367 Logitmg2
## 36 0.8888889 0.6001813 Logitmg2
## 37 0.9206349 0.6645367 Logitmg2
## 38 0.9516129 0.8143713 Logitmg2
## 39 0.9365079 0.7627119 Logitmg2
## 40 0.9047619 0.5790646 Logitmg2
## 41 0.8730159 0.4857143 Logitmg2
## 42 0.9523810 0.8148874 Logitmg2
## 43 0.9516129 0.8280961 Logitmg2
## 44 0.9206349 0.6645367 Logitmg2
## 45 0.9047619 0.5790646 Logitmg2
## 46 0.9206349 0.6645367 Logitmg2
## 47 0.8888889 0.6278481 Logitmg2
## 48 0.9682540 0.8813559 Logitmg2
## 49 0.9047619 0.6695804 Logitmg2
## 50 0.9193548 0.6906188 Logitmg2
## 51 0.8888889 0.4854142 Logitmg2
## 52 0.9206349 0.6914789 Logitmg2
## 53 0.9523810 0.8148874 Logitmg2
## 54 0.9365079 0.7627119 Logitmg2
## 55 0.9523810 0.8148874 Logitmg2
## 56 0.9047619 0.6440678 Logitmg2
## 57 0.9032258 0.6133056 Logitmg2
## 58 0.9193548 0.6906188 Logitmg2
## 59 0.9047619 0.6695804 Logitmg2
## 60 0.9523810 0.8148874 Logitmg2
## 61 0.9516129 0.8143713 Logitmg2
## 62 0.9523810 0.8286491 Logitmg2
## 63 0.9523810 0.8286491 Logitmg2
## 64 0.9206349 0.6645367 Logitmg2
## 65 0.9032258 0.6133056 Logitmg2
## 66 0.9523810 0.8148874 Logitmg2
## 67 0.9523810 0.8148874 Logitmg2
## 68 0.9047619 0.6142857 Logitmg2
## 69 0.8730159 0.5254237 Logitmg2
## 70 0.9047619 0.5790646 Logitmg2
## 71 0.8870968 0.5668663 Logitmg2
## 72 0.8888889 0.5303514 Logitmg2
## 73 0.9193548 0.6637744 Logitmg2
## 74 0.9523810 0.8405063 Logitmg2
## 75 0.9206349 0.6914789 Logitmg2
## 76 0.9523810 0.8286491 Logitmg2
## 77 0.9523810 0.8148874 Logitmg2
## 78 0.9523810 0.8286491 Logitmg2
## 79 0.9206349 0.6645367 Logitmg2
## 80 0.9047619 0.6142857 Logitmg2
## 81 0.9365079 0.7797203 Logitmg2
## 82 0.9032258 0.6429942 Logitmg2
## 83 0.9047619 0.6142857 Logitmg2
## 84 0.8888889 0.5680705 Logitmg2
## 85 0.9206349 0.6914789 Logitmg2
## 86 1.0000000 1.0000000 Logitmg2
## 87 0.9365079 0.7428571 Logitmg2
## 88 0.9047619 0.6440678 Logitmg2
## 89 0.9206349 0.6914789 Logitmg2
## 90 0.9032258 0.6133056 Logitmg2
## 91 0.9523810 0.8148874 Logitmg2
## 92 0.9193548 0.6637744 Logitmg2
## 93 0.9682540 0.8813559 Logitmg2
## 94 0.9193548 0.7134935 Logitmg2
## 95 0.9206349 0.6645367 Logitmg2
## 96 0.8888889 0.5680705 Logitmg2
## 97 0.9206349 0.6914789 Logitmg2
## 98 0.9047619 0.6142857 Logitmg2
## 99 0.9206349 0.7144152 Logitmg2
## 100 0.9206349 0.6914789 Logitmg2
b = mg3train$resample
b = b[, -3]
b$Mohinh = "Logitmg3"
b
## Accuracy Kappa Mohinh
## 1 0.9838710 0.9426987 Logitmg3
## 2 0.9047619 0.6142857 Logitmg3
## 3 0.9193548 0.6637744 Logitmg3
## 4 0.9523810 0.8148874 Logitmg3
## 5 0.9682540 0.8898601 Logitmg3
## 6 0.9047619 0.6440678 Logitmg3
## 7 0.9365079 0.7428571 Logitmg3
## 8 0.9047619 0.6142857 Logitmg3
## 9 0.8888889 0.5680705 Logitmg3
## 10 0.9365079 0.7428571 Logitmg3
## 11 0.9516129 0.8280961 Logitmg3
## 12 0.9193548 0.7134935 Logitmg3
## 13 0.9206349 0.6645367 Logitmg3
## 14 0.9206349 0.6914789 Logitmg3
## 15 0.8888889 0.5303514 Logitmg3
## 16 0.9523810 0.8286491 Logitmg3
## 17 0.9365079 0.7428571 Logitmg3
## 18 0.9365079 0.7627119 Logitmg3
## 19 0.9365079 0.7428571 Logitmg3
## 20 0.9206349 0.6645367 Logitmg3
## 21 0.9523810 0.8405063 Logitmg3
## 22 0.9523810 0.8286491 Logitmg3
## 23 0.9354839 0.7422037 Logitmg3
## 24 0.8888889 0.4854142 Logitmg3
## 25 0.8888889 0.5303514 Logitmg3
## 26 0.9682540 0.8813559 Logitmg3
## 27 0.9206349 0.6914789 Logitmg3
## 28 0.9677419 0.8809981 Logitmg3
## 29 0.9206349 0.6914789 Logitmg3
## 30 0.9047619 0.6142857 Logitmg3
## 31 0.9047619 0.5790646 Logitmg3
## 32 0.9206349 0.6645367 Logitmg3
## 33 0.8548387 0.5197935 Logitmg3
## 34 0.9193548 0.6637744 Logitmg3
## 35 0.9682540 0.8813559 Logitmg3
## 36 0.9523810 0.8148874 Logitmg3
## 37 0.9206349 0.7144152 Logitmg3
## 38 0.8730159 0.5889070 Logitmg3
## 39 0.9682540 0.8813559 Logitmg3
## 40 0.9206349 0.6914789 Logitmg3
## 41 0.9206349 0.6645367 Logitmg3
## 42 0.9523810 0.8148874 Logitmg3
## 43 0.9047619 0.6142857 Logitmg3
## 44 0.9206349 0.6914789 Logitmg3
## 45 0.9838710 0.9426987 Logitmg3
## 46 0.9677419 0.8894831 Logitmg3
## 47 0.9365079 0.7627119 Logitmg3
## 48 0.8888889 0.5303514 Logitmg3
## 49 0.9206349 0.6645367 Logitmg3
## 50 0.9047619 0.6142857 Logitmg3
## 51 0.9206349 0.6914789 Logitmg3
## 52 0.8870968 0.5292842 Logitmg3
## 53 0.9206349 0.7341772 Logitmg3
## 54 0.8888889 0.4854142 Logitmg3
## 55 0.9682540 0.8813559 Logitmg3
## 56 0.9838710 0.9426987 Logitmg3
## 57 0.9206349 0.6645367 Logitmg3
## 58 0.9206349 0.6645367 Logitmg3
## 59 0.9365079 0.7627119 Logitmg3
## 60 0.9365079 0.7797203 Logitmg3
## 61 0.9365079 0.7797203 Logitmg3
## 62 0.9523810 0.8148874 Logitmg3
## 63 0.9365079 0.7627119 Logitmg3
## 64 0.9516129 0.8143713 Logitmg3
## 65 0.9206349 0.6914789 Logitmg3
## 66 0.9193548 0.6906188 Logitmg3
## 67 0.9047619 0.5790646 Logitmg3
## 68 0.9047619 0.6440678 Logitmg3
## 69 0.9206349 0.6645367 Logitmg3
## 70 0.9523810 0.8148874 Logitmg3
## 71 0.9365079 0.7944535 Logitmg3
## 72 0.9365079 0.7627119 Logitmg3
## 73 0.9047619 0.6142857 Logitmg3
## 74 0.9365079 0.7627119 Logitmg3
## 75 0.9206349 0.6645367 Logitmg3
## 76 0.9354839 0.7422037 Logitmg3
## 77 0.9516129 0.8143713 Logitmg3
## 78 0.9523810 0.8148874 Logitmg3
## 79 0.9047619 0.6142857 Logitmg3
## 80 0.9365079 0.7428571 Logitmg3
## 81 0.9682540 0.8813559 Logitmg3
## 82 0.9523810 0.8148874 Logitmg3
## 83 0.9047619 0.6916803 Logitmg3
## 84 0.8730159 0.5254237 Logitmg3
## 85 0.9523810 0.8148874 Logitmg3
## 86 0.8888889 0.4854142 Logitmg3
## 87 0.9047619 0.5790646 Logitmg3
## 88 0.9677419 0.8809981 Logitmg3
## 89 0.9841270 0.9428830 Logitmg3
## 90 0.9032258 0.5782313 Logitmg3
## 91 0.9365079 0.7627119 Logitmg3
## 92 0.9523810 0.8148874 Logitmg3
## 93 0.9193548 0.6906188 Logitmg3
## 94 0.9523810 0.8148874 Logitmg3
## 95 0.9206349 0.6645367 Logitmg3
## 96 0.9047619 0.6440678 Logitmg3
## 97 0.8888889 0.5303514 Logitmg3
## 98 0.9193548 0.6906188 Logitmg3
## 99 0.9047619 0.6440678 Logitmg3
## 100 0.9682540 0.8813559 Logitmg3
resamplemod = rbind(a, b) # Hợp nhất 2 data frame a và b
resamplemod
## Accuracy Kappa Mohinh
## 1 0.8870968 0.6265060 Logitmg2
## 2 0.9354839 0.7422037 Logitmg2
## 3 0.9047619 0.5790646 Logitmg2
## 4 0.9047619 0.6142857 Logitmg2
## 5 0.9206349 0.6645367 Logitmg2
## 6 0.9206349 0.6914789 Logitmg2
## 7 0.9365079 0.7627119 Logitmg2
## 8 0.9206349 0.6914789 Logitmg2
## 9 0.9523810 0.8148874 Logitmg2
## 10 0.9365079 0.7627119 Logitmg2
## 11 0.9365079 0.7627119 Logitmg2
## 12 0.9193548 0.7134935 Logitmg2
## 13 0.9032258 0.6133056 Logitmg2
## 14 0.9523810 0.8148874 Logitmg2
## 15 0.9206349 0.6645367 Logitmg2
## 16 0.9206349 0.6914789 Logitmg2
## 17 0.9365079 0.7627119 Logitmg2
## 18 0.9206349 0.6645367 Logitmg2
## 19 0.9206349 0.6914789 Logitmg2
## 20 0.9365079 0.7627119 Logitmg2
## 21 0.9523810 0.8286491 Logitmg2
## 22 0.9365079 0.7627119 Logitmg2
## 23 0.9193548 0.6637744 Logitmg2
## 24 0.9365079 0.7627119 Logitmg2
## 25 0.9047619 0.5790646 Logitmg2
## 26 0.8888889 0.5303514 Logitmg2
## 27 0.9516129 0.8143713 Logitmg2
## 28 0.9523810 0.8286491 Logitmg2
## 29 0.9523810 0.8286491 Logitmg2
## 30 0.8730159 0.5254237 Logitmg2
## 31 0.9365079 0.7428571 Logitmg2
## 32 0.9365079 0.7627119 Logitmg2
## 33 0.9365079 0.7627119 Logitmg2
## 34 0.9516129 0.8280961 Logitmg2
## 35 0.9206349 0.6645367 Logitmg2
## 36 0.8888889 0.6001813 Logitmg2
## 37 0.9206349 0.6645367 Logitmg2
## 38 0.9516129 0.8143713 Logitmg2
## 39 0.9365079 0.7627119 Logitmg2
## 40 0.9047619 0.5790646 Logitmg2
## 41 0.8730159 0.4857143 Logitmg2
## 42 0.9523810 0.8148874 Logitmg2
## 43 0.9516129 0.8280961 Logitmg2
## 44 0.9206349 0.6645367 Logitmg2
## 45 0.9047619 0.5790646 Logitmg2
## 46 0.9206349 0.6645367 Logitmg2
## 47 0.8888889 0.6278481 Logitmg2
## 48 0.9682540 0.8813559 Logitmg2
## 49 0.9047619 0.6695804 Logitmg2
## 50 0.9193548 0.6906188 Logitmg2
## 51 0.8888889 0.4854142 Logitmg2
## 52 0.9206349 0.6914789 Logitmg2
## 53 0.9523810 0.8148874 Logitmg2
## 54 0.9365079 0.7627119 Logitmg2
## 55 0.9523810 0.8148874 Logitmg2
## 56 0.9047619 0.6440678 Logitmg2
## 57 0.9032258 0.6133056 Logitmg2
## 58 0.9193548 0.6906188 Logitmg2
## 59 0.9047619 0.6695804 Logitmg2
## 60 0.9523810 0.8148874 Logitmg2
## 61 0.9516129 0.8143713 Logitmg2
## 62 0.9523810 0.8286491 Logitmg2
## 63 0.9523810 0.8286491 Logitmg2
## 64 0.9206349 0.6645367 Logitmg2
## 65 0.9032258 0.6133056 Logitmg2
## 66 0.9523810 0.8148874 Logitmg2
## 67 0.9523810 0.8148874 Logitmg2
## 68 0.9047619 0.6142857 Logitmg2
## 69 0.8730159 0.5254237 Logitmg2
## 70 0.9047619 0.5790646 Logitmg2
## 71 0.8870968 0.5668663 Logitmg2
## 72 0.8888889 0.5303514 Logitmg2
## 73 0.9193548 0.6637744 Logitmg2
## 74 0.9523810 0.8405063 Logitmg2
## 75 0.9206349 0.6914789 Logitmg2
## 76 0.9523810 0.8286491 Logitmg2
## 77 0.9523810 0.8148874 Logitmg2
## 78 0.9523810 0.8286491 Logitmg2
## 79 0.9206349 0.6645367 Logitmg2
## 80 0.9047619 0.6142857 Logitmg2
## 81 0.9365079 0.7797203 Logitmg2
## 82 0.9032258 0.6429942 Logitmg2
## 83 0.9047619 0.6142857 Logitmg2
## 84 0.8888889 0.5680705 Logitmg2
## 85 0.9206349 0.6914789 Logitmg2
## 86 1.0000000 1.0000000 Logitmg2
## 87 0.9365079 0.7428571 Logitmg2
## 88 0.9047619 0.6440678 Logitmg2
## 89 0.9206349 0.6914789 Logitmg2
## 90 0.9032258 0.6133056 Logitmg2
## 91 0.9523810 0.8148874 Logitmg2
## 92 0.9193548 0.6637744 Logitmg2
## 93 0.9682540 0.8813559 Logitmg2
## 94 0.9193548 0.7134935 Logitmg2
## 95 0.9206349 0.6645367 Logitmg2
## 96 0.8888889 0.5680705 Logitmg2
## 97 0.9206349 0.6914789 Logitmg2
## 98 0.9047619 0.6142857 Logitmg2
## 99 0.9206349 0.7144152 Logitmg2
## 100 0.9206349 0.6914789 Logitmg2
## 101 0.9838710 0.9426987 Logitmg3
## 102 0.9047619 0.6142857 Logitmg3
## 103 0.9193548 0.6637744 Logitmg3
## 104 0.9523810 0.8148874 Logitmg3
## 105 0.9682540 0.8898601 Logitmg3
## 106 0.9047619 0.6440678 Logitmg3
## 107 0.9365079 0.7428571 Logitmg3
## 108 0.9047619 0.6142857 Logitmg3
## 109 0.8888889 0.5680705 Logitmg3
## 110 0.9365079 0.7428571 Logitmg3
## 111 0.9516129 0.8280961 Logitmg3
## 112 0.9193548 0.7134935 Logitmg3
## 113 0.9206349 0.6645367 Logitmg3
## 114 0.9206349 0.6914789 Logitmg3
## 115 0.8888889 0.5303514 Logitmg3
## 116 0.9523810 0.8286491 Logitmg3
## 117 0.9365079 0.7428571 Logitmg3
## 118 0.9365079 0.7627119 Logitmg3
## 119 0.9365079 0.7428571 Logitmg3
## 120 0.9206349 0.6645367 Logitmg3
## 121 0.9523810 0.8405063 Logitmg3
## 122 0.9523810 0.8286491 Logitmg3
## 123 0.9354839 0.7422037 Logitmg3
## 124 0.8888889 0.4854142 Logitmg3
## 125 0.8888889 0.5303514 Logitmg3
## 126 0.9682540 0.8813559 Logitmg3
## 127 0.9206349 0.6914789 Logitmg3
## 128 0.9677419 0.8809981 Logitmg3
## 129 0.9206349 0.6914789 Logitmg3
## 130 0.9047619 0.6142857 Logitmg3
## 131 0.9047619 0.5790646 Logitmg3
## 132 0.9206349 0.6645367 Logitmg3
## 133 0.8548387 0.5197935 Logitmg3
## 134 0.9193548 0.6637744 Logitmg3
## 135 0.9682540 0.8813559 Logitmg3
## 136 0.9523810 0.8148874 Logitmg3
## 137 0.9206349 0.7144152 Logitmg3
## 138 0.8730159 0.5889070 Logitmg3
## 139 0.9682540 0.8813559 Logitmg3
## 140 0.9206349 0.6914789 Logitmg3
## 141 0.9206349 0.6645367 Logitmg3
## 142 0.9523810 0.8148874 Logitmg3
## 143 0.9047619 0.6142857 Logitmg3
## 144 0.9206349 0.6914789 Logitmg3
## 145 0.9838710 0.9426987 Logitmg3
## 146 0.9677419 0.8894831 Logitmg3
## 147 0.9365079 0.7627119 Logitmg3
## 148 0.8888889 0.5303514 Logitmg3
## 149 0.9206349 0.6645367 Logitmg3
## 150 0.9047619 0.6142857 Logitmg3
## 151 0.9206349 0.6914789 Logitmg3
## 152 0.8870968 0.5292842 Logitmg3
## 153 0.9206349 0.7341772 Logitmg3
## 154 0.8888889 0.4854142 Logitmg3
## 155 0.9682540 0.8813559 Logitmg3
## 156 0.9838710 0.9426987 Logitmg3
## 157 0.9206349 0.6645367 Logitmg3
## 158 0.9206349 0.6645367 Logitmg3
## 159 0.9365079 0.7627119 Logitmg3
## 160 0.9365079 0.7797203 Logitmg3
## 161 0.9365079 0.7797203 Logitmg3
## 162 0.9523810 0.8148874 Logitmg3
## 163 0.9365079 0.7627119 Logitmg3
## 164 0.9516129 0.8143713 Logitmg3
## 165 0.9206349 0.6914789 Logitmg3
## 166 0.9193548 0.6906188 Logitmg3
## 167 0.9047619 0.5790646 Logitmg3
## 168 0.9047619 0.6440678 Logitmg3
## 169 0.9206349 0.6645367 Logitmg3
## 170 0.9523810 0.8148874 Logitmg3
## 171 0.9365079 0.7944535 Logitmg3
## 172 0.9365079 0.7627119 Logitmg3
## 173 0.9047619 0.6142857 Logitmg3
## 174 0.9365079 0.7627119 Logitmg3
## 175 0.9206349 0.6645367 Logitmg3
## 176 0.9354839 0.7422037 Logitmg3
## 177 0.9516129 0.8143713 Logitmg3
## 178 0.9523810 0.8148874 Logitmg3
## 179 0.9047619 0.6142857 Logitmg3
## 180 0.9365079 0.7428571 Logitmg3
## 181 0.9682540 0.8813559 Logitmg3
## 182 0.9523810 0.8148874 Logitmg3
## 183 0.9047619 0.6916803 Logitmg3
## 184 0.8730159 0.5254237 Logitmg3
## 185 0.9523810 0.8148874 Logitmg3
## 186 0.8888889 0.4854142 Logitmg3
## 187 0.9047619 0.5790646 Logitmg3
## 188 0.9677419 0.8809981 Logitmg3
## 189 0.9841270 0.9428830 Logitmg3
## 190 0.9032258 0.5782313 Logitmg3
## 191 0.9365079 0.7627119 Logitmg3
## 192 0.9523810 0.8148874 Logitmg3
## 193 0.9193548 0.6906188 Logitmg3
## 194 0.9523810 0.8148874 Logitmg3
## 195 0.9206349 0.6645367 Logitmg3
## 196 0.9047619 0.6440678 Logitmg3
## 197 0.8888889 0.5303514 Logitmg3
## 198 0.9193548 0.6906188 Logitmg3
## 199 0.9047619 0.6440678 Logitmg3
## 200 0.9682540 0.8813559 Logitmg3
## Phân tích hình ảnh
library(ggplot2)
library(ggthemes)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
h1 <- ggplot(resamplemod) + geom_boxplot(aes(x = Mohinh, y = Accuracy, fill = Mohinh)) + coord_flip() + geom_hline(yintercept = 0.8248, color = "blue") + theme_wsj() # Giá trị 0.8248 = 82.48% là tỷ lệ nhóm chiếm ưu thế (không sử dụng xe buýt-sử dụng xe máy)
h1 # Nhận xét: Cả 2 mô hình đều có khả năng phân loại tốt - độ chính xác accuracy của cả 2 mô hình đều lớn hơn 87.01% (đường màu xanh)
h2 <- ggplot(resamplemod, aes(Accuracy, fill = Mohinh)) + geom_density(alpha = 0.3) + theme_wsj() + facet_wrap(~ Mohinh)
h2
## Kiểm định t-test để đánh giá sự khác biệt về Accuracy và Kappa của 2 mô hình: p<0.05 có ý nghĩa thống kê về sự khác biệt Accuracy và Kappa của 2 mô hình
t.test(resamplemod$Accuracy ~ resamplemod$Mohinh)
##
## Welch Two Sample t-test
##
## data: resamplemod$Accuracy by resamplemod$Mohinh
## t = -1.1014, df = 194.58, p-value = 0.2721
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.011216872 0.003177957
## sample estimates:
## mean in group Logitmg2 mean in group Logitmg3
## 0.9245110 0.9285305
summary(a)
## Accuracy Kappa Mohinh
## Min. :0.8730 Min. :0.4854 Length:100
## 1st Qu.:0.9048 1st Qu.:0.6275 Class :character
## Median :0.9206 Median :0.6915 Mode :character
## Mean :0.9245 Mean :0.7045
## 3rd Qu.:0.9516 3rd Qu.:0.8144
## Max. :1.0000 Max. :1.0000
summary(b)
## Accuracy Kappa Mohinh
## Min. :0.8548 Min. :0.4854 Length:100
## 1st Qu.:0.9048 1st Qu.:0.6441 Class :character
## Median :0.9206 Median :0.7026 Mode :character
## Mean :0.9285 Mean :0.7184
## 3rd Qu.:0.9524 3rd Qu.:0.8149
## Max. :0.9841 Max. :0.9429
t.test(resamplemod$Kappa ~ resamplemod$Mohinh)
##
## Welch Two Sample t-test
##
## data: resamplemod$Kappa by resamplemod$Mohinh
## t = -0.89788, df = 193.91, p-value = 0.3704
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04415241 0.01652762
## sample estimates:
## mean in group Logitmg2 mean in group Logitmg3
## 0.7045402 0.7183526
# 4.9. Calculate AUC (Area under Curve) - Diện tích dưới đường cong ROC (Receiver Operating Characteristics), đánh giá mức độ lồi về phía trên của đường ROC (Tiêu chí thường được sử dụng khi so sánh các mô hình phân loại với nhau): sử dụng hàm roc trong gói "pROC": AUC càng gần 1 càng tốt, >0.7 xem như chấp nhận được
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Mô hình mg2
mg2train1 <- train(Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather + Age + Motor.Owning + Distance + Cost, data = datMB, method = "glm", family = "binomial")
summary(mg2train1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5543 -0.3473 -0.2047 -0.1105 3.5424
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.0827 0.4366 2.480 0.01313 *
## Bus.Stop.ConditionYes -1.1454 0.3559 -3.218 0.00129 **
## Temporary.Stop.NumberYes -1.4357 0.3665 -3.917 8.97e-05 ***
## WeatherRainny 3.3460 1.2303 2.720 0.00653 **
## `Age25-60` -0.8581 0.3723 -2.305 0.02118 *
## `Age>60` 3.0726 0.6714 4.577 4.73e-06 ***
## Motor.OwningYes -2.5521 0.3773 -6.763 1.35e-11 ***
## Distance 0.8812 0.1097 8.034 9.47e-16 ***
## Cost -0.9272 0.1168 -7.941 2.00e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 269.20 on 619 degrees of freedom
## AIC: 287.2
##
## Number of Fisher Scoring iterations: 6
pred1mg2 <- predict(mg2train1, newdata = datMB, type = "prob")
rmg2 <- roc(datMB$Bus, pred1mg2$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rmg2$auc
## Area under the curve: 0.9092
auc(rmg2)
## Area under the curve: 0.9092
plot.roc(rmg2)
ci(rmg2)
## 95% CI: 0.867-0.9514 (DeLong)
plot(rmg2, print.thres = "best", col = "red", print.auc = TRUE, auc.polygon = TRUE, auc.polygon.col = "yellow", identity = F)
plot(smooth(rmg2), print.thres = "best", col = "red", print.auc = TRUE, auc.polygon = TRUE, auc.polygon.col = "yellow", identity = F)
## Mô hình mg3
mg3train1 <- train(Bus ~ Temporary.Stop.Number + Age + Motor.Certificate + Motor.Owning + Distance + Cost, method = "glm", family = "binomial", data = datMB)
summary(mg3train1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6521 -0.3234 -0.2352 -0.1597 3.4706
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1249 0.4578 2.457 0.014011 *
## Temporary.Stop.NumberYes -1.2977 0.3518 -3.689 0.000225 ***
## `Age25-60` -0.7221 0.3726 -1.938 0.052588 .
## `Age>60` 2.6518 0.6239 4.250 2.14e-05 ***
## Motor.CertificateYes -1.2614 0.4444 -2.839 0.004532 **
## Motor.OwningYes -2.0477 0.3807 -5.378 7.51e-08 ***
## Distance 0.8622 0.1091 7.900 2.79e-15 ***
## Cost -0.8805 0.1135 -7.759 8.56e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 278.94 on 620 degrees of freedom
## AIC: 294.94
##
## Number of Fisher Scoring iterations: 6
pred1mg3 <- predict(mg3train1, newdata = datMB, type = "prob")
rmg3 <- roc(datMB$Bus, pred1mg3$Yes)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rmg3$auc
## Area under the curve: 0.9006
plot(rmg3, print.thres = "best", col = "red", print.auc = TRUE, auc.polygon = TRUE, auc.polygon.col = "greenyellow", identity = F)
plot(smooth(rmg3), print.thres = "best", col = "red", print.auc = TRUE, auc.polygon = TRUE, auc.polygon.col = "greenyellow", identity = F)
## Có thể dùng các lệnh đơn giản
auc(rmg3)
## Area under the curve: 0.9006
plot.roc(rmg3)
ci(rmg3)
## 95% CI: 0.8539-0.9472 (DeLong)
## Tính hệ số Gini = 2AUC-1
Ginimg3 <- 2*(rmg3$auc)-1
Ginimg3
## [1] 0.8011232
Ginimg2 <- 2*(rmg2$auc)-1
Ginimg2
## [1] 0.8183573
5. Importance of variables in models
library(caret)
varImp(mg2)
## Overall
## Bus.Stop.ConditionYes 3.218460
## Temporary.Stop.NumberYes 3.916837
## WeatherRainny 2.719747
## Age25-60 2.304714
## Age>60 4.576534
## Motor.OwningYes 6.763419
## Distance 8.033518
## Cost 7.941463
plot(varImp(mg2))
varImp(mg3)
## Overall
## Temporary.Stop.NumberYes 3.688996
## Age25-60 1.938286
## Age>60 4.250078
## Motor.CertificateYes 2.838573
## Motor.OwningYes 5.378406
## Distance 7.900110
## Cost 7.759013
plot(varImp(mg3))
6. Probility of bus use - Prohibit model
# Prohibit Models
library(aod)
##
## Attaching package: 'aod'
## The following object is masked from 'package:survival':
##
## rats
## Model mg2
prohibitmg2 <- glm(Bus ~ Bus.Stop.Condition + Temporary.Stop.Number + Weather + Age + Motor.Owning + Distance + Cost, family = binomial(link = "probit"), data = datMB)
summary(prohibitmg2)
##
## Call:
## glm(formula = Bus ~ Bus.Stop.Condition + Temporary.Stop.Number +
## Weather + Age + Motor.Owning + Distance + Cost, family = binomial(link = "probit"),
## data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4479 -0.3810 -0.2122 -0.1005 3.7567
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.50335 0.22669 2.220 0.02639 *
## Bus.Stop.ConditionYes -0.53362 0.17272 -3.089 0.00201 **
## Temporary.Stop.NumberYes -0.65542 0.17624 -3.719 0.00020 ***
## WeatherRainny 1.72021 0.66545 2.585 0.00974 **
## Age25-60 -0.45173 0.17979 -2.513 0.01199 *
## Age>60 1.57540 0.35924 4.385 1.16e-05 ***
## Motor.OwningYes -1.33955 0.19510 -6.866 6.60e-12 ***
## Distance 0.43489 0.05126 8.484 < 2e-16 ***
## Cost -0.45880 0.05449 -8.420 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 279.55 on 619 degrees of freedom
## AIC: 297.55
##
## Number of Fisher Scoring iterations: 7
## Model mg3
prohibitmg3 <- glm(Bus ~ Temporary.Stop.Number + Age + Motor.Certificate + Motor.Owning + Distance + Cost, family = binomial(link = "probit"), data = datMB)
summary(prohibitmg3)
##
## Call:
## glm(formula = Bus ~ Temporary.Stop.Number + Age + Motor.Certificate +
## Motor.Owning + Distance + Cost, family = binomial(link = "probit"),
## data = datMB)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5733 -0.3513 -0.2548 -0.1541 3.6886
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.59639 0.24722 2.412 0.015849 *
## Temporary.Stop.NumberYes -0.60283 0.16986 -3.549 0.000387 ***
## Age25-60 -0.36339 0.17770 -2.045 0.040864 *
## Age>60 1.40849 0.34567 4.075 4.61e-05 ***
## Motor.CertificateYes -0.63545 0.24188 -2.627 0.008610 **
## Motor.OwningYes -1.13928 0.20348 -5.599 2.16e-08 ***
## Distance 0.42121 0.05073 8.303 < 2e-16 ***
## Cost -0.43561 0.05332 -8.170 3.09e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.75 on 627 degrees of freedom
## Residual deviance: 289.24 on 620 degrees of freedom
## AIC: 305.24
##
## Number of Fisher Scoring iterations: 7
# Đánh giá tác động tổng thể Age (Total effect of Age) trong mô hinhg mg2 có ảnh hưởng hay không đến khả năng chọn xe buýt (thứ tự hệ số ứng với Age là từ 7:9)
wald.test(b = coef(prohibitmg2), Sigma = vcov(prohibitmg2), Terms = 7:9) # Giá trị 107.6 ứng với p-value < 0.05 --> KL: tác động tổng thể về Purpose (mục đich chuyến đi) của người sử dụng lên khả năng chọn xe buýt là tồn tại và có ý nghĩa thống kê
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 107.6, df = 3, P(> X2) = 0.0
# Đánh giá tác động tổng thể Temporary.Stop.Number (Total effect of Temporary.Stop.Number) trong mô hình mg2 có ảnh hưởng hay không đến khả năng chọn xe buýt (thứ tự hệ số ứng với Temporary.Stop.Number là từ 1:2)
wald.test(b = coef(prohibitmg2), Sigma = vcov(prohibitmg2), Terms = 1:2)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 10.6, df = 2, P(> X2) = 0.0051
7. Reason why choose or doesn’t choose bus
t = "E:\\NGHIEN CUU SINH\\NCS - PHUONG ANH\\Part 1-Mode choice\\SO LIEU R\\Mode choice in DN.csv"
MC = read.csv(t, header = T)
head(MC)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 1 6 1 0 3 3 1
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 6 4 1 0 1 4 1
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 1 2 10 1 1 0
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 6 20 30 1 1 0
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 1 4 1 0 1 12 1
## 2 2 3 0 1 8 2
## 3 4 1 0 2 5 1
## 4 2 1 0 4 5 1
## 5 2 3 0 2 8 2
## 6 5 3 0 2 40 2
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 1 0 5 4 2 2 0
## 2 0 3 6 3 1 1
## 3 1 3 2 4 0 1
## 4 1 3 2 3 0 1
## 5 0 4 6 3 1 1
## 6 1 4 3 4 2 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 1 0 0 0 0 1
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 6 1 1 1 1 1
## Number.of.Motors Number.of.Car
## 1 2 1
## 2 3 0
## 3 3 0
## 4 3 0
## 5 2 0
## 6 2 1
str(MC)
## 'data.frame': 847 obs. of 30 variables:
## $ Travel.Mode : int 6 3 3 3 3 4 4 3 3 3 ...
## $ Bus.Stop.Condition : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Central.Area : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose : int 3 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : int 3 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : int 1 1 3 1 3 1 1 1 2 1 ...
## $ Distance : num 2 8 5 5 8 20 15 10 12 10 ...
## $ Travel.Period : num 10 15 15 10 15 30 20 25 30 25 ...
## $ Sidewalk.Clearance : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Lane.Separate : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Temporary.Stop.Number: int 0 1 1 1 0 0 0 0 1 1 ...
## $ Mode.Choice.Reason : int 4 2 4 2 2 5 5 2 2 2 ...
## $ Weather : int 1 3 1 1 3 3 3 3 1 3 ...
## $ Weekend : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Non.Bus.Reason : int 1 1 2 4 2 2 4 2 1 4 ...
## $ Cost : int 12 8 5 5 8 40 30 10 12 10 ...
## $ Bus.Stop.Present : int 1 2 1 1 2 2 2 1 1 1 ...
## $ Gender : int 0 0 1 1 0 1 1 0 0 1 ...
## $ Age : int 5 3 3 3 4 4 5 4 3 4 ...
## $ Occupation : int 4 6 2 2 6 3 3 7 7 3 ...
## $ Income : int 2 3 4 3 3 4 4 2 3 3 ...
## $ Number.of.Children : int 2 1 0 0 1 2 2 0 1 0 ...
## $ Motor.Certificate : int 0 1 1 1 1 1 1 1 1 1 ...
## $ Car.Certificate : int 0 0 0 0 0 1 1 0 0 0 ...
## $ Bicycle.Owning : int 0 1 0 1 0 1 0 0 1 0 ...
## $ Motor.Owning : int 0 1 1 1 1 1 1 1 1 1 ...
## $ Car.Owning : int 0 0 0 0 0 1 1 0 0 0 ...
## $ Number.of.Bicycles : int 1 1 0 1 0 1 0 0 1 0 ...
## $ Number.of.Motors : int 2 3 3 3 2 2 2 2 3 3 ...
## $ Number.of.Car : int 1 0 0 0 0 1 1 0 0 0 ...
str(MC$Travel.Mode)
## int [1:847] 6 3 3 3 3 4 4 3 3 3 ...
# View data (number of rows and colume)
dim(MC)
## [1] 847 30
# Sumerize Data
summary(MC)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose
## Min. :1.000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000
## Median :3.000 Median :1.0000 Median :0.000 Median :1.000
## Mean :3.646 Mean :0.5089 Mean :0.477 Mean :1.568
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :7.000 Max. :1.0000 Max. :1.000 Max. :4.000
##
## Frequency Departure.Time Distance Travel.Period
## Min. :1.000 Min. :1.000 Min. : 0.015 Min. : 0.00
## 1st Qu.:4.000 1st Qu.:1.000 1st Qu.: 3.000 1st Qu.: 10.00
## Median :4.000 Median :1.000 Median : 5.000 Median : 15.00
## Mean :3.548 Mean :1.843 Mean : 6.492 Mean : 17.42
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 20.00
## Max. :4.000 Max. :4.000 Max. :35.000 Max. :180.00
##
## Sidewalk.Clearance Lane.Separate Temporary.Stop.Number Mode.Choice.Reason
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:2.000
## Median :1.0000 Median :1.0000 Median :0.0000 Median :2.000
## Mean :0.8666 Mean :0.7898 Mean :0.6824 Mean :2.713
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :1.0000 Max. :1.0000 Max. :3.0000 Max. :6.000
##
## Weather Weekend Non.Bus.Reason Cost
## Min. :1.000 Min. :0.0000 Min. :1.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.: 3.000
## Median :1.000 Median :0.0000 Median :2.000 Median : 5.000
## Mean :1.646 Mean :0.2444 Mean :2.912 Mean : 8.685
## 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.: 10.000
## Max. :3.000 Max. :1.0000 Max. :6.000 Max. :285.000
## NA's :110
## Bus.Stop.Present Gender Age Occupation
## Min. :0.000 Min. :0.0000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:3.00 1st Qu.:2.000
## Median :1.000 Median :1.0000 Median :4.00 Median :3.000
## Mean :1.004 Mean :0.5832 Mean :3.56 Mean :4.152
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:4.00 3rd Qu.:7.000
## Max. :2.000 Max. :1.0000 Max. :6.00 Max. :8.000
##
## Income Number.of.Children Motor.Certificate Car.Certificate
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000
## Median :2.000 Median :1.0000 Median :1.0000 Median :0.000
## Mean :2.205 Mean :0.7674 Mean :0.8453 Mean :0.183
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :4.000 Max. :3.0000 Max. :1.0000 Max. :1.000
##
## Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.4109 Mean :0.7851 Mean :0.1405 Mean :0.7084
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :3.0000
##
## Number.of.Motors Number.of.Car
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :0.0000
## Mean :2.118 Mean :0.2456
## 3rd Qu.:3.000 3rd Qu.:0.0000
## Max. :4.000 Max. :2.0000
##
# Create dat - MotorBus (Bus và xe máy)
datMB <- MC[MC$Travel.Mode %in% c(3,7),]
head(datMB)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 8 3 1 0 1 4 1
## 9 3 1 0 1 4 2
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 8 10 25 1 1 0
## 9 12 30 1 1 1
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 2 3 0 1 8 2
## 3 4 1 0 2 5 1
## 4 2 1 0 4 5 1
## 5 2 3 0 2 8 2
## 8 2 3 1 2 10 1
## 9 2 1 0 1 12 1
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 2 0 3 6 3 1 1
## 3 1 3 2 4 0 1
## 4 1 3 2 3 0 1
## 5 0 4 6 3 1 1
## 8 0 4 7 2 0 1
## 9 0 3 7 3 1 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 8 0 0 1 0 0
## 9 0 1 1 0 1
## Number.of.Motors Number.of.Car
## 2 3 0
## 3 3 0
## 4 3 0
## 5 2 0
## 8 2 0
## 9 3 0
dim(datMB)
## [1] 628 30
datMB$Bus[datMB$Travel.Mode == 3] <- 0
datMB$Bus[datMB$Travel.Mode == 7] <- 1
head(datMB)
## Travel.Mode Bus.Stop.Condition Central.Area Purpose Frequency Departure.Time
## 2 3 1 0 1 4 1
## 3 3 1 0 1 4 3
## 4 3 1 0 1 4 1
## 5 3 1 0 1 4 3
## 8 3 1 0 1 4 1
## 9 3 1 0 1 4 2
## Distance Travel.Period Sidewalk.Clearance Lane.Separate Temporary.Stop.Number
## 2 8 15 1 1 1
## 3 5 15 1 1 1
## 4 5 10 1 1 1
## 5 8 15 1 1 0
## 8 10 25 1 1 0
## 9 12 30 1 1 1
## Mode.Choice.Reason Weather Weekend Non.Bus.Reason Cost Bus.Stop.Present
## 2 2 3 0 1 8 2
## 3 4 1 0 2 5 1
## 4 2 1 0 4 5 1
## 5 2 3 0 2 8 2
## 8 2 3 1 2 10 1
## 9 2 1 0 1 12 1
## Gender Age Occupation Income Number.of.Children Motor.Certificate
## 2 0 3 6 3 1 1
## 3 1 3 2 4 0 1
## 4 1 3 2 3 0 1
## 5 0 4 6 3 1 1
## 8 0 4 7 2 0 1
## 9 0 3 7 3 1 1
## Car.Certificate Bicycle.Owning Motor.Owning Car.Owning Number.of.Bicycles
## 2 0 1 1 0 1
## 3 0 0 1 0 0
## 4 0 1 1 0 1
## 5 0 0 1 0 0
## 8 0 0 1 0 0
## 9 0 1 1 0 1
## Number.of.Motors Number.of.Car Bus
## 2 3 0 0
## 3 3 0 0
## 4 3 0 0
## 5 2 0 0
## 8 2 0 0
## 9 3 0 0
dim(datMB)
## [1] 628 31
str(datMB)
## 'data.frame': 628 obs. of 31 variables:
## $ Travel.Mode : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Bus.Stop.Condition : int 1 1 1 1 1 1 1 1 0 0 ...
## $ Central.Area : int 0 0 0 0 0 0 0 0 1 1 ...
## $ Purpose : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Frequency : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Departure.Time : int 1 3 1 3 1 2 1 1 1 1 ...
## $ Distance : num 8 5 5 8 10 12 10 3 2 4 ...
## $ Travel.Period : num 15 15 10 15 25 30 25 5 5 10 ...
## $ Sidewalk.Clearance : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Lane.Separate : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Temporary.Stop.Number: int 1 1 1 0 0 1 1 0 1 0 ...
## $ Mode.Choice.Reason : int 2 4 2 2 2 2 2 2 4 3 ...
## $ Weather : int 3 1 1 3 3 1 3 1 3 1 ...
## $ Weekend : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Non.Bus.Reason : int 1 2 4 2 2 1 4 4 4 2 ...
## $ Cost : int 8 5 5 8 10 12 10 3 2 4 ...
## $ Bus.Stop.Present : int 2 1 1 2 1 1 1 1 1 1 ...
## $ Gender : int 0 1 1 0 0 0 1 1 1 1 ...
## $ Age : int 3 3 3 4 4 3 4 2 4 3 ...
## $ Occupation : int 6 2 2 6 7 7 3 1 7 7 ...
## $ Income : int 3 4 3 3 2 3 3 2 2 1 ...
## $ Number.of.Children : int 1 0 0 1 0 1 0 0 1 0 ...
## $ Motor.Certificate : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Car.Certificate : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bicycle.Owning : int 1 0 1 0 0 1 0 1 0 0 ...
## $ Motor.Owning : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Car.Owning : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Number.of.Bicycles : int 1 0 1 0 0 1 0 1 2 0 ...
## $ Number.of.Motors : int 3 3 3 2 2 3 3 3 2 1 ...
## $ Number.of.Car : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
# Vẽ biểu đồ reason why choose bus or doesn't choose bus
library(magrittr)
library(tidyverse)
library(ggplot2)
library(car)
## Lấy 1 phần dữ liệu gồm Bus, Mode.Choice.Reason, Non.Bus.Reason
dat1 <- datMB[,c(12,15,31)]
head(dat1)
## Mode.Choice.Reason Non.Bus.Reason Bus
## 2 2 1 0
## 3 4 2 0
## 4 2 4 0
## 5 2 2 0
## 8 2 2 0
## 9 2 1 0
dim(dat1)
## [1] 628 3
## Dữ liệu sử dụng xe buýt
datBusonly <- subset(dat1, Bus == 1)
dim(datBusonly)
## [1] 110 3
datBusonly$Bus.Choice.Reason <- datBusonly$Mode.Choice.Reason
head(datBusonly)
## Mode.Choice.Reason Non.Bus.Reason Bus Bus.Choice.Reason
## 60 1 NA 1 1
## 68 3 NA 1 3
## 128 1 NA 1 1
## 247 2 NA 1 2
## 280 2 NA 1 2
## 344 1 NA 1 1
str(datBusonly)
## 'data.frame': 110 obs. of 4 variables:
## $ Mode.Choice.Reason: int 1 3 1 2 2 1 2 1 1 3 ...
## $ Non.Bus.Reason : int NA NA NA NA NA NA NA NA NA NA ...
## $ Bus : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Bus.Choice.Reason : int 1 3 1 2 2 1 2 1 1 3 ...
## reformat variables in factor variables
attach(datBusonly)
## The following objects are masked from datMB (pos = 39):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
## The following objects are masked from datMB (pos = 40):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
datBusonly = within(datBusonly, {
Bus.Choice.Reason = factor(Bus.Choice.Reason, labels = c("Safety", "Comfortable", "Low price", "Accessibility", "others"))
} )
head(datBusonly)
## Mode.Choice.Reason Non.Bus.Reason Bus Bus.Choice.Reason
## 60 1 NA 1 Safety
## 68 3 NA 1 Low price
## 128 1 NA 1 Safety
## 247 2 NA 1 Comfortable
## 280 2 NA 1 Comfortable
## 344 1 NA 1 Safety
str(datBusonly)
## 'data.frame': 110 obs. of 4 variables:
## $ Mode.Choice.Reason: int 1 3 1 2 2 1 2 1 1 3 ...
## $ Non.Bus.Reason : int NA NA NA NA NA NA NA NA NA NA ...
## $ Bus : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Bus.Choice.Reason : Factor w/ 5 levels "Safety","Comfortable",..: 1 3 1 2 2 1 2 1 1 3 ...
dim(datBusonly)
## [1] 110 4
## Reasons choose bus
p.bus = ggplot(datBusonly, aes(x = Bus.Choice.Reason, fill = Bus.Choice.Reason))
p.bus = p.bus + geom_bar(width = 0.8, col = "blue")
p.bus = p.bus + theme(legend.position = "none")
p.bus = p.bus + coord_flip()
p.bus
# Dữ liệu không sử dụng xe buýt - sử dụng xe máy
datMotoronly <- subset(dat1, Bus == 0)
head(datMotoronly)
## Mode.Choice.Reason Non.Bus.Reason Bus
## 2 2 1 0
## 3 4 2 0
## 4 2 4 0
## 5 2 2 0
## 8 2 2 0
## 9 2 1 0
str(datMotoronly)
## 'data.frame': 518 obs. of 3 variables:
## $ Mode.Choice.Reason: int 2 4 2 2 2 2 2 2 4 3 ...
## $ Non.Bus.Reason : int 1 2 4 2 2 1 4 4 4 2 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
attach(datMotoronly)
## The following objects are masked from datBusonly:
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
##
## The following objects are masked from datMB (pos = 40):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
## The following objects are masked from datMB (pos = 41):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
datMotoronly = within(datMotoronly, {
Non.Bus.Reason = factor(Non.Bus.Reason, labels = c("No Route", "Inconvenient", "Unsafety", "Long waiting time", "Unreliability", "others"))
} )
str(datMotoronly)
## 'data.frame': 518 obs. of 3 variables:
## $ Mode.Choice.Reason: int 2 4 2 2 2 2 2 2 4 3 ...
## $ Non.Bus.Reason : Factor w/ 6 levels "No Route","Inconvenient",..: 1 2 4 2 2 1 4 4 4 2 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
dim(datMotoronly)
## [1] 518 3
## Reasons motor users not choose bus
p.nonbus = ggplot(datMotoronly, aes(x = Non.Bus.Reason, fill = Non.Bus.Reason))
p.nonbus = p.nonbus + geom_bar(width = 0.8, col = "blue")
p.nonbus = p.nonbus + theme(legend.position = "none")
p.nonbus = p.nonbus + coord_flip()
p.nonbus
## Reasons choose motor
datMotoronly$Motor.Choice.Reason <- datMotoronly$Mode.Choice.Reason
head(datMotoronly)
## Mode.Choice.Reason Non.Bus.Reason Bus Motor.Choice.Reason
## 2 2 No Route 0 2
## 3 4 Inconvenient 0 4
## 4 2 Long waiting time 0 2
## 5 2 Inconvenient 0 2
## 8 2 Inconvenient 0 2
## 9 2 No Route 0 2
str(datMotoronly)
## 'data.frame': 518 obs. of 4 variables:
## $ Mode.Choice.Reason : int 2 4 2 2 2 2 2 2 4 3 ...
## $ Non.Bus.Reason : Factor w/ 6 levels "No Route","Inconvenient",..: 1 2 4 2 2 1 4 4 4 2 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Motor.Choice.Reason: int 2 4 2 2 2 2 2 2 4 3 ...
dim(datMotoronly)
## [1] 518 4
attach(datMotoronly)
## The following objects are masked from datMotoronly (pos = 3):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
## The following objects are masked from datBusonly:
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
## The following objects are masked from datMB (pos = 41):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
## The following objects are masked from datMB (pos = 42):
##
## Bus, Mode.Choice.Reason, Non.Bus.Reason
datMotoronly = within(datMotoronly, {
Motor.Choice.Reason = factor(Motor.Choice.Reason, labels = c("Safety", "Comfortable", "Low price", "Accessibility", "Reliability", "others"))
} )
head(datMotoronly)
## Mode.Choice.Reason Non.Bus.Reason Bus Motor.Choice.Reason
## 2 2 No Route 0 Comfortable
## 3 4 Inconvenient 0 Accessibility
## 4 2 Long waiting time 0 Comfortable
## 5 2 Inconvenient 0 Comfortable
## 8 2 Inconvenient 0 Comfortable
## 9 2 No Route 0 Comfortable
str(datMotoronly)
## 'data.frame': 518 obs. of 4 variables:
## $ Mode.Choice.Reason : int 2 4 2 2 2 2 2 2 4 3 ...
## $ Non.Bus.Reason : Factor w/ 6 levels "No Route","Inconvenient",..: 1 2 4 2 2 1 4 4 4 2 ...
## $ Bus : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Motor.Choice.Reason: Factor w/ 6 levels "Safety","Comfortable",..: 2 4 2 2 2 2 2 2 4 3 ...
dim(datMotoronly)
## [1] 518 4
p.motor = ggplot(datMotoronly, aes(x = Motor.Choice.Reason, fill = Motor.Choice.Reason))
p.motor = p.motor + geom_bar(width = 0.8, col = "blue")
p.motor = p.motor + theme(legend.position = "none")
p.motor = p.motor + coord_flip()
p.motor