This data set is collected by a survey about on air passenger
satisfaction.
Link of my resource Kaggle.com
The following classification problem is set:
It is necessary to predict which of the two levels of satisfaction with the airline the passenger belongs to:
1. Satisfaction (54.8%)
2. dissatisfied (45.2%)
In this project, we will apply various models:
-Logistic regression
-Factorial Discriminant Analysis
-Decision Tree
-RandomForest
There is the following information about the passengers of some airline:
#import data
df=read.csv("C:/Users/Safoueen/Desktop/Invistico_Airline.csv")
#view data
head(df)
## satisfaction Gender Customer.Type Age Type.of.Travel Class
## 1 satisfied Female Loyal Customer 65 Personal Travel Eco
## 2 satisfied Male Loyal Customer 47 Personal Travel Business
## 3 satisfied Female Loyal Customer 15 Personal Travel Eco
## 4 satisfied Female Loyal Customer 60 Personal Travel Eco
## 5 satisfied Female Loyal Customer 70 Personal Travel Eco
## 6 satisfied Male Loyal Customer 30 Personal Travel Eco
## Flight.Distance Seat.comfort Departure.Arrival.time.convenient Food.and.drink
## 1 265 0 0 0
## 2 2464 0 0 0
## 3 2138 0 0 0
## 4 623 0 0 0
## 5 354 0 0 0
## 6 1894 0 0 0
## Gate.location Inflight.wifi.service Inflight.entertainment Online.support
## 1 2 2 4 2
## 2 3 0 2 2
## 3 3 2 0 2
## 4 3 3 4 3
## 5 3 4 3 4
## 6 3 2 0 2
## Ease.of.Online.booking On.board.service Leg.room.service Baggage.handling
## 1 3 3 0 3
## 2 3 4 4 4
## 3 2 3 3 4
## 4 1 1 0 1
## 5 2 2 0 2
## 6 2 5 4 5
## Checkin.service Cleanliness Online.boarding Departure.Delay.in.Minutes
## 1 5 3 2 0
## 2 2 3 2 310
## 3 4 4 2 0
## 4 4 1 3 0
## 5 4 2 5 0
## 6 5 4 2 0
## Arrival.Delay.in.Minutes
## 1 0
## 2 305
## 3 0
## 4 0
## 5 0
## 6 0
I created a function to check the NAs values in each column or the data frame
#function to get the type, unique items and the NA count of every column in the dataframe
stats=function(df){
l=c()
for(i in 1:length(df)){
type=class(df[,i])
unique=length(unique(df[,i]))
sum_null=sum(is.na(df[,i]))
l=append(l,c(colnames(df)[i],type,unique,sum_null))
}
df_stats=matrix(l,ncol=4,byrow=T)
colnames(df_stats)=c('column','type','unique','sum_null')
return (df_stats)
}
#applying the stats function
stats(df)
## column type unique sum_null
## [1,] "satisfaction" "character" "2" "0"
## [2,] "Gender" "character" "2" "0"
## [3,] "Customer.Type" "character" "2" "0"
## [4,] "Age" "integer" "75" "0"
## [5,] "Type.of.Travel" "character" "2" "0"
## [6,] "Class" "character" "3" "0"
## [7,] "Flight.Distance" "integer" "5398" "0"
## [8,] "Seat.comfort" "integer" "6" "0"
## [9,] "Departure.Arrival.time.convenient" "integer" "6" "0"
## [10,] "Food.and.drink" "integer" "6" "0"
## [11,] "Gate.location" "integer" "6" "0"
## [12,] "Inflight.wifi.service" "integer" "6" "0"
## [13,] "Inflight.entertainment" "integer" "6" "0"
## [14,] "Online.support" "integer" "6" "0"
## [15,] "Ease.of.Online.booking" "integer" "6" "0"
## [16,] "On.board.service" "integer" "6" "0"
## [17,] "Leg.room.service" "integer" "6" "0"
## [18,] "Baggage.handling" "integer" "5" "0"
## [19,] "Checkin.service" "integer" "6" "0"
## [20,] "Cleanliness" "integer" "6" "0"
## [21,] "Online.boarding" "integer" "6" "0"
## [22,] "Departure.Delay.in.Minutes" "integer" "466" "0"
## [23,] "Arrival.Delay.in.Minutes" "integer" "473" "393"
As we can see, only the last variable Arrival.Delay.in.Minutes which has NAs values, so we will change the NAs with the same value as Departure.Delay.in.Minutes because i noticed that most values of the two variables are equals or close.
#change the NA values with the same value of arrival delay values
df$Arrival.Delay.in.Minutes=ifelse(is.na(df$Arrival.Delay.in.Minutes),df$Departure.Delay.in.Minutes,df$Arrival.Delay.in.Minutes)
now we check if still any NA values.
#re-check the NA values
stats(df)
## column type unique sum_null
## [1,] "satisfaction" "character" "2" "0"
## [2,] "Gender" "character" "2" "0"
## [3,] "Customer.Type" "character" "2" "0"
## [4,] "Age" "integer" "75" "0"
## [5,] "Type.of.Travel" "character" "2" "0"
## [6,] "Class" "character" "3" "0"
## [7,] "Flight.Distance" "integer" "5398" "0"
## [8,] "Seat.comfort" "integer" "6" "0"
## [9,] "Departure.Arrival.time.convenient" "integer" "6" "0"
## [10,] "Food.and.drink" "integer" "6" "0"
## [11,] "Gate.location" "integer" "6" "0"
## [12,] "Inflight.wifi.service" "integer" "6" "0"
## [13,] "Inflight.entertainment" "integer" "6" "0"
## [14,] "Online.support" "integer" "6" "0"
## [15,] "Ease.of.Online.booking" "integer" "6" "0"
## [16,] "On.board.service" "integer" "6" "0"
## [17,] "Leg.room.service" "integer" "6" "0"
## [18,] "Baggage.handling" "integer" "5" "0"
## [19,] "Checkin.service" "integer" "6" "0"
## [20,] "Cleanliness" "integer" "6" "0"
## [21,] "Online.boarding" "integer" "6" "0"
## [22,] "Departure.Delay.in.Minutes" "integer" "466" "0"
## [23,] "Arrival.Delay.in.Minutes" "integer" "475" "0"
Starting with a full summary of the dataframe.
#data summary
summary(df)
## satisfaction Gender Customer.Type Age
## Length:129880 Length:129880 Length:129880 Min. : 7.00
## Class :character Class :character Class :character 1st Qu.:27.00
## Mode :character Mode :character Mode :character Median :40.00
## Mean :39.43
## 3rd Qu.:51.00
## Max. :85.00
## Type.of.Travel Class Flight.Distance Seat.comfort
## Length:129880 Length:129880 Min. : 50 Min. :0.000
## Class :character Class :character 1st Qu.:1359 1st Qu.:2.000
## Mode :character Mode :character Median :1925 Median :3.000
## Mean :1981 Mean :2.839
## 3rd Qu.:2544 3rd Qu.:4.000
## Max. :6951 Max. :5.000
## Departure.Arrival.time.convenient Food.and.drink Gate.location
## Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.00
## Median :3.000 Median :3.000 Median :3.00
## Mean :2.991 Mean :2.852 Mean :2.99
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.000 Max. :5.00
## Inflight.wifi.service Inflight.entertainment Online.support
## Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.00
## Median :3.000 Median :4.000 Median :4.00
## Mean :3.249 Mean :3.383 Mean :3.52
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.00
## Ease.of.Online.booking On.board.service Leg.room.service Baggage.handling
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.472 Mean :3.465 Mean :3.486 Mean :3.696
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Checkin.service Cleanliness Online.boarding Departure.Delay.in.Minutes
## Min. :0.000 Min. :0.000 Min. :0.000 Min. : 0.00
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 0.00
## Median :3.000 Median :4.000 Median :4.000 Median : 0.00
## Mean :3.341 Mean :3.706 Mean :3.353 Mean : 14.71
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.: 12.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :1592.00
## Arrival.Delay.in.Minutes
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 15.16
## 3rd Qu.: 13.00
## Max. :1584.00
Next we have some visualizations of certain variables of the dataframe.
#pie chart plot by proportion of satisfaction
data.pie=data.frame(df%>%
group_by(satisfaction)%>%
summarise(count=n()))
ggplot(data.pie, aes(x="", y=count, fill=satisfaction)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
geom_text(aes(label = count),
position = position_stack(vjust = 0.5)) +
theme_void()
Here we have the distribution of the satisfied and the dissatisfied clients we can see more satisfied than the dissatisfied ones.
#histogram chart of satisfaction by the flight distance
ggplot(df, aes(x=Flight.Distance, fill=satisfaction)) +
geom_histogram(alpha=0.5,bins=300,position='identity')
Here is the distribution of the costumers dependently on the distance of
the flight where we have in the majority of the short-distance flights,
the clients are satisfied but for the mid-range distance between
1100 and 2500, the clients are not
satisfied and then for the long-range we have back the dominance of the
satisfaction.
#histogram chart of customer type by the flight distance
ggplot(df, aes(x=Flight.Distance, fill=Customer.Type)) +
geom_histogram(alpha=0.5,bins=300,position='identity')
Here for the loyalty of the customers, we have the majority of the
customers are loyal for the company and the majority of the disloyal
customers are getting mid-range flights.
#density chart of satisfaction by age
ggplot(df, aes(x=Age, color=satisfaction)) +geom_density()
for ages, we have mostly independence between satisfaction and the age factor, except for the range of ages between 38 and 62 where we have high satisfaction rate and slightly higher dissatisfaction for younger ages.
#bar chart of satisfaction by the class of customers
ggplot(df,aes(x=Class,fill=satisfaction))+geom_bar()
As u can see, most of the costumers are with business
class but most of the dissatisfied costumers are taking the
Eco class and just few of them are in eco
plus class.
# Analysing Data
For this part we will be taking a look on on the PCA but firstly we need to select the categorical variables. for this purpose I created a function to select those variables dependently on the number of unique values.
discrete= function(x){
length(unique(x)) <= 6
}
sapply(df, discrete)
## satisfaction Gender
## TRUE TRUE
## Customer.Type Age
## TRUE FALSE
## Type.of.Travel Class
## TRUE TRUE
## Flight.Distance Seat.comfort
## FALSE TRUE
## Departure.Arrival.time.convenient Food.and.drink
## TRUE TRUE
## Gate.location Inflight.wifi.service
## TRUE TRUE
## Inflight.entertainment Online.support
## TRUE TRUE
## Ease.of.Online.booking On.board.service
## TRUE TRUE
## Leg.room.service Baggage.handling
## TRUE TRUE
## Checkin.service Cleanliness
## TRUE TRUE
## Online.boarding Departure.Delay.in.Minutes
## TRUE FALSE
## Arrival.Delay.in.Minutes
## FALSE
when we look at the stats function we could see that the required variables have maximum 6 levels and with the use of the sapply function we get the result above of the required variables then we select the chosen variables from the dataframe.
pca_df=df[,sapply(df,discrete)]
head(pca_df)
## satisfaction Gender Customer.Type Type.of.Travel Class Seat.comfort
## 1 satisfied Female Loyal Customer Personal Travel Eco 0
## 2 satisfied Male Loyal Customer Personal Travel Business 0
## 3 satisfied Female Loyal Customer Personal Travel Eco 0
## 4 satisfied Female Loyal Customer Personal Travel Eco 0
## 5 satisfied Female Loyal Customer Personal Travel Eco 0
## 6 satisfied Male Loyal Customer Personal Travel Eco 0
## Departure.Arrival.time.convenient Food.and.drink Gate.location
## 1 0 0 2
## 2 0 0 3
## 3 0 0 3
## 4 0 0 3
## 5 0 0 3
## 6 0 0 3
## Inflight.wifi.service Inflight.entertainment Online.support
## 1 2 4 2
## 2 0 2 2
## 3 2 0 2
## 4 3 4 3
## 5 4 3 4
## 6 2 0 2
## Ease.of.Online.booking On.board.service Leg.room.service Baggage.handling
## 1 3 3 0 3
## 2 3 4 4 4
## 3 2 3 3 4
## 4 1 1 0 1
## 5 2 2 0 2
## 6 2 5 4 5
## Checkin.service Cleanliness Online.boarding
## 1 5 3 2
## 2 2 3 2
## 3 4 4 2
## 4 4 1 3
## 5 4 2 5
## 6 5 4 2
as u can see, the first 5 columns are string categorical variables
and the PCA function works on the numerical variables so if we apply the
as.numeric function directly on the dataframe, all the
string variables will become NAs.
So my solution was to apply firstly as.factor then
apply the as.numeric, that way, all the string
categorical variables will be numeric categorical automatically.
pca_df[]=lapply(pca_df,factor)
pca_df[]=lapply(pca_df,as.integer)
head(pca_df)
## satisfaction Gender Customer.Type Type.of.Travel Class Seat.comfort
## 1 2 1 2 2 2 1
## 2 2 2 2 2 1 1
## 3 2 1 2 2 2 1
## 4 2 1 2 2 2 1
## 5 2 1 2 2 2 1
## 6 2 2 2 2 2 1
## Departure.Arrival.time.convenient Food.and.drink Gate.location
## 1 1 1 3
## 2 1 1 4
## 3 1 1 4
## 4 1 1 4
## 5 1 1 4
## 6 1 1 4
## Inflight.wifi.service Inflight.entertainment Online.support
## 1 3 5 3
## 2 1 3 3
## 3 3 1 3
## 4 4 5 4
## 5 5 4 5
## 6 3 1 3
## Ease.of.Online.booking On.board.service Leg.room.service Baggage.handling
## 1 4 4 1 3
## 2 4 5 5 4
## 3 3 4 4 4
## 4 2 2 1 1
## 5 3 3 1 2
## 6 3 6 5 5
## Checkin.service Cleanliness Online.boarding
## 1 6 4 3
## 2 3 4 3
## 3 5 5 3
## 4 5 2 4
## 5 5 3 6
## 6 6 5 3
and the problem is solved but we have another problem which the large number of rows (129880) that may cause a problem for the PCA function so it is required to make changes on the data to make it easier.
pca_df2=summaryBy(.~ satisfaction+Gender+Customer.Type+Type.of.Travel+Class,data = pca_df,FUN=c(mean),keep.names = TRUE)
dim(pca_df2)
## [1] 43 19
Now we have a huge reduction of the large number of rows which is 43 now. and the data is ready for applying the PCA function.
c=cor(pca_df2)
corrplot(c, type="upper", order="hclust",col=brewer.pal(n=8, name="RdBu"))
using the correlation plot, we remark some interesting correlations between some variables.
acp=PCA(pca_df2,quali.sup = c(1:5),graph=F)
acp$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 6.313287177 45.09490840 45.09491
## comp 2 2.787664699 19.91189071 65.00680
## comp 3 2.128877348 15.20626677 80.21307
## comp 4 1.011249588 7.22321135 87.43628
## comp 5 0.656924237 4.69231598 92.12859
## comp 6 0.448076690 3.20054779 95.32914
## comp 7 0.262922187 1.87801562 97.20716
## comp 8 0.195015696 1.39296926 98.60013
## comp 9 0.096628894 0.69020638 99.29033
## comp 10 0.037298670 0.26641907 99.55675
## comp 11 0.030826578 0.22018984 99.77694
## comp 12 0.016403057 0.11716469 99.89411
## comp 13 0.008019785 0.05728418 99.95139
## comp 14 0.006805394 0.04860996 100.00000
fviz_screeplot(acp, ncp=10)
for this PCA, we will be taking 5 axes for better clustering results
later.
fviz_pca_ind(acp)
fviz_pca_var(acp)
For now, we could not interpret anything and we need to perform some
clustering methods for better results.
for this part, we will be using the NbClust method.
NbClust(acp$ind$coord,method='ward.D')$Best.partition
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 2 proposed 3 as the best number of clusters
## * 7 proposed 4 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 14 as the best number of clusters
## * 5 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 4
##
##
## *******************************************************************
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 2 2 1 1 2 2 2 3 3 3 1 2 2 1 1 1 2 2 2 1 1 1 3 3 3
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 3 4 4 4 4 4 4 3 3 3 3 4 4 4 3 3 3
As the function gives, the best number of classes is 4.
d=dist(acp$ind$coord)
clust=hclust(d,method="ward.D")
dend=as.dendrogram(clust)
plot(dend)
by the graph, the number of classes is between 2 or
4. but we will be using the result of the
NbClust function.
clusters=cutree(clust,k=4)
df_final=cbind(pca_df2,Classes=as.factor(clusters))
fviz_pca_ind(acp,col.ind=df_final$Classes)
This is the second method of clustering.
km=kmeans(as.matrix(df_final), centers = 4)
km
## K-means clustering with 4 clusters of sizes 9, 10, 14, 10
##
## Cluster means:
## satisfaction Gender Customer.Type Type.of.Travel Class Seat.comfort
## 1 2.000000 1.333333 2.000000 1.333333 2.0 4.466273
## 2 1.000000 1.500000 1.600000 1.000000 2.2 3.401609
## 3 1.785714 1.500000 1.428571 1.571429 2.0 4.673429
## 4 1.000000 1.700000 1.300000 1.800000 1.7 3.485829
## Departure.Arrival.time.convenient Food.and.drink Gate.location
## 1 3.968077 3.931826 3.988933
## 2 3.680298 3.771885 4.060349
## 3 4.250254 4.609245 3.911851
## 4 4.483059 3.437393 3.899266
## Inflight.wifi.service Inflight.entertainment Online.support
## 1 4.663843 4.928928 4.964475
## 2 3.805119 3.584057 3.839691
## 3 4.071311 4.614756 4.142987
## 4 3.857962 3.504272 3.841796
## Ease.of.Online.booking On.board.service Leg.room.service Baggage.handling
## 1 5.067188 4.625432 4.628452 3.675507
## 2 3.687825 3.604744 3.908823 3.041054
## 3 4.029175 4.233420 4.356788 3.622989
## 4 3.880763 4.348836 4.178836 3.768918
## Checkin.service Cleanliness Online.boarding Classes
## 1 4.400804 4.685574 4.817029 4
## 2 3.597194 4.033010 3.711728 2
## 3 4.436412 4.614943 4.089940 3
## 4 4.257836 4.867092 3.867064 1
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 4 2 2 4 4 2 2 2 3 3 3 4 2 2 4 4 4 2 2 2 4 4 4 3 3 3
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## 3 1 1 1 1 1 1 3 3 3 3 1 1 1 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 21.28239 18.18594 38.17519 27.61126
## (between_SS / total_SS = 60.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
In order to compare the two method, we have to compare the result of clustering, for that reason we will be using the table function
table(km$cluster,clusters)
## clusters
## 1 2 3 4
## 1 0 0 0 9
## 2 0 10 0 0
## 3 0 0 14 0
## 4 10 0 0 0
as we can see, the two method gave the exact same results, just with different names. which make this distribution a perfect one.
for this part, we will use the catdes function.
catdes(df_final,num.var = length(df_final))
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Inflight.entertainment 0.8401274 1.391398e-15
## Ease.of.Online.booking 0.8336022 3.024395e-15
## Seat.comfort 0.7967721 1.461181e-13
## satisfaction 0.7796584 6.998142e-13
## Food.and.drink 0.7552236 5.359465e-12
## Online.support 0.7209842 6.736772e-11
## Online.boarding 0.6889455 5.492831e-10
## Inflight.wifi.service 0.6469188 6.314475e-09
## Checkin.service 0.6406495 8.859885e-09
## On.board.service 0.5274790 1.687339e-06
## Leg.room.service 0.4286089 6.243420e-05
## Baggage.handling 0.4063092 1.286122e-04
## Cleanliness 0.3942368 1.879043e-04
## Type.of.Travel 0.3372180 1.013663e-03
## Departure.Arrival.time.convenient 0.2807364 4.614870e-03
## Customer.Type 0.2523496 9.384189e-03
## Gate.location 0.1982273 3.320153e-02
##
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
## v.test Mean in category Overall mean
## Type.of.Travel 2.572800 1.800000 1.441860
## Departure.Arrival.time.convenient 2.387779 4.483058 4.112786
## Cleanliness 2.335235 4.867092 4.553032
## Inflight.wifi.service -2.045285 3.857962 4.083808
## Online.support -2.356454 3.841796 4.174348
## Seat.comfort -3.150832 3.485829 4.058113
## satisfaction -3.326739 1.000000 1.465116
## Inflight.entertainment -3.657668 3.504272 4.182563
## Food.and.drink -3.830197 3.437393 4.000201
## sd in category Overall sd p.value
## Type.of.Travel 0.4000000 0.4966083 0.0100879575
## Departure.Arrival.time.convenient 0.5914998 0.5532159 0.0169505159
## Cleanliness 0.3888512 0.4797878 0.0195311736
## Inflight.wifi.service 0.3275167 0.3939365 0.0408268020
## Online.support 0.3712193 0.5034637 0.0184503340
## Seat.comfort 0.2099391 0.6479685 0.0016280596
## satisfaction 0.0000000 0.4987816 0.0008786857
## Inflight.entertainment 0.3033502 0.6615759 0.0002545207
## Food.and.drink 0.2621474 0.5242120 0.0001280406
##
## $`2`
## v.test Mean in category Overall mean
## Gate.location 2.490437 4.060349 3.959592
## Online.support -2.371372 3.839691 4.174348
## Inflight.wifi.service -2.523841 3.805119 4.083808
## Departure.Arrival.time.convenient -2.788994 3.680298 4.112786
## Ease.of.Online.booking -2.914263 3.687825 4.132535
## Online.boarding -2.934899 3.711728 4.102334
## Type.of.Travel -3.174233 1.000000 1.441860
## Inflight.entertainment -3.227430 3.584057 4.182563
## satisfaction -3.326739 1.000000 1.465116
## Leg.room.service -3.380497 3.908823 4.268086
## Seat.comfort -3.614523 3.401609 4.058113
## Cleanliness -3.866698 4.033010 4.553032
## Baggage.handling -4.051534 3.041054 3.532585
## On.board.service -4.320262 3.604744 4.196106
## Checkin.service -5.080507 3.597194 4.192263
## sd in category Overall sd p.value
## Gate.location 0.05869400 0.1443337 1.275861e-02
## Online.support 0.22078488 0.5034637 1.772217e-02
## Inflight.wifi.service 0.22627080 0.3939365 1.160806e-02
## Departure.Arrival.time.convenient 0.35113022 0.5532159 5.287203e-03
## Ease.of.Online.booking 0.29632274 0.5443980 3.565296e-03
## Online.boarding 0.30399543 0.4748023 3.336562e-03
## Type.of.Travel 0.00000000 0.4966083 1.502328e-03
## Inflight.entertainment 0.20753618 0.6615759 1.249078e-03
## satisfaction 0.00000000 0.4987816 8.786857e-04
## Leg.room.service 0.22985542 0.3791394 7.235493e-04
## Seat.comfort 0.01734401 0.6479685 3.009015e-04
## Cleanliness 0.24437060 0.4797878 1.103191e-04
## Baggage.handling 0.24073204 0.4328113 5.088286e-05
## On.board.service 0.15034813 0.4883274 1.558439e-05
## Checkin.service 0.16301408 0.4178572 3.764283e-07
##
## $`3`
## v.test Mean in category Overall mean sd in category
## Food.and.drink 5.231559 4.609245 4.000201 0.2896602
## Seat.comfort 4.275968 4.673429 4.058113 0.2699608
## Inflight.entertainment 2.941624 4.614756 4.182563 0.2805695
## satisfaction 2.894277 1.785714 1.465116 0.4103259
## Checkin.service 2.630976 4.436412 4.192263 0.2410746
## Overall sd p.value
## Food.and.drink 0.5242120 1.680861e-07
## Seat.comfort 0.6479685 1.903089e-05
## Inflight.entertainment 0.6615759 3.264962e-03
## satisfaction 0.4987816 3.800333e-03
## Checkin.service 0.4178572 8.514008e-03
##
## $`4`
## v.test Mean in category Overall mean sd in category
## Ease.of.Online.booking 5.724537 5.067188 4.132535 0.1008173
## Online.support 5.232811 4.964475 4.174348 0.2428969
## Online.boarding 5.018970 4.817029 4.102334 0.2203912
## Inflight.wifi.service 4.909462 4.663842 4.083808 0.2766824
## Inflight.entertainment 3.761645 4.928928 4.182563 0.2485208
## satisfaction 3.575653 2.000000 1.465116 0.0000000
## Leg.room.service 3.169215 4.628452 4.268086 0.4029844
## Customer.Type 2.966727 2.000000 1.558140 0.0000000
## On.board.service 2.931451 4.625432 4.196106 0.3980407
## Seat.comfort 2.100310 4.466273 4.058113 0.4949797
## Overall sd p.value
## Ease.of.Online.booking 0.5443980 1.037164e-08
## Online.support 0.5034637 1.669510e-07
## Online.boarding 0.4748023 5.194925e-07
## Inflight.wifi.service 0.3939365 9.132645e-07
## Inflight.entertainment 0.6615759 1.687992e-04
## satisfaction 0.4987816 3.493548e-04
## Leg.room.service 0.3791394 1.528514e-03
## Customer.Type 0.4966083 3.009878e-03
## On.board.service 0.4883274 3.373823e-03
## Seat.comfort 0.6479685 3.570160e-02
Class 1 : represents the customers who are slightly
dissatisfied but the have a little bit of satisfaction just because it
is for personal travel.
Class 2 : represents the customers who are disappointed
and totally dissatisfied with everything in the flight
Class 3 : represents the customers who are slightly
satisfied with the flight and some of its services.
Class 4 : represents the customers who are fully
satisfied with everything in the flight and all its pre-flight services
and this class is for the loyal customers.
in this part we will split the whole data in two parts, one for
training the model and the other for testing it.
as we almost have 130000 observations in the dataframe
so will have 100000 for training and the rest for
testing.
#vector of indexes used to split the data into train and test sub-dataframes
x=sample(c(1:nrow(df)),100000)
#getting the new dataframes based on the indexes
df_train=df[x,]
df_test=df[-x,]
#verifying the length of the two dataframes
nrow(df_train)
## [1] 100000
nrow(df_test)
## [1] 29880
In this part we will use the step function to estimate the best equation to use on the glm model. but firstly we need to apply small change into the satisfaction variable.
#change the value of satisfaction from string to numeric type
df_train$satisfaction=as.factor(ifelse(df_train$satisfaction=='satisfied',1,0))
df_test$satisfaction=as.factor(ifelse(df_test$satisfaction=='satisfied',1,0))
Now we apply the glm model.
#glm model with the training dataframe
modele_glm=glm(satisfaction ~ .,family='binomial',df_train)
and here is a summary of the of all variable coefficients:
#modele summary
summary(modele_glm)
##
## Call:
## glm(formula = satisfaction ~ ., family = "binomial", data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9751 -0.5773 0.1925 0.5191 3.6282
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.823e+00 7.436e-02 -91.745 < 2e-16 ***
## GenderMale -9.487e-01 1.873e-02 -50.642 < 2e-16 ***
## Customer.TypeLoyal Customer 1.960e+00 2.846e-02 68.868 < 2e-16 ***
## Age -7.997e-03 6.497e-04 -12.308 < 2e-16 ***
## Type.of.TravelPersonal Travel -7.805e-01 2.672e-02 -29.209 < 2e-16 ***
## ClassEco -7.246e-01 2.415e-02 -30.010 < 2e-16 ***
## ClassEco Plus -8.094e-01 3.732e-02 -21.690 < 2e-16 ***
## Flight.Distance -1.141e-04 9.774e-06 -11.672 < 2e-16 ***
## Seat.comfort 2.846e-01 1.047e-02 27.173 < 2e-16 ***
## Departure.Arrival.time.convenient -1.956e-01 7.724e-03 -25.324 < 2e-16 ***
## Food.and.drink -2.183e-01 1.063e-02 -20.544 < 2e-16 ***
## Gate.location 1.188e-01 8.706e-03 13.641 < 2e-16 ***
## Inflight.wifi.service -7.204e-02 1.010e-02 -7.131 9.99e-13 ***
## Inflight.entertainment 6.888e-01 9.446e-03 72.917 < 2e-16 ***
## Online.support 9.550e-02 1.026e-02 9.304 < 2e-16 ***
## Ease.of.Online.booking 2.194e-01 1.325e-02 16.565 < 2e-16 ***
## On.board.service 3.128e-01 9.379e-03 33.351 < 2e-16 ***
## Leg.room.service 2.238e-01 7.999e-03 27.979 < 2e-16 ***
## Baggage.handling 1.178e-01 1.058e-02 11.133 < 2e-16 ***
## Checkin.service 2.959e-01 7.905e-03 37.429 < 2e-16 ***
## Cleanliness 7.731e-02 1.098e-02 7.038 1.95e-12 ***
## Online.boarding 1.673e-01 1.133e-02 14.772 < 2e-16 ***
## Departure.Delay.in.Minutes 3.068e-03 9.187e-04 3.339 0.00084 ***
## Arrival.Delay.in.Minutes -8.214e-03 9.046e-04 -9.080 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 137733 on 99999 degrees of freedom
## Residual deviance: 76983 on 99976 degrees of freedom
## AIC: 77031
##
## Number of Fisher Scoring iterations: 5
#stepwise function
step(modele_glm,direction='both')
## Start: AIC=77030.97
## satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel +
## Class + Flight.Distance + Seat.comfort + Departure.Arrival.time.convenient +
## Food.and.drink + Gate.location + Inflight.wifi.service +
## Inflight.entertainment + Online.support + Ease.of.Online.booking +
## On.board.service + Leg.room.service + Baggage.handling +
## Checkin.service + Cleanliness + Online.boarding + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes
##
## Df Deviance AIC
## <none> 76983 77031
## - Departure.Delay.in.Minutes 1 76994 77040
## - Cleanliness 1 77033 77079
## - Inflight.wifi.service 1 77034 77080
## - Arrival.Delay.in.Minutes 1 77066 77112
## - Online.support 1 77069 77115
## - Baggage.handling 1 77107 77153
## - Flight.Distance 1 77119 77165
## - Age 1 77135 77181
## - Gate.location 1 77170 77216
## - Online.boarding 1 77201 77247
## - Ease.of.Online.booking 1 77258 77304
## - Food.and.drink 1 77410 77456
## - Departure.Arrival.time.convenient 1 77632 77678
## - Seat.comfort 1 77733 77779
## - Leg.room.service 1 77767 77813
## - Type.of.Travel 1 77844 77890
## - Class 2 77998 78042
## - On.board.service 1 78114 78160
## - Checkin.service 1 78419 78465
## - Gender 1 79622 79668
## - Customer.Type 1 82133 82179
## - Inflight.entertainment 1 82835 82881
##
## Call: glm(formula = satisfaction ~ Gender + Customer.Type + Age + Type.of.Travel +
## Class + Flight.Distance + Seat.comfort + Departure.Arrival.time.convenient +
## Food.and.drink + Gate.location + Inflight.wifi.service +
## Inflight.entertainment + Online.support + Ease.of.Online.booking +
## On.board.service + Leg.room.service + Baggage.handling +
## Checkin.service + Cleanliness + Online.boarding + Departure.Delay.in.Minutes +
## Arrival.Delay.in.Minutes, family = "binomial", data = df_train)
##
## Coefficients:
## (Intercept) GenderMale
## -6.8225923 -0.9486538
## Customer.TypeLoyal Customer Age
## 1.9597902 -0.0079970
## Type.of.TravelPersonal Travel ClassEco
## -0.7805398 -0.7246104
## ClassEco Plus Flight.Distance
## -0.8093581 -0.0001141
## Seat.comfort Departure.Arrival.time.convenient
## 0.2845685 -0.1955998
## Food.and.drink Gate.location
## -0.2183493 0.1187589
## Inflight.wifi.service Inflight.entertainment
## -0.0720360 0.6887585
## Online.support Ease.of.Online.booking
## 0.0955004 0.2194085
## On.board.service Leg.room.service
## 0.3127895 0.2237870
## Baggage.handling Checkin.service
## 0.1177943 0.2958699
## Cleanliness Online.boarding
## 0.0773052 0.1673123
## Departure.Delay.in.Minutes Arrival.Delay.in.Minutes
## 0.0030678 -0.0082138
##
## Degrees of Freedom: 99999 Total (i.e. Null); 99976 Residual
## Null Deviance: 137700
## Residual Deviance: 76980 AIC: 77030
As we can see the result the step function, it kept all the given variables as they are all dependent to the target variable. now we will try with Wald Test to make sure about the result and will apply it to every possible set of combination of variables.
for( i in 1:24){
for (j in i:24){
w=wald.test(b = coef(modele_glm), Sigma = vcov(modele_glm),Terms=i:j)
if (w$result$chi2[3]>0.05){
print(w$result$chi2[3])
print(i)
print(j)
}
}
}
This function givens if there is any combination have a non
significant variables (with p-value >0.05) as we have the Null
hypothesis that all the selected variable equals to zero.
And this function didn’t return any value so that means that all the
variables are significant.
So we will keep the model as it is and we will proceed to the prediction function.
#predict the levels of satisfaction using predict function using the testing dataframe
pred=predict(modele_glm,df_test,type='response')
#change the predicted values from 0 to 1 range, to only two possible values with 0 and 1
pred_glm=ifelse(pred>0.5,1,0)
#calculate the accuracy and the precision of the model
tab=table(df_test$satisfaction,pred_glm)
print(paste("The accuracy = ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "The accuracy = 0.835809906291834"
print(paste("The precision = ",tab[2,2]/(tab[1,2]+tab[2,2])))
## [1] "The precision = 0.849682384559003"
here is the result of the prediction shows that the accuracy and the
precision of the model are more than 80% which is high
enough to accept the model.
And here is the AUC graph of the glm model.
#perform the roc plot on the glm model
predicted=prediction(pred,df_test$satisfaction)
roc=performance(predicted, "tpr","fpr")
plot(roc)
auc=performance(predicted, "auc")
auc@y.values
## [[1]]
## [1] 0.9091909
#perform precision recall plot on the glm model
proc=performance(predicted, "prec","rec")
plot(proc)
aucpr=performance(predicted, "aucpr")
aucpr@y.values
## [[1]]
## [1] 0.9340118
Here is another method to make a model for prediction of the target variable with the factorial discriminant analysis using the lda function.
#lda model using the train dataframe
modele_lda=lda(satisfaction ~ .,df_train)
#predict the satisfaction levels using lda model
pred_lda=predict(modele_lda,prior=c(0.5,0.5),df_test)
#calculate the accuracy and the precision of the model
tab=table(df_test$satisfaction,pred_lda$class)
print(paste("The accuracy = ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "The accuracy = 0.839825970548862"
print(paste("The precision = ",tab[2,2]/(tab[1,2]+tab[2,2])))
## [1] "The precision = 0.868330574302814"
This model gives an accuracy and precision also above 83% which means that this model is accurate as well.
#perform the roc plot on the lda model
predicted=prediction(pred_lda$posterior[,2],df_test$satisfaction)
roc=performance(predicted, "tpr","fpr")
plot(roc)
auc=performance(predicted, "auc")
auc@y.values
## [[1]]
## [1] 0.9095446
#perform precision recall plot on the lda model
proc=performance(predicted, "prec","rec")
plot(proc)
aucpr=performance(predicted, "aucpr")
aucpr@y.values
## [[1]]
## [1] 0.9337687
in this part we will create a decision tree model
#decision tree parameters
rc=rpart.control(minsplit = 20,cp=0.008,minbucket=1)
#performing a decision tree using training dataframe
tree=rpart(satisfaction~.,data=df_train,method="class",control = rc)
#predict satisfactions classes using the decision tree
pred_tree=predict(tree,df_test,type = "class")
#plot the tree
rpart.plot(tree,extra = 106)
##calculate the accuracy and the precision
tab=table(df_test$satisfaction,pred_tree)
print(paste("The accuracy = ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "The accuracy = 0.882195448460509"
print(paste("The precision = ",tab[2,2]/(tab[1,2]+tab[2,2])))
## [1] "The precision = 0.880303389428775"
As the decision tree shows, the most important variable is
inflight entertainment then we have the seat
comfort and the ease of online booking
printcp(tree)
##
## Classification tree:
## rpart(formula = satisfaction ~ ., data = df_train, method = "class",
## control = rc)
##
## Variables actually used in tree construction:
## [1] Class Customer.Type Ease.of.Online.booking
## [4] Inflight.entertainment Online.support Seat.comfort
##
## Root node error: 45269/100000 = 0.45269
##
## n= 100000
##
## CP nsplit rel error xerror xstd
## 1 0.5631668 0 1.00000 1.00000 0.0034771
## 2 0.0515806 1 0.43683 0.43683 0.0027824
## 3 0.0443350 2 0.38525 0.38525 0.0026507
## 4 0.0133793 3 0.34092 0.34092 0.0025236
## 5 0.0095503 6 0.30078 0.30416 0.0024070
## 6 0.0094988 9 0.27213 0.28432 0.0023393
## 7 0.0080000 10 0.26263 0.26263 0.0022609
Here as you can see, the best number of splits is 10 as it minimizes the xerror and the xstd.
#perform the roc plot of the decision tree
pred=predict(tree,df_test,type = "prob")[,2]
predicted=prediction(pred,df_test$satisfaction)
roc=performance(predicted, "tpr","fpr")
plot(roc)
auc=performance(predicted, "auc")
auc@y.values
## [[1]]
## [1] 0.9164299
#perform the precision recall plot of the decision tree
proc=performance(predicted, "prec","rec")
plot(proc)
aucpr=performance(predicted, "aucpr")
aucpr@y.values
## [[1]]
## [1] 0.9095313
#calculate the best number of variables for the random forst with tuneRF function
tuneRF(df_train[setdiff(colnames(df_train),"satisfaction")],df_train$satisfaction,stepFactor=1.5,mtryStart=2, improve=0.01,ntree=20,trace=F)
## 0.1525091 0.01
## 0.0243473 0.01
## 0.06841031 0.01
## 0.02231894 0.01
## -0.01395697 0.01
## mtry OOBError
## 2.OOB 2 0.06451516
## 3.OOB 3 0.05467601
## 4.OOB 4 0.05334480
## 6.OOB 6 0.04969547
## 9.OOB 9 0.04858632
## 13.OOB 13 0.04926443
As the tuneRF function gives, its optimal to use
9 mtry variables as it has the least OOB error.
Now we will apply a random forest model:
#apply a random forest model with the train dataframe
modele_rf=randomForest(satisfaction~.,data=df_train,ntree=20,mtry=9)
#predict the satisfaction levels with predict function on the test dataframe
pred=predict(modele_rf,df_test,type="class")
#calculate the accuracy and the precision of the model
tab=table(df_test$satisfaction,pred)
print(paste("The accuracy = ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "The accuracy = 0.956492637215529"
print(paste("The precision = ",tab[2,2]/(tab[1,2]+tab[2,2])))
## [1] "The precision = 0.970264867566217"
As you can see, we have a 95% accuracy of the model and 96% precision, which are really high and acceptable.
#apply varImpPlot on the model to have the most important variables
varImpPlot(modele_rf)
this graph shows the most important variables in the classification
model and it shows that Inflight entertainment is the
most crucial variable and that’s what we saw with the decision
tree.
And lastly we represent the AUC plot of this model.
#perform the roc plot of the random forest model
pred=predict(modele_rf,df_test,type="prob")[,2]
predicted=prediction(pred,df_test$satisfaction)
roc=performance(predicted, "tpr","fpr")
plot(roc)
auc=performance(predicted, "auc")
auc@y.values
## [[1]]
## [1] 0.992734
#perform the precision recall plot of the random forest
proc=performance(predicted, "prec","rec")
plot(proc)
aucpr=performance(predicted, "aucpr")
aucpr@y.values
## [[1]]
## [1] 0.9949422