import data and change column names
df = read.csv("satisfaction_survey.csv", stringsAsFactors=F)
new_colnames <- c('satisfaction', 'airline_status', 'age', 'gender', 'price_sensitivity', 'year_first_flight', 'num_flights', 'percent_flight_other_airlines', 'type_of_travel', 'num_loyalty_cards', 'airport_shopping', 'airport_dining', 'class', 'day_of_month', 'flight_date', 'airline_code', 'airline_name', 'origin_city', 'origin_state', 'destination_city', 'destination_state', 'scheduled_departure_hour', 'departure_delay_in_minutes', 'arrival_delay_in_minutes', 'flight_cancelled', 'flight_time_in_minutes', 'flight_distance', 'arrival_delay_greater_5_mins')
colnames(df) <- new_colnames
Beginning row length is: 129889, and number of columns is: 28
Check for leading and trailing whitespace for every vector in df
test_blanks <- c(paste(" ", 1:129889, " ")) # create an array of characters with spaces at the front and end
df <- cbind(df, test_blanks) # add test array to df
sapply(df, function(x) sum(grepl("^\\s*.*\\s+$", x))) # find all vectors in df with leading/trailing spaces
## satisfaction airline_status
## 0 0
## age gender
## 0 0
## price_sensitivity year_first_flight
## 0 0
## num_flights percent_flight_other_airlines
## 0 0
## type_of_travel num_loyalty_cards
## 0 0
## airport_shopping airport_dining
## 0 0
## class day_of_month
## 0 0
## flight_date airline_code
## 0 0
## airline_name origin_city
## 118481 0
## origin_state destination_city
## 0 0
## destination_state scheduled_departure_hour
## 0 0
## departure_delay_in_minutes arrival_delay_in_minutes
## 0 0
## flight_cancelled flight_time_in_minutes
## 0 0
## flight_distance arrival_delay_greater_5_mins
## 0 0
## test_blanks
## 129889
- Looks like our test case and airline_name has leading/trailing spaces; let’s deal with the whitespace in airline_name
df$airline_name = trimws(df$airline_name)
colsKeep <- c('satisfaction', 'airline_status', 'age', 'gender', 'price_sensitivity', 'year_first_flight', 'num_flights', 'percent_flight_other_airlines', 'type_of_travel', 'num_loyalty_cards', 'airport_shopping', 'airport_dining', 'class', 'day_of_month', 'flight_date', 'airline_code', 'airline_name', 'origin_city', 'origin_state', 'destination_city', 'destination_state', 'scheduled_departure_hour', 'departure_delay_in_minutes', 'arrival_delay_in_minutes', 'flight_cancelled', 'flight_time_in_minutes', 'flight_distance', 'arrival_delay_greater_5_mins')
df <- df[,colsKeep]
- whitespace in airline_name is gone, run the sapply() command from above to verify
Look for anomalies by reviewing the unique values of the variables
- you’re looking for anomalies–specifically in the levels
unique(df$satisfaction)
## [1] "4.5" "4" "2.5" "5" "3.5"
## [6] "2" "3" "1" "4.00.5" "4.00.2.00"
Satisfaction column has strange values, “4.00.5” and “4.00.2.00”. We can find how many rows have these values with:
which(df$satisfaction=="4.00.5")
## [1] 38898
which(df$satisfaction=="4.00.2.00")
## [1] 38899 38900
Considering there are only 3 rows with the eroneous data, let’s delete them
library(magrittr)
drop_rows <- c(38898, 38899, 38900)
df <- df[-drop_rows,]
df$satisfaction %<>% as.numeric # covert vector to numeric
- The dataset now has 129886 rows after dropping the 3 with bad data
Find all NA values in df
sapply(df, function(x) sum(is.na(x)))
## satisfaction airline_status
## 0 0
## age gender
## 0 0
## price_sensitivity year_first_flight
## 0 0
## num_flights percent_flight_other_airlines
## 0 0
## type_of_travel num_loyalty_cards
## 0 0
## airport_shopping airport_dining
## 0 0
## class day_of_month
## 0 0
## flight_date airline_code
## 0 0
## airline_name origin_city
## 0 0
## origin_state destination_city
## 0 0
## destination_state scheduled_departure_hour
## 0 0
## departure_delay_in_minutes arrival_delay_in_minutes
## 2345 2738
## flight_cancelled flight_time_in_minutes
## 0 2738
## flight_distance arrival_delay_greater_5_mins
## 0 0
- Flight_time_in_minutes, departure_delay_in_minutes, and arrival_delay_in_minutes have NA values. How should we deal with them? Our options are:
- Drop the entire row
- Set them to the average of the vector
Set NA values to average of column
flight_time_in_minutes_avg <- mean(df$flight_time_in_minutes, na.rm=T)
departure_delay_in_minutes_avg <- mean(df$departure_delay_in_minutes, na.rm=T)
arrival_delay_in_minutes_avg <- mean(df$arrival_delay_in_minutes, na.rm=T)
df[which(is.na(df$flight_time_in_minutes)),"flight_time_in_minutes"] = round(flight_time_in_minutes_avg,1)
df[which(is.na(df$departure_delay_in_minutes)),"departure_delay_in_minutes"] = round(departure_delay_in_minutes_avg,1)
df[which(is.na(df$arrival_delay_in_minutes)),"arrival_delay_in_minutes"] = round(arrival_delay_in_minutes_avg,1)
Change the flight_date column to Date datatype.
df$flight_date <- as.Date(df$flight_date, "%m/%d/%y")
- date if formated: YYYY-mm-dd (i.e. 2014-03-18)
The arrangment of the categories does not follow an ordinal hierachy such as lowest to highest (i.e. Eco, Eco Plus, Business). We will make the following changes:
- Change the values for the type_of_travel variable to the more concise names of: “Personal, Mileage, Business”
- Convert the type_of_travel variable to a factor with factor order of: “Personal, Mileage, Business”
- Change the values for the class variable to the more concise names of: “Economy, Plus, Business”
- Set the order of the factors for the class variable to: “Economy, Plus, Business”
- Change gender to a factor and change the factor order to: “Male, Female”
- Change airline_status to an “unordered” factor: “Blue, Silver, Gold, Platinum”
- Change airline_name to more consise names of: “‘Cheapseats’, ‘Cool&Young’, ‘EnjoyFlying’, ‘FlyFast’, ‘FlyHere’, ‘FlyToSun’, ‘GoingNorth’, ‘Northwest’, ‘OnlyJets’, ‘Oursin’, ‘PaulSmith’, ‘Sigma’, ‘Southeast’, ‘West’”
- Change flight_cancelled to a factor with order of: “Yes, No”
- Change arrival_delay_greater_5_mins to a factor with order: “yes”, “no”
# 1. Change the values for the *class* variable to the more concise names of: "Economy, Plus, Business"
df[which(df$type_of_travel=="Personal Travel"), "type_of_travel"] = "Personal"
df[which(df$type_of_travel=="Mileage tickets"), "type_of_travel"] = "Mileage"
df[which(df$type_of_travel=="Business travel"), "type_of_travel"] = "Business"
# change to factor and set order
df$type_of_travel <- factor(df$type_of_travel, levels=c("Personal","Mileage","Business"), ordered=TRUE)
# 2. Change the values for the *class* variable to the more concise names of: "Economy, Plus, Business"
df[which(df$class=="Eco"), "class"] = "Economy"
df[which(df$class=="Eco Plus"), "class"] = "Plus"
df$class <- factor(df$class, levels=c("Economy", "Plus", "Business"), ordered=TRUE)
# 3. Change *gender* to a factor and change the factor order to: "Male, Female"
df$gender <- factor(df$gender, levels=c("Male", "Female"), ordered=TRUE)
# 4. Change *airline_status* to a factor with order of: "Blue, Silver, Gold, Platinum"
df$airline_status <- factor(df$airline_status, levels=c("Blue", "Silver", "Gold", "Platinum"), ordered=TRUE)
5. Change airline_name to more concise names of: “‘Cheapseats’, ‘Cool&Young’, ‘EnjoyFlying’, ‘FlyFast’, ‘FlyHere’, ‘FlyToSun’, ‘GoingNorth’, ‘Northwest’, ‘OnlyJets’, ‘Oursin’, ‘PaulSmith’, ‘Sigma’, ‘Southeast’, ‘West’”
df[which(df$airline_name=="Cheapseats Airlines Inc."), "airline_name"] = "Cheapseats"
df[which(df$airline_name=="Cool&Young Airlines Inc."), "airline_name"] = "Cool&Young"
df[which(df$airline_name=="EnjoyFlying Air Services"), "airline_name"] = "EnjoyFlying"
df[which(df$airline_name=="FlyFast Airways Inc."), "airline_name"] = "FlyFast"
df[which(df$airline_name=="FlyHere Airways"), "airline_name"] = "FlyHere"
df[which(df$airline_name=="FlyToSun Airlines Inc."), "airline_name"] = "FlyToSun"
df[which(df$airline_name=="GoingNorth Airlines Inc."), "airline_name"] = "GoingNorth"
df[which(df$airline_name=="Northwest Business Airlines Inc."), "airline_name"] = "Northwest"
df[which(df$airline_name=="OnlyJets Airlines Inc."), "airline_name"] = "OnlyJets"
df[which(df$airline_name=="Oursin Airlines Inc."), "airline_name"] = "Oursin"
df[which(df$airline_name=="Paul Smith Airlines Inc."), "airline_name"] = "PaulSmith"
df[which(df$airline_name=="Sigma Airlines Inc."), "airline_name"] = "Sigma"
df[which(df$airline_name=="Southeast Airlines Co."), "airline_name"] = "Southeast"
df[which(df$airline_name=="West Airways Inc."), "airline_name"] = "West"
new_airline_names <- c('Cheapseats', 'Cool&Young', 'EnjoyFlying', 'FlyFast', 'FlyHere', 'FlyToSun', 'GoingNorth', 'Northwest', 'OnlyJets', 'Oursin', 'PaulSmith', 'Sigma', 'Southeast', 'West')
df$airline_name <- factor(df$airline_name, levels=new_airline_names, ordered=TRUE)
6. Change flight_cancelled to a factor with order of: “Yes, No”
df$flight_cancelled <- factor(df$flight_cancelled, levels=c("Yes","No"), ordered=TRUE)
7. Change arrival_delay_greater_5_mins to a factor with order: “yes”, “no”
df$arrival_delay_greater_5_mins <- factor(df$arrival_delay_greater_5_mins, levels=c("yes","no"), ordered=TRUE)
Drop the rows with percent_flight_other_airlines > 50
df <- df[-which(df$percent_flight_other_airlines > 50),]
- dropped 451 of outliers data
Round up the values for satisfaction with decimal values
table(df$satisfaction)
##
## 1 2 2.5 3 3.5 4 4.5 5
## 2989 23499 2 36864 2 53571 2 12506
df[which(df$satisfaction==2.5), "satisfaction"] = 3
df[which(df$satisfaction==3.5), "satisfaction"] = 4
df[which(df$satisfaction==4.5), "satisfaction"] = 5
table(df$satisfaction)
##
## 1 2 3 4 5
## 2989 23499 36866 53573 12508