library(readr)
library(tidyr)
package 㤼㸱tidyr㤼㸲 was built under R version 3.6.3Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Attaching package: 㤼㸱tidyr㤼㸲
The following object is masked from 㤼㸱package:R.utils㤼㸲:
extract
library(lubridate)
package 㤼㸱lubridate㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱lubridate㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
date, intersect, setdiff, union
library(dplyr)
package 㤼㸱dplyr㤼㸲 was built under R version 3.6.3
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:lubridate㤼㸲:
intersect, setdiff, union
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
library(outliers)
library(forecast)
package 㤼㸱forecast㤼㸲 was built under R version 3.6.3
RPubs link information: RPubs Link
The data consist the information on number of on-time / delayed / cancelled / diverted flights based on travel consumer report of 2015 marked by BUREAU OF Transportation statistics, Department of Transportation, US. The data is split across three different csv files (airlines, airports and flights). A large data set with the name flightdata is created by merging the other available and related data.
airports <- read_csv("airports.csv")
Parsed with column specification:
cols(
IATA_CODE = [31mcol_character()[39m,
AIRPORT = [31mcol_character()[39m,
CITY = [31mcol_character()[39m,
STATE = [31mcol_character()[39m,
COUNTRY = [31mcol_character()[39m,
LATITUDE = [32mcol_double()[39m,
LONGITUDE = [32mcol_double()[39m
)
airports <- data.frame(airports, stringsAsFactors = TRUE)
head(airports)
flights <- read_csv("flights.csv")
Parsed with column specification:
cols(
.default = col_double(),
AIRLINE = [31mcol_character()[39m,
TAIL_NUMBER = [31mcol_character()[39m,
ORIGIN_AIRPORT = [31mcol_character()[39m,
DESTINATION_AIRPORT = [31mcol_character()[39m,
SCHEDULED_DEPARTURE = [31mcol_character()[39m,
DEPARTURE_TIME = [31mcol_character()[39m,
WHEELS_OFF = [31mcol_character()[39m,
WHEELS_ON = [31mcol_character()[39m,
SCHEDULED_ARRIVAL = [31mcol_character()[39m,
ARRIVAL_TIME = [31mcol_character()[39m,
CANCELLATION_REASON = [31mcol_character()[39m
)
See spec(...) for full column specifications.
flights <- data.frame(flights, stringsAsFactors = TRUE)
head(flights)
# CREATING UNIQUE AIRPORT LOCATIONS FOR STARTING AND ENDING POSITIONS OF THE FLIGHT
ORIGIN_AIRPORT <- airports[,1:4]
colnames(ORIGIN_AIRPORT) <- c("IATA_CODE", "Orgin_Airport_Name", "Orgin_City", "Orgin_State")
DEST_AIRPORT <- airports[,1:4]
colnames(DEST_AIRPORT) <- c("IATA_CODE", "Dest_Airport_Name", "Dest_City", "Dest_State")
flightdata <- flights %>% left_join(ORIGIN_AIRPORT, c("ORIGIN_AIRPORT"="IATA_CODE"))
flightdata <- flightdata %>% left_join(DEST_AIRPORT, c("DESTINATION_AIRPORT"="IATA_CODE"))
Below is the summary of variables of the dataset. They are about 37 different variables in the dataset with few of them being numeric and characters.
flightdata %>% str()
'data.frame': 5819079 obs. of 37 variables:
$ YEAR : num 2015 2015 2015 2015 2015 ...
$ MONTH : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY_OF_WEEK : num 4 4 4 4 4 4 4 4 4 4 ...
$ AIRLINE : chr "AS" "AA" "US" "AA" ...
$ FLIGHT_NUMBER : num 98 2336 840 258 135 ...
$ TAIL_NUMBER : chr "N407AS" "N3KUAA" "N171US" "N3HYAA" ...
$ ORIGIN_AIRPORT : chr "ANC" "LAX" "SFO" "LAX" ...
$ DESTINATION_AIRPORT: chr "SEA" "PBI" "CLT" "MIA" ...
$ SCHEDULED_DEPARTURE: chr "0005" "0010" "0020" "0020" ...
$ DEPARTURE_TIME : chr "2354" "0002" "0018" "0015" ...
$ DEPARTURE_DELAY : num -11 -8 -2 -5 -1 -5 -6 14 -11 3 ...
$ TAXI_OUT : num 21 12 16 15 11 18 11 13 17 12 ...
$ WHEELS_OFF : chr "0015" "0014" "0034" "0030" ...
$ SCHEDULED_TIME : num 205 280 286 285 235 217 181 273 195 221 ...
$ ELAPSED_TIME : num 194 279 293 281 215 230 170 249 193 203 ...
$ AIR_TIME : num 169 263 266 258 199 206 154 228 173 186 ...
$ DISTANCE : num 1448 2330 2296 2342 1448 ...
$ WHEELS_ON : chr "0404" "0737" "0800" "0748" ...
$ TAXI_IN : num 4 4 11 8 5 6 5 8 3 5 ...
$ SCHEDULED_ARRIVAL : chr "0430" "0750" "0806" "0805" ...
$ ARRIVAL_TIME : chr "0408" "0741" "0811" "0756" ...
$ ARRIVAL_DELAY : num -22 -9 5 -9 -21 8 -17 -10 -13 -15 ...
$ DIVERTED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLATION_REASON: chr NA NA NA NA ...
$ AIR_SYSTEM_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ SECURITY_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ AIRLINE_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ LATE_AIRCRAFT_DELAY: num NA NA NA NA NA NA NA NA NA NA ...
$ WEATHER_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ Orgin_Airport_Name : chr "Ted Stevens Anchorage International Airport" "Los Angeles International Airport" "San Francisco International Airport" "Los Angeles International Airport" ...
$ Orgin_City : chr "Anchorage" "Los Angeles" "San Francisco" "Los Angeles" ...
$ Orgin_State : chr "AK" "CA" "CA" "CA" ...
$ Dest_Airport_Name : chr "Seattle-Tacoma International Airport" "Palm Beach International Airport" "Charlotte Douglas International Airport" "Miami International Airport" ...
$ Dest_City : chr "Seattle" "West Palm Beach" "Charlotte" "Miami" ...
$ Dest_State : chr "WA" "FL" "NC" "FL" ...
# CHAR -> FACT CONVERSIONS
flightdata <- flightdata %>% mutate_if(is.character, as.factor)
# FACT -> DATE CONVERSIONS
flightdata[,c(10,11,14,19,21,22)] <- flightdata[,c(10,11,14,19,21,22)] %>% as.character.Date()
flightdata %>% str()
'data.frame': 5819079 obs. of 37 variables:
$ YEAR : num 2015 2015 2015 2015 2015 ...
$ MONTH : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY_OF_WEEK : num 4 4 4 4 4 4 4 4 4 4 ...
$ AIRLINE : Factor w/ 14 levels "AA","AS","B6",..: 2 1 12 1 2 4 9 12 1 4 ...
$ FLIGHT_NUMBER : num 98 2336 840 258 135 ...
$ TAIL_NUMBER : Factor w/ 4897 levels "7819A","7820L",..: 1623 1557 422 1517 2132 1142 2766 2411 1562 3935 ...
$ ORIGIN_AIRPORT : Factor w/ 628 levels "10135","10136",..: 324 483 585 483 584 585 481 483 585 481 ...
$ DESTINATION_AIRPORT: Factor w/ 629 levels "10135","10136",..: 585 543 374 511 325 524 524 374 394 328 ...
$ SCHEDULED_DEPARTURE: 'AsIs' chr "0005" "0010" "0020" "0020" ...
$ DEPARTURE_TIME : 'AsIs' chr "2354" "0002" "0018" "0015" ...
$ DEPARTURE_DELAY : num -11 -8 -2 -5 -1 -5 -6 14 -11 3 ...
$ TAXI_OUT : num 21 12 16 15 11 18 11 13 17 12 ...
$ WHEELS_OFF : 'AsIs' chr "0015" "0014" "0034" "0030" ...
$ SCHEDULED_TIME : num 205 280 286 285 235 217 181 273 195 221 ...
$ ELAPSED_TIME : num 194 279 293 281 215 230 170 249 193 203 ...
$ AIR_TIME : num 169 263 266 258 199 206 154 228 173 186 ...
$ DISTANCE : num 1448 2330 2296 2342 1448 ...
$ WHEELS_ON : 'AsIs' chr "0404" "0737" "0800" "0748" ...
$ TAXI_IN : num 4 4 11 8 5 6 5 8 3 5 ...
$ SCHEDULED_ARRIVAL : 'AsIs' chr "0430" "0750" "0806" "0805" ...
$ ARRIVAL_TIME : 'AsIs' chr "0408" "0741" "0811" "0756" ...
$ ARRIVAL_DELAY : num -22 -9 5 -9 -21 8 -17 -10 -13 -15 ...
$ DIVERTED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLATION_REASON: Factor w/ 4 levels "A","B","C","D": NA NA NA NA NA NA NA NA NA NA ...
$ AIR_SYSTEM_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ SECURITY_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ AIRLINE_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ LATE_AIRCRAFT_DELAY: num NA NA NA NA NA NA NA NA NA NA ...
$ WEATHER_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ Orgin_Airport_Name : Factor w/ 322 levels "Aberdeen Regional Airport",..: 287 173 260 173 268 260 185 173 260 185 ...
$ Orgin_City : Factor w/ 308 levels "Aberdeen","Abilene",..: 13 174 256 174 267 256 164 174 256 164 ...
$ Orgin_State : Factor w/ 54 levels "AK","AL","AR",..: 1 6 6 6 51 6 35 6 6 35 ...
$ Dest_Airport_Name : Factor w/ 322 levels "Aberdeen Regional Airport",..: 268 222 50 194 287 196 196 50 68 127 ...
$ Dest_City : Factor w/ 308 levels "Aberdeen","Abilene",..: 267 296 54 188 13 191 191 54 72 19 ...
$ Dest_State : Factor w/ 54 levels "AK","AL","AR",..: 51 10 29 10 1 25 25 29 46 11 ...
Each variable must have its own column. - YES
In the above steps, we have excluded variables that duplicate data such as Country, longitude, Latitude, IOTA_CODE. Each observation must have its own row. - YES
All observations are unique by flight number
Each value must have its own cell. - YES
Create/mutate at least one variable from the existing variables (minimum requirement #6). In addition to the R codes and outputs, explain everything that you do in this step.
# Creating Average Speed
flightdata$Avg_Speed <- (flightdata$DISTANCE / flightdata$AIR_TIME)
# Taxi time
flightdata$Taxi_Time <- (flightdata$TAXI_OUT + flightdata$TAXI_IN)
# Did the flight Arrive on time? SCHEDULED_TIME VS ELAPSED_TIME
flightdata$On_Time <- ifelse(flightdata$SCHEDULED_TIME - flightdata$ELAPSED_TIME > 0 , 1, 0)
head(flightdata)
Using Summary to identify Missing Values
# SUMMARY OF NA VALUES
flightdata %>% summary()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE
Min. :2015 Min. : 1.000 Min. : 1.0 Min. :1.000 WN :1261855 Min. : 1 N480HA : 3768 ATL : 346836 ATL : 346904 Length:5819079
1st Qu.:2015 1st Qu.: 4.000 1st Qu.: 8.0 1st Qu.:2.000 DL : 875881 1st Qu.: 730 N484HA : 3723 ORD : 285884 ORD : 285906 Class :AsIs
Median :2015 Median : 7.000 Median :16.0 Median :4.000 AA : 725984 Median :1690 N488HA : 3723 DFW : 239551 DFW : 239582 Mode :character
Mean :2015 Mean : 6.524 Mean :15.7 Mean :3.927 OO : 588353 Mean :2173 N493HA : 3585 DEN : 196055 DEN : 196010
3rd Qu.:2015 3rd Qu.: 9.000 3rd Qu.:23.0 3rd Qu.:6.000 EV : 571977 3rd Qu.:3230 N478HA : 3577 LAX : 194673 LAX : 194696
Max. :2015 Max. :12.000 Max. :31.0 Max. :7.000 UA : 515723 Max. :9855 (Other):5785982 SFO : 148008 SFO : 147966
(Other):1279306 NA's : 14721 (Other):4408072 (Other):4408015
DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN
Length:5819079 Min. : -82.00 Min. : 1.00 Length:5819079 Min. : 18.0 Min. : 14 Min. : 7.0 Min. : 21.0 Length:5819079 Min. : 1.00
Class :AsIs 1st Qu.: -5.00 1st Qu.: 11.00 Class :AsIs 1st Qu.: 85.0 1st Qu.: 82 1st Qu.: 60.0 1st Qu.: 373.0 Class :AsIs 1st Qu.: 4.00
Mode :character Median : -2.00 Median : 14.00 Mode :character Median :123.0 Median :118 Median : 94.0 Median : 647.0 Mode :character Median : 6.00
Mean : 9.37 Mean : 16.07 Mean :141.7 Mean :137 Mean :113.5 Mean : 822.4 Mean : 7.43
3rd Qu.: 7.00 3rd Qu.: 19.00 3rd Qu.:173.0 3rd Qu.:168 3rd Qu.:144.0 3rd Qu.:1062.0 3rd Qu.: 9.00
Max. :1988.00 Max. :225.00 Max. :718.0 Max. :766 Max. :690.0 Max. :4983.0 Max. :248.00
NA's :86153 NA's :89047 NA's :6 NA's :105071 NA's :105071 NA's :92513
SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY
Length:5819079 Length:5819079 Min. : -87.00 Min. :0.00000 Min. :0.00000 A : 25262 Min. : 0 Min. : 0 Min. : 0
Class :AsIs Class :AsIs 1st Qu.: -13.00 1st Qu.:0.00000 1st Qu.:0.00000 B : 48851 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
Mode :character Mode :character Median : -5.00 Median :0.00000 Median :0.00000 C : 15749 Median : 2 Median : 0 Median : 2
Mean : 4.41 Mean :0.00261 Mean :0.01545 D : 22 Mean : 13 Mean : 0 Mean : 19
3rd Qu.: 8.00 3rd Qu.:0.00000 3rd Qu.:0.00000 NA's:5729195 3rd Qu.: 18 3rd Qu.: 0 3rd Qu.: 19
Max. :1971.00 Max. :1.00000 Max. :1.00000 Max. :1134 Max. :573 Max. :1971
NA's :105071 NA's :4755640 NA's :4755640 NA's :4755640
LATE_AIRCRAFT_DELAY WEATHER_DELAY Orgin_Airport_Name Orgin_City Orgin_State
Min. : 0 Min. : 0 Hartsfield-Jackson Atlanta International Airport: 346836 Chicago : 366770 CA : 647911
1st Qu.: 0 1st Qu.: 0 Chicago O'Hare International Airport : 285884 Atlanta : 346836 TX : 631124
Median : 3 Median : 0 Dallas/Fort Worth International Airport : 239551 Dallas-Fort Worth: 239551 FL : 415586
Mean : 23 Mean : 3 Denver International Airport : 196055 Houston : 198664 IL : 381644
3rd Qu.: 29 3rd Qu.: 0 Los Angeles International Airport : 194673 Denver : 196055 GA : 360496
Max. :1331 Max. :1211 (Other) :4069915 (Other) :3985038 (Other):2896153
NA's :4755640 NA's :4755640 NA's : 486165 NA's : 486165 NA's : 486165
Dest_Airport_Name Dest_City Dest_State Avg_Speed Taxi_Time On_Time
Hartsfield-Jackson Atlanta International Airport: 346904 Chicago : 366790 CA : 647899 Min. : 0.54 Min. : 2.00 Min. :0.0
Chicago O'Hare International Airport : 285906 Atlanta : 346904 TX : 631214 1st Qu.: 6.11 1st Qu.: 17.00 1st Qu.:0.0
Dallas/Fort Worth International Airport : 239582 Dallas-Fort Worth: 239582 FL : 415554 Median : 6.93 Median : 21.00 Median :1.0
Denver International Airport : 196010 Houston : 198724 IL : 381666 Mean : 6.81 Mean : 23.51 Mean :0.7
Los Angeles International Airport : 194696 Denver : 196010 GA : 360563 3rd Qu.: 7.60 3rd Qu.: 27.00 3rd Qu.:1.0
(Other) :4069816 (Other) :3984904 (Other):2896018 Max. :13.13 Max. :268.00 Max. :1.0
NA's : 486165 NA's : 486165 NA's : 486165 NA's :105071 NA's :92513 NA's :105071
# DELETING NA VALUES THAT ARE LESS THAN 5% OF THE TOTAL DATA. (5% = 290,953)
flightdata <- flightdata[!(is.na(flightdata$TAIL_NUMBER)),]
flightdata <- flightdata[!(is.na(flightdata$DEPARTURE_DELAY)),]
flightdata <- flightdata[!(is.na(flightdata$TAXI_OUT)), ]
flightdata <- flightdata[!(is.na(flightdata$SCHEDULED_TIME)), ]
flightdata <- flightdata[!(is.na(flightdata$ELAPSED_TIME)), ]
flightdata <- flightdata[!(is.na(flightdata$TAXI_IN)), ]
flightdata %>% str()
'data.frame': 5714008 obs. of 40 variables:
$ YEAR : num 2015 2015 2015 2015 2015 ...
$ MONTH : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY : num 1 1 1 1 1 1 1 1 1 1 ...
$ DAY_OF_WEEK : num 4 4 4 4 4 4 4 4 4 4 ...
$ AIRLINE : Factor w/ 14 levels "AA","AS","B6",..: 2 1 12 1 2 4 9 12 1 4 ...
$ FLIGHT_NUMBER : num 98 2336 840 258 135 ...
$ TAIL_NUMBER : Factor w/ 4897 levels "7819A","7820L",..: 1623 1557 422 1517 2132 1142 2766 2411 1562 3935 ...
$ ORIGIN_AIRPORT : Factor w/ 628 levels "10135","10136",..: 324 483 585 483 584 585 481 483 585 481 ...
$ DESTINATION_AIRPORT: Factor w/ 629 levels "10135","10136",..: 585 543 374 511 325 524 524 374 394 328 ...
$ SCHEDULED_DEPARTURE: 'AsIs' chr "0005" "0010" "0020" "0020" ...
$ DEPARTURE_TIME : 'AsIs' chr "2354" "0002" "0018" "0015" ...
$ DEPARTURE_DELAY : num -11 -8 -2 -5 -1 -5 -6 14 -11 3 ...
$ TAXI_OUT : num 21 12 16 15 11 18 11 13 17 12 ...
$ WHEELS_OFF : 'AsIs' chr "0015" "0014" "0034" "0030" ...
$ SCHEDULED_TIME : num 205 280 286 285 235 217 181 273 195 221 ...
$ ELAPSED_TIME : num 194 279 293 281 215 230 170 249 193 203 ...
$ AIR_TIME : num 169 263 266 258 199 206 154 228 173 186 ...
$ DISTANCE : num 1448 2330 2296 2342 1448 ...
$ WHEELS_ON : 'AsIs' chr "0404" "0737" "0800" "0748" ...
$ TAXI_IN : num 4 4 11 8 5 6 5 8 3 5 ...
$ SCHEDULED_ARRIVAL : 'AsIs' chr "0430" "0750" "0806" "0805" ...
$ ARRIVAL_TIME : 'AsIs' chr "0408" "0741" "0811" "0756" ...
$ ARRIVAL_DELAY : num -22 -9 5 -9 -21 8 -17 -10 -13 -15 ...
$ DIVERTED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLED : num 0 0 0 0 0 0 0 0 0 0 ...
$ CANCELLATION_REASON: Factor w/ 4 levels "A","B","C","D": NA NA NA NA NA NA NA NA NA NA ...
$ AIR_SYSTEM_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ SECURITY_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ AIRLINE_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ LATE_AIRCRAFT_DELAY: num NA NA NA NA NA NA NA NA NA NA ...
$ WEATHER_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
$ Orgin_Airport_Name : Factor w/ 322 levels "Aberdeen Regional Airport",..: 287 173 260 173 268 260 185 173 260 185 ...
$ Orgin_City : Factor w/ 308 levels "Aberdeen","Abilene",..: 13 174 256 174 267 256 164 174 256 164 ...
$ Orgin_State : Factor w/ 54 levels "AK","AL","AR",..: 1 6 6 6 51 6 35 6 6 35 ...
$ Dest_Airport_Name : Factor w/ 322 levels "Aberdeen Regional Airport",..: 268 222 50 194 287 196 196 50 68 127 ...
$ Dest_City : Factor w/ 308 levels "Aberdeen","Abilene",..: 267 296 54 188 13 191 191 54 72 19 ...
$ Dest_State : Factor w/ 54 levels "AK","AL","AR",..: 51 10 29 10 1 25 25 29 46 11 ...
$ Avg_Speed : num 8.57 8.86 8.63 9.08 7.28 ...
$ Taxi_Time : num 25 16 27 23 16 24 16 21 20 17 ...
$ On_Time : num 1 1 0 1 1 0 1 1 1 1 ...
For delay options NA means there is no delay hence no imputation needed
Airport information such as name, city and state cannnot be imputed based on flight data because certain airports use a numeric ID which had no matching data with the airports.csv . No other variables have a realtionship with this to be imputed.
*Process: 1. Find the Outliers by converting the data into z scores. 2. Z scores with a score of greater than 3 are outliers. 3. Filter out the the outliers and check to see that they have been filtered 4. Plot a boxplot of both the variable with and without the outliers to determine any noticeable difference.
## Finding Outliers for only the Numeric Variables
z.scores <- flightdata$DEPARTURE_DELAY %>% scores(type = "z")
length (which(abs(z.scores) >3))
[1] 116262
Depature_DelaynoOutliers <- flightdata$DEPARTURE_DELAY[- (which(abs(z.scores) >3))]
str(Depature_DelaynoOutliers)
num [1:5597746] -11 -8 -2 -5 -1 -5 -6 14 -11 3 ...
z.scores1 <- flightdata$TAXI_OUT %>% scores(type = "z")
length (which(abs(z.scores1) >3))
[1] 100870
Taxi_outnoOutliers <- flightdata$TAXI_OUT[- (which(abs(z.scores1) >3))]
str(Taxi_outnoOutliers)
num [1:5613138] 21 12 16 15 11 18 11 13 17 12 ...
z.scores2 <- flightdata$SCHEDULED_TIME %>% scores(type = "z")
length (which(abs(z.scores2) >3))
[1] 76179
Scheduled_timenoOutliers <- flightdata$SCHEDULED_TIME[- (which(abs(z.scores2) >3))]
str(Scheduled_timenoOutliers)
num [1:5637829] 205 280 286 285 235 217 181 273 195 221 ...
z.scores3 <- flightdata$ELAPSED_TIME %>% scores(type = "z")
length (which(abs(z.scores3) >3))
[1] 73273
Elapsed_timenoOutliers <- flightdata$ELAPSED_TIME[- (which(abs(z.scores3) >3))]
str(Elapsed_timenoOutliers)
num [1:5640735] 194 279 293 281 215 230 170 249 193 203 ...
z.scores4 <- flightdata$AIR_TIME %>% scores(type = "z")
length (which(abs(z.scores4) >3))
[1] 71632
Air_TimenoOutliers <- flightdata$AIR_TIME[- (which(abs(z.scores4) >3))]
str(Air_TimenoOutliers)
num [1:5642376] 169 263 266 258 199 206 154 228 173 186 ...
z.scores5 <- flightdata$DISTANCE %>% scores(type = "z")
length (which(abs(z.scores5) >3))
[1] 36805
Distance_noOutliers <- flightdata$DISTANCE[- (which(abs(z.scores5) >3))]
str(Distance_noOutliers)
num [1:5677203] 1448 2330 2296 2342 1448 ...
z.scores6 <- flightdata$TAXI_IN %>% scores(type = "z")
length (which(abs(z.scores6) >3))
[1] 101405
Taxi_innoOutliers <- flightdata$TAXI_IN[- (which(abs(z.scores6) >3))]
str(Taxi_innoOutliers)
num [1:5612603] 4 4 11 8 5 6 5 8 3 5 ...
z.scores7 <- flightdata$ARRIVAL_DELAY %>% scores(type = "z")
length (which(abs(z.scores7) >3))
[1] 110956
Arrival_DelaynoOutliers <- flightdata$ARRIVAL_DELAY[- (which(abs(z.scores7) >3))]
str(Arrival_DelaynoOutliers)
num [1:5603052] -22 -9 5 -9 -21 8 -17 -10 -13 -15 ...
z.scores8 <- flightdata$Avg_Speed %>% scores(type = "z")
length (which(abs(z.scores8) >3))
[1] 26989
Avg_speednoOutliers <- flightdata$Avg_Speed[- (which(abs(z.scores8) >3))]
str(Avg_speednoOutliers)
num [1:5687019] 8.57 8.86 8.63 9.08 7.28 ...
z.scores9 <- flightdata$Taxi_Time %>% scores(type = "z")
length (which(abs(z.scores9) >3))
[1] 91707
Taxi_TimenoOutliers <- flightdata$Taxi_Time[- (which(abs(z.scores9) >3))]
str(Taxi_TimenoOutliers)
num [1:5622301] 25 16 27 23 16 24 16 21 20 17 ...
#Boxplots
boxplot(flightdata$DEPARTURE_DELAY,Depature_DelaynoOutliers, flightdata$ARRIVAL_DELAY, Arrival_DelaynoOutliers, names = c("Depature Delay","DD Outliers Filtered","Arrival Delay","AD Outliers Filtered"),main = "Delays", las = 1, cex.axis = 0.5)
boxplot(flightdata$TAXI_OUT,Taxi_outnoOutliers, flightdata$TAXI_IN, Taxi_innoOutliers, flightdata$Taxi_Time, Taxi_TimenoOutliers, names = c("Taxi Out","TO Outliers Filtered","Taxi in", "TI Outliers Filtered","Total Taxi Time","TTT Outliers Filtered"),main = "Taxi Time", las = 1, cex.axis = 0.5)
boxplot(flightdata$SCHEDULED_TIME,Scheduled_timenoOutliers, flightdata$AIR_TIME, Air_TimenoOutliers, flightdata$ELAPSED_TIME, Elapsed_timenoOutliers, names = c("Scheduled Time","ST Outliers Filtered","Air Time","AT Outliers Filtered","Elapsed Time","ET Outliers Filtered"),main = "Travel Time", las = 1, cex.axis = 0.5)
boxplot(flightdata$DISTANCE,Distance_noOutliers, names = c("With outliers","Outliers Filtered"),main = "Distance")
boxplot(flightdata$Avg_Speed,Avg_speednoOutliers, names = c("With outliers","Outliers Filtered"),main = "Average Speed")
Here, logarathmic function is used to transform the variables. Only few variables could be transformed.
#Taxi_out
hist(flightdata$TAXI_OUT) #original-right skewed
hist(log10(flightdata$TAXI_OUT)) #transformed - normal
#Scheduled_time
hist(flightdata$SCHEDULED_TIME) #original-right skewed
hist(log10(flightdata$SCHEDULED_TIME)) #transformed -normal
#Arrival_Delay
hist(flightdata$ARRIVAL_DELAY) #original
hist(log((flightdata$ARRIVAL_DELAY)))#reduced skewness
NaNs produced
#All other variables couldnt be transformed