Required packages

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(deductive)
library(validate)
## 
## Attaching package: 'validate'
## The following object is masked from 'package:dplyr':
## 
##     expr
library(outliers)
library(forecast)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
## 
##     expr

Executive Summary

I have two datasets obtained from the data.gov.au website. They look at passenger and freight movements in and out of Australian airports. They’re two International Airline datasets and have a unique identifier for month that can be used for a join. I will look at the data to see the difference between passenger movements and maximum seat capacity.

Data

Using the International Airlines - Airline by country of port dataset … (data.gov.au 2018).

Using International Airlines - Operated Flights and Seats dataset … (data.gov.au 2018).

#read in airline port country dataset
aus_international_airline_port <- read_csv("airline_portcountry.csv")
## Parsed with column specification:
## cols(
##   Month = col_integer(),
##   Airline = col_character(),
##   Port_Country = col_character(),
##   Passengers_In = col_integer(),
##   `Freight_In_(tonnes)` = col_double(),
##   `Mail_In_(tonnes)` = col_double(),
##   Passengers_Out = col_integer(),
##   `Freight_Out_(tonnes)` = col_double(),
##   `Mail_Out_(tonnes)` = col_double(),
##   Year = col_integer(),
##   Month_num = col_integer()
## )
#show head of data
head(aus_international_airline_port)
#read in airline movement activity 
aus_international_flights_seats <- read_csv("international_airline_activity_opfltsseats.csv")
## Parsed with column specification:
## cols(
##   Month = col_integer(),
##   In_Out = col_character(),
##   Australian_City = col_character(),
##   International_City = col_character(),
##   Airline = col_character(),
##   Route = col_character(),
##   Port_Country = col_character(),
##   Port_Region = col_character(),
##   Service_Country = col_character(),
##   Service_Region = col_character(),
##   Stops = col_integer(),
##   All_Flights = col_integer(),
##   Max_Seats = col_integer(),
##   Year = col_integer(),
##   Month_num = col_integer()
## )
#show head of data
head(aus_international_flights_seats)

Understand

#check the variable class types
sapply(aus_international_airline_port, class)
##                Month              Airline         Port_Country 
##            "integer"          "character"          "character" 
##        Passengers_In  Freight_In_(tonnes)     Mail_In_(tonnes) 
##            "integer"            "numeric"            "numeric" 
##       Passengers_Out Freight_Out_(tonnes)    Mail_Out_(tonnes) 
##            "integer"            "numeric"            "numeric" 
##                 Year            Month_num 
##            "integer"            "integer"
sapply(aus_international_flights_seats, class)
##              Month             In_Out    Australian_City 
##          "integer"        "character"        "character" 
## International_City            Airline              Route 
##        "character"        "character"        "character" 
##       Port_Country        Port_Region    Service_Country 
##        "character"        "character"        "character" 
##     Service_Region              Stops        All_Flights 
##        "character"          "integer"          "integer" 
##          Max_Seats               Year          Month_num 
##          "integer"          "integer"          "integer"
#total variable class types
table(sapply(aus_international_airline_port, class))
## 
## character   integer   numeric 
##         2         5         4
table(sapply(aus_international_flights_seats, class))
## 
## character   integer 
##         9         6
#dataset structure
str(aus_international_airline_port)
## Classes 'tbl_df', 'tbl' and 'data.frame':    40156 obs. of  11 variables:
##  $ Month               : int  31048 31048 31048 31048 31048 31048 31048 31048 31048 31048 ...
##  $ Airline             : chr  "Air Caledonie" "Air China" "Air India" "Air India" ...
##  $ Port_Country        : chr  "New Caledonia" "China" "India" "Singapore" ...
##  $ Passengers_In       : int  725 1311 1423 501 161 24588 2954 3350 574 280 ...
##  $ Freight_In_(tonnes) : num  0.368 9.158 44.823 10.583 0.212 ...
##  $ Mail_In_(tonnes)    : num  0 0 0 0 0 0.374 0 0 0 0 ...
##  $ Passengers_Out      : int  474 1232 970 306 96 20692 2906 2470 578 205 ...
##  $ Freight_Out_(tonnes): num  0.167 2.587 6.708 124.966 23.996 ...
##  $ Mail_Out_(tonnes)   : num  0 0 0 0 0 0.122 0 0 0 0 ...
##  $ Year                : int  1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
##  $ Month_num           : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 11
##   .. ..$ Month               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Airline             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Port_Country        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Passengers_In       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Freight_In_(tonnes) : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Mail_In_(tonnes)    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Passengers_Out      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Freight_Out_(tonnes): list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Mail_Out_(tonnes)   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Year                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Month_num           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
str(aus_international_flights_seats)
## Classes 'tbl_df', 'tbl' and 'data.frame':    92386 obs. of  15 variables:
##  $ Month             : int  37865 37865 37865 37865 37865 37865 37865 37865 37865 37865 ...
##  $ In_Out            : chr  "I" "I" "I" "I" ...
##  $ Australian_City   : chr  "Adelaide" "Adelaide" "Adelaide" "Adelaide" ...
##  $ International_City: chr  "Denpasar" "Hong Kong" "Kuala Lumpur" "Singapore" ...
##  $ Airline           : chr  "Garuda Indonesia" "Cathay Pacific Airways" "Malaysia Airlines" "Qantas Airways" ...
##  $ Route             : chr  "DPS-ADL-MEL" "HKG-ADL-MEL" "KUL-ADL" "SIN-DRW-ADL-MEL" ...
##  $ Port_Country      : chr  "Indonesia" "Hong Kong (SAR)" "Malaysia" "Singapore" ...
##  $ Port_Region       : chr  "SE Asia" "NE Asia" "SE Asia" "SE Asia" ...
##  $ Service_Country   : chr  "Indonesia" "Hong Kong (SAR)" "Malaysia" "Singapore" ...
##  $ Service_Region    : chr  "SE Asia" "NE Asia" "SE Asia" "SE Asia" ...
##  $ Stops             : int  0 0 0 1 1 0 0 0 0 0 ...
##  $ All_Flights       : int  13 8 17 4 9 12 36 18 8 14 ...
##  $ Max_Seats         : int  3809 2008 4726 908 2038 3876 12624 2556 2296 5404 ...
##  $ Year              : int  2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
##  $ Month_num         : int  9 9 9 9 9 9 9 9 9 9 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 15
##   .. ..$ Month             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ In_Out            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Australian_City   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ International_City: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Airline           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Route             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Port_Country      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Port_Region       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Service_Country   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Service_Region    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Stops             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ All_Flights       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Max_Seats         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Year              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Month_num         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
#a better look at the attributes
attributes(aus_international_airline_port[1:10, ])
## $names
##  [1] "Month"                "Airline"              "Port_Country"        
##  [4] "Passengers_In"        "Freight_In_(tonnes)"  "Mail_In_(tonnes)"    
##  [7] "Passengers_Out"       "Freight_Out_(tonnes)" "Mail_Out_(tonnes)"   
## [10] "Year"                 "Month_num"           
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"
attributes(aus_international_flights_seats[1:10, ])
## $names
##  [1] "Month"              "In_Out"             "Australian_City"   
##  [4] "International_City" "Airline"            "Route"             
##  [7] "Port_Country"       "Port_Region"        "Service_Country"   
## [10] "Service_Region"     "Stops"              "All_Flights"       
## [13] "Max_Seats"          "Year"               "Month_num"         
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"

Tidy & Manipulate Data I

#group observations and drop varibles 
aus_international_flights_seats <- aus_international_flights_seats %>% group_by(Month, Airline, Port_Country, Year, Month_num) %>% summarise(Max_Seats=sum(Max_Seats,na.rm = TRUE), All_Flights=sum(All_Flights, na.rm = TRUE))

#join ailine movements and airline ports on Month, Airline, Port_Country, Year and Month_num
international_air_traval <- full_join(aus_international_flights_seats, aus_international_airline_port, by = c("Month" = "Month", "Airline" = "Airline", "Port_Country" = "Port_Country", "Year" = "Year", "Month_num" = "Month_num"))
#show head of data
head(international_air_traval)
#factor variables
international_air_traval$Month <- as.factor(international_air_traval$Month)
international_air_traval$Year <- as.factor(international_air_traval$Year)
international_air_traval$Airline <- as.factor(international_air_traval$Airline)
international_air_traval$Port_Country <- as.factor(international_air_traval$Port_Country)

#show head of joined data
head(international_air_traval)

Tidy & Manipulate Data II

The following rules apply for tidy data: 1. Each variable must have its own column. 2. Each observation must have its own row. 3. Each value must have its own cell. - all rules have been met except if you consider the two month variables where on is an identification number. So there some things that should be changed

#ungrouping
#ungroup(international_air_traval)

#rename the variables
names(international_air_traval)[1]<-"Month_ID"
names(international_air_traval)[5]<-"Month"

#factor and change values to quarters
international_air_traval <- ungroup(international_air_traval)  %>% mutate(Month = case_when (Month == 1 ~ "January", Month == 2 ~ "February", Month  == 3 ~ "March", Month == 4 ~ "April", Month == 5 ~ "May", Month == 6 ~ "June", Month == 7 ~ "July", Month == 8 ~ "August", Month == 9 ~ "September", Month == 10 ~ "October", Month == 11 ~ "November", Month == 12 ~ "December"))

#factor Quarter variable
international_air_traval$Month = as.factor(international_air_traval$Month)

#create new dataframe for totals
total_passenger <- international_air_traval %>% select(Year, Month, Max_Seats, Passengers_In, Passengers_Out)

#creating new variable from passengers in and passengers out
total_passenger <- mutate(total_passenger,
       Total_Movement = Passengers_In + Passengers_Out)

#creating new variable, empty_seats by subtracting the new variable total movement from max seats 
total_passenger <- mutate(total_passenger, Empty_Seats = Max_Seats - Total_Movement)

#group by year and month and create a new dataset
totals_by_month <- total_passenger %>% group_by(Year, Month) %>% summarise(Total_Movement=sum(Total_Movement,na.rm = TRUE), Empty_Seats=sum(Empty_Seats, na.rm = TRUE), Max_Seats=sum(Max_Seats, na.rm = TRUE))

#head data
head(total_passenger)

Scan I

#checking for NAs
colSums(is.na(total_passenger))
##           Year          Month      Max_Seats  Passengers_In Passengers_Out 
##              0              0          26694            874            874 
## Total_Movement    Empty_Seats 
##            874          27568
#we have a lot of NAs due to the differece in dates of observation
#filtering the 15 year period froom 2003 - 2017, purposely dropping 2018 as the data is incomplete 
fifteen_years_movement <- total_passenger %>% filter (Year == 2003 | Year == 2004 | Year == 2005 | Year == 2006 | Year == 2007 
                                           | Year == 2008 | Year == 2009 | Year == 2010 | Year == 2011 | Year == 2012
                                           | Year == 2013 | Year == 2014 | Year == 2015 | Year == 2016 | Year == 2017)

#checking for NAs again
colSums(is.na(fifteen_years_movement))
##           Year          Month      Max_Seats  Passengers_In Passengers_Out 
##              0              0           5885            866            866 
## Total_Movement    Empty_Seats 
##            866           6751
# Define the rules as an validator expression
Rules <- validator( Passengers_In == Max_Seats * 0.25, Passengers_Out == Max_Seats * 0.25, Empty_Seats == Max_Seats - Total_Movement, Passengers_In + Passengers_Out == Total_Movement)

# Use impute_lr function
fifteen_years_movement <- impute_lr(fifteen_years_movement, Rules)

fifteen_years_movement <- fifteen_years_movement %>% mutate_at(vars(Passengers_In, Passengers_Out, Total_Movement, Empty_Seats), funs(round(., 0)))

#checking for NAs again
colSums(is.na(fifteen_years_movement))
##           Year          Month      Max_Seats  Passengers_In Passengers_Out 
##              0              0              0              0              0 
## Total_Movement    Empty_Seats 
##              0              0
#checking to see if there are any infinite values 
sum(is.infinite(fifteen_years_movement$Max_Seats)|
is.infinite(fifteen_years_movement$Passengers_In) |
is.infinite(fifteen_years_movement$Passengers_Out)|
is.infinite(fifteen_years_movement$Total_Movement) |
is.infinite(fifteen_years_movement$Empty_Seats))
## [1] 0
#checking to see if they're all numeric
sum(is.nan(fifteen_years_movement$Max_Seats)|
is.nan(fifteen_years_movement$Passengers_In) |
is.nan(fifteen_years_movement$Passengers_Out)|
is.nan(fifteen_years_movement$Total_Movement) |
is.nan(fifteen_years_movement$Empty_Seats))
## [1] 0
#removing zero data from Empty_seat variable to avoid skewed results
fifteen_years_movement = filter(fifteen_years_movement, Empty_Seats > 0)

Scan II

#Calculating the z-scores for the Total_Movement movement variable and showing
#the total number with an absolute value greater than 3.
z.scores_mv <- fifteen_years_movement$Total_Movement %>% scores(type = "z")
z.scores_mv %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.65441 -0.54116 -0.39204  0.00000  0.06796  6.89021
sum(abs(z.scores_mv) > 3)
## [1] 452
#Calculating the z-scores for the Max_seats variable and showing
#the total number with an absolute value greater than 3.
z.scores_ms <- fifteen_years_movement$Max_Seats %>% scores(type = "z")
z.scores_ms %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7374 -0.5879 -0.4066  0.0000  0.1238  6.4705
sum(abs(z.scores_ms) > 3)
## [1] 438
#Calculating the z-scores for the Passengers_In variable and showing
#the total number with an absolute value greater than 3.
z.scores_pi <- fifteen_years_movement$Passengers_In %>% scores(type = "z")
z.scores_pi %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.65044 -0.53986 -0.39149  0.00000  0.07094  7.30039
sum(abs(z.scores_pi) > 3)
## [1] 434
#Calculating the z-scores for the Passengers_Out variable and showing
#the total number with an absolute value greater than 3.
z.scores_po <- fifteen_years_movement$Passengers_Out %>% scores(type = "z")
z.scores_po %>% summary()
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.65282 -0.54091 -0.39091  0.00000  0.07655  7.65262
sum(abs(z.scores_po) > 3)
## [1] 452
#Calculating the z-scores for the Empty_Seats variable and showing
#the total number with an absolute value greater than 3.
z.scores_es <- fifteen_years_movement$Empty_Seats %>% scores(type = "z")
z.scores_es %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7478 -0.6032 -0.3976  0.0000  0.1588  8.8727
sum(abs(z.scores_es) > 3)
## [1] 351
#function to cap the values
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}

# Capping the numeric variables
fifteen_years_movement$Total_Movement <- fifteen_years_movement$Total_Movement %>% cap()
fifteen_years_movement$Max_Seats <- fifteen_years_movement$Max_Seats %>% cap()
fifteen_years_movement$Passengers_In <- fifteen_years_movement$Passengers_In %>% cap()
fifteen_years_movement$Passengers_Out <- fifteen_years_movement$Passengers_Out %>% cap()
fifteen_years_movement$Empty_seats <- fifteen_years_movement$Empty_Seats %>% cap()

#summary of variables before capping
summary(fifteen_years_movement[, c(3:7)])
##    Max_Seats      Passengers_In   Passengers_Out  Total_Movement  
##  Min.   :     8   Min.   :    0   Min.   :    0   Min.   :     0  
##  1st Qu.:  8398   1st Qu.: 2196   1st Qu.: 2187   1st Qu.:  4443  
##  Median : 18576   Median : 5142   Median : 5118   Median : 10294  
##  Mean   : 39815   Mean   :12121   Mean   :11981   Mean   : 24224  
##  3rd Qu.: 48349   3rd Qu.:14324   3rd Qu.:14254   3rd Qu.: 28341  
##  Max.   :160013   Max.   :51973   Max.   :51527   Max.   :103629  
##   Empty_Seats    
##  Min.   :     2  
##  1st Qu.:  3042  
##  Median :  7366  
##  Mean   : 15727  
##  3rd Qu.: 19066  
##  Max.   :202305
#summary of variables after capping
summary(fifteen_years_movement[, c(3:7)])
##    Max_Seats      Passengers_In   Passengers_Out  Total_Movement  
##  Min.   :     8   Min.   :    0   Min.   :    0   Min.   :     0  
##  1st Qu.:  8398   1st Qu.: 2196   1st Qu.: 2187   1st Qu.:  4443  
##  Median : 18576   Median : 5142   Median : 5118   Median : 10294  
##  Mean   : 39815   Mean   :12121   Mean   :11981   Mean   : 24224  
##  3rd Qu.: 48349   3rd Qu.:14324   3rd Qu.:14254   3rd Qu.: 28341  
##  Max.   :160013   Max.   :51973   Max.   :51527   Max.   :103629  
##   Empty_Seats    
##  Min.   :     2  
##  1st Qu.:  3042  
##  Median :  7366  
##  Mean   : 15727  
##  3rd Qu.: 19066  
##  Max.   :202305

Transform

#a histogram showing the Empty_Seats variable before the transformation.
ggplot(fifteen_years_movement, aes(x = Empty_Seats)) +
geom_histogram(bins = 20, colour = "darkblue", fill = "lightblue") +
labs(title = "Histogram of Empty_seats variable from complete_games dataset",
y = "Frequency",
x = "Empty Seats by Month")

#a Box-Cox tranformation on the Empty_Seats variable
fifteen_years_movement$Empty_Seats_BoxCox <- BoxCox(fifteen_years_movement$Empty_Seats, lambda = "auto")
#histogram of the Empty_Seats variable after the transformation
ggplot(fifteen_years_movement, aes(x = Empty_Seats_BoxCox)) +
geom_histogram(bins = 20, colour = "darkblue", fill = "lightblue") +
labs(title = "Histogram of the Empty_seats variable after BoxCox transformation",
y = "Frequency",
x = "Empty Seats by Month")

data.gov.au 2018, International Airlines - Airline by country of port, Airline datasets, data file, Australian Government, Bureau of Infrastructure, Melbourne, viewed 06 November 2018, < https://data.gov.au/dataset/international-airlines-airline-by-country-of-port-data>

data.gov.au 2018, International Airlines - Operated Flights and Seats, Airline datasets, data file, Australian Government, Bureau of Infrastructure, Melbourne, viewed 06 November 2018, https://data.gov.au/dataset/airport-traffic-data