library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(deductive)
library(validate)
##
## Attaching package: 'validate'
## The following object is masked from 'package:dplyr':
##
## expr
library(outliers)
library(forecast)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
##
## expr
I have two datasets obtained from the data.gov.au website. They look at passenger and freight movements in and out of Australian airports. They’re two International Airline datasets and have a unique identifier for month that can be used for a join. I will look at the data to see the difference between passenger movements and maximum seat capacity.
Using the International Airlines - Airline by country of port dataset … (data.gov.au 2018).
Using International Airlines - Operated Flights and Seats dataset … (data.gov.au 2018).
#read in airline port country dataset
aus_international_airline_port <- read_csv("airline_portcountry.csv")
## Parsed with column specification:
## cols(
## Month = col_integer(),
## Airline = col_character(),
## Port_Country = col_character(),
## Passengers_In = col_integer(),
## `Freight_In_(tonnes)` = col_double(),
## `Mail_In_(tonnes)` = col_double(),
## Passengers_Out = col_integer(),
## `Freight_Out_(tonnes)` = col_double(),
## `Mail_Out_(tonnes)` = col_double(),
## Year = col_integer(),
## Month_num = col_integer()
## )
#show head of data
head(aus_international_airline_port)
#read in airline movement activity
aus_international_flights_seats <- read_csv("international_airline_activity_opfltsseats.csv")
## Parsed with column specification:
## cols(
## Month = col_integer(),
## In_Out = col_character(),
## Australian_City = col_character(),
## International_City = col_character(),
## Airline = col_character(),
## Route = col_character(),
## Port_Country = col_character(),
## Port_Region = col_character(),
## Service_Country = col_character(),
## Service_Region = col_character(),
## Stops = col_integer(),
## All_Flights = col_integer(),
## Max_Seats = col_integer(),
## Year = col_integer(),
## Month_num = col_integer()
## )
#show head of data
head(aus_international_flights_seats)
#check the variable class types
sapply(aus_international_airline_port, class)
## Month Airline Port_Country
## "integer" "character" "character"
## Passengers_In Freight_In_(tonnes) Mail_In_(tonnes)
## "integer" "numeric" "numeric"
## Passengers_Out Freight_Out_(tonnes) Mail_Out_(tonnes)
## "integer" "numeric" "numeric"
## Year Month_num
## "integer" "integer"
sapply(aus_international_flights_seats, class)
## Month In_Out Australian_City
## "integer" "character" "character"
## International_City Airline Route
## "character" "character" "character"
## Port_Country Port_Region Service_Country
## "character" "character" "character"
## Service_Region Stops All_Flights
## "character" "integer" "integer"
## Max_Seats Year Month_num
## "integer" "integer" "integer"
#total variable class types
table(sapply(aus_international_airline_port, class))
##
## character integer numeric
## 2 5 4
table(sapply(aus_international_flights_seats, class))
##
## character integer
## 9 6
#dataset structure
str(aus_international_airline_port)
## Classes 'tbl_df', 'tbl' and 'data.frame': 40156 obs. of 11 variables:
## $ Month : int 31048 31048 31048 31048 31048 31048 31048 31048 31048 31048 ...
## $ Airline : chr "Air Caledonie" "Air China" "Air India" "Air India" ...
## $ Port_Country : chr "New Caledonia" "China" "India" "Singapore" ...
## $ Passengers_In : int 725 1311 1423 501 161 24588 2954 3350 574 280 ...
## $ Freight_In_(tonnes) : num 0.368 9.158 44.823 10.583 0.212 ...
## $ Mail_In_(tonnes) : num 0 0 0 0 0 0.374 0 0 0 0 ...
## $ Passengers_Out : int 474 1232 970 306 96 20692 2906 2470 578 205 ...
## $ Freight_Out_(tonnes): num 0.167 2.587 6.708 124.966 23.996 ...
## $ Mail_Out_(tonnes) : num 0 0 0 0 0 0.122 0 0 0 0 ...
## $ Year : int 1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
## $ Month_num : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 11
## .. ..$ Month : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Airline : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Port_Country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Passengers_In : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Freight_In_(tonnes) : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Mail_In_(tonnes) : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Passengers_Out : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Freight_Out_(tonnes): list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Mail_Out_(tonnes) : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Month_num : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
str(aus_international_flights_seats)
## Classes 'tbl_df', 'tbl' and 'data.frame': 92386 obs. of 15 variables:
## $ Month : int 37865 37865 37865 37865 37865 37865 37865 37865 37865 37865 ...
## $ In_Out : chr "I" "I" "I" "I" ...
## $ Australian_City : chr "Adelaide" "Adelaide" "Adelaide" "Adelaide" ...
## $ International_City: chr "Denpasar" "Hong Kong" "Kuala Lumpur" "Singapore" ...
## $ Airline : chr "Garuda Indonesia" "Cathay Pacific Airways" "Malaysia Airlines" "Qantas Airways" ...
## $ Route : chr "DPS-ADL-MEL" "HKG-ADL-MEL" "KUL-ADL" "SIN-DRW-ADL-MEL" ...
## $ Port_Country : chr "Indonesia" "Hong Kong (SAR)" "Malaysia" "Singapore" ...
## $ Port_Region : chr "SE Asia" "NE Asia" "SE Asia" "SE Asia" ...
## $ Service_Country : chr "Indonesia" "Hong Kong (SAR)" "Malaysia" "Singapore" ...
## $ Service_Region : chr "SE Asia" "NE Asia" "SE Asia" "SE Asia" ...
## $ Stops : int 0 0 0 1 1 0 0 0 0 0 ...
## $ All_Flights : int 13 8 17 4 9 12 36 18 8 14 ...
## $ Max_Seats : int 3809 2008 4726 908 2038 3876 12624 2556 2296 5404 ...
## $ Year : int 2003 2003 2003 2003 2003 2003 2003 2003 2003 2003 ...
## $ Month_num : int 9 9 9 9 9 9 9 9 9 9 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 15
## .. ..$ Month : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ In_Out : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Australian_City : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ International_City: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Airline : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Route : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Port_Country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Port_Region : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Service_Country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Service_Region : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Stops : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ All_Flights : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Max_Seats : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Month_num : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
#a better look at the attributes
attributes(aus_international_airline_port[1:10, ])
## $names
## [1] "Month" "Airline" "Port_Country"
## [4] "Passengers_In" "Freight_In_(tonnes)" "Mail_In_(tonnes)"
## [7] "Passengers_Out" "Freight_Out_(tonnes)" "Mail_Out_(tonnes)"
## [10] "Year" "Month_num"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $class
## [1] "tbl_df" "tbl" "data.frame"
attributes(aus_international_flights_seats[1:10, ])
## $names
## [1] "Month" "In_Out" "Australian_City"
## [4] "International_City" "Airline" "Route"
## [7] "Port_Country" "Port_Region" "Service_Country"
## [10] "Service_Region" "Stops" "All_Flights"
## [13] "Max_Seats" "Year" "Month_num"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $class
## [1] "tbl_df" "tbl" "data.frame"
#group observations and drop varibles
aus_international_flights_seats <- aus_international_flights_seats %>% group_by(Month, Airline, Port_Country, Year, Month_num) %>% summarise(Max_Seats=sum(Max_Seats,na.rm = TRUE), All_Flights=sum(All_Flights, na.rm = TRUE))
#join ailine movements and airline ports on Month, Airline, Port_Country, Year and Month_num
international_air_traval <- full_join(aus_international_flights_seats, aus_international_airline_port, by = c("Month" = "Month", "Airline" = "Airline", "Port_Country" = "Port_Country", "Year" = "Year", "Month_num" = "Month_num"))
#show head of data
head(international_air_traval)
#factor variables
international_air_traval$Month <- as.factor(international_air_traval$Month)
international_air_traval$Year <- as.factor(international_air_traval$Year)
international_air_traval$Airline <- as.factor(international_air_traval$Airline)
international_air_traval$Port_Country <- as.factor(international_air_traval$Port_Country)
#show head of joined data
head(international_air_traval)
The following rules apply for tidy data: 1. Each variable must have its own column. 2. Each observation must have its own row. 3. Each value must have its own cell. - all rules have been met except if you consider the two month variables where on is an identification number. So there some things that should be changed
Create a new dataset total_passengers for the numeric variables
Create two new variables from existing variables - Total_Movement from In/Out movements and Empty_seats by subtracting Total_movents from Max_Seates
#ungrouping
#ungroup(international_air_traval)
#rename the variables
names(international_air_traval)[1]<-"Month_ID"
names(international_air_traval)[5]<-"Month"
#factor and change values to quarters
international_air_traval <- ungroup(international_air_traval) %>% mutate(Month = case_when (Month == 1 ~ "January", Month == 2 ~ "February", Month == 3 ~ "March", Month == 4 ~ "April", Month == 5 ~ "May", Month == 6 ~ "June", Month == 7 ~ "July", Month == 8 ~ "August", Month == 9 ~ "September", Month == 10 ~ "October", Month == 11 ~ "November", Month == 12 ~ "December"))
#factor Quarter variable
international_air_traval$Month = as.factor(international_air_traval$Month)
#create new dataframe for totals
total_passenger <- international_air_traval %>% select(Year, Month, Max_Seats, Passengers_In, Passengers_Out)
#creating new variable from passengers in and passengers out
total_passenger <- mutate(total_passenger,
Total_Movement = Passengers_In + Passengers_Out)
#creating new variable, empty_seats by subtracting the new variable total movement from max seats
total_passenger <- mutate(total_passenger, Empty_Seats = Max_Seats - Total_Movement)
#group by year and month and create a new dataset
totals_by_month <- total_passenger %>% group_by(Year, Month) %>% summarise(Total_Movement=sum(Total_Movement,na.rm = TRUE), Empty_Seats=sum(Empty_Seats, na.rm = TRUE), Max_Seats=sum(Max_Seats, na.rm = TRUE))
#head data
head(total_passenger)
#checking for NAs
colSums(is.na(total_passenger))
## Year Month Max_Seats Passengers_In Passengers_Out
## 0 0 26694 874 874
## Total_Movement Empty_Seats
## 874 27568
#we have a lot of NAs due to the differece in dates of observation
#filtering the 15 year period froom 2003 - 2017, purposely dropping 2018 as the data is incomplete
fifteen_years_movement <- total_passenger %>% filter (Year == 2003 | Year == 2004 | Year == 2005 | Year == 2006 | Year == 2007
| Year == 2008 | Year == 2009 | Year == 2010 | Year == 2011 | Year == 2012
| Year == 2013 | Year == 2014 | Year == 2015 | Year == 2016 | Year == 2017)
#checking for NAs again
colSums(is.na(fifteen_years_movement))
## Year Month Max_Seats Passengers_In Passengers_Out
## 0 0 5885 866 866
## Total_Movement Empty_Seats
## 866 6751
# Define the rules as an validator expression
Rules <- validator( Passengers_In == Max_Seats * 0.25, Passengers_Out == Max_Seats * 0.25, Empty_Seats == Max_Seats - Total_Movement, Passengers_In + Passengers_Out == Total_Movement)
# Use impute_lr function
fifteen_years_movement <- impute_lr(fifteen_years_movement, Rules)
fifteen_years_movement <- fifteen_years_movement %>% mutate_at(vars(Passengers_In, Passengers_Out, Total_Movement, Empty_Seats), funs(round(., 0)))
#checking for NAs again
colSums(is.na(fifteen_years_movement))
## Year Month Max_Seats Passengers_In Passengers_Out
## 0 0 0 0 0
## Total_Movement Empty_Seats
## 0 0
#checking to see if there are any infinite values
sum(is.infinite(fifteen_years_movement$Max_Seats)|
is.infinite(fifteen_years_movement$Passengers_In) |
is.infinite(fifteen_years_movement$Passengers_Out)|
is.infinite(fifteen_years_movement$Total_Movement) |
is.infinite(fifteen_years_movement$Empty_Seats))
## [1] 0
#checking to see if they're all numeric
sum(is.nan(fifteen_years_movement$Max_Seats)|
is.nan(fifteen_years_movement$Passengers_In) |
is.nan(fifteen_years_movement$Passengers_Out)|
is.nan(fifteen_years_movement$Total_Movement) |
is.nan(fifteen_years_movement$Empty_Seats))
## [1] 0
#removing zero data from Empty_seat variable to avoid skewed results
fifteen_years_movement = filter(fifteen_years_movement, Empty_Seats > 0)
#Calculating the z-scores for the Total_Movement movement variable and showing
#the total number with an absolute value greater than 3.
z.scores_mv <- fifteen_years_movement$Total_Movement %>% scores(type = "z")
z.scores_mv %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.65441 -0.54116 -0.39204 0.00000 0.06796 6.89021
sum(abs(z.scores_mv) > 3)
## [1] 452
#Calculating the z-scores for the Max_seats variable and showing
#the total number with an absolute value greater than 3.
z.scores_ms <- fifteen_years_movement$Max_Seats %>% scores(type = "z")
z.scores_ms %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7374 -0.5879 -0.4066 0.0000 0.1238 6.4705
sum(abs(z.scores_ms) > 3)
## [1] 438
#Calculating the z-scores for the Passengers_In variable and showing
#the total number with an absolute value greater than 3.
z.scores_pi <- fifteen_years_movement$Passengers_In %>% scores(type = "z")
z.scores_pi %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.65044 -0.53986 -0.39149 0.00000 0.07094 7.30039
sum(abs(z.scores_pi) > 3)
## [1] 434
#Calculating the z-scores for the Passengers_Out variable and showing
#the total number with an absolute value greater than 3.
z.scores_po <- fifteen_years_movement$Passengers_Out %>% scores(type = "z")
z.scores_po %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.65282 -0.54091 -0.39091 0.00000 0.07655 7.65262
sum(abs(z.scores_po) > 3)
## [1] 452
#Calculating the z-scores for the Empty_Seats variable and showing
#the total number with an absolute value greater than 3.
z.scores_es <- fifteen_years_movement$Empty_Seats %>% scores(type = "z")
z.scores_es %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7478 -0.6032 -0.3976 0.0000 0.1588 8.8727
sum(abs(z.scores_es) > 3)
## [1] 351
#function to cap the values
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
# Capping the numeric variables
fifteen_years_movement$Total_Movement <- fifteen_years_movement$Total_Movement %>% cap()
fifteen_years_movement$Max_Seats <- fifteen_years_movement$Max_Seats %>% cap()
fifteen_years_movement$Passengers_In <- fifteen_years_movement$Passengers_In %>% cap()
fifteen_years_movement$Passengers_Out <- fifteen_years_movement$Passengers_Out %>% cap()
fifteen_years_movement$Empty_seats <- fifteen_years_movement$Empty_Seats %>% cap()
#summary of variables before capping
summary(fifteen_years_movement[, c(3:7)])
## Max_Seats Passengers_In Passengers_Out Total_Movement
## Min. : 8 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 8398 1st Qu.: 2196 1st Qu.: 2187 1st Qu.: 4443
## Median : 18576 Median : 5142 Median : 5118 Median : 10294
## Mean : 39815 Mean :12121 Mean :11981 Mean : 24224
## 3rd Qu.: 48349 3rd Qu.:14324 3rd Qu.:14254 3rd Qu.: 28341
## Max. :160013 Max. :51973 Max. :51527 Max. :103629
## Empty_Seats
## Min. : 2
## 1st Qu.: 3042
## Median : 7366
## Mean : 15727
## 3rd Qu.: 19066
## Max. :202305
#summary of variables after capping
summary(fifteen_years_movement[, c(3:7)])
## Max_Seats Passengers_In Passengers_Out Total_Movement
## Min. : 8 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 8398 1st Qu.: 2196 1st Qu.: 2187 1st Qu.: 4443
## Median : 18576 Median : 5142 Median : 5118 Median : 10294
## Mean : 39815 Mean :12121 Mean :11981 Mean : 24224
## 3rd Qu.: 48349 3rd Qu.:14324 3rd Qu.:14254 3rd Qu.: 28341
## Max. :160013 Max. :51973 Max. :51527 Max. :103629
## Empty_Seats
## Min. : 2
## 1st Qu.: 3042
## Median : 7366
## Mean : 15727
## 3rd Qu.: 19066
## Max. :202305
#a histogram showing the Empty_Seats variable before the transformation.
ggplot(fifteen_years_movement, aes(x = Empty_Seats)) +
geom_histogram(bins = 20, colour = "darkblue", fill = "lightblue") +
labs(title = "Histogram of Empty_seats variable from complete_games dataset",
y = "Frequency",
x = "Empty Seats by Month")
#a Box-Cox tranformation on the Empty_Seats variable
fifteen_years_movement$Empty_Seats_BoxCox <- BoxCox(fifteen_years_movement$Empty_Seats, lambda = "auto")
#histogram of the Empty_Seats variable after the transformation
ggplot(fifteen_years_movement, aes(x = Empty_Seats_BoxCox)) +
geom_histogram(bins = 20, colour = "darkblue", fill = "lightblue") +
labs(title = "Histogram of the Empty_seats variable after BoxCox transformation",
y = "Frequency",
x = "Empty Seats by Month")
data.gov.au 2018, International Airlines - Airline by country of port, Airline datasets, data file, Australian Government, Bureau of Infrastructure, Melbourne, viewed 06 November 2018, < https://data.gov.au/dataset/international-airlines-airline-by-country-of-port-data>
data.gov.au 2018, International Airlines - Operated Flights and Seats, Airline datasets, data file, Australian Government, Bureau of Infrastructure, Melbourne, viewed 06 November 2018, https://data.gov.au/dataset/airport-traffic-data