setwd("~/Desktop/2. Data Wrangling/Assignment2-40%")
# This is the R chunk for the required packages
library(readr)
library(magrittr)
library(tidyr)
library(Hmisc)
library(dplyr)
library(outliers)
library(lubridate)
library(forecast)
A Preprocessing Step is always an important step before analysing any data. Based on this data we have done the following:
The data is extracted from the following site:
{https://data.gov.au/dataset/ds-dga-cc5d888f-5850-47f3-815d-08289b22f5a8/details}.
The two datasets that was of interest is Airport Passenger Movements by Month - 20 major airports(CSV) which will be named “passengers” and Airport Aircraft Movements by Month - 20 airports(CSV) which will be named “details”. Both dataset contains 8883 observations with 12 variables.
Under the passengers dataset, the variables are as follows:
Under the aircraft details dataset, the variables are as follows:
passengers <- read_csv("passengers.csv")
details <- read_csv("details.csv")
head(details,3)
## # A tibble: 3 x 12
## AIRPORT Year Month Dom_Acm_In Dom_Acm_Out Dom_Acm_Total Int_Acm_In
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 1 1908 1902 3810 35
## 2 ALICE … 1985 1 264 261 525 0
## 3 All Au… 1985 1 34670 34670 69340 1759
## # … with 5 more variables: Int_Acm_Out <dbl>, Int_Acm_Total <dbl>,
## # Acm_In <dbl>, Acm_Out <dbl>, Acm_Total <dbl>
head(passengers, 3)
## # A tibble: 3 x 12
## AIRPORT Year Month Dom_Pax_In Dom_Pax_Out Dom_Pax_Total Int_Pax_In
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 1 81661 81630 163291 5806
## 2 ALICE … 1985 1 19238 17887 37125 0
## 3 All Au… 1985 1 1192395 1192395 2384790 263795
## # … with 5 more variables: Int_Pax_Out <dbl>, Int_Pax_Total <dbl>,
## # Pax_In <dbl>, Pax_Out <dbl>, Pax_Total <dbl>
#flight details by merging 2 datasets
flight_dts <- passengers %>% left_join(details, by=c("AIRPORT" = "AIRPORT", "Year" = "Year", "Month" = "Month"))
head(flight_dts)
## # A tibble: 6 x 21
## AIRPORT Year Month Dom_Pax_In Dom_Pax_Out Dom_Pax_Total Int_Pax_In
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 1 81661 81630 163291 5806
## 2 ALICE … 1985 1 19238 17887 37125 0
## 3 All Au… 1985 1 1192395 1192395 2384790 263795
## 4 BALLINA 1985 1 0 0 0 0
## 5 BRISBA… 1985 1 120950 120776 241726 25867
## 6 CAIRNS 1985 1 18840 19865 38705 1683
## # … with 14 more variables: Int_Pax_Out <dbl>, Int_Pax_Total <dbl>,
## # Pax_In <dbl>, Pax_Out <dbl>, Pax_Total <dbl>, Dom_Acm_In <dbl>,
## # Dom_Acm_Out <dbl>, Dom_Acm_Total <dbl>, Int_Acm_In <dbl>,
## # Int_Acm_Out <dbl>, Int_Acm_Total <dbl>, Acm_In <dbl>, Acm_Out <dbl>,
## # Acm_Total <dbl>
flight_att <- attributes(flight_dts)
flight_att$names
## [1] "AIRPORT" "Year" "Month" "Dom_Pax_In"
## [5] "Dom_Pax_Out" "Dom_Pax_Total" "Int_Pax_In" "Int_Pax_Out"
## [9] "Int_Pax_Total" "Pax_In" "Pax_Out" "Pax_Total"
## [13] "Dom_Acm_In" "Dom_Acm_Out" "Dom_Acm_Total" "Int_Acm_In"
## [17] "Int_Acm_Out" "Int_Acm_Total" "Acm_In" "Acm_Out"
## [21] "Acm_Total"
flight_att$class
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
str(flight_dts)
## tibble [8,883 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ AIRPORT : chr [1:8883] "ADELAIDE" "ALICE SPRINGS" "All Australian Airports" "BALLINA" ...
## $ Year : num [1:8883] 1985 1985 1985 1985 1985 ...
## $ Month : num [1:8883] 1 1 1 1 1 1 1 1 1 1 ...
## $ Dom_Pax_In : num [1:8883] 81661 19238 1192395 0 120950 ...
## $ Dom_Pax_Out : num [1:8883] 81630 17887 1192395 0 120776 ...
## $ Dom_Pax_Total: num [1:8883] 163291 37125 2384790 0 241726 ...
## $ Int_Pax_In : num [1:8883] 5806 0 263795 0 25867 ...
## $ Int_Pax_Out : num [1:8883] 4733 0 208770 0 19178 ...
## $ Int_Pax_Total: num [1:8883] 10539 0 472565 0 45045 ...
## $ Pax_In : num [1:8883] 87467 19238 1456190 0 146817 ...
## $ Pax_Out : num [1:8883] 86363 17887 1401165 0 139954 ...
## $ Pax_Total : num [1:8883] 173830 37125 2857355 0 286771 ...
## $ Dom_Acm_In : num [1:8883] 1908 264 34670 0 2071 ...
## $ Dom_Acm_Out : num [1:8883] 1902 261 34670 0 2064 ...
## $ Dom_Acm_Total: num [1:8883] 3810 525 69340 0 4135 ...
## $ Int_Acm_In : num [1:8883] 35 0 1759 0 183 ...
## $ Int_Acm_Out : num [1:8883] 34 0 1731 0 183 ...
## $ Int_Acm_Total: num [1:8883] 69 0 3490 0 366 72 0 59 0 0 ...
## $ Acm_In : num [1:8883] 1943 264 36429 0 2254 ...
## $ Acm_Out : num [1:8883] 1936 261 36401 0 2247 ...
## $ Acm_Total : num [1:8883] 3879 525 72830 0 4501 ...
flight_dts$Month[flight_dts$Month == 1] <- "Jan"
flight_dts$Month[flight_dts$Month == 2] <- "Feb"
flight_dts$Month[flight_dts$Month == 3] <- "Mar"
flight_dts$Month[flight_dts$Month == 4] <- "Apr"
flight_dts$Month[flight_dts$Month == 5] <- "May"
flight_dts$Month[flight_dts$Month == 6] <- "Jun"
flight_dts$Month[flight_dts$Month == 7] <- "July"
flight_dts$Month[flight_dts$Month == 8] <- "Aug"
flight_dts$Month[flight_dts$Month == 9] <- "Sep"
flight_dts$Month[flight_dts$Month == 10] <- "Oct"
flight_dts$Month[flight_dts$Month == 11] <- "Nov"
flight_dts$Month[flight_dts$Month == 12] <- "Dec"
flight_dts$Month <- as.factor(flight_dts$Month)
levels(flight_dts$Month)
## [1] "Apr" "Aug" "Dec" "Feb" "Jan" "July" "Jun" "Mar" "May" "Nov"
## [11] "Oct" "Sep"
In this dataset, it is clear that the data looked untidy as it has failed to satisfy the following principles:
A new variable is required in this dataset which is “Type” that records the type of flight passengers were taking (i.e Domestic, International or Both). From here, we can now tidy the data in a way such that each variable has its own column. Renaming the columns is also required to tidy the data and standardise all variables. Once we have all three types of flight, we can combine them by using the function bind_rows().
#sorting the data by Domestic flights
flights.dom <- flight_dts %>% select(AIRPORT:Dom_Pax_Total, Dom_Acm_In:Dom_Acm_Total)
flights.dom$Type <- "Domestic"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Acm_In"] <-"Aircrafts In"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Acm_Out"] <-"Aircrafts Out"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Acm_Total"] <-"Aircrafts Total"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Pax_In"] <-"Passangers In"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Pax_Out"] <-"Passangers Out"
colnames(flights.dom)[colnames(flights.dom)=="Dom_Pax_Total"] <-"Passangers Total"
head(flights.dom, 3)
## # A tibble: 3 x 10
## AIRPORT Year Month `Passangers In` `Passangers Out` `Passangers Tot…
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 Jan 81661 81630 163291
## 2 ALICE … 1985 Jan 19238 17887 37125
## 3 All Au… 1985 Jan 1192395 1192395 2384790
## # … with 4 more variables: `Aircrafts In` <dbl>, `Aircrafts Out` <dbl>,
## # `Aircrafts Total` <dbl>, Type <chr>
#sorting the data by international flights
flights.int <- flight_dts %>% select(AIRPORT:Month, Int_Pax_In:Int_Pax_Total, Int_Acm_In:Int_Acm_Total)
flights.int$Type <- "International"
colnames(flights.int)[colnames(flights.int)=="Int_Acm_In"] <-"Aircrafts In"
colnames(flights.int)[colnames(flights.int)=="Int_Acm_Out"] <-"Aircrafts Out"
colnames(flights.int)[colnames(flights.int)=="Int_Acm_Total"] <-"Aircrafts Total"
colnames(flights.int)[colnames(flights.int)=="Int_Pax_In"] <-"Passangers In"
colnames(flights.int)[colnames(flights.int)=="Int_Pax_Out"] <-"Passangers Out"
colnames(flights.int)[colnames(flights.int)=="Int_Pax_Total"] <-"Passangers Total"
head(flights.int, 3)
## # A tibble: 3 x 10
## AIRPORT Year Month `Passangers In` `Passangers Out` `Passangers Tot…
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 Jan 5806 4733 10539
## 2 ALICE … 1985 Jan 0 0 0
## 3 All Au… 1985 Jan 263795 208770 472565
## # … with 4 more variables: `Aircrafts In` <dbl>, `Aircrafts Out` <dbl>,
## # `Aircrafts Total` <dbl>, Type <chr>
#sorting the data by both Domestic and International flights
flight.total <- flight_dts %>% select(AIRPORT:Month, Pax_In:Pax_Total, Acm_In:Acm_Total)
flight.total$Type <- "Domestic & International"
colnames(flight.total)[colnames(flight.total)=="Acm_In"] <-"Aircrafts In"
colnames(flight.total)[colnames(flight.total)=="Acm_Out"] <-"Aircrafts Out"
colnames(flight.total)[colnames(flight.total)=="Acm_Total"] <-"Aircrafts Total"
colnames(flight.total)[colnames(flight.total)=="Pax_In"] <-"Passangers In"
colnames(flight.total)[colnames(flight.total)=="Pax_Out"] <-"Passangers Out"
colnames(flight.total)[colnames(flight.total)=="Pax_Total"] <-"Passangers Total"
head(flight.total, 3)
## # A tibble: 3 x 10
## AIRPORT Year Month `Passangers In` `Passangers Out` `Passangers Tot…
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 Jan 87467 86363 173830
## 2 ALICE … 1985 Jan 19238 17887 37125
## 3 All Au… 1985 Jan 1456190 1401165 2857355
## # … with 4 more variables: `Aircrafts In` <dbl>, `Aircrafts Out` <dbl>,
## # `Aircrafts Total` <dbl>, Type <chr>
#binding all 3 flights by rows
flights_new <- bind_rows(flights.dom, flights.int, flight.total)
head(flights_new)
## # A tibble: 6 x 10
## AIRPORT Year Month `Passangers In` `Passangers Out` `Passangers Tot…
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 ADELAI… 1985 Jan 81661 81630 163291
## 2 ALICE … 1985 Jan 19238 17887 37125
## 3 All Au… 1985 Jan 1192395 1192395 2384790
## 4 BALLINA 1985 Jan 0 0 0
## 5 BRISBA… 1985 Jan 120950 120776 241726
## 6 CAIRNS 1985 Jan 18840 19865 38705
## # … with 4 more variables: `Aircrafts In` <dbl>, `Aircrafts Out` <dbl>,
## # `Aircrafts Total` <dbl>, Type <chr>
From here, it is clear that the data is still not tidy due to the repititive nature in the dataset. Hance, we require a new variable which is “Status” recording whether the flight is under Arrivals or Departures. The colnames are once again changed order to standardized any subsetted datas and the dataset are combined with the function bind_rows().
#sorting the data by inbound and replcing it by Arrivals
inbound <- flights_new %>% select(AIRPORT:`Passangers In`, `Aircrafts In`, Type)
inbound$Status <- "Arrivals"
colnames(inbound)[colnames(inbound)=="Passangers In"] <-"Passengers"
colnames(inbound)[colnames(inbound)=="Aircrafts In"] <-"Aircrafts"
head(inbound)
## # A tibble: 6 x 7
## AIRPORT Year Month Passengers Aircrafts Type Status
## <chr> <dbl> <fct> <dbl> <dbl> <chr> <chr>
## 1 ADELAIDE 1985 Jan 81661 1908 Domestic Arrivals
## 2 ALICE SPRINGS 1985 Jan 19238 264 Domestic Arrivals
## 3 All Australian Airports 1985 Jan 1192395 34670 Domestic Arrivals
## 4 BALLINA 1985 Jan 0 0 Domestic Arrivals
## 5 BRISBANE 1985 Jan 120950 2071 Domestic Arrivals
## 6 CAIRNS 1985 Jan 18840 486 Domestic Arrivals
#sorting the data by outbound and replcing it by Departures
outbound <- flights_new %>% select(AIRPORT:Month, `Passangers Out`,`Aircrafts Out`,Type)
outbound$Status <- "Departures"
colnames(outbound)[colnames(outbound)=="Passangers Out"] <-"Passengers"
colnames(outbound)[colnames(outbound)=="Aircrafts Out"] <-"Aircrafts"
head(outbound)
## # A tibble: 6 x 7
## AIRPORT Year Month Passengers Aircrafts Type Status
## <chr> <dbl> <fct> <dbl> <dbl> <chr> <chr>
## 1 ADELAIDE 1985 Jan 81630 1902 Domestic Departures
## 2 ALICE SPRINGS 1985 Jan 17887 261 Domestic Departures
## 3 All Australian Airports 1985 Jan 1192395 34670 Domestic Departures
## 4 BALLINA 1985 Jan 0 0 Domestic Departures
## 5 BRISBANE 1985 Jan 120776 2064 Domestic Departures
## 6 CAIRNS 1985 Jan 19865 494 Domestic Departures
# row bind for the departure and arrival flights
flights_tidy1 <- bind_rows(inbound, outbound)
colnames(flights_tidy1)[colnames(flights_tidy1)=="AIRPORT"] <-"Airport"
head(flights_tidy1)
## # A tibble: 6 x 7
## Airport Year Month Passengers Aircrafts Type Status
## <chr> <dbl> <fct> <dbl> <dbl> <chr> <chr>
## 1 ADELAIDE 1985 Jan 81661 1908 Domestic Arrivals
## 2 ALICE SPRINGS 1985 Jan 19238 264 Domestic Arrivals
## 3 All Australian Airports 1985 Jan 1192395 34670 Domestic Arrivals
## 4 BALLINA 1985 Jan 0 0 Domestic Arrivals
## 5 BRISBANE 1985 Jan 120950 2071 Domestic Arrivals
## 6 CAIRNS 1985 Jan 18840 486 Domestic Arrivals
Checking the structure of the new table:
as.factor().#Converting the variables to factors
flights_tidy1$Airport <- as.factor(flights_tidy1$Airport)
flights_tidy1$Type <- as.factor(flights_tidy1$Type)
flights_tidy1 <- arrange(flights_tidy1, Airport, Year, Month)
flights_tidy1 <- flights_tidy1[c("Airport", "Month", "Year", "Type", "Status", "Passengers", "Aircrafts")]
flights_tidy1$Status <- as.factor(flights_tidy1$Status)
head(flights_tidy1)
## # A tibble: 6 x 7
## Airport Month Year Type Status Passengers Aircrafts
## <fct> <fct> <dbl> <fct> <fct> <dbl> <dbl>
## 1 ADELAIDE Apr 1985 Domestic Arrivals 76105 1803
## 2 ADELAIDE Apr 1985 International Arrivals 3346 28
## 3 ADELAIDE Apr 1985 Domestic & International Arrivals 79451 1831
## 4 ADELAIDE Apr 1985 Domestic Departures 77891 1804
## 5 ADELAIDE Apr 1985 International Departures 4721 29
## 6 ADELAIDE Apr 1985 Domestic & International Departures 82612 1833
New variables are added in the date by using the mutate() function. The new vaiables are Passengers per Aircraft(P.P.A) and Scaled.PPA. Passengers per Aircraft(P.P.A) is the ratio of Passengers and Aircraft where we assume the size of the aircrafts for specific “Type” of travels are the same. Since the values of Passengers per Aircraft(P.P.A) is large, we have to utilise the logarithmic function of based 10 for the P.P.A values. These values will be used in the later sections.
# creating new variables for Passengers per Aircraft(P.P.A) rounding to one decimal place and also a logorithmic scaled P.P.A round off to two decimal place
flight_tidy2 <- flights_tidy1 %>% mutate(
P.P.A = round(Passengers/Aircrafts,1),
Scaled.PPA = round(log10(P.P.A),2)
)
head(flight_tidy2, n=3)
## # A tibble: 3 x 9
## Airport Month Year Type Status Passengers Aircrafts P.P.A Scaled.PPA
## <fct> <fct> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAIDE Apr 1985 Domestic Arriv… 76105 1803 42.2 1.63
## 2 ADELAIDE Apr 1985 Internation… Arriv… 3346 28 120. 2.08
## 3 ADELAIDE Apr 1985 Domestic & … Arriv… 79451 1831 43.4 1.64
The variable generate for Passengers per Aircraft may contain missing values or special values. Hence, we are required to check for all present inconsistencies in the dataset. After using colSums(is.na()) function, we found that there will still inconsistencies present in the data. we proceed to write a function what displays all the special values (i.e. Nan and Inf).
#scanning for missing values
colSums(is.na(flight_tidy2))
## Airport Month Year Type Status Passengers Aircrafts
## 0 0 0 0 0 0 0
## P.P.A Scaled.PPA
## 9701 9701
# scanning for total special values
is.special <- function(x){
if (is.numeric(x)) (is.nan(x)|is.infinite(x))
}
# scanning for special values (Nan)
is.specialnan <- function(x){
if (is.numeric(x)) (is.nan(x))
}
# scanning for special values (Inf)
is.specialinf <- function(x){
if (is.numeric(x)) (is.infinite(x))
}
sum(is.special(flight_tidy2$P.P.A))
## [1] 9702
sum(is.specialnan(flight_tidy2$P.P.A))
## [1] 9701
sum(is.specialinf(flight_tidy2$P.P.A))
## [1] 1
sum(is.special(flight_tidy2$Scaled.PPA))
## [1] 9709
sum(is.specialnan(flight_tidy2$Scaled.PPA))
## [1] 9701
sum(is.specialinf(flight_tidy2$Scaled.PPA))
## [1] 8
The reasons there maybe special values in the dataset is because:
Therefore, we have replaced all the special values with 0.
flight_tidy2$P.P.A[is.nan(flight_tidy2$P.P.A)] <- 0
flight_tidy2$P.P.A[is.infinite(flight_tidy2$P.P.A)] <- 0
flight_tidy2$Scaled.PPA[is.nan(flight_tidy2$Scaled.PPA)] <- 0
flight_tidy2$Scaled.PPA[is.infinite(flight_tidy2$Scaled.PPA)] <- 0
head(flight_tidy2,3)
## # A tibble: 3 x 9
## Airport Month Year Type Status Passengers Aircrafts P.P.A Scaled.PPA
## <fct> <fct> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAIDE Apr 1985 Domestic Arriv… 76105 1803 42.2 1.63
## 2 ADELAIDE Apr 1985 Internation… Arriv… 3346 28 120. 2.08
## 3 ADELAIDE Apr 1985 Domestic & … Arriv… 79451 1831 43.4 1.64
Finally, we have shown that the dataset is consistent.
colSums(is.na(flight_tidy2))
## Airport Month Year Type Status Passengers Aircrafts
## 0 0 0 0 0 0 0
## P.P.A Scaled.PPA
## 0 0
We have subsetted the data into three different groups from the fight_tidy2 dataset to do a better visual inspection on the outliers for Arrivals and Departure Flights for all three groups.
domestic <- subset(flight_tidy2, `Type` == "Domestic", na.rm=TRUE)
international <- subset(flight_tidy2, `Type` == "International", na.rm=TRUE)
both <- subset(flight_tidy2, `Type` == "Domestic & International", na.rm=TRUE)
domestic$P.P.A %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.20 68.40 70.26 91.30 169.00
international$P.P.A %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 58.99 123.70 269.70
both$P.P.A %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 48.30 71.20 73.25 94.90 170.90
Using a box-plot, it gives us a visual illustrations whether outliers exists. From the figures below for all three groups, we can see that there a large numbers of outliers. Using the values of the Scaled_PPA, we are able to identify all the outliers present in the data by visual inspection.
boxplot(domestic$Scaled.PPA ~ domestic$Status,
main="Box Plot of Passengers per Aircraft for Domestic Flight",
ylab="Passengers per Aircraft",
xlab="Status",
ylim = c(-1,3),
col=3)
boxplot(international$Scaled.PPA ~ international$Status,
main="Box Plot of Passengers per Aircraft for International Flight",
ylab="Passengers per Aircraft",
xlab="Status",
ylim = c(-1,3),
col="grey")
boxplot(both$Scaled.PPA ~ both$Status,
main="Box Plot of P.P.A for Domestic & International Flights",
ylab="Passengers per Aircraft",
xlab="Status",
ylim = c(-1,3),
col=4)
By capping or winsorising involves replacing the outliers with the nearest neighbours that are not outliers. For outliers that lie outside the outlier fences on a box-plot, we can cap it by replacing those observations outside the lower limit with the value of 5th percentile and those that lie above the upper limit, with the value of 95th percentile.
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ))
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
flight_tidy2$P.P.A <- cap(na.omit(flight_tidy2$P.P.A))
hist(flight_tidy2$P.P.A, main = "Histogram of Passengers Per Aircraft",
xlab="Passenger per Aircraft")
From the above histogram, we can see that the data is rightly skewed. In order to correct for this skewed, a transformation has to be done on the dataset.
A mathemical transformation called the square root transformation where the sqrt() function is used to reduce this right skewness. It has an advantage that it can be applied to zero values. Hence, making this transformation the right fit for the data.
flight_tidy2$P.P.A_Transform <- round(sqrt(flight_tidy2$P.P.A),3)
head(flight_tidy2,3)
## # A tibble: 3 x 10
## Airport Month Year Type Status Passengers Aircrafts P.P.A Scaled.PPA
## <fct> <fct> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 ADELAI… Apr 1985 Dome… Arriv… 76105 1803 42.2 1.63
## 2 ADELAI… Apr 1985 Inte… Arriv… 3346 28 120. 2.08
## 3 ADELAI… Apr 1985 Dome… Arriv… 79451 1831 43.4 1.64
## # … with 1 more variable: P.P.A_Transform <dbl>
par(mfrow=c(1,2))
hist(flight_tidy2$P.P.A, main = "P.P.A",
xlab="Passenger/Aircraft", col="grey")
hist(flight_tidy2$P.P.A_Transform,
main = "Transfromed P.P.A",
xlab="sqrt(Passenger/Aircraft)",
col="red1")
As a result, it is clear that after the transformation, the data is now more symmetrically distributed. The reason for the high distribution in zero’s in the Histrogram is due the large amount of zero’s replaced for the special values. The reason is also due the nature of the dataset.