library(tidyr)
library(tidyverse)
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v ggplot2 3.2.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
First, let’s import our data and take a quick look at it:
airlineData <- as_tibble(read.csv('airlineData.csv', header = TRUE, stringsAsFactors = FALSE))
airlineData
## # A tibble: 5 x 7
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 "" delayed 62 12 20 102 305
## 3 "" "" NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 "" delayed 117 415 65 129 61
The first 2 columns aren’t labeled so let’s put some column names on them. While we’re at it, let’s eliminate the blank row separating the 2 airlines and bring the Airline name down into rows that have a status of delayed.
airlineData <- airlineData %>%
rename(AIRLINE = X, STATUS = X.1)%>%
filter(STATUS != '')
# replace blank with airline above
airlineData$AIRLINE[airlineData$AIRLINE == ''] <- airlineData$AIRLINE[str_detect(airlineData$AIRLINE,'[[:alpha:]]+')]
airlineData
## # A tibble: 4 x 7
## AIRLINE STATUS Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
The data looks better, but it’s still not completely tidy. According to Wickham, each variable in the dataset should have its own column. In the current structure of the dataset, we have 2 problems:
airlineData <- airlineData %>%
gather(CITY,NUM_FLIGHTS, -AIRLINE, -STATUS) %>%
spread(STATUS, NUM_FLIGHTS)
colnames(airlineData) <- c('AIRLINE', 'CITY', 'DELAYED', 'ON_TIME')
airlineData
## # A tibble: 10 x 4
## AIRLINE CITY DELAYED ON_TIME
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
Now that we have tidy data, we can compare the number of delays for each of the airlines. We can see from the summary that AM WEST has more delays than ALASKA!
airlineData %>%
group_by(AIRLINE) %>%
summarise(TOTAL_DELAYS = sum(DELAYED),
AVG_NUM_DELAYS = mean(DELAYED),
MEDIAN_DELAYS = median(DELAYED),
MIN_DELAYS = min(DELAYED),
MAX_DELAYS = max(DELAYED))
## # A tibble: 2 x 6
## AIRLINE TOTAL_DELAYS AVG_NUM_DELAYS MEDIAN_DELAYS MIN_DELAYS MAX_DELAYS
## <chr> <int> <dbl> <int> <int> <int>
## 1 ALASKA 501 100. 62 12 305
## 2 AM WEST 787 157. 117 61 415