Took the data provided and typed it into a CSV table that I then uploaded into R.
library(readr)
Airline_delays <- read_csv("~/Documents/CUNY Data 607/data/airlines/Airline_delays.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_integer(),
## Phoenix = col_integer(),
## `San Diego` = col_integer(),
## `San Francisco` = col_integer(),
## Seattle = col_integer()
## )
Airline_delays
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco`
## <chr> <chr> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503
## 2 <NA> delayed 62 12 20 102
## 3 <NA> <NA> NA NA NA NA
## 4 AM WEST on time 694 4840 383 320
## 5 <NA> delayed 117 415 65 129
## # ... with 1 more variables: Seattle <int>
The carrier names in the first column were not populated across all rows, so needed to fill the remaining rows in that column with the Carrier name.
library(tidyr)
Airline <- Airline_delays %>% fill(X1)
Airline
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco`
## <chr> <chr> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503
## 2 ALASKA delayed 62 12 20 102
## 3 ALASKA <NA> NA NA NA NA
## 4 AM WEST on time 694 4840 383 320
## 5 AM WEST delayed 117 415 65 129
## # ... with 1 more variables: Seattle <int>
There was an uncessary empty row in the original CSV file which appeared as NA when uploaded into R, so had to remove it.
Airline <- na.omit(Airline)
i.e. Change the City column data into observations rows.
Airline <- gather(Airline, "City", "n", 3:7)
Airline
## # A tibble: 20 x 4
## X1 X2 City n
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
i.e. Take the “On Time” and “Delayed” row variables and turn them into observations.
Airline <- spread(Airline, "X2", "n")
Airline
## # A tibble: 10 x 4
## X1 City delayed `on time`
## * <chr> <chr> <int> <int>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Airline <- dplyr::rename(Airline, Carrier = X1)
Airline <- dplyr::rename(Airline, Delayed = delayed)
Airline <- dplyr::rename(Airline, On_Time = 'on time')
Airline
## # A tibble: 10 x 4
## Carrier City Delayed On_Time
## * <chr> <chr> <int> <int>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
I still find the following function quicker and easier…
colnames(Airline)<- c(“Carrier”,“City”,“Delayed”, “On Time”)
Taking a look at the data:
dplyr::glimpse(Airline)
## Observations: 10
## Variables: 4
## $ Carrier <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AM ...
## $ City <chr> "Los Angeles", "Phoenix", "San Diego", "San Francisco"...
## $ Delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ On_Time <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
Look at the number of delays by Carrier:
select(Airline, Carrier, Delayed)
## # A tibble: 10 x 2
## Carrier Delayed
## * <chr> <int>
## 1 ALASKA 62
## 2 ALASKA 12
## 3 ALASKA 20
## 4 ALASKA 102
## 5 ALASKA 305
## 6 AM WEST 117
## 7 AM WEST 415
## 8 AM WEST 65
## 9 AM WEST 129
## 10 AM WEST 61
Mean and median of all Delays:
dplyr::summarise(Airline, Mean = mean(Delayed), Median = median(Delayed))
## # A tibble: 1 x 2
## Mean Median
## <dbl> <dbl>
## 1 128.8 83.5
Look at the number of delays in descending order:
dplyr::arrange(Airline, desc(Delayed))
## # A tibble: 10 x 4
## Carrier City Delayed On_Time
## <chr> <chr> <int> <int>
## 1 AM WEST Phoenix 415 4840
## 2 ALASKA Seattle 305 1841
## 3 AM WEST San Francisco 129 320
## 4 AM WEST Los Angeles 117 694
## 5 ALASKA San Francisco 102 503
## 6 AM WEST San Diego 65 383
## 7 ALASKA Los Angeles 62 497
## 8 AM WEST Seattle 61 201
## 9 ALASKA San Diego 20 212
## 10 ALASKA Phoenix 12 221
Number of delays in ascending order by Carrier:
dplyr::arrange(Airline, Carrier, Delayed)
## # A tibble: 10 x 4
## Carrier City Delayed On_Time
## <chr> <chr> <int> <int>
## 1 ALASKA Phoenix 12 221
## 2 ALASKA San Diego 20 212
## 3 ALASKA Los Angeles 62 497
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Seattle 61 201
## 7 AM WEST San Diego 65 383
## 8 AM WEST Los Angeles 117 694
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Phoenix 415 4840
The mean and sum of delays by Carrier:
Carrier <- Airline %>% group_by(Carrier) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed), n = n())
Carrier
## # A tibble: 2 x 4
## Carrier mean sum n
## <chr> <dbl> <int> <int>
## 1 ALASKA 100.2 501 5
## 2 AM WEST 157.4 787 5
The mean and sum of delays by City:
City <- Airline %>% group_by(City) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed)) %>%
arrange(desc(mean))
City
## # A tibble: 5 x 3
## City mean sum
## <chr> <dbl> <int>
## 1 Phoenix 213.5 427
## 2 Seattle 183.0 366
## 3 San Francisco 115.5 231
## 4 Los Angeles 89.5 179
## 5 San Diego 42.5 85
Create table to use for visualizing data:
CityCarrier <- Airline %>% group_by(City, Carrier) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed))
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(ggplot2)
LineGraph <- ggplot(CityCarrier, aes(x = City, y = mean))
LineGraph <- LineGraph + geom_line(aes(color=factor(Carrier), group = Carrier))
LineGraph <- LineGraph + scale_color_discrete(name = "Carrier")
LineGraph <- LineGraph + labs(title = "Carrier Delays by City", x = "City", y = "Total Number of Delays")
LineGraph