Tidy Data
1. Fill first column with complete row names
library(tidyr)
flight <- flight %>% fill(X)
flight
## X X.1 Los.Angeles Phoenix San.Diego San.Fransico Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
2. Remove NAs
flight <- na.omit(flight)
3. Make City attribute data observation
flight <- gather(flight, "City", "n", 3:7)
flight
## X X.1 City n
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on time San.Fransico 503
## 14 ALASKA delayed San.Fransico 102
## 15 AMWEST on time San.Fransico 320
## 16 AMWEST delayed San.Fransico 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
4. Turn “OnTime” and “Delayed” row variables into observations
flight <- spread(flight, "X.1", "n")
flight
## X City delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Fransico 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Fransico 129 320
## 10 AMWEST Seattle 61 201
flight <- dplyr::rename(flight, Carrier = X)
flight <- dplyr::rename(flight, Delayed = delayed)
flight <- dplyr::rename(flight, On_Time = 'on time')
flight
## Carrier City Delayed On_Time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Fransico 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Fransico 129 320
## 10 AMWEST Seattle 61 201
DATA ANALYSIS
dplyr::glimpse(flight)
## Observations: 10
## Variables: 4
## $ Carrier <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AMW...
## $ City <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Fransico",...
## $ Delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ On_Time <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
Delay duration by days
select(flight, Carrier, Delayed)
## Carrier Delayed
## 1 ALASKA 62
## 2 ALASKA 12
## 3 ALASKA 20
## 4 ALASKA 102
## 5 ALASKA 305
## 6 AMWEST 117
## 7 AMWEST 415
## 8 AMWEST 65
## 9 AMWEST 129
## 10 AMWEST 61
Number of delays in Descending order
dplyr::arrange(flight, desc(Delayed))
## Carrier City Delayed On_Time
## 1 AMWEST Phoenix 415 4840
## 2 ALASKA Seattle 305 1841
## 3 AMWEST San.Fransico 129 320
## 4 AMWEST Los.Angeles 117 694
## 5 ALASKA San.Fransico 102 503
## 6 AMWEST San.Diego 65 383
## 7 ALASKA Los.Angeles 62 497
## 8 AMWEST Seattle 61 201
## 9 ALASKA San.Diego 20 212
## 10 ALASKA Phoenix 12 221
Number of delays in ascending order by Carrier
dplyr::arrange(flight, Carrier, Delayed)
## Carrier City Delayed On_Time
## 1 ALASKA Phoenix 12 221
## 2 ALASKA San.Diego 20 212
## 3 ALASKA Los.Angeles 62 497
## 4 ALASKA San.Fransico 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Seattle 61 201
## 7 AMWEST San.Diego 65 383
## 8 AMWEST Los.Angeles 117 694
## 9 AMWEST San.Fransico 129 320
## 10 AMWEST Phoenix 415 4840
The mean and sum of delays by Carrier:
Carrier <- flight %>% group_by(Carrier) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed), n = n())
Carrier
## # A tibble: 2 x 4
## Carrier mean sum n
## <chr> <dbl> <int> <int>
## 1 ALASKA 100 501 5
## 2 AMWEST 157 787 5
The mean and sum of delays by City:
City <- flight %>% group_by(City) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed)) %>%
arrange(desc(mean))
City
## # A tibble: 5 x 3
## City mean sum
## <chr> <dbl> <int>
## 1 Phoenix 214 427
## 2 Seattle 183 366
## 3 San.Fransico 116 231
## 4 Los.Angeles 89.5 179
## 5 San.Diego 42.5 85
VISUALIZATION
Table creation for data visualization:
CityCarrier <- flight %>% group_by(City, Carrier) %>%
summarise(mean = mean(Delayed), sum = sum(Delayed))
Carrier Delays By City
library(ggplot2)
LineGraph <- ggplot(CityCarrier, aes(x = City, y = mean))
LineGraph <- LineGraph + geom_line(aes(color=factor(Carrier), group = Carrier))
LineGraph <- LineGraph + scale_color_discrete(name = "Carrier")
LineGraph <- LineGraph + labs(title = "Carrier Delays by City", x = "City", y = "Total Number of Delays")
LineGraph
