install.packages(“tidyr”)
install.packages(“dplyr”) install.packages(“ggplot2”)
require(tidyr)
require(dplyr)
require(ggplot2)
require(tibble)
df <- read.csv("https://raw.githubusercontent.com/simon63/Assignment5/master/flights-wk5.csv",
header = T, stringsAsFactors = F, fileEncoding = "UTF-8-BOM", check.names = F)
# header = T - using this option to use the first line for column names
# check.names = F - using this option to avoid "Los Angeles" being converted to "Los.Angeles"
# fileEncoding = "UTF-8-BOM" - using this option to avoid loading the "\ufeff" encoding character
df
## Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Missing column names must be assigned before tbl_df() can be used
names(df)[1:2] <- c("Airline", "n_type")
# When attempting to use tbl_df(), I noticed a pop-up description, saying:
# "Deprecated: please use tibble::as_tibble() instead."
df <- as_tibble(df)
df
## # A tibble: 5 x 7
## Airline n_type `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on ti~ 497 221 212 503 1841
## 2 "" delay~ 62 12 20 102 305
## 3 "" "" NA NA NA NA NA
## 4 AM WEST on ti~ 694 4840 383 320 201
## 5 "" delay~ 117 415 65 129 61
# Using gather() to do the conversion and
# also filtering out rows with NA value in the new "n" column
t <- gather(df, "city", "n", 3:7) %>% filter(!is.na(n))
t
## # A tibble: 20 x 4
## Airline n_type city n
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 "" delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 "" delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 "" delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 "" delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 "" delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 "" delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 "" delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 "" delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 "" delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 "" delayed Seattle 61
# Assign the Airline values from the "Odd" rows into the "Even" rows
t[(c(1:length(t)) %% 2) == 0,1] <- t[(c(1:length(t)) %% 2) != 0,1]
t
## # A tibble: 20 x 4
## Airline n_type city n
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
# Spread would not work without the prior step of assigning missing Airline values
t <- spread(t, n_type, n)
t
## # A tibble: 10 x 4
## Airline city delayed `on time`
## <chr> <chr> <int> <int>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
t <- mutate(t, pct_delayed = round((delayed / (delayed + t$'on time')) * 100, 2), pct_on_time = 100 - pct_delayed)
t
## # A tibble: 10 x 6
## Airline city delayed `on time` pct_delayed pct_on_time
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497 11.1 88.9
## 2 ALASKA Phoenix 12 221 5.15 94.8
## 3 ALASKA San Diego 20 212 8.62 91.4
## 4 ALASKA San Francisco 102 503 16.9 83.1
## 5 ALASKA Seattle 305 1841 14.2 85.8
## 6 AM WEST Los Angeles 117 694 14.4 85.6
## 7 AM WEST Phoenix 415 4840 7.90 92.1
## 8 AM WEST San Diego 65 383 14.5 85.5
## 9 AM WEST San Francisco 129 320 28.7 71.3
## 10 AM WEST Seattle 61 201 23.3 76.7