The chart above describes arrival delays for two airlines across five destinations. Your task is to: (1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below. (2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. (3) Perform analysis to compare the arrival delays for the two airlines. (4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission: ??? The URL to the .Rmd file in your GitHub repository. and ??? The URL for your rpubs.com web page.
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(tidyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v readr 1.3.1
## v tibble 2.1.3 v purrr 0.3.2
## v ggplot2 3.2.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tibble)
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(readr)
Follow this link to see uploaded .csv file (https://raw.githubusercontent.com/igukusamuel/DATA-607-Assignments/master/tidy_trans.csv)
airlines <- read_csv("https://raw.githubusercontent.com/igukusamuel/DATA-607-Assignments/master/tidy_trans.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_double(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_double()
## )
head(airlines, 5) #Print out first 5 rows to confirm that the data have been loaded correctly.
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
airlines[2,1] <- airlines[1,1]
airlines[5,1] <- airlines[4,1]
airlines[,2] <- sapply(airlines[,2], str_replace, " ", "_")
airlines <- na.omit(airlines) # To omit the null/empty rows
head(airlines)
## # A tibble: 4 x 7
## X1 X2[,"X2"] `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALAS~ on_time 497 221 212 503 1841
## 2 ALAS~ delayed 62 12 20 102 305
## 3 AMWE~ on_time 694 4840 383 320 201
## 4 AMWE~ delayed 117 415 65 129 61
names(airlines) #Column names before renaming the first and second columns.
## [1] "X1" "X2" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
airlines <- dplyr::rename(airlines, carrier = X1)
airlines <- dplyr::rename(airlines, arrival_type = X2)
names(airlines) #Column names after renaming the first and second columns.
## [1] "carrier" "arrival_type" "Los Angeles" "Phoenix"
## [5] "San Diego" "San Francisco" "Seattle"
airlines <- gather(airlines, "city", "n", 3:7)
head(airlines)
## # A tibble: 6 x 4
## carrier arrival_type city n
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on_time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AMWEST on_time Los Angeles 694
## 4 AMWEST delayed Los Angeles 117
## 5 ALASKA on_time Phoenix 221
## 6 ALASKA delayed Phoenix 12
airlines <- spread(airlines, "arrival_type", "n")
airlines
## # A tibble: 10 x 4
## carrier city delayed on_time
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San Diego 65 383
## 9 AMWEST San Francisco 129 320
## 10 AMWEST Seattle 61 201
dplyr::glimpse(airlines) #use the dplyr function glimpse() to view all the columns of the resulting data frame.
## Observations: 10
## Variables: 4
## $ carrier <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AMW...
## $ city <chr> "Los Angeles", "Phoenix", "San Diego", "San Francisco"...
## $ delayed <dbl> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ on_time <dbl> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
airlines <- mutate(airlines, total_arrivals = delayed + on_time)
airlines <- mutate(airlines, rate_on_time = on_time/total_arrivals)
head(airlines)
## # A tibble: 6 x 6
## carrier city delayed on_time total_arrivals rate_on_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497 559 0.889
## 2 ALASKA Phoenix 12 221 233 0.948
## 3 ALASKA San Diego 20 212 232 0.914
## 4 ALASKA San Francisco 102 503 605 0.831
## 5 ALASKA Seattle 305 1841 2146 0.858
## 6 AMWEST Los Angeles 117 694 811 0.856
select(airlines, city, on_time, delayed)
## # A tibble: 10 x 3
## city on_time delayed
## <chr> <dbl> <dbl>
## 1 Los Angeles 497 62
## 2 Phoenix 221 12
## 3 San Diego 212 20
## 4 San Francisco 503 102
## 5 Seattle 1841 305
## 6 Los Angeles 694 117
## 7 Phoenix 4840 415
## 8 San Diego 383 65
## 9 San Francisco 320 129
## 10 Seattle 201 61
airlines %>% summarise(mean = mean(on_time), median = median(on_time), n = n())
## # A tibble: 1 x 3
## mean median n
## <dbl> <dbl> <int>
## 1 971. 440 10
airlines %>% group_by(on_time) %>%
arrange(desc(rate_on_time))
## # A tibble: 10 x 6
## # Groups: on_time [10]
## carrier city delayed on_time total_arrivals rate_on_time
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA Phoenix 12 221 233 0.948
## 2 AMWEST Phoenix 415 4840 5255 0.921
## 3 ALASKA San Diego 20 212 232 0.914
## 4 ALASKA Los Angeles 62 497 559 0.889
## 5 ALASKA Seattle 305 1841 2146 0.858
## 6 AMWEST Los Angeles 117 694 811 0.856
## 7 AMWEST San Diego 65 383 448 0.855
## 8 ALASKA San Francisco 102 503 605 0.831
## 9 AMWEST Seattle 61 201 262 0.767
## 10 AMWEST San Francisco 129 320 449 0.713