The Data

chart above describes arrival delays for two airlines across five destinations. Your task is to:

(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

# load require packages
require(tidyr)
## Loading required package: tidyr
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
# Import data from a.csv file
source <-tbl_df(read.csv('D:/CUNY_SPS_DA/607_Data_Aq/week5/a.csv', stringsAsFactors = FALSE)) 
source
## # A tibble: 5 x 7
##       ï..       X Los.Angeles Phoenix San.Diego San.Francisco Seattle
##     <chr>   <chr>       <int>   <int>     <int>         <int>   <int>
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
# Check data typesraw()
str(source)
## Classes 'tbl_df', 'tbl' and 'data.frame':    5 obs. of  7 variables:
##  $ ï..          : chr  "ALASKA" "" "" "AM WEST" ...
##  $ X            : chr  "on time" "delayed" "" "on time" ...
##  $ Los.Angeles  : int  497 62 NA 694 117
##  $ Phoenix      : int  221 12 NA 4840 415
##  $ San.Diego    : int  212 20 NA 383 65
##  $ San.Francisco: int  503 102 NA 320 129
##  $ Seattle      : int  1841 305 NA 201 61

Data Cleaning

# only select rows with complete data in all columns and show data
# renames first two columns 
a <- rename(na.omit(source),c("ï.." ="airlines", "X"="statues"))

# insert missing values in first columns
a[2,1] <- "ALASKA"
a[4,1] <- "AM WEST"

# to shorten the string values to character in second column
#a$statues <- strtrim(a$statues,1)
a
## # A tibble: 4 x 7
##   airlines statues Los.Angeles Phoenix San.Diego San.Francisco Seattle
##      <chr>   <chr>       <int>   <int>     <int>         <int>   <int>
## 1   ALASKA on time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 3  AM WEST on time         694    4840       383           320     201
## 4  AM WEST delayed         117     415        65           129      61

(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

# long-formatted-data
a2 <- a %>%  
  gather(cities, n, Los.Angeles:Seattle, na.rm = TRUE)
a2
## # A tibble: 20 x 4
##    airlines statues        cities     n
##       <chr>   <chr>         <chr> <int>
##  1   ALASKA on time   Los.Angeles   497
##  2   ALASKA delayed   Los.Angeles    62
##  3  AM WEST on time   Los.Angeles   694
##  4  AM WEST delayed   Los.Angeles   117
##  5   ALASKA on time       Phoenix   221
##  6   ALASKA delayed       Phoenix    12
##  7  AM WEST on time       Phoenix  4840
##  8  AM WEST delayed       Phoenix   415
##  9   ALASKA on time     San.Diego   212
## 10   ALASKA delayed     San.Diego    20
## 11  AM WEST on time     San.Diego   383
## 12  AM WEST delayed     San.Diego    65
## 13   ALASKA on time San.Francisco   503
## 14   ALASKA delayed San.Francisco   102
## 15  AM WEST on time San.Francisco   320
## 16  AM WEST delayed San.Francisco   129
## 17   ALASKA on time       Seattle  1841
## 18   ALASKA delayed       Seattle   305
## 19  AM WEST on time       Seattle   201
## 20  AM WEST delayed       Seattle    61
# wide-formatted-data
a3 <- a2 %>%
 spread(statues,n)

# correct the column name "on time" to "on_time"
colnames(a3)[4] <- "on_time"

a3
## # A tibble: 10 x 4
##    airlines        cities delayed on_time
##  *    <chr>         <chr>   <int>   <int>
##  1   ALASKA   Los.Angeles      62     497
##  2   ALASKA       Phoenix      12     221
##  3   ALASKA     San.Diego      20     212
##  4   ALASKA San.Francisco     102     503
##  5   ALASKA       Seattle     305    1841
##  6  AM WEST   Los.Angeles     117     694
##  7  AM WEST       Phoenix     415    4840
##  8  AM WEST     San.Diego      65     383
##  9  AM WEST San.Francisco     129     320
## 10  AM WEST       Seattle      61     201
# load requried packages for string value
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## arrange():   dplyr, plyr
## compact():   purrr, plyr
## count():     dplyr, plyr
## failwith():  dplyr, plyr
## filter():    dplyr, stats
## id():        dplyr, plyr
## lag():       dplyr, stats
## mutate():    dplyr, plyr
## rename():    dplyr, plyr
## summarise(): dplyr, plyr
## summarize(): dplyr, plyr
library(stringr)

# filter data for airline "AM WEST"
am <- a2 %>%
 filter(str_detect(airlines, 'AM WEST'))%>%
 spread(statues,n)
am
## # A tibble: 5 x 4
##   airlines        cities delayed `on time`
## *    <chr>         <chr>   <int>     <int>
## 1  AM WEST   Los.Angeles     117       694
## 2  AM WEST       Phoenix     415      4840
## 3  AM WEST     San.Diego      65       383
## 4  AM WEST San.Francisco     129       320
## 5  AM WEST       Seattle      61       201
# correct the column name "on time" to "on_time"
colnames(am)[4] <- "on_time"

# unite columns airlines and cities
am2<- am %>% 
  unite(AM.city, airlines, cities,sep="." ) %>%
  select(AM.city,delayed,on_time) 

# tranfrom rows to columns
am2<- as.data.frame(t(am2))

# first row as the header in the table a4
colnames(am2) <- unlist(am2[1,])
am2=am2[-1,]
am2
##         AM WEST.Los.Angeles AM WEST.Phoenix AM WEST.San.Diego
## delayed                 117             415                65
## on_time                 694            4840               383
##         AM WEST.San.Francisco AM WEST.Seattle
## delayed                   129              61
## on_time                   320             201
sub.a3 <- a3%>%  select(airlines,delayed)

sub.a3 %>% summarise(Min = min(delayed, na.rm=TRUE),
                     Median = median(delayed, na.rm=TRUE),
                     Mean = mean(delayed, na.rm=TRUE),
                     Var = var(delayed, na.rm=TRUE),
                     SD = sd(delayed, na.rm=TRUE),
                     Max = max(delayed, na.rm=TRUE))
##   Min Median  Mean      Var       SD Max
## 1  12   83.5 128.8 16935.96 130.1382 415

(3) Perform analysis to compare the arrival delays for the two airlines.

Answer: AM_WEST airline has more delayed flights than ALASKA airline in every city. Both airlines have the most delayed flights in San.Francisco.

a3  
## # A tibble: 10 x 4
##    airlines        cities delayed on_time
##  *    <chr>         <chr>   <int>   <int>
##  1   ALASKA   Los.Angeles      62     497
##  2   ALASKA       Phoenix      12     221
##  3   ALASKA     San.Diego      20     212
##  4   ALASKA San.Francisco     102     503
##  5   ALASKA       Seattle     305    1841
##  6  AM WEST   Los.Angeles     117     694
##  7  AM WEST       Phoenix     415    4840
##  8  AM WEST     San.Diego      65     383
##  9  AM WEST San.Francisco     129     320
## 10  AM WEST       Seattle      61     201
df <- a3 %>% 
  mutate(total = on_time + delayed)%>%
  mutate(delayed = delayed/total)%>%
  select(airlines,cities,delayed)%>%
  arrange(cities,airlines)
df
## # A tibble: 10 x 3
##    airlines        cities    delayed
##       <chr>         <chr>      <dbl>
##  1   ALASKA   Los.Angeles 0.11091234
##  2  AM WEST   Los.Angeles 0.14426634
##  3   ALASKA       Phoenix 0.05150215
##  4  AM WEST       Phoenix 0.07897241
##  5   ALASKA     San.Diego 0.08620690
##  6  AM WEST     San.Diego 0.14508929
##  7   ALASKA San.Francisco 0.16859504
##  8  AM WEST San.Francisco 0.28730512
##  9   ALASKA       Seattle 0.14212488
## 10  AM WEST       Seattle 0.23282443
al_delay <- df%>% 
  filter(str_detect(airlines, 'ALASKA'))%>%
  select(cities,delayed)%>%
  arrange(cities)
#al_delay
colnames(al_delay)[2] <-"ALASKA.delayed"

am_delay <- df%>% 
  filter(str_detect(airlines, 'AM WEST'))%>%
  select(cities,delayed)%>%
  arrange(cities)
#am_delay
colnames(am_delay)[2] <-"AM_WEST.delayed"

delayed<- al_delay %>% 
  join(am_delay, by="cities") #%>% 
#  mutate(diff = AM_WEST.delayed - ALASKA.delayed )
delayed
##          cities ALASKA.delayed AM_WEST.delayed
## 1   Los.Angeles     0.11091234      0.14426634
## 2       Phoenix     0.05150215      0.07897241
## 3     San.Diego     0.08620690      0.14508929
## 4 San.Francisco     0.16859504      0.28730512
## 5       Seattle     0.14212488      0.23282443
library ('ggplot2')
barplot(delayed$ALASKA.delayed, names.arg=delayed$cities, ylim=c(0,0.30), main="ALASKA Airline Delayed (%)", ylab="blah", xlab="cities", col="darkmagenta")

barplot(delayed$AM_WEST.delayed, names.arg=delayed$cities, ylim=c(0,0.30), main="AM WEST Airline Delayed (%)", ylab="blah", xlab="cities", col="orange")

 matplot(delayed, type = c("b"),pch=1,col = 2:3) #plot
## Warning in xy.coords(x, y, xlabel, ylabel, log = log): NAs introduced by
## coercion
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion
 legend("topleft", legend =1:2, col=1:2, pch=1) # optional legend

write.table(delayed, "D:/CUNY_SPS_DA/607_Data_Aq/week5/delayed.csv", sep="\t")

(4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:

??? The URL to the .Rmd file in your GitHub repository. and

??? The URL for your rpubs.com web page.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.