# install.packages("stringr")
# install.packages("prettydoc")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("knitr")


library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
library(prettydoc)
## Warning: package 'prettydoc' was built under R version 3.2.5
library(knitr)
## Warning: package 'knitr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1). Uploaded .csv file from Github….

my_data <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA607/W5/flights.csv" ,stringsAsFactors = F)
my_data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on_time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on_time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

2). Display Data/Headings….

str(my_data) 
## 'data.frame':    5 obs. of  7 variables:
##  $ X            : chr  "ALASKA" "" "" "AM WEST" ...
##  $ X.1          : chr  "on_time" "delayed" "" "on_time" ...
##  $ Los.Angeles  : int  497 62 NA 694 117
##  $ Phoenix      : int  221 12 NA 4840 415
##  $ San.Diego    : int  212 20 NA 383 65
##  $ San.Francisco: int  503 102 NA 320 129
##  $ Seattle      : int  1841 305 NA 201 61

3). Assign two values to the missing cell values ….

my_data[2, 1] <- "ALASKA"
my_data[5, 1] <- "AM WEST"
my_data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on_time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on_time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

4). Assign two new column names….

names(my_data)[1:2] <- c('Airlines','Status')
my_data
##   Airlines  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1   ALASKA on_time         497     221       212           503    1841
## 2   ALASKA delayed          62      12        20           102     305
## 3                           NA      NA        NA            NA      NA
## 4  AM WEST on_time         694    4840       383           320     201
## 5  AM WEST delayed         117     415        65           129      61

5). Gather flight details and cities in to two vectors….

Tidydata<-gather(my_data, "cities","flights",3:7)
Tidydata
##    Airlines  Status        cities flights
## 1    ALASKA on_time   Los.Angeles     497
## 2    ALASKA delayed   Los.Angeles      62
## 3                     Los.Angeles      NA
## 4   AM WEST on_time   Los.Angeles     694
## 5   AM WEST delayed   Los.Angeles     117
## 6    ALASKA on_time       Phoenix     221
## 7    ALASKA delayed       Phoenix      12
## 8                         Phoenix      NA
## 9   AM WEST on_time       Phoenix    4840
## 10  AM WEST delayed       Phoenix     415
## 11   ALASKA on_time     San.Diego     212
## 12   ALASKA delayed     San.Diego      20
## 13                      San.Diego      NA
## 14  AM WEST on_time     San.Diego     383
## 15  AM WEST delayed     San.Diego      65
## 16   ALASKA on_time San.Francisco     503
## 17   ALASKA delayed San.Francisco     102
## 18                  San.Francisco      NA
## 19  AM WEST on_time San.Francisco     320
## 20  AM WEST delayed San.Francisco     129
## 21   ALASKA on_time       Seattle    1841
## 22   ALASKA delayed       Seattle     305
## 23                        Seattle      NA
## 24  AM WEST on_time       Seattle     201
## 25  AM WEST delayed       Seattle      61

6). Filter the rows with data into new Data Farme….

Tidydata1 <- filter(Tidydata, Status !='')
Tidydata1
##    Airlines  Status        cities flights
## 1    ALASKA on_time   Los.Angeles     497
## 2    ALASKA delayed   Los.Angeles      62
## 3   AM WEST on_time   Los.Angeles     694
## 4   AM WEST delayed   Los.Angeles     117
## 5    ALASKA on_time       Phoenix     221
## 6    ALASKA delayed       Phoenix      12
## 7   AM WEST on_time       Phoenix    4840
## 8   AM WEST delayed       Phoenix     415
## 9    ALASKA on_time     San.Diego     212
## 10   ALASKA delayed     San.Diego      20
## 11  AM WEST on_time     San.Diego     383
## 12  AM WEST delayed     San.Diego      65
## 13   ALASKA on_time San.Francisco     503
## 14   ALASKA delayed San.Francisco     102
## 15  AM WEST on_time San.Francisco     320
## 16  AM WEST delayed San.Francisco     129
## 17   ALASKA on_time       Seattle    1841
## 18   ALASKA delayed       Seattle     305
## 19  AM WEST on_time       Seattle     201
## 20  AM WEST delayed       Seattle      61

7). With spread function Pivot Status column for the data in flight column and send the data to a new data frame….

Tidydata2<-spread(Tidydata1, Status, flights)
Tidydata2
##    Airlines        cities delayed on_time
## 1    ALASKA   Los.Angeles      62     497
## 2    ALASKA       Phoenix      12     221
## 3    ALASKA     San.Diego      20     212
## 4    ALASKA San.Francisco     102     503
## 5    ALASKA       Seattle     305    1841
## 6   AM WEST   Los.Angeles     117     694
## 7   AM WEST       Phoenix     415    4840
## 8   AM WEST     San.Diego      65     383
## 9   AM WEST San.Francisco     129     320
## 10  AM WEST       Seattle      61     201

8). Display data in two data frames in string format….

glimpse(Tidydata1)
## Observations: 20
## Variables: 4
## $ Airlines <chr> "ALASKA", "ALASKA", "AM WEST", "AM WEST", "ALASKA", "...
## $ Status   <chr> "on_time", "delayed", "on_time", "delayed", "on_time"...
## $ cities   <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Los.Ang...
## $ flights  <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, 383, ...
glimpse(Tidydata2)
## Observations: 10
## Variables: 4
## $ Airlines <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AM...
## $ cities   <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Francisco...
## $ delayed  <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ on_time  <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201

9). Display Tidydata1 - data in two data frames in table format….

kable(Tidydata1)
Airlines Status cities flights
ALASKA on_time Los.Angeles 497
ALASKA delayed Los.Angeles 62
AM WEST on_time Los.Angeles 694
AM WEST delayed Los.Angeles 117
ALASKA on_time Phoenix 221
ALASKA delayed Phoenix 12
AM WEST on_time Phoenix 4840
AM WEST delayed Phoenix 415
ALASKA on_time San.Diego 212
ALASKA delayed San.Diego 20
AM WEST on_time San.Diego 383
AM WEST delayed San.Diego 65
ALASKA on_time San.Francisco 503
ALASKA delayed San.Francisco 102
AM WEST on_time San.Francisco 320
AM WEST delayed San.Francisco 129
ALASKA on_time Seattle 1841
ALASKA delayed Seattle 305
AM WEST on_time Seattle 201
AM WEST delayed Seattle 61

10). Select only given columns….

select(Tidydata1,Airlines, cities)
##    Airlines        cities
## 1    ALASKA   Los.Angeles
## 2    ALASKA   Los.Angeles
## 3   AM WEST   Los.Angeles
## 4   AM WEST   Los.Angeles
## 5    ALASKA       Phoenix
## 6    ALASKA       Phoenix
## 7   AM WEST       Phoenix
## 8   AM WEST       Phoenix
## 9    ALASKA     San.Diego
## 10   ALASKA     San.Diego
## 11  AM WEST     San.Diego
## 12  AM WEST     San.Diego
## 13   ALASKA San.Francisco
## 14   ALASKA San.Francisco
## 15  AM WEST San.Francisco
## 16  AM WEST San.Francisco
## 17   ALASKA       Seattle
## 18   ALASKA       Seattle
## 19  AM WEST       Seattle
## 20  AM WEST       Seattle
filter(Tidydata2, cities=='Phoenix')
##   Airlines  cities delayed on_time
## 1   ALASKA Phoenix      12     221
## 2  AM WEST Phoenix     415    4840

11). Select distinct columns….

distinct(Tidydata1,Airlines, cities)
##    Airlines        cities
## 1    ALASKA   Los.Angeles
## 2   AM WEST   Los.Angeles
## 3    ALASKA       Phoenix
## 4   AM WEST       Phoenix
## 5    ALASKA     San.Diego
## 6   AM WEST     San.Diego
## 7    ALASKA San.Francisco
## 8   AM WEST San.Francisco
## 9    ALASKA       Seattle
## 10  AM WEST       Seattle

12). Calculate the ratio between Delayed and On_Time flights….

Tidydata2 %>% 
mutate(ratio=delayed/(delayed+`on_time`)) %>%
  arrange(cities, Airlines)
##    Airlines        cities delayed on_time      ratio
## 1    ALASKA   Los.Angeles      62     497 0.11091234
## 2   AM WEST   Los.Angeles     117     694 0.14426634
## 3    ALASKA       Phoenix      12     221 0.05150215
## 4   AM WEST       Phoenix     415    4840 0.07897241
## 5    ALASKA     San.Diego      20     212 0.08620690
## 6   AM WEST     San.Diego      65     383 0.14508929
## 7    ALASKA San.Francisco     102     503 0.16859504
## 8   AM WEST San.Francisco     129     320 0.28730512
## 9    ALASKA       Seattle     305    1841 0.14212488
## 10  AM WEST       Seattle      61     201 0.23282443

13). Plotting a graph….

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
Tidydata2 <- mutate(Tidydata2, Total = delayed + `on_time`, PercentDelayed = delayed / Total * 100)
Tidydata2 <- arrange(Tidydata2, cities, PercentDelayed)

ggplot(Tidydata2,aes(x=cities,y=PercentDelayed,fill=factor(Airlines)))+
    geom_bar(stat="identity",position="dodge")

14).Export Data in to a .csv….

# write.csv(my_data, 'E:/0_MSC/Modules/607- Data Acquisition and Management/Asignments/W5/FlightStatusTable.csv', row.names=T)

write.csv(my_data, "../W5/FlightStatusTable.csv")

15). Group by airline and summarize the total on time flights divided by the total flights…..

delays <- Tidydata2 %>% group_by(Airlines) %>% summarise(MeanPercent = round(mean(PercentDelayed), 0))
delays
## # A tibble: 2 × 2
##   Airlines MeanPercent
##      <chr>       <dbl>
## 1   ALASKA          11
## 2  AM WEST          18