# install.packages("stringr")
# install.packages("prettydoc")
# install.packages("tidyr")
# install.packages("dplyr")
# install.packages("knitr")
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
library(prettydoc)
## Warning: package 'prettydoc' was built under R version 3.2.5
library(knitr)
## Warning: package 'knitr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
1). Uploaded .csv file from Github….
my_data <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA607/W5/flights.csv" ,stringsAsFactors = F)
my_data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on_time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
2). Display Data/Headings….
str(my_data)
## 'data.frame': 5 obs. of 7 variables:
## $ X : chr "ALASKA" "" "" "AM WEST" ...
## $ X.1 : chr "on_time" "delayed" "" "on_time" ...
## $ Los.Angeles : int 497 62 NA 694 117
## $ Phoenix : int 221 12 NA 4840 415
## $ San.Diego : int 212 20 NA 383 65
## $ San.Francisco: int 503 102 NA 320 129
## $ Seattle : int 1841 305 NA 201 61
3). Assign two values to the missing cell values ….
my_data[2, 1] <- "ALASKA"
my_data[5, 1] <- "AM WEST"
my_data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on_time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
4). Assign two new column names….
names(my_data)[1:2] <- c('Airlines','Status')
my_data
## Airlines Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on_time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
5). Gather flight details and cities in to two vectors….
Tidydata<-gather(my_data, "cities","flights",3:7)
Tidydata
## Airlines Status cities flights
## 1 ALASKA on_time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 Los.Angeles NA
## 4 AM WEST on_time Los.Angeles 694
## 5 AM WEST delayed Los.Angeles 117
## 6 ALASKA on_time Phoenix 221
## 7 ALASKA delayed Phoenix 12
## 8 Phoenix NA
## 9 AM WEST on_time Phoenix 4840
## 10 AM WEST delayed Phoenix 415
## 11 ALASKA on_time San.Diego 212
## 12 ALASKA delayed San.Diego 20
## 13 San.Diego NA
## 14 AM WEST on_time San.Diego 383
## 15 AM WEST delayed San.Diego 65
## 16 ALASKA on_time San.Francisco 503
## 17 ALASKA delayed San.Francisco 102
## 18 San.Francisco NA
## 19 AM WEST on_time San.Francisco 320
## 20 AM WEST delayed San.Francisco 129
## 21 ALASKA on_time Seattle 1841
## 22 ALASKA delayed Seattle 305
## 23 Seattle NA
## 24 AM WEST on_time Seattle 201
## 25 AM WEST delayed Seattle 61
6). Filter the rows with data into new Data Farme….
Tidydata1 <- filter(Tidydata, Status !='')
Tidydata1
## Airlines Status cities flights
## 1 ALASKA on_time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on_time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on_time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on_time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on_time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on_time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on_time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on_time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on_time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on_time Seattle 201
## 20 AM WEST delayed Seattle 61
7). With spread function Pivot Status column for the data in flight column and send the data to a new data frame….
Tidydata2<-spread(Tidydata1, Status, flights)
Tidydata2
## Airlines cities delayed on_time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
8). Display data in two data frames in string format….
glimpse(Tidydata1)
## Observations: 20
## Variables: 4
## $ Airlines <chr> "ALASKA", "ALASKA", "AM WEST", "AM WEST", "ALASKA", "...
## $ Status <chr> "on_time", "delayed", "on_time", "delayed", "on_time"...
## $ cities <chr> "Los.Angeles", "Los.Angeles", "Los.Angeles", "Los.Ang...
## $ flights <int> 497, 62, 694, 117, 221, 12, 4840, 415, 212, 20, 383, ...
glimpse(Tidydata2)
## Observations: 10
## Variables: 4
## $ Airlines <chr> "ALASKA", "ALASKA", "ALASKA", "ALASKA", "ALASKA", "AM...
## $ cities <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Francisco...
## $ delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ on_time <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
10). Select only given columns….
select(Tidydata1,Airlines, cities)
## Airlines cities
## 1 ALASKA Los.Angeles
## 2 ALASKA Los.Angeles
## 3 AM WEST Los.Angeles
## 4 AM WEST Los.Angeles
## 5 ALASKA Phoenix
## 6 ALASKA Phoenix
## 7 AM WEST Phoenix
## 8 AM WEST Phoenix
## 9 ALASKA San.Diego
## 10 ALASKA San.Diego
## 11 AM WEST San.Diego
## 12 AM WEST San.Diego
## 13 ALASKA San.Francisco
## 14 ALASKA San.Francisco
## 15 AM WEST San.Francisco
## 16 AM WEST San.Francisco
## 17 ALASKA Seattle
## 18 ALASKA Seattle
## 19 AM WEST Seattle
## 20 AM WEST Seattle
filter(Tidydata2, cities=='Phoenix')
## Airlines cities delayed on_time
## 1 ALASKA Phoenix 12 221
## 2 AM WEST Phoenix 415 4840
11). Select distinct columns….
distinct(Tidydata1,Airlines, cities)
## Airlines cities
## 1 ALASKA Los.Angeles
## 2 AM WEST Los.Angeles
## 3 ALASKA Phoenix
## 4 AM WEST Phoenix
## 5 ALASKA San.Diego
## 6 AM WEST San.Diego
## 7 ALASKA San.Francisco
## 8 AM WEST San.Francisco
## 9 ALASKA Seattle
## 10 AM WEST Seattle
12). Calculate the ratio between Delayed and On_Time flights….
Tidydata2 %>%
mutate(ratio=delayed/(delayed+`on_time`)) %>%
arrange(cities, Airlines)
## Airlines cities delayed on_time ratio
## 1 ALASKA Los.Angeles 62 497 0.11091234
## 2 AM WEST Los.Angeles 117 694 0.14426634
## 3 ALASKA Phoenix 12 221 0.05150215
## 4 AM WEST Phoenix 415 4840 0.07897241
## 5 ALASKA San.Diego 20 212 0.08620690
## 6 AM WEST San.Diego 65 383 0.14508929
## 7 ALASKA San.Francisco 102 503 0.16859504
## 8 AM WEST San.Francisco 129 320 0.28730512
## 9 ALASKA Seattle 305 1841 0.14212488
## 10 AM WEST Seattle 61 201 0.23282443
13). Plotting a graph….
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
Tidydata2 <- mutate(Tidydata2, Total = delayed + `on_time`, PercentDelayed = delayed / Total * 100)
Tidydata2 <- arrange(Tidydata2, cities, PercentDelayed)
ggplot(Tidydata2,aes(x=cities,y=PercentDelayed,fill=factor(Airlines)))+
geom_bar(stat="identity",position="dodge")

14).Export Data in to a .csv….
# write.csv(my_data, 'E:/0_MSC/Modules/607- Data Acquisition and Management/Asignments/W5/FlightStatusTable.csv', row.names=T)
write.csv(my_data, "../W5/FlightStatusTable.csv")
15). Group by airline and summarize the total on time flights divided by the total flights…..
delays <- Tidydata2 %>% group_by(Airlines) %>% summarise(MeanPercent = round(mean(PercentDelayed), 0))
delays
## # A tibble: 2 × 2
## Airlines MeanPercent
## <chr> <dbl>
## 1 ALASKA 11
## 2 AM WEST 18