Overview

The purpose of this assignment is to use the packages “tidyr” and “dplyr” and the different functions that are part of the packages. The exercises below reshape and use the data to illustrate the different packages.

library(tidyr)
library(dplyr)
library(ggplot2)
library(plotly)
#read in the file
numbersdata<-read.csv("numbersense.csv",header=TRUE,sep=",")
# this is the original data set we are working with
head(numbersdata)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM West on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
# selectdata to remove #NA's
selectdata<-numbersdata %>%  
  filter(!is.na(Phoenix)) %>% 
    rename(Airline=X,Arrival=X.1,"Los Angeles"=Los.Angeles,"San Diego"=San.Diego,"San Francisco"=San.Francisco)

#replace missing values
selectdata$Airline<-as.character(selectdata$Airline)
selectdata$Airline[selectdata$Airline==""]<-c("Alaska","AM West")
# renamed dataset
selectdata
##   Airline Arrival Los Angeles Phoenix San Diego San Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2  Alaska delayed          62      12        20           102     305
## 3 AM West on time         694    4840       383           320     201
## 4 AM West delayed         117     415        65           129      61
#create tidy dataset and sort the data
tidydata<-selectdata %>% 
  gather(key=region,value=Number,3:7)
tidydata<-arrange(tidydata,Airline,Arrival)
# the tidy dataset
head(tidydata)
##   Airline Arrival        region Number
## 1  Alaska delayed   Los Angeles     62
## 2  Alaska delayed       Phoenix     12
## 3  Alaska delayed     San Diego     20
## 4  Alaska delayed San Francisco    102
## 5  Alaska delayed       Seattle    305
## 6  Alaska on time   Los Angeles    497
# total flights delayed and and Ontime
tidydata %>% 
  group_by(Airline,Arrival) %>% 
  summarize(total=sum(Number))
## Source: local data frame [4 x 3]
## Groups: Airline [?]
## 
##   Airline Arrival total
##     <chr>  <fctr> <int>
## 1  Alaska delayed   501
## 2  Alaska on time  3274
## 3 AM West delayed   787
## 4 AM West on time  6438
# total flights by airline
tidydata %>% 
  group_by(Airline) %>% 
  summarize(total=sum(Number))
## # A tibble: 2 × 2
##   Airline total
##     <chr> <int>
## 1  Alaska  3775
## 2 AM West  7225
#reshape the data and calculate proportions
# use the mutate function to total and proportion of delayed flights
spreadata<-tidydata %>% 
  spread(Arrival,Number) %>% 
  mutate(total=delayed+`on time`,prop=delayed/total)
#create plot
# plot of delayed airlines
ggplotly(ggplot(filter(tidydata,Arrival=="delayed"),aes(region,Number,color=Airline))+geom_point(size=3)+ggtitle("Delayed Flights")+ylab("Number of Delayed Flights"))
#bar plot of delayed flights
ggplotly(ggplot(spreadata,aes(x=region,y=prop))+geom_bar(aes(fill=Airline),stat="identity",position="dodge")+ylab("Proportion of Delayed Flights")+ggtitle("Delayed Flights by Region"))

The steps shown serve as an illustration of the way different functions within the “dplyr” and “tidyr” packages work and how they can simplify data analysis.