#For the purposes of this assignment I uploaded the dataset from MySQL.
rm(list=ls())
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
myurl <- "https://raw.githubusercontent.com/VioletaStoyanova/Untidy_data/master/flights.csv"
data<- read.csv(myurl, header = TRUE, stringsAsFactors = FALSE)
data
## idflights_table Airline Schedule Los.Angeles Phoenix San.Diego
## 1 1 ALASKA on_time 497 221 212
## 2 2 NULL delayed 62 12 20
## 3 3 AM WEST on_time 694 4840 383
## 4 4 NULL delayed 117 415 65
## San.Francisco Seattle
## 1 503 1841
## 2 102 305
## 3 320 201
## 4 129 61
names(data) <- c("ID","Airline","Schedule", "Los Angeles", "Phoenix", "San Diego",
"San Francisco", "Seattle")
colnames(data)
## [1] "ID" "Airline" "Schedule" "Los Angeles"
## [5] "Phoenix" "San Diego" "San Francisco" "Seattle"
#Then I created a data frame where I added Airline names in column 2 row 2 and column 2 row 4.
data[2,2]<-c("ALASKA")
data[4,2]<-c("AM WEST")
data_df<-data.frame(data,stringsAsFactors=FALSE)
data_df[]<-lapply(data_df,as.array)
head(data_df)
## ID Airline Schedule Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 1 ALASKA on_time 497 221 212 503 1841
## 2 2 ALASKA delayed 62 12 20 102 305
## 3 3 AM WEST on_time 694 4840 383 320 201
## 4 4 AM WEST delayed 117 415 65 129 61
#I attempted to group all the cities together and separated them by frequency.
flights<-data%>%
gather(City,Freq,4:8) %>% select (-ID)
head(flights)
## Airline Schedule City Freq
## 1 ALASKA on_time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on_time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on_time Phoenix 221
## 6 ALASKA delayed Phoenix 12
#At this step the two variables delayed and on time were separated into two columns.
temp <- flights%>%
spread(Schedule,Freq)
str(temp$`delayed `) #this is the point where I realized that my delayed variable had a space and it was giving me trouble when I was trying to use the function mutate.
## int [1:10] 62 12 20 102 305 117 415 65 129 61
mutate(temp, total_flight = `delayed `+on_time)
## Airline City delayed on_time total_flight
## 1 ALASKA Los Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San Diego 20 212 232
## 4 ALASKA San Francisco 102 503 605
## 5 ALASKA Seattle 305 1841 2146
## 6 AM WEST Los Angeles 117 694 811
## 7 AM WEST Phoenix 415 4840 5255
## 8 AM WEST San Diego 65 383 448
## 9 AM WEST San Francisco 129 320 449
## 10 AM WEST Seattle 61 201 262
colnames(temp)
## [1] "Airline" "City" "delayed " "on_time"
#I was suspiciuos and wanted to see the total number of flights comapring to the total percentage of delay between the two airlines.
total_flights<- group_by(temp, Airline) %>%
summarise(total_delayed=sum(`delayed `), total_ontime=sum(on_time), total=sum(`delayed `+on_time))
head(total_flights)
## # A tibble: 2 x 4
## Airline total_delayed total_ontime total
## <chr> <int> <int> <int>
## 1 ALASKA 501 3274 3775
## 2 AM WEST 787 6438 7225
# And my suspicions were right as one can see that the total number of flights of AM WEST is 7225 and the total number of flights of ALASKA is 3775 it would be a mistake to conclude that AM WEST had more delays because they had more flights comapring to ALASKA.
sum_airline<- group_by(temp, Airline) %>%
summarise(avg_delay = mean(`delayed `), avg_on_time = mean(on_time)) %>%
mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time)))
sum_airline
## # A tibble: 2 x 4
## Airline avg_delay avg_on_time percent_delay
## <chr> <dbl> <dbl> <dbl>
## 1 ALASKA 100 655 13.3
## 2 AM WEST 157 1288 10.9