R interface
library(knitr)
The chart above describes arrival delays for two airlines across five destinations. Your task is to:
#getting the csv data from github
schedule <- read.csv("https://raw.githubusercontent.com/maharjansudhan/DATA607/master/schedule.csv", header=TRUE, sep=",")
schedule
## ï.. X Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.1
#VIEW THE DATA AND CREATE NAMES
str(schedule)
## 'data.frame': 4 obs. of 7 variables:
## $ ï.. : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 2 2
## $ X : Factor w/ 2 levels "delayed","on time": 2 1 2 1
## $ Los.Angeles : int 497 62 694 117
## $ Phoenix : int 221 12 4840 415
## $ San.Diego : int 212 20 383 65
## $ San.Franciso: int 503 102 320 129
## $ Seattle : int 1841 305 201 61
#for a better view
names(schedule)[1:2] <- c('AIRLINES','STATUS')
schedule
## AIRLINES STATUS Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
#trying to gather data in a meaningful way
Tidydata<-gather(schedule, "CITY","FLIGHT",3:7)
Tidydata
## AIRLINES STATUS CITY FLIGHT
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Franciso 503
## 14 ALASKA delayed San.Franciso 102
## 15 AM WEST on time San.Franciso 320
## 16 AM WEST delayed San.Franciso 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
X<-spread(Tidydata,STATUS,FLIGHT)
X
## AIRLINES CITY delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Franciso 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Franciso 129 320
## 10 AM WEST Seattle 61 201
#have a overview of your data if its in correct format or not
glimpse(X)
## Observations: 10
## Variables: 4
## $ AIRLINES <fct> ALASKA, ALASKA, ALASKA, ALASKA, ALASKA, AM WEST, AM ...
## $ CITY <chr> "Los.Angeles", "Phoenix", "San.Diego", "San.Franciso...
## $ delayed <int> 62, 12, 20, 102, 305, 117, 415, 65, 129, 61
## $ `on time` <int> 497, 221, 212, 503, 1841, 694, 4840, 383, 320, 201
#calculate the ratio of the arrival delays and show it with all the other information
X %>% mutate(ratio=delayed/(delayed+`on time`))
## Warning: package 'bindrcpp' was built under R version 3.5.1
## AIRLINES CITY delayed on time ratio
## 1 ALASKA Los.Angeles 62 497 0.11091234
## 2 ALASKA Phoenix 12 221 0.05150215
## 3 ALASKA San.Diego 20 212 0.08620690
## 4 ALASKA San.Franciso 102 503 0.16859504
## 5 ALASKA Seattle 305 1841 0.14212488
## 6 AM WEST Los.Angeles 117 694 0.14426634
## 7 AM WEST Phoenix 415 4840 0.07897241
## 8 AM WEST San.Diego 65 383 0.14508929
## 9 AM WEST San.Franciso 129 320 0.28730512
## 10 AM WEST Seattle 61 201 0.23282443