require(tidyr)
require(dplyr)
untidy <- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/untidy_airline.csv",
header=TRUE, sep=",")
untidy <- tbl_df(untidy)
Delete the empty row
untidy <- untidy[-3,]
Extract city data with a slice and transpose.
city<- t(slice(untidy,0))
city <- city[-1,]
city <- city[-1,]
city
##
## Los.Angeles
## Phoenix
## San.Diego
## San.Francisco
## Seattle
Extract airline data.
airlines<-strsplit(noquote(as.character(untidy$X))," ")
i<-1
for(i in seq(1,length(airlines))){
if (airlines[i]=='character(0)'){
airlines <- airlines[-i]
}
}
airlines
## [[1]]
## [1] "ALASKA"
##
## [[2]]
## [1] "AMWEST"
Creating the tidied final table, chiefly with a gather() function.
endrow<-nrow(city)+2
times<-untidy[3:endrow]
times
## Source: local data frame [4 x 5]
##
## Los.Angeles Phoenix San.Diego San.Francisco Seattle
## (int) (int) (int) (int) (int)
## 1 497 221 212 503 1841
## 2 62 12 20 102 305
## 3 694 4840 383 320 201
## 4 117 415 65 129 61
times<-gather(times)
times
## Source: local data frame [20 x 2]
##
## key value
## (chr) (int)
## 1 Los.Angeles 497
## 2 Los.Angeles 62
## 3 Los.Angeles 694
## 4 Los.Angeles 117
## 5 Phoenix 221
## 6 Phoenix 12
## 7 Phoenix 4840
## 8 Phoenix 415
## 9 San.Diego 212
## 10 San.Diego 20
## 11 San.Diego 383
## 12 San.Diego 65
## 13 San.Francisco 503
## 14 San.Francisco 102
## 15 San.Francisco 320
## 16 San.Francisco 129
## 17 Seattle 1841
## 18 Seattle 305
## 19 Seattle 201
## 20 Seattle 61
times<-data.frame(times,stringsAsFactors=FALSE)
times<-cbind(times, airline="",stringsAsFactors=FALSE)
times<-cbind(times, status="",stringsAsFactors=FALSE)
option<-c("on time","delayed")
times
## key value airline status
## 1 Los.Angeles 497
## 2 Los.Angeles 62
## 3 Los.Angeles 694
## 4 Los.Angeles 117
## 5 Phoenix 221
## 6 Phoenix 12
## 7 Phoenix 4840
## 8 Phoenix 415
## 9 San.Diego 212
## 10 San.Diego 20
## 11 San.Diego 383
## 12 San.Diego 65
## 13 San.Francisco 503
## 14 San.Francisco 102
## 15 San.Francisco 320
## 16 San.Francisco 129
## 17 Seattle 1841
## 18 Seattle 305
## 19 Seattle 201
## 20 Seattle 61
Using loops to populate the remaining data.
i<-1
n <- length(airlines)*2
for (i in seq(1,nrow(times), by=n)){
j<-1
times$airline[i] <- airlines[[j]]
times$status[i] <- option[[j]]
times$airline[i+1]<-airlines[[j]]
times$status[i+1] <- option[[j+1]]
times$airline[i+2] <- airlines[[j+1]]
times$status[i+2] <- option[[j]]
times$airline[i+3]<-airlines[[j+1]]
times$status[i+3] <- option[[j+1]]
}
times
## key value airline status
## 1 Los.Angeles 497 ALASKA on time
## 2 Los.Angeles 62 ALASKA delayed
## 3 Los.Angeles 694 AMWEST on time
## 4 Los.Angeles 117 AMWEST delayed
## 5 Phoenix 221 ALASKA on time
## 6 Phoenix 12 ALASKA delayed
## 7 Phoenix 4840 AMWEST on time
## 8 Phoenix 415 AMWEST delayed
## 9 San.Diego 212 ALASKA on time
## 10 San.Diego 20 ALASKA delayed
## 11 San.Diego 383 AMWEST on time
## 12 San.Diego 65 AMWEST delayed
## 13 San.Francisco 503 ALASKA on time
## 14 San.Francisco 102 ALASKA delayed
## 15 San.Francisco 320 AMWEST on time
## 16 San.Francisco 129 AMWEST delayed
## 17 Seattle 1841 ALASKA on time
## 18 Seattle 305 ALASKA delayed
## 19 Seattle 201 AMWEST on time
## 20 Seattle 61 AMWEST delayed
Geocoding and requiring libraries.
library(ggmap)
library(ggplot2)
library(rworldmap)
library(maptools)
coordinates<-geocode(times$key, output = "latlon" , source = "google")
geotidied<-cbind(times, coordinates)
geotidied
## key value airline status lon lat
## 1 Los.Angeles 497 ALASKA on time -118.2437 34.05223
## 2 Los.Angeles 62 ALASKA delayed -118.2437 34.05223
## 3 Los.Angeles 694 AMWEST on time -118.2437 34.05223
## 4 Los.Angeles 117 AMWEST delayed -118.2437 34.05223
## 5 Phoenix 221 ALASKA on time -112.0740 33.44838
## 6 Phoenix 12 ALASKA delayed -112.0740 33.44838
## 7 Phoenix 4840 AMWEST on time -112.0740 33.44838
## 8 Phoenix 415 AMWEST delayed -112.0740 33.44838
## 9 San.Diego 212 ALASKA on time -117.1611 32.71574
## 10 San.Diego 20 ALASKA delayed -117.1611 32.71574
## 11 San.Diego 383 AMWEST on time -117.1611 32.71574
## 12 San.Diego 65 AMWEST delayed -117.1611 32.71574
## 13 San.Francisco 503 ALASKA on time -122.4194 37.77493
## 14 San.Francisco 102 ALASKA delayed -122.4194 37.77493
## 15 San.Francisco 320 AMWEST on time -122.4194 37.77493
## 16 San.Francisco 129 AMWEST delayed -122.4194 37.77493
## 17 Seattle 1841 ALASKA on time -122.3321 47.60621
## 18 Seattle 305 ALASKA delayed -122.3321 47.60621
## 19 Seattle 201 AMWEST on time -122.3321 47.60621
## 20 Seattle 61 AMWEST delayed -122.3321 47.60621
Writing and applying a ratio function that calculates proportional delays per city and by airline.
ratio<-function (data){
i<-1
for (i in seq(1,nrow(data))){
if (data$status[i]=="on time"){
data$percent[i]<-data$value[i]/(data$value[i]+data$value[i+1])
data$total[i]<-data$value[i]+data$value[i+1]
}else{
data$percent[i]<-data$value[i]/(data$value[i]+data$value[i-1])
data$total[i]<-data$value[i]+data$value[i-1]
}
}
return(data)
}
geotidied<-ratio(geotidied)
alaska<-subset(geotidied,airline=="ALASKA")
amwest<-subset(geotidied,airline=="AMWEST")
amwest_ontime<-subset(amwest,status=="on time")
amwest_delayed<-subset(amwest,status=="delayed")
alaska_ontime<-subset(alaska,status=="on time")
alaska_delayed<-subset(alaska,status=="delayed")
http://stackoverflow.com/questions/7145826/how-to-format-a-number-as-percentage-in-r
percent <- function(x, digits = 2, format = "f", ...) {
paste0(formatC(100 * x, format = format, digits = digits, ...), "%")
}
ALASKA PERCENT DELAYED
newmap <- getMap(resolution = "low")
plot(newmap, xlim = c(-114, -113), ylim = c(29, 51), asp = 1,main = "ALASKA flights percent delayed")
points(alaska$lon, alaska$lat, col = "red", cex = (alaska$total/400), pch = 16)
pointLabel(alaska_ontime$lon, alaska_ontime$lat,labels=as.character(percent(alaska_delayed$percent)),cex=1)
AMWEST PERCENT DELAYED
plot(newmap, xlim = c(-114, -113), ylim = c(29, 51), asp = 1,main = "AMWEST flights percent delayed")
points(amwest$lon, amwest$lat, col = "red", cex = (amwest$total/400), pch = 16)
pointLabel(amwest_ontime$lon, amwest_ontime$lat,labels=as.character(percent(amwest_delayed$percent)),cex=1)