Tidying and Transforming Data

Requiring the tidyr and dplry packages

require(tidyr)
require(dplyr)

Loading the .csv file.

untidy <- read.csv(file="https://raw.githubusercontent.com/RobertSellers/R/master/data/untidy_airline.csv", 
header=TRUE, sep=",")
untidy <- tbl_df(untidy)

Tidying

Delete the empty row

untidy <- untidy[-3,]

Extract city data with a slice and transpose.

city<- t(slice(untidy,0))
city <- city[-1,]
city <- city[-1,]
city
##              
## Los.Angeles  
## Phoenix      
## San.Diego    
## San.Francisco
## Seattle

Extract airline data.

airlines<-strsplit(noquote(as.character(untidy$X))," ")

i<-1
for(i in seq(1,length(airlines))){
  if (airlines[i]=='character(0)'){
    airlines <- airlines[-i]
  }
}
airlines
## [[1]]
## [1] "ALASKA"
## 
## [[2]]
## [1] "AMWEST"

Creating the tidied final table, chiefly with a gather() function.

endrow<-nrow(city)+2
times<-untidy[3:endrow]
times
## Source: local data frame [4 x 5]
## 
##   Los.Angeles Phoenix San.Diego San.Francisco Seattle
##         (int)   (int)     (int)         (int)   (int)
## 1         497     221       212           503    1841
## 2          62      12        20           102     305
## 3         694    4840       383           320     201
## 4         117     415        65           129      61
times<-gather(times)
times
## Source: local data frame [20 x 2]
## 
##              key value
##            (chr) (int)
## 1    Los.Angeles   497
## 2    Los.Angeles    62
## 3    Los.Angeles   694
## 4    Los.Angeles   117
## 5        Phoenix   221
## 6        Phoenix    12
## 7        Phoenix  4840
## 8        Phoenix   415
## 9      San.Diego   212
## 10     San.Diego    20
## 11     San.Diego   383
## 12     San.Diego    65
## 13 San.Francisco   503
## 14 San.Francisco   102
## 15 San.Francisco   320
## 16 San.Francisco   129
## 17       Seattle  1841
## 18       Seattle   305
## 19       Seattle   201
## 20       Seattle    61
times<-data.frame(times,stringsAsFactors=FALSE)
times<-cbind(times, airline="",stringsAsFactors=FALSE)
times<-cbind(times, status="",stringsAsFactors=FALSE)
option<-c("on time","delayed")
times
##              key value airline status
## 1    Los.Angeles   497               
## 2    Los.Angeles    62               
## 3    Los.Angeles   694               
## 4    Los.Angeles   117               
## 5        Phoenix   221               
## 6        Phoenix    12               
## 7        Phoenix  4840               
## 8        Phoenix   415               
## 9      San.Diego   212               
## 10     San.Diego    20               
## 11     San.Diego   383               
## 12     San.Diego    65               
## 13 San.Francisco   503               
## 14 San.Francisco   102               
## 15 San.Francisco   320               
## 16 San.Francisco   129               
## 17       Seattle  1841               
## 18       Seattle   305               
## 19       Seattle   201               
## 20       Seattle    61

Using loops to populate the remaining data.

i<-1
n <- length(airlines)*2
for (i in seq(1,nrow(times), by=n)){
      j<-1
      times$airline[i] <- airlines[[j]]
      times$status[i] <- option[[j]]
      times$airline[i+1]<-airlines[[j]]
      times$status[i+1] <- option[[j+1]]
      times$airline[i+2] <- airlines[[j+1]]
      times$status[i+2] <- option[[j]]
      times$airline[i+3]<-airlines[[j+1]]
      times$status[i+3] <- option[[j+1]]
}
times
##              key value airline  status
## 1    Los.Angeles   497  ALASKA on time
## 2    Los.Angeles    62  ALASKA delayed
## 3    Los.Angeles   694  AMWEST on time
## 4    Los.Angeles   117  AMWEST delayed
## 5        Phoenix   221  ALASKA on time
## 6        Phoenix    12  ALASKA delayed
## 7        Phoenix  4840  AMWEST on time
## 8        Phoenix   415  AMWEST delayed
## 9      San.Diego   212  ALASKA on time
## 10     San.Diego    20  ALASKA delayed
## 11     San.Diego   383  AMWEST on time
## 12     San.Diego    65  AMWEST delayed
## 13 San.Francisco   503  ALASKA on time
## 14 San.Francisco   102  ALASKA delayed
## 15 San.Francisco   320  AMWEST on time
## 16 San.Francisco   129  AMWEST delayed
## 17       Seattle  1841  ALASKA on time
## 18       Seattle   305  ALASKA delayed
## 19       Seattle   201  AMWEST on time
## 20       Seattle    61  AMWEST delayed

Map Preparation and summary statistics

Geocoding and requiring libraries.

library(ggmap)
library(ggplot2)
library(rworldmap)
library(maptools)

coordinates<-geocode(times$key, output = "latlon" , source = "google")
geotidied<-cbind(times, coordinates)
geotidied
##              key value airline  status       lon      lat
## 1    Los.Angeles   497  ALASKA on time -118.2437 34.05223
## 2    Los.Angeles    62  ALASKA delayed -118.2437 34.05223
## 3    Los.Angeles   694  AMWEST on time -118.2437 34.05223
## 4    Los.Angeles   117  AMWEST delayed -118.2437 34.05223
## 5        Phoenix   221  ALASKA on time -112.0740 33.44838
## 6        Phoenix    12  ALASKA delayed -112.0740 33.44838
## 7        Phoenix  4840  AMWEST on time -112.0740 33.44838
## 8        Phoenix   415  AMWEST delayed -112.0740 33.44838
## 9      San.Diego   212  ALASKA on time -117.1611 32.71574
## 10     San.Diego    20  ALASKA delayed -117.1611 32.71574
## 11     San.Diego   383  AMWEST on time -117.1611 32.71574
## 12     San.Diego    65  AMWEST delayed -117.1611 32.71574
## 13 San.Francisco   503  ALASKA on time -122.4194 37.77493
## 14 San.Francisco   102  ALASKA delayed -122.4194 37.77493
## 15 San.Francisco   320  AMWEST on time -122.4194 37.77493
## 16 San.Francisco   129  AMWEST delayed -122.4194 37.77493
## 17       Seattle  1841  ALASKA on time -122.3321 47.60621
## 18       Seattle   305  ALASKA delayed -122.3321 47.60621
## 19       Seattle   201  AMWEST on time -122.3321 47.60621
## 20       Seattle    61  AMWEST delayed -122.3321 47.60621

Writing and applying a ratio function that calculates proportional delays per city and by airline.

ratio<-function (data){
  i<-1
  for (i in seq(1,nrow(data))){
    if (data$status[i]=="on time"){
      data$percent[i]<-data$value[i]/(data$value[i]+data$value[i+1])
      data$total[i]<-data$value[i]+data$value[i+1]
    }else{
      data$percent[i]<-data$value[i]/(data$value[i]+data$value[i-1])
      data$total[i]<-data$value[i]+data$value[i-1]
    }
  }
  return(data)
}

geotidied<-ratio(geotidied)
alaska<-subset(geotidied,airline=="ALASKA")
amwest<-subset(geotidied,airline=="AMWEST")
amwest_ontime<-subset(amwest,status=="on time")
amwest_delayed<-subset(amwest,status=="delayed")
alaska_ontime<-subset(alaska,status=="on time")
alaska_delayed<-subset(alaska,status=="delayed")

Plotting the results. The following percent calculation is cited as follows:

http://stackoverflow.com/questions/7145826/how-to-format-a-number-as-percentage-in-r

percent <- function(x, digits = 2, format = "f", ...) {
  paste0(formatC(100 * x, format = format, digits = digits, ...), "%")
}

ALASKA PERCENT DELAYED

newmap <- getMap(resolution = "low")
plot(newmap, xlim = c(-114, -113), ylim = c(29, 51), asp = 1,main = "ALASKA flights percent delayed")
points(alaska$lon, alaska$lat, col = "red", cex = (alaska$total/400), pch = 16)
pointLabel(alaska_ontime$lon, alaska_ontime$lat,labels=as.character(percent(alaska_delayed$percent)),cex=1)

AMWEST PERCENT DELAYED

plot(newmap, xlim = c(-114, -113), ylim = c(29, 51), asp = 1,main = "AMWEST flights percent delayed")
points(amwest$lon, amwest$lat, col = "red", cex = (amwest$total/400), pch = 16)
pointLabel(amwest_ontime$lon, amwest_ontime$lat,labels=as.character(percent(amwest_delayed$percent)),cex=1)