messy data into a more useful format.data.url <- file(paste(url,"Week%205/tidy%20data%20set.csv", sep = ""), open="r" )
flights <- read.csv(data.url, sep=",", header=TRUE, stringsAsFactors = FALSE)
flights <- flights[(rowSums(flights[3:7], na.rm = TRUE) > 0),] # removes blank row
flights$X[flights$X == ""] <- NA
flights$X <- na.locf(flights$X) #copy forwards the column name for next blank cell
colnames(flights)[c(1,2)] <- c("Airline", "Flight_Status")
flights$Flight_Status <- str_trim(flights$Flight_Status)
flights <- flights %>%
gather(Airline, Flight_Status, Los.Angeles:Seattle)
colnames(flights)[c(3,4)] <- c("Location", "Counts")
kable(flights, align = "c")
| Airline | Flight_Status | Location | Counts |
|---|---|---|---|
| ALASKA | on time | Los.Angeles | 497 |
| ALASKA | delayed | Los.Angeles | 62 |
| AM WEST | on time | Los.Angeles | 694 |
| AM WEST | delayed | Los.Angeles | 117 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | delayed | Phoenix | 12 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | delayed | Phoenix | 415 |
| ALASKA | on time | San.Diego | 212 |
| ALASKA | delayed | San.Diego | 20 |
| AM WEST | on time | San.Diego | 383 |
| AM WEST | delayed | San.Diego | 65 |
| ALASKA | on time | San.Francisco | 503 |
| ALASKA | delayed | San.Francisco | 102 |
| AM WEST | on time | San.Francisco | 320 |
| AM WEST | delayed | San.Francisco | 129 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | on time | Seattle | 201 |
| AM WEST | delayed | Seattle | 61 |
delayed and the on time flights status. I set the colors for the graph using RColorBrewer using the Paired palette and using the blue and orange colors .ggplot(flights, aes(x = Airline, y = Counts, fill = Flight_Status, group = Airline)) +
geom_bar(stat="identity", position=position_dodge(), colour="black") +
scale_fill_manual(values = rev(colors)) +
facet_grid(~Location) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
flights_percent <- ddply(flights, ~Airline + Location, summarize, 'All Flights' = sum(Counts))
flights_delayed <- flights[(flights$Flight_Status == "delayed"),]
flights_delayed <- flights_delayed[c(1,3,4)]
colnames(flights_delayed)[c(3)] <- c("Delayed Counts")
flights_delayed <- merge(flights_percent, flights_delayed, by = c("Airline", "Location"), all = TRUE)
flights_delayed$Percent_delayed <- flights_delayed$`Delayed Counts` / flights_delayed$`All Flights`
Los.Angeles, Phoenix, San.Diego, San.Francisco, Seattle)ggplot(flights_delayed, aes(x = Airline, y = Percent_delayed, fill = Airline)) +
scale_y_continuous(labels = percent, limits = c(0,.5)) +
geom_violin(trim = TRUE) +
scale_fill_manual(values = rev(colors)) +
geom_boxplot(width=0.1, fill = "white") +
geom_jitter(width=0.1) +
theme_minimal() +
theme(legend.position = "top") +
ylab("Percent of Flights Delayed\n")