#Tidying and Transforming Data
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
Data was entered in a csv file called thomps_HW5.csv
CSV read into R from github.
url <- "https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/thomps_HW5.csv"
dfuntidy <- read_csv(url)
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_double(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_double()
## )
dfuntidy
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
tbuntidy<-tbl_df(dfuntidy)
tbuntidy
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
We transform the data by first reshaping a wide table to a tidy table with variables
as columns and observations as rows.
tblTidy<-gather(dfuntidy,"Destinations","Flights",3:7)
tblTidy
## # A tibble: 25 x 4
## X1 X2 Destinations Flights
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 <NA> delayed Los Angeles 62
## 3 <NA> <NA> Los Angeles NA
## 4 AM WEST on time Los Angeles 694
## 5 <NA> delayed Los Angeles 117
## 6 ALASKA on time Phoenix 221
## 7 <NA> delayed Phoenix 12
## 8 <NA> <NA> Phoenix NA
## 9 AM WEST on time Phoenix 4840
## 10 <NA> delayed Phoenix 415
## # ... with 15 more rows
Next, we identify blanks and select all rows without blank values only, rename columns,
fill in missing info, create and match a new destination column with counts, and split
the status column.
blanks <- seq(3,nrow(dfuntidy),3)
clean <- dfuntidy %>% slice(-blanks) %>% rename("airline" = X1, "status" = X2)
for (i in seq(from=2, to=nrow(clean), by=2)) {
clean[i,1] <- clean[i-1,1]
}
cleaner <- clean %>% gather(`Los Angeles`:`Seattle`, key = destination, value = counts)
tidy <- cleaner %>% spread(status, counts)
The data is now tidy. Every column is a variable, where each row represents an observation,
and every value has a cell.
tidy
## # A tibble: 10 x 4
## airline destination delayed `on time`
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
Using the tidy table with groups we can compare both airlines arrival times. The table
shows that AM West has more delayed flights than Alaska, but also has more on time. Since
AM West has more flights than Alaska, we can compare is using ratios. Using pipes and mutate,
We add a column of total flights and arrange them in descending order
flights <- tidy %>% mutate(totalflights = delayed + `on time`) %>% arrange(desc(totalflights, 'on time', delayed))
flights
## # A tibble: 10 x 5
## airline destination delayed `on time` totalflights
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 AM WEST Phoenix 415 4840 5255
## 2 ALASKA Seattle 305 1841 2146
## 3 AM WEST Los Angeles 117 694 811
## 4 ALASKA San Francisco 102 503 605
## 5 ALASKA Los Angeles 62 497 559
## 6 AM WEST San Francisco 129 320 449
## 7 AM WEST San Diego 65 383 448
## 8 AM WEST Seattle 61 201 262
## 9 ALASKA Phoenix 12 221 233
## 10 ALASKA San Diego 20 212 232
To compare arrival ratios we can mutate our final table again to add a ratio column
for delayed flights (dratio) and on time flights (otratio)
flights2 <- flights %>% mutate(dratio = delayed/totalflights, otratio = 1-dratio)
flights2
## # A tibble: 10 x 7
## airline destination delayed `on time` totalflights dratio otratio
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AM WEST Phoenix 415 4840 5255 0.0790 0.921
## 2 ALASKA Seattle 305 1841 2146 0.142 0.858
## 3 AM WEST Los Angeles 117 694 811 0.144 0.856
## 4 ALASKA San Francisco 102 503 605 0.169 0.831
## 5 ALASKA Los Angeles 62 497 559 0.111 0.889
## 6 AM WEST San Francisco 129 320 449 0.287 0.713
## 7 AM WEST San Diego 65 383 448 0.145 0.855
## 8 AM WEST Seattle 61 201 262 0.233 0.767
## 9 ALASKA Phoenix 12 221 233 0.0515 0.948
## 10 ALASKA San Diego 20 212 232 0.0862 0.914
From the ratio shown above we discover that Alaska has more delayed flights, with 14.2% of
its flights being delayed against 7.9% of AM West flights being delayed. We can also confirm
Alaska has more on time flights overall by grouping and descending the on time ratios by destination.
ot_flights <- flights2 %>%
arrange(destination, desc(otratio)) %>%
group_by(destination) %>%
select(-delayed:-dratio)
ot_flights
## # A tibble: 10 x 3
## # Groups: destination [5]
## airline destination otratio
## <chr> <chr> <dbl>
## 1 ALASKA Los Angeles 0.889
## 2 AM WEST Los Angeles 0.856
## 3 ALASKA Phoenix 0.948
## 4 AM WEST Phoenix 0.921
## 5 ALASKA San Diego 0.914
## 6 AM WEST San Diego 0.855
## 7 ALASKA San Francisco 0.831
## 8 AM WEST San Francisco 0.713
## 9 ALASKA Seattle 0.858
## 10 AM WEST Seattle 0.767
The ggplot graph supports our analysis of flight arrival times. Flights with Alaska
face less delays than flights with AM West.
ggplot(data=ot_flights, aes(x=reorder(destination, -otratio), y=otratio, fill=airline)) +
geom_bar(stat="identity", position=position_dodge()) +
scale_y_continuous(labels = percent, breaks=seq(0,1,.1)) +
scale_fill_brewer(palette="Blues") + theme_minimal() +
labs(title="% of on time flights by destination", y = " % On Time Flights ", x = "Destination")