df <- read.csv("https://raw.githubusercontent.com/ajbentley/cuny_ms_ds/master/607/week5_hw_607_data.csv", header = TRUE)
df
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Use tidyr and dplyr as needed to tidy and transform your data
There are two ways to set this up, specifically the first two columns. Either it can be by airline and then by city or by city then by airline. In practice the decision would be driven by the client and the question.
The most likely questions would be:
- What airline is best at keeping its schedule?
- What airport is best at keeping traffic moving smoothly?
As the next part of the assignment asks to compare arrival delays I'll order as:
Planned Data Frame Format
## Airline City On.time Delayed
## 1 ALASKA LA ot count delayed count
## 2 AMWEST SD ot count delayed count
1 - Remove blank lines from the data
df <- df %>% na.omit()
df
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
2 - Fill in the empty cells in the first column with the values above it by first changing the blanks to NA and then using fill
df <- df %>%
mutate(X = na_if(X, "")) %>%
fill(X)
df
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
3 - Move the cities into a column
df <- df %>%
gather("City", "time", 3:7)
df
## X X.1 City time
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AMWEST on time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on time Seattle 201
## 20 AMWEST delayed Seattle 61
4 - spread X.1 (on time / delayed) across two columns
df <- df %>%
spread("X.1", "time")
df
## X City delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Francisco 129 320
## 10 AMWEST Seattle 61 201
5 - change X to Airline and capitalize delayed and on time
df <- df %>%
rename("Airline" = X, "Delayed" = delayed, "On time" = `on time`)
df
## Airline City Delayed On time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
## 7 AMWEST Phoenix 415 4840
## 8 AMWEST San.Diego 65 383
## 9 AMWEST San.Francisco 129 320
## 10 AMWEST Seattle 61 201
Now let’s try to do that with one tidy statement
df2 <- read.csv("https://raw.githubusercontent.com/ajbentley/cuny_ms_ds/master/607/week5_hw_607_data.csv", header = TRUE)
df2 <- df2 %>% na.omit() %>% # remove blank lines
mutate(X = na_if(X, "")) %>% # change blanks in Airline to NA
fill(X) %>% # fill blanks in Airline with value in cell above
gather("City", "time", 3:7) %>% # move cities into a column
spread("X.1", "time") %>% # spread on time and delayed into columns
rename("Airline" = X, # change first column name to Airline
"Delayed" = delayed, # capitalize delayed
"On time" = `on time`) # capitalize on time
head(df2)
## Airline City Delayed On time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
Planned Data Frame Format
## City ALASKA AMWEST Diff Pct.Diff
## 1 LA Del.time Del.time AL-AM AL/AM-1
## 2 SD Del.time Del.time AL-AM AL/AM-1
df_d <- df2 %>%
arrange(City) %>% # order by City
select(City, Airline, Delayed) %>% # move City to first column and drop On Time column
spread("Airline", "Delayed") %>% # make each airline a col with delay info beneath
mutate(Difference = ALASKA - AMWEST) %>% # make a col with the tot diff between airlines
mutate(Pct.Difference = (ALASKA / AMWEST) -1) %>% # make a col with the ptc diff between airlines
mutate(rounded = round(Pct.Difference, 2)) %>% # make a col with the pct diff rounded to 2 places
select(-('Pct.Difference')) %>% # drop the original pct. diff column
rename("Pct. Difference" = rounded) # rename the rounded column to Pct. Difference
df_d
## City ALASKA AMWEST Difference Pct. Difference
## 1 Los.Angeles 62 117 -55 -0.47
## 2 Phoenix 12 415 -403 -0.97
## 3 San.Diego 20 65 -45 -0.69
## 4 San.Francisco 102 129 -27 -0.21
## 5 Seattle 305 61 244 4.00
Planned Data Frame Format
## City ALASKA.On.time ALASKA.Delayed ALASKA.Ratio AMWEST.On.time
## 1 LA OT.Time Del.Time Del.Time / OT.Time OT.Time
## 2 PHX OT.Time Del.Time Del.Time / OT.Time OT.Time
## AMWEST.Delayed AMWEST.Ratio Difference
## 1 Del.Time Del.Time / OT.Time ALASKA Ratio - AMWEST Ratio
## 2 Del.Time Del.Time / OT.Time ALASKA Ratio - AMWEST Ratio
head(df2)
## Airline City Delayed On time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AMWEST Los.Angeles 117 694
df_r <- df2 %>%
arrange(City, Airline) %>% # order by City and Airline
select(City, Airline, Delayed, 'On time') %>% # move City to first column
rename("On.Time" = `On time`) %>% # change On time to easier name to work with
gather("Del/OT", 'Count', 3:4) %>% # create column with both delay and ontime #s
unite(temp1, "Airline", "Del/OT", sep = ".") %>% # merge airport name with delay or ontime
spread("temp1", "Count") %>% # make cols for each airport/time pair
mutate(ALASKA.Ratio = # make ALASKA ratio column
ALASKA.On.Time / ALASKA.Delayed) %>%
mutate(AMWEST.Ratio = # make AMWEST ratio column
AMWEST.On.Time / AMWEST.Delayed) %>%
mutate(Alaska.Ratio = round(ALASKA.Ratio, 1)) %>% # round Alaska ratio column to 1 decimal
mutate(AMW.Ratio = round(AMWEST.Ratio, 1)) %>% # round AMWEST ratio column to 1 decimal
select('City','ALASKA.On.Time', 'ALASKA.Delayed', # reorder columns
'Alaska.Ratio', 'AMWEST.On.Time',
'AMWEST.Delayed', 'AMW.Ratio')
df_r
## City ALASKA.On.Time ALASKA.Delayed Alaska.Ratio AMWEST.On.Time
## 1 Los.Angeles 497 62 8.0 694
## 2 Phoenix 221 12 18.4 4840
## 3 San.Diego 212 20 10.6 383
## 4 San.Francisco 503 102 4.9 320
## 5 Seattle 1841 305 6.0 201
## AMWEST.Delayed AMW.Ratio
## 1 117 5.9
## 2 415 11.7
## 3 65 5.9
## 4 129 2.5
## 5 61 3.3
I actually don’t like the way this looks–too busy. I’m going to just show the two ratios.
## City Alaska.Ratio AMW.Ratio
## 1 Los.Angeles 8.0 5.9
## 2 Phoenix 18.4 11.7
## 3 San.Diego 10.6 5.9
## 4 San.Francisco 4.9 2.5
## 5 Seattle 6.0 3.3