fullData <- read.csv2(file = "https://raw.githubusercontent.com/san123i/CUNY/master/Semester1/607/Week5/data.csv", header = T, sep = ",", na.strings = c("","NA"), blank.lines.skip = TRUE, stringsAsFactors = F)
names(fullData)[1] <- 'Airline'
fullData_Long <- gather(fullData, key="City", value="FlightCount", 3:7)
head(fullData_Long)
## Airline Status City FlightCount
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
1. gather -> Convert data from Wide format to Long format
2. select -> Select only a limited set of columns
3. group_by -> Group the data in a tbl/df mentioning selected columns
4. summarize -> Generate sumarized stats
5. filter -> Filter the tbl/df by applying conditions on columns
6. mutate -> Modify the column data or create new columns
7. left_join -> Apply left join on different datasets
airline_city_total <- fullData_Long %>% select(Airline, City, FlightCount) %>% group_by(Airline, City) %>% summarize(totalcount=sum(FlightCount))
fullData_long_totalcount <- left_join(fullData_Long, airline_city_total, by=c("Airline","City"))
head(fullData_long_totalcount)
## Airline Status City FlightCount totalcount
## 1 ALASKA on time Los.Angeles 497 559
## 2 ALASKA delayed Los.Angeles 62 559
## 3 AM WEST on time Los.Angeles 694 811
## 4 AM WEST delayed Los.Angeles 117 811
## 5 ALASKA on time Phoenix 221 233
## 6 ALASKA delayed Phoenix 12 233
city_airline_ontimeRatio <- fullData_long_totalcount %>% select(Airline, Status, City, FlightCount, totalcount) %>% mutate(ratio=FlightCount/totalcount) %>% filter(Status=='on time') %>% group_by(City, Airline) %>% select(City, Airline, ratio)
city_airline_ontimeRatio
## # A tibble: 10 x 3
## # Groups: City, Airline [10]
## City Airline ratio
## <chr> <chr> <dbl>
## 1 Los.Angeles ALASKA 0.889
## 2 Los.Angeles AM WEST 0.856
## 3 Phoenix ALASKA 0.948
## 4 Phoenix AM WEST 0.921
## 5 San.Diego ALASKA 0.914
## 6 San.Diego AM WEST 0.855
## 7 San.Fransisco ALASKA 0.831
## 8 San.Fransisco AM WEST 0.713
## 9 Seattle ALASKA 0.858
## 10 Seattle AM WEST 0.767
final_transformed_data <- mutate(city_airline_ontimeRatio, ontime_ratio=ratio, delay_ratio=1-ratio) %>% select(City, Airline, ontime_ratio, delay_ratio)
final_transformed_data
## # A tibble: 10 x 4
## # Groups: City, Airline [10]
## City Airline ontime_ratio delay_ratio
## <chr> <chr> <dbl> <dbl>
## 1 Los.Angeles ALASKA 0.889 0.111
## 2 Los.Angeles AM WEST 0.856 0.144
## 3 Phoenix ALASKA 0.948 0.0515
## 4 Phoenix AM WEST 0.921 0.0790
## 5 San.Diego ALASKA 0.914 0.0862
## 6 San.Diego AM WEST 0.855 0.145
## 7 San.Fransisco ALASKA 0.831 0.169
## 8 San.Fransisco AM WEST 0.713 0.287
## 9 Seattle ALASKA 0.858 0.142
## 10 Seattle AM WEST 0.767 0.233
ggplot(fullData_long_totalcount, aes(x=City, y = FlightCount, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())
ggplot(final_transformed_data, aes(x=City, y = ontime_ratio, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())
ggplot(final_transformed_data, aes(x=City, y = delay_ratio, fill=Airline)) + geom_bar(stat="identity", position = position_dodge())