library ("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library ("tidyr")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/Airline-Safety.csv"
a<- read.csv(url)
kable(head(a, 10))
| airline | avail_seat_km_per_week | incidents_85_99 | fatal_accidents_85_99 | fatalities_85_99 | incidents_00_14 | fatal_accidents_00_14 | fatalities_00_14 |
|---|---|---|---|---|---|---|---|
| Aer Lingus | 320906734 | 2 | 0 | 0 | 0 | 0 | 0 |
| Aeroflot* | 1197672318 | 76 | 14 | 128 | 6 | 1 | 88 |
| Aerolineas Argentinas | 385803648 | 6 | 0 | 0 | 1 | 0 | 0 |
| Aeromexico* | 596871813 | 3 | 1 | 64 | 5 | 0 | 0 |
| Air Canada | 1865253802 | 2 | 0 | 0 | 2 | 0 | 0 |
| Air France | 3004002661 | 14 | 4 | 79 | 6 | 2 | 337 |
| Air India* | 869253552 | 2 | 1 | 329 | 4 | 1 | 158 |
| Air New Zealand* | 710174817 | 3 | 0 | 0 | 5 | 1 | 7 |
| Alaska Airlines* | 965346773 | 5 | 0 | 0 | 5 | 1 | 88 |
| Alitalia | 698012498 | 7 | 2 | 50 | 4 | 0 | 0 |
##Rename columns for easy manipulation
dataset <- a %>%
rename("Incidents. 1985-1999"= incidents_85_99,
"Fatal accidents. 1985-1999" = fatal_accidents_85_99,
"Fatalities. 1985-1999"= fatalities_85_99,
"Incidents. 2000-2014" = incidents_00_14,
"Fatal accidents. 2000-2014" = fatal_accidents_00_14,
"Fatalities. 2000-2014" = fatalities_00_14)
head(dataset,5)
## airline avail_seat_km_per_week Incidents. 1985-1999
## 1 Aer Lingus 320906734 2
## 2 Aeroflot* 1197672318 76
## 3 Aerolineas Argentinas 385803648 6
## 4 Aeromexico* 596871813 3
## 5 Air Canada 1865253802 2
## Fatal accidents. 1985-1999 Fatalities. 1985-1999 Incidents. 2000-2014
## 1 0 0 0
## 2 14 128 6
## 3 0 0 1
## 4 1 64 5
## 5 0 0 2
## Fatal accidents. 2000-2014 Fatalities. 2000-2014
## 1 0 0
## 2 1 88
## 3 0 0
## 4 0 0
## 5 0 0
data2 <- dataset %>%
gather(Episode, Quantity, 'Incidents. 1985-1999' :'Fatalities. 2000-2014') %>%
arrange(airline)
head(data2,5)
## airline avail_seat_km_per_week Episode Quantity
## 1 Aer Lingus 320906734 Incidents. 1985-1999 2
## 2 Aer Lingus 320906734 Fatal accidents. 1985-1999 0
## 3 Aer Lingus 320906734 Fatalities. 1985-1999 0
## 4 Aer Lingus 320906734 Incidents. 2000-2014 0
## 5 Aer Lingus 320906734 Fatal accidents. 2000-2014 0
data3<-data2 %>%
separate(Episode, into = c("Episode", "YEAR"), sep = "\\. " )
head(data3,5)
## airline avail_seat_km_per_week Episode YEAR Quantity
## 1 Aer Lingus 320906734 Incidents 1985-1999 2
## 2 Aer Lingus 320906734 Fatal accidents 1985-1999 0
## 3 Aer Lingus 320906734 Fatalities 1985-1999 0
## 4 Aer Lingus 320906734 Incidents 2000-2014 0
## 5 Aer Lingus 320906734 Fatal accidents 2000-2014 0
tx<-aggregate(data3$Quantity, by=list(Episode=data3$Episode, Year=data3$YEAR), FUN=sum) %>%
group_by(Episode,Year) %>%
summarise(count=(x)) %>%
mutate(perc=count/sum(count))
##rename column
tx<- dplyr::rename(tx, 'Total'='count', 'Percent'='perc')
head (tx,5 )
## # A tibble: 5 x 4
## # Groups: Episode [3]
## Episode Year Total Percent
## <chr> <chr> <int> <dbl>
## 1 Fatal accidents 1985-1999 122 0.767
## 2 Fatal accidents 2000-2014 37 0.233
## 3 Fatalities 1985-1999 6295 0.669
## 4 Fatalities 2000-2014 3109 0.331
## 5 Incidents 1985-1999 402 0.635
p<-ggplot(tx, aes(x=Episode, y=Total, group=Year)) +
geom_line(aes(color=Year))+
geom_point(aes(color=Year))
p
total<-aggregate(data3$Quantity, by=list(Years=data3$YEAR), FUN=sum) %>%
mutate(pct_change = ((x/lag(x) - 1) * 100))
##rename column
total<- dplyr::rename(total, 'Total'='x', 'Percentage change'='pct_change')
total
## Years Total Percentage change
## 1 1985-1999 6819 NA
## 2 2000-2014 3377 -50.47661
The total quantity of episodes dropped on almost 51% after 1999.
ggplot(tx, aes(x = Episode, y = Percent, fill = Year, label = scales::percent(Percent))) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = .9),
vjust = -0.5,
size = 3) +
scale_y_continuous(labels = scales::percent)+ theme(axis.text=element_text(angle=90))+ ggtitle("Total Episodes by Years")
newdata <- data3 %>%
filter(YEAR=="1985-1999")
head(newdata,5)
## airline avail_seat_km_per_week Episode YEAR Quantity
## 1 Aer Lingus 320906734 Incidents 1985-1999 2
## 2 Aer Lingus 320906734 Fatal accidents 1985-1999 0
## 3 Aer Lingus 320906734 Fatalities 1985-1999 0
## 4 Aeroflot* 1197672318 Incidents 1985-1999 76
## 5 Aeroflot* 1197672318 Fatal accidents 1985-1999 14
group_by(newdata, airline) %>% filter(Quantity == min(Quantity)) %>% arrange(Quantity) %>% select (airline,Episode,Quantity)
## # A tibble: 79 x 3
## # Groups: airline [56]
## airline Episode Quantity
## <fct> <chr> <int>
## 1 Aer Lingus Fatal accidents 0
## 2 Aer Lingus Fatalities 0
## 3 Aerolineas Argentinas Fatal accidents 0
## 4 Aerolineas Argentinas Fatalities 0
## 5 Air Canada Fatal accidents 0
## 6 Air Canada Fatalities 0
## 7 Air New Zealand* Fatal accidents 0
## 8 Air New Zealand* Fatalities 0
## 9 Alaska Airlines* Fatal accidents 0
## 10 Alaska Airlines* Fatalities 0
## # ... with 69 more rows
newdata %>%
group_by(Episode, YEAR) %>%
summarise(
mean = mean(Quantity), max=max(Quantity))%>%
arrange(Episode)
## # A tibble: 3 x 4
## # Groups: Episode [3]
## Episode YEAR mean max
## <chr> <chr> <dbl> <int>
## 1 Fatal accidents 1985-1999 2.18 14
## 2 Fatalities 1985-1999 112. 535
## 3 Incidents 1985-1999 7.18 76
total<-aggregate(newdata$Quantity, by=list(Carrier=newdata$airline, Seats=newdata$avail_seat_km_per_week), FUN=sum)
##rename column
total<- dplyr::rename(total, 'Total'='x')%>%
arrange(desc(Total))
head (total,15 )
## Carrier Seats Total
## 1 China Airlines 813216487 553
## 2 Japan Airlines 1574217531 524
## 3 Delta / Northwest* 6525658894 443
## 4 Korean Air 1734522605 442
## 5 United / Continental* 7139291291 346
## 6 Air India* 869253552 332
## 7 Avianca 396922563 331
## 8 Saudi Arabian 859673901 322
## 9 Thai Airways 1702802250 320
## 10 Egyptair 557699891 293
## 11 Garuda Indonesia 613356665 273
## 12 US Airways / America West* 2455687887 247
## 13 Pakistan International 348563137 245
## 14 SWISS* 792601299 232
## 15 Aeroflot* 1197672318 218
g2<-ggplot(data=total, aes(x=Carrier, y=Total, group=10)) +
geom_line(arrow = arrow())+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs( x="Carrier", y="Total")+ggtitle("Total Episodes 1985-1999")
g2
ggplot(newdata, aes(airline, Quantity, color = Episode)) +
geom_point()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
newdata1 <- data3 %>%
filter(YEAR=="2000-2014")
head(newdata1,5)
## airline avail_seat_km_per_week Episode YEAR Quantity
## 1 Aer Lingus 320906734 Incidents 2000-2014 0
## 2 Aer Lingus 320906734 Fatal accidents 2000-2014 0
## 3 Aer Lingus 320906734 Fatalities 2000-2014 0
## 4 Aeroflot* 1197672318 Incidents 2000-2014 6
## 5 Aeroflot* 1197672318 Fatal accidents 2000-2014 1
newdata1 %>%
group_by(Episode, YEAR) %>%
summarise(
mean = mean(Quantity), max=max(Quantity))%>%
arrange(Episode)
## # A tibble: 3 x 4
## # Groups: Episode [3]
## Episode YEAR mean max
## <chr> <chr> <dbl> <int>
## 1 Fatal accidents 2000-2014 0.661 3
## 2 Fatalities 2000-2014 55.5 537
## 3 Incidents 2000-2014 4.12 24
group_by(newdata1, airline) %>% filter(Quantity == min(Quantity)) %>% arrange(airline) %>% select (airline,Episode,Quantity)
## # A tibble: 101 x 3
## # Groups: airline [56]
## airline Episode Quantity
## <fct> <chr> <int>
## 1 Aer Lingus Incidents 0
## 2 Aer Lingus Fatal accidents 0
## 3 Aer Lingus Fatalities 0
## 4 Aeroflot* Fatal accidents 1
## 5 Aerolineas Argentinas Fatal accidents 0
## 6 Aerolineas Argentinas Fatalities 0
## 7 Aeromexico* Fatal accidents 0
## 8 Aeromexico* Fatalities 0
## 9 Air Canada Fatal accidents 0
## 10 Air Canada Fatalities 0
## # ... with 91 more rows
total1<-aggregate(newdata1$Quantity, by=list(Carrier=newdata$airline, Seats=newdata$avail_seat_km_per_week), FUN=sum)
##rename column
total1<- dplyr::rename(total1, 'Total'='x')%>%
arrange(desc(Total))
head (total1,15 )
## Carrier Seats Total
## 1 Malaysia Airlines 1039171244 542
## 2 American* 5228357340 436
## 3 Air France 3004002661 345
## 4 Kenya Airways 277414794 287
## 5 China Airlines 813216487 228
## 6 TAM 1509195646 197
## 7 Air India* 869253552 163
## 8 Gulf Air 301379762 147
## 9 United / Continental* 7139291291 125
## 10 SAS* 682971852 117
## 11 Ethiopian Airlines 488560643 99
## 12 Aeroflot* 1197672318 95
## 13 Alaska Airlines* 965346773 94
## 14 Turkish Airlines 1946098294 94
## 15 Singapore Airlines 2376857805 86
g2<-ggplot(data=total1, aes(x=Carrier, y=Total, group=10)) +
geom_line(arrow = arrow())+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs( x="Carrier", y="Total")+ggtitle("Total Episodes 2000-2014")
g2
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggplot(newdata1, aes(airline, Quantity, color = Episode)) +
geom_point()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))