Extract and prepare data

library ("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library ("tidyr")
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
url<-"https://raw.githubusercontent.com/uplotnik/Data607/master/Airline-Safety.csv"
a<- read.csv(url)
kable(head(a, 10))
airline avail_seat_km_per_week incidents_85_99 fatal_accidents_85_99 fatalities_85_99 incidents_00_14 fatal_accidents_00_14 fatalities_00_14
Aer Lingus 320906734 2 0 0 0 0 0
Aeroflot* 1197672318 76 14 128 6 1 88
Aerolineas Argentinas 385803648 6 0 0 1 0 0
Aeromexico* 596871813 3 1 64 5 0 0
Air Canada 1865253802 2 0 0 2 0 0
Air France 3004002661 14 4 79 6 2 337
Air India* 869253552 2 1 329 4 1 158
Air New Zealand* 710174817 3 0 0 5 1 7
Alaska Airlines* 965346773 5 0 0 5 1 88
Alitalia 698012498 7 2 50 4 0 0

Rename columns

##Rename columns for easy manipulation 
dataset <- a %>%
  rename("Incidents. 1985-1999"= incidents_85_99,
         "Fatal accidents. 1985-1999" = fatal_accidents_85_99,
         "Fatalities. 1985-1999"= fatalities_85_99,
         "Incidents. 2000-2014" = incidents_00_14,
         "Fatal accidents. 2000-2014" = fatal_accidents_00_14,
        "Fatalities. 2000-2014" = fatalities_00_14)
head(dataset,5)
##                 airline avail_seat_km_per_week Incidents. 1985-1999
## 1            Aer Lingus              320906734                    2
## 2             Aeroflot*             1197672318                   76
## 3 Aerolineas Argentinas              385803648                    6
## 4           Aeromexico*              596871813                    3
## 5            Air Canada             1865253802                    2
##   Fatal accidents. 1985-1999 Fatalities. 1985-1999 Incidents. 2000-2014
## 1                          0                     0                    0
## 2                         14                   128                    6
## 3                          0                     0                    1
## 4                          1                    64                    5
## 5                          0                     0                    2
##   Fatal accidents. 2000-2014 Fatalities. 2000-2014
## 1                          0                     0
## 2                          1                    88
## 3                          0                     0
## 4                          0                     0
## 5                          0                     0

Gather columns

data2 <- dataset %>%
  gather(Episode, Quantity, 'Incidents. 1985-1999' :'Fatalities. 2000-2014') %>%
  arrange(airline)
 head(data2,5) 
##      airline avail_seat_km_per_week                    Episode Quantity
## 1 Aer Lingus              320906734       Incidents. 1985-1999        2
## 2 Aer Lingus              320906734 Fatal accidents. 1985-1999        0
## 3 Aer Lingus              320906734      Fatalities. 1985-1999        0
## 4 Aer Lingus              320906734       Incidents. 2000-2014        0
## 5 Aer Lingus              320906734 Fatal accidents. 2000-2014        0

Separate column

data3<-data2 %>% 
  separate(Episode, into = c("Episode", "YEAR"), sep = "\\. " )
head(data3,5)
##      airline avail_seat_km_per_week         Episode      YEAR Quantity
## 1 Aer Lingus              320906734       Incidents 1985-1999        2
## 2 Aer Lingus              320906734 Fatal accidents 1985-1999        0
## 3 Aer Lingus              320906734      Fatalities 1985-1999        0
## 4 Aer Lingus              320906734       Incidents 2000-2014        0
## 5 Aer Lingus              320906734 Fatal accidents 2000-2014        0

Sum Episodes

tx<-aggregate(data3$Quantity, by=list(Episode=data3$Episode, Year=data3$YEAR), FUN=sum) %>%
  group_by(Episode,Year) %>% 
  summarise(count=(x)) %>% 
  mutate(perc=count/sum(count))
 ##rename column
tx<- dplyr::rename(tx, 'Total'='count', 'Percent'='perc')
head (tx,5 )
## # A tibble: 5 x 4
## # Groups:   Episode [3]
##   Episode         Year      Total Percent
##   <chr>           <chr>     <int>   <dbl>
## 1 Fatal accidents 1985-1999   122   0.767
## 2 Fatal accidents 2000-2014    37   0.233
## 3 Fatalities      1985-1999  6295   0.669
## 4 Fatalities      2000-2014  3109   0.331
## 5 Incidents       1985-1999   402   0.635

Total Episodes by Years

p<-ggplot(tx, aes(x=Episode, y=Total, group=Year)) +
  geom_line(aes(color=Year))+
  geom_point(aes(color=Year))
p

total<-aggregate(data3$Quantity, by=list(Years=data3$YEAR), FUN=sum) %>%
  mutate(pct_change = ((x/lag(x) - 1) * 100))

 ##rename column
total<- dplyr::rename(total, 'Total'='x', 'Percentage change'='pct_change')
total
##       Years Total Percentage change
## 1 1985-1999  6819                NA
## 2 2000-2014  3377         -50.47661

The total quantity of episodes dropped on almost 51% after 1999.

    ggplot(tx, aes(x = Episode, y = Percent, fill = Year, label = scales::percent(Percent))) + 
    geom_col(position = 'dodge') + 
    geom_text(position = position_dodge(width = .9),    
              vjust = -0.5,    
              size = 3) + 
    scale_y_continuous(labels = scales::percent)+  theme(axis.text=element_text(angle=90))+  ggtitle("Total Episodes by Years")

Analyse flight episodes 1985-1999

newdata <- data3 %>% 
  filter(YEAR=="1985-1999") 
  head(newdata,5)
##      airline avail_seat_km_per_week         Episode      YEAR Quantity
## 1 Aer Lingus              320906734       Incidents 1985-1999        2
## 2 Aer Lingus              320906734 Fatal accidents 1985-1999        0
## 3 Aer Lingus              320906734      Fatalities 1985-1999        0
## 4  Aeroflot*             1197672318       Incidents 1985-1999       76
## 5  Aeroflot*             1197672318 Fatal accidents 1985-1999       14

The safest airlines in 1985-1999

group_by(newdata, airline)  %>% filter(Quantity == min(Quantity)) %>% arrange(Quantity)  %>% select (airline,Episode,Quantity)
## # A tibble: 79 x 3
## # Groups:   airline [56]
##    airline               Episode         Quantity
##    <fct>                 <chr>              <int>
##  1 Aer Lingus            Fatal accidents        0
##  2 Aer Lingus            Fatalities             0
##  3 Aerolineas Argentinas Fatal accidents        0
##  4 Aerolineas Argentinas Fatalities             0
##  5 Air Canada            Fatal accidents        0
##  6 Air Canada            Fatalities             0
##  7 Air New Zealand*      Fatal accidents        0
##  8 Air New Zealand*      Fatalities             0
##  9 Alaska Airlines*      Fatal accidents        0
## 10 Alaska Airlines*      Fatalities             0
## # ... with 69 more rows
newdata %>%
  group_by(Episode, YEAR) %>%
  summarise(
    mean = mean(Quantity), max=max(Quantity))%>%
  arrange(Episode)
## # A tibble: 3 x 4
## # Groups:   Episode [3]
##   Episode         YEAR        mean   max
##   <chr>           <chr>      <dbl> <int>
## 1 Fatal accidents 1985-1999   2.18    14
## 2 Fatalities      1985-1999 112.     535
## 3 Incidents       1985-1999   7.18    76

Total episodes by airlines

total<-aggregate(newdata$Quantity, by=list(Carrier=newdata$airline, Seats=newdata$avail_seat_km_per_week), FUN=sum)
 ##rename column
total<- dplyr::rename(total, 'Total'='x')%>%
 arrange(desc(Total))
head (total,15 )
##                       Carrier      Seats Total
## 1              China Airlines  813216487   553
## 2              Japan Airlines 1574217531   524
## 3          Delta / Northwest* 6525658894   443
## 4                  Korean Air 1734522605   442
## 5       United / Continental* 7139291291   346
## 6                  Air India*  869253552   332
## 7                     Avianca  396922563   331
## 8               Saudi Arabian  859673901   322
## 9                Thai Airways 1702802250   320
## 10                   Egyptair  557699891   293
## 11           Garuda Indonesia  613356665   273
## 12 US Airways / America West* 2455687887   247
## 13     Pakistan International  348563137   245
## 14                     SWISS*  792601299   232
## 15                  Aeroflot* 1197672318   218

Visualization

g2<-ggplot(data=total, aes(x=Carrier, y=Total, group=10)) +
  geom_line(arrow = arrow())+
  geom_point()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  labs( x="Carrier", y="Total")+ggtitle("Total Episodes 1985-1999")
g2

ggplot(newdata, aes(airline, Quantity, color = Episode)) +
  geom_point()+  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Analyse fliht episodes 2000-2014

newdata1 <- data3 %>% 
  filter(YEAR=="2000-2014") 
  head(newdata1,5)
##      airline avail_seat_km_per_week         Episode      YEAR Quantity
## 1 Aer Lingus              320906734       Incidents 2000-2014        0
## 2 Aer Lingus              320906734 Fatal accidents 2000-2014        0
## 3 Aer Lingus              320906734      Fatalities 2000-2014        0
## 4  Aeroflot*             1197672318       Incidents 2000-2014        6
## 5  Aeroflot*             1197672318 Fatal accidents 2000-2014        1
newdata1 %>%
  group_by(Episode, YEAR) %>%
  summarise(
    mean = mean(Quantity), max=max(Quantity))%>%
  arrange(Episode)
## # A tibble: 3 x 4
## # Groups:   Episode [3]
##   Episode         YEAR        mean   max
##   <chr>           <chr>      <dbl> <int>
## 1 Fatal accidents 2000-2014  0.661     3
## 2 Fatalities      2000-2014 55.5     537
## 3 Incidents       2000-2014  4.12     24

The safest airlines 2000-2014

group_by(newdata1, airline)  %>% filter(Quantity == min(Quantity)) %>% arrange(airline)  %>% select (airline,Episode,Quantity)
## # A tibble: 101 x 3
## # Groups:   airline [56]
##    airline               Episode         Quantity
##    <fct>                 <chr>              <int>
##  1 Aer Lingus            Incidents              0
##  2 Aer Lingus            Fatal accidents        0
##  3 Aer Lingus            Fatalities             0
##  4 Aeroflot*             Fatal accidents        1
##  5 Aerolineas Argentinas Fatal accidents        0
##  6 Aerolineas Argentinas Fatalities             0
##  7 Aeromexico*           Fatal accidents        0
##  8 Aeromexico*           Fatalities             0
##  9 Air Canada            Fatal accidents        0
## 10 Air Canada            Fatalities             0
## # ... with 91 more rows

Total episodes by airlines

total1<-aggregate(newdata1$Quantity, by=list(Carrier=newdata$airline, Seats=newdata$avail_seat_km_per_week), FUN=sum)
 ##rename column
total1<- dplyr::rename(total1, 'Total'='x')%>%
  arrange(desc(Total))
head (total1,15 )
##                  Carrier      Seats Total
## 1      Malaysia Airlines 1039171244   542
## 2              American* 5228357340   436
## 3             Air France 3004002661   345
## 4          Kenya Airways  277414794   287
## 5         China Airlines  813216487   228
## 6                    TAM 1509195646   197
## 7             Air India*  869253552   163
## 8               Gulf Air  301379762   147
## 9  United / Continental* 7139291291   125
## 10                  SAS*  682971852   117
## 11    Ethiopian Airlines  488560643    99
## 12             Aeroflot* 1197672318    95
## 13      Alaska Airlines*  965346773    94
## 14      Turkish Airlines 1946098294    94
## 15    Singapore Airlines 2376857805    86

Visualization

g2<-ggplot(data=total1, aes(x=Carrier, y=Total, group=10)) +
  geom_line(arrow = arrow())+
  geom_point()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  labs( x="Carrier", y="Total")+ggtitle("Total Episodes 2000-2014")
g2

theme(axis.text.x = element_text(angle = 90, hjust = 1))+

ggplot(newdata1, aes(airline, Quantity, color = Episode)) +
  geom_point()+  theme(axis.text.x = element_text(angle = 90, hjust = 1))