Summary

This short analysis is done for a self-training purpose. The dataset was found on Gapminder. The data origin is from the Center for Research on the Epidemiology of Disasters.

data source: https://www.gapminder.org/data/
data file: https://spreadsheets.google.com/pub?key=rSv5aMDwESiKg-yA__-tRFg&output=xls

The indicator represents the number of people killed in air accidents during the given year.

Dataset first overview

Please note that I saved the excel file as CSV on my computer.

After loading the file and having a quick view at the data, four small issues occure:

  • each column name (except the first one) start by “X”

  • last column named “X” is empty

  • last row is empty

  • columns are not variables

library(ggplot2)
library(dplyr)
library(tidyr)
library(ggrepel)
data.source <- read.csv("C:/Users/marc/Desktop/Data/160817_plane crash/indicator_air accident killed.csv", sep = ";", header= TRUE, stringsAsFactors = FALSE)
data.source <- tbl_df(data.source)
dim(data.source)  # number of rows and columns
## [1] 134  41
colnames(data.source) 
##  [1] "Air.accidents.killed" "X1970"                "X1971"               
##  [4] "X1972"                "X1973"                "X1974"               
##  [7] "X1975"                "X1976"                "X1977"               
## [10] "X1978"                "X1979"                "X1980"               
## [13] "X1981"                "X1982"                "X1983"               
## [16] "X1984"                "X1985"                "X1986"               
## [19] "X1987"                "X1988"                "X1989"               
## [22] "X1990"                "X1991"                "X1992"               
## [25] "X1993"                "X1994"                "X1995"               
## [28] "X1996"                "X1997"                "X1998"               
## [31] "X1999"                "X2000"                "X2001"               
## [34] "X2002"                "X2003"                "X2004"               
## [37] "X2005"                "X2006"                "X2007"               
## [40] "X2008"                "X"
head(data.source[,1:5], 5) #first rows and columns
## # A tibble: 5 x 5
##   Air.accidents.killed X1970 X1971 X1972 X1973
##                  <chr> <int> <int> <int> <int>
## 1          Afghanistan     0     0     0     0
## 2              Algeria     0     0     0     0
## 3               Angola     0     0     0     0
## 4            Argentina     0     0     0     0
## 5              Armenia     0     0     0     0
tail(data.source[,1:5],5)
## # A tibble: 5 x 5
##   Air.accidents.killed X1970 X1971 X1972 X1973
##                  <chr> <int> <int> <int> <int>
## 1         West Germany     0     0     0     0
## 2          Yemen, Rep.     0     0     0     0
## 3           Yugoslavia     0     0     0     0
## 4               Zambia     0     0     0     0
## 5                         NA    NA    NA    NA

Data cleaning

#Removing last empty row and column
data.cleaned <- data.source[-134,-41]

#Moving columns into Rows
data.cleaned <- gather(data.cleaned, year, people.killed, -Air.accidents.killed)
head(data.cleaned,5)
## # A tibble: 5 x 3
##   Air.accidents.killed  year people.killed
##                  <chr> <chr>         <int>
## 1          Afghanistan X1970             0
## 2              Algeria X1970             0
## 3               Angola X1970             0
## 4            Argentina X1970             0
## 5              Armenia X1970             0
colnames(data.cleaned) <- c("country", "year", "people.killed")

#remove the "X" in front of each year
data.cleaned$year <- gsub("X", "", data.cleaned$year)
data.cleaned$year <- as.numeric(data.cleaned$year)

Number of people killed per year

This dataset is from 1970 to 2008

group.year <- group_by(data.cleaned, year)
ppl.killed.per.year <- summarize(group.year, nb.death = sum(people.killed)) 

ggplot(ppl.killed.per.year, aes(x=year, y= nb.death, group=1, label=nb.death)) +
      geom_point(size=5, color= "#008080")+
      geom_line(size= 1.2, color= "#008080")+
      theme_bw()+
      geom_label_repel(color = "black", 
                       box.padding = unit(0.25, "lines"))+
      ylab("Number of people killed") +
      ggtitle("Number of people killed by air accidents per year for all countries")+
      scale_x_continuous(breaks= seq(1970,2008,3))

1996 stands out as being a tragic year. We can check to see in which countries were the main incidents this year.

year.1996 <- filter(data.cleaned, year == 1996 & people.killed != 0) %>%
      arrange(desc(people.killed))      
knitr::kable(head(year.1996,10))
country year people.killed
Congo, Dem. Rep. 1996 432
India 1996 373
United States 1996 361
Peru 1996 216
Dominican Rep. 1996 189
Nigeria 1996 154
Sudan 1996 144
Norway 1996 143
Comoros 1996 127
Brazil 1996 98

Top 15 countries with the most people killed in air accidents

killed.country <- group_by(data.cleaned, country) %>% 
      summarize(death = sum(people.killed)) %>%
      arrange(desc(death))

head15 <- head(killed.country,15)


ggplot(head15, aes(x=country, y=death)) +
      geom_bar(stat = "identity", fill= "#008080") +
      coord_flip()+
      theme_bw()

Top 15 countries with the least people killed in air accidents

bottom15 <- tail(killed.country,15)

ggplot(bottom15, aes(x=country, y=death)) +
      geom_bar(stat = "identity", fill= "#008080") +
      coord_flip()+
      theme_bw()

Worst incident(s) for a given year and country

A small research on Wikipedia indicates that the worst incident in the canari island is related to “The Tenerife airport disaster [which was] was a fatal runway collision between two Boeing 747s on Sunday, March 27, 1977”

knitr::kable(head(arrange(data.cleaned,desc(people.killed)),5))
country year people.killed
Canary Is 1977 562
Japan 1985 520
Congo, Dem. Rep. 1996 432
Russia 1994 431
Indonesia 1997 379