Introduction

The Dataset

This dataset contains very much information about the terrorism, from There are ` records and columns including date, time, location, number of hostages, killed, wounded, if there was a ransom, the outcome, if there was a suicide attack, claims, weapons used.

Missing data

There is a significant amount of missing information. For example, there are missing longitude values and missing latitude values. For only events is recorded a type of claim and only ransom notes are recorded.

Shark, Wikimedia Commons, public domain

Read the data

library(naniar)
miss_var_summary(df)
## # A tibble: 26 x 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 Time       3403    52.4 
##  2 Species    2923    45.0 
##  3 Age        2872    44.3 
##  4 Day         875    13.5 
##  5 isFatal     619     9.54
##  6 Sex         567     8.74
##  7 Activity    554     8.54
##  8 Location    549     8.46
##  9 Month       506     7.80
## 10 Area        465     7.17
## # ... with 16 more rows
qomo <-  "./images/shark.png"
sk <- png::readPNG(qomo)
ski = raster2uri(as.raster(sk))
kili <-  "https://cdn.pixabay.com/photo/2017/05/16/10/10/shark-2317422__340.png"
 
df %>% select(Country,isFatal) %>%  filter(!is.na(isFatal)) %>%  group_by(isFatal) %>% summarise(count=n()) -> plt 
## `summarise()` ungrouping output (override with `.groups` argument)
data <- data.frame(
  x = c("non Fatal", "Fatal"), 
  value = c(plt$count[1], plt$count[2]),
  symbol = c(paste0("image://", ski),
    paste0("image://", kili))
)


data %>% 
  e_charts(x) %>% 
  e_pictorial(value, symbol) %>% 
  e_labels(fontSize = 10) %>%
  e_legend(F) 

Shark Attacks

If we look now to the country split, we see that there are few countries where the number of killed people in terrorist attacks is very large. Some of them have experienced recent massive increase, like Iraq and Syria and Afganistan while others have a long history, like Peru, Colombia, India, Pakistan.

treemap(df, 
        index=c("Country"), 
        vSize = "isFatal",  
        palette = "Reds",  
        title="Shark Attack", 
        fontsize.title = 14  )

Attacks by countries and years

Let’s look now to both countries and years in the same time. We also add, besides the number of killed people, the number of wounded (color is proportional with the number of wounded).

treemap(df, #Your data frame object
        index=c("Country", "Year"),  
        type = "value",
        vSize = "isFatal", 
        #vColor="nwound",
        palette = "RdBu", 
        title="Victims in Shark Attacks  (Countries/Years) - size is proportional with the number of Attacks", 
        title.legend = "Number of Attacks",
        fontsize.title = 13
)

df  %>% group_by(Year) %>%  filter( n() > 100  & !is.na(Year)) %>%  count() %>%
        ggplot() +
        geom_col(aes(x=reorder(Year, n), y=n, fill=Year)) + coord_flip() + theme_bw() + 
        labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle("Years with the highest number of Attacks \n by Shark Attack until 2019") +
  scale_y_continuous(breaks = seq(0,300,20))

df  %>% group_by(Year) %>%  filter( !is.na(Year)) %>% count() %>%
        ggplot() +
        geom_line(aes(x=Year, y=n) , color = "red") +  theme_bw() + 
        labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle("All time Attacks by Year") 

df  %>% group_by(Year) %>%  filter( !is.na(Year) & Year>1900) %>% count() %>%
        ggplot() +
        geom_line(aes(x=Year, y=n), color = "red") +  theme_bw() + 
        labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle(" Attacks by Year since 1900") 

Fatal Attacks

Let’s look to the deaths of people grouped by highest Countries.

df  %>% group_by(Country) %>%  filter(isFatal == 1 & n() > 30  & !is.na(Country)) %>% count() %>%
        ggplot(aes(x=reorder(Country, n), y=n, fill=Country)) +
        geom_col() + coord_flip() + theme_bw() + 
        geom_text(aes(label=n) , color="white", hjust=1.2 , size=3.5)+
        labs(x="", y="Number of Deaths") + theme(legend.position="none") +
        ggtitle("Countries with the highest number of deaths \n by Shark Attack until 2019") +
  scale_y_continuous(breaks = seq(0,300,20))

### Deaths by Year since 1800

df_fatal_year <- df %>% filter(Year != 0, Year > 1800) %>% group_by(Year) %>% filter( isFatal ==1 ) %>% summarise( count = n() )
## `summarise()` ungrouping output (override with `.groups` argument)
plt <-   plot_ly(df_fatal_year, x = ~Year, y = ~count, 
       color = ~Year, colors = sample(colours(), 180),
       type = 'bar', orientation = 'v',
       hoverinfo = 'text' , text = ~paste("Year: ", Year,  "<br>count: ", count)) %>%
       layout(showlegend = FALSE,
              
              title = "Deaths by Year",
              xaxis = list(title = "Year"),
              yaxis = list(title = "count"),
              
              annotations = list(x = 1, y = -0.1, text = "Death Since 1800", 
                                 showarrow = F, xref='paper', yref='paper', 
                                 xanchor='right', yanchor='auto', xshift=0, yshift=0,
                                 font=list(size=10, color="grey") ), 
                                  title = "Attacks by Country",
                                 yaxis = list(title = "Country"),
                                 xaxis = list(title = "count")
              ) %>%
     
      
       highlight("plotly_click")
  
  hide_colorbar(plt)    

Type of attack

df <- filter(df,lat!=0)
leaflet(data = df) %>%
  addTiles() %>%
  addMarkers(lat=df$lat, lng=df$long, clusterOptions = markerClusterOptions(),
             popup= paste("<strong>Date: </strong>", df$Month,"/", df$Day,"/", df$Year,
                          "<br><br><strong>Place: </strong>", df$Location,"-",df$Country,
                          "<br><strong>Case NUmber: </strong>", df$Case.Number,
                          "<br><strong>Latitude: </strong>", df$lat,
                           "<br><strong>Longitude: </strong>", df$long,
                          "<br><strong>Specie: </strong>", df$Species
                          #"<br><strong>Wounded US citizens: </strong>", dfr$nwoundus,
                          #"<br><strong>Suicide attack(0-No/1-Yes): </strong>", dfr$suicide,
                          #"<br><strong>Ransom paid: </strong>", dfr$ransompaid,
                          #"<br><strong>Ransom note: </strong>", dfr$ransomnote,
                          #"<br><strong>Hostages/kidnapped: </strong>", dfr$nhostkid,
                          #"<br><strong>Hostages/kidnapped outcome: </strong>", dfr$hostkidoutcome_txt
             ))

Species

set.seed(1234)
df %>% filter(Species != is.na(Species)) %>% count(Species) ->  cd2
wordcloud(words = cd2$Species, freq = cd2$n, min.freq = 9,  colors=brewer.pal(8, "Dark2"))

species_txt <- df %>%  filter(!is.na(Species)) %>%
        mutate( linenumber = row_number()) %>%
        ungroup() %>%
        unnest_tokens(word,Species) %>%
        anti_join(stop_words) 
## Joining, by = "word"
vec_species <- species %>% c(tolower(species$Common_name))

is_specie <- function(field) {
  return  (tolower(field) %in% vec_species    )
}

species_txt %>% mutate(ident_specie = is_specie(species_txt$word) )  %>% 
   filter(ident_specie==TRUE) %>% select(word) -> WordList


WordList %>%
  count(word, sort= TRUE) %>%
  #filter(n > 5) %>% #if want filter
  mutate(word=reorder(word,n)) %>%
  ggplot(aes(word,n)) +
    geom_col() +
    xlab(NULL) +
    coord_flip()

We can observe that Bombing/Explosion was allways the most frequent type of attack, followed by Armed Assault. In recent years, the number of such events increased dramatically, accounting for the majority of the events in recent years.

###Gender

df %>%  filter(!is.na(isFatal)) %>% count(Sex) %>% mutate(perc_fatal = n / sum(n) * 100)
##    Sex    n  perc_fatal
## 1    F  405 11.64797239
## 2  lli    1  0.02876043
## 3    M 2827 81.30572332
## 4    N    1  0.02876043
## 5 <NA>  243  6.98878343
df %>% filter (isFatal == TRUE) %>% count(Sex) %>% mutate(perc_fatal = n / sum(n) * 100)
##    Sex   n perc_fatal
## 1    F  41   6.972789
## 2    M 520  88.435374
## 3 <NA>  27   4.591837
df %>% filter (isFatal != TRUE) %>% count(Sex) %>% mutate(perc_fatal = n / sum(n) * 100)
##    Sex    n  perc_fatal
## 1    F  364 12.59951540
## 2  lli    1  0.03461405
## 3    M 2307 79.85462098
## 4    N    1  0.03461405
## 5 <NA>  216  7.47663551
df %>% count(isFatal)
##   isFatal    n
## 1       0 2889
## 2       1  588
## 3      NA  368
df %>% count(Sex=="F")
##   Sex == "F"    n
## 1      FALSE 3091
## 2       TRUE  450
## 3         NA  304
df %>% count(Sex=="M")
##   Sex == "M"    n
## 1      FALSE  453
## 2       TRUE 3088
## 3         NA  304
df %>% filter(Sex=="M") %>% count(isFatal)
##   isFatal    n
## 1       0 2307
## 2       1  520
## 3      NA  261
df %>% filter(Sex=="F") %>% count(isFatal)
##   isFatal   n
## 1       0 364
## 2       1  41
## 3      NA  45
df %>% filter(Sex=="M" & !is.na(isFatal)) %>% count(isFatal)
##   isFatal    n
## 1       0 2307
## 2       1  520
df %>% filter(Sex=="F" & !is.na(isFatal)) %>% count(isFatal)
##   isFatal   n
## 1       0 364
## 2       1  41

###Activity

set.seed(1234)
df %>% filter(Activity != is.na(Activity)) %>% count(Activity, sort= TRUE) ->  cd2
wordcloud(words = cd2$Activity, freq = cd2$n, min.freq = 5,  colors=brewer.pal(8, "Dark2"))

cd2 %>%
  filter(n > 10) %>% #if want filter
  mutate(Activity=reorder(Activity,n)) %>%
  ggplot(aes(Activity,n, fill=Activity)) +
    geom_col() + coord_flip() +
   geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
        labs(x="", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle("Activities with the highest Shark Attacks") +
  scale_y_continuous(breaks = seq(0,800,100))

Type

df %>% count(Type)  
##            Type    n
## 1          Boat    4
## 2       Invalid  328
## 3      Provoked  348
## 4  Questionable    7
## 5  Sea Disaster   51
## 6   Unconfirmed    1
## 7    Unprovoked 2886
## 8    Unverified    1
## 9    Watercraft  218
## 10         <NA>    1
  df %>% filter(Type != is.na(Type)) %>% count(Type, sort= TRUE) %>% mutate(Type=reorder(Type,n)) %>%
  ggplot(aes(Type,n, fill=Type)) +
    geom_col() + coord_flip() +
   geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
        labs(x="", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle("Activities with the highest Shark Attacks") +
  scale_y_continuous(breaks = seq(0,3000,1000))

  #  Fatal
   df %>% filter(Type != is.na(Type) & isFatal == TRUE) %>% count(Type, sort= TRUE) %>% mutate(Type=reorder(Type,n)) %>%
  ggplot(aes(Type,n, fill=Type)) +
    geom_col() + coord_flip() +
   geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
        labs(x="", y="Number of Attacks") + theme(legend.position="none") +
        ggtitle("Activities with the highest Shark Attacks") +
  scale_y_continuous(breaks = seq(0,3000,1000))

tot_usa <- count(df,Country=="USA")
TOT <- as.integer(df %>%  filter(Country=="USA" ) %>% count())
df %>% filter(Country=="USA" ) %>% group_by(Month) %>%count(sort= TRUE) %>% mutate(perc = as.integer(n/TOT*100))
## # A tibble: 13 x 3
## # Groups:   Month [13]
##    Month     n  perc
##    <int> <int> <int>
##  1     7   310    16
##  2     8   295    15
##  3     9   250    13
##  4     6   208    11
##  5    10   157     8
##  6     5   137     7
##  7     4   135     7
##  8    11    97     5
##  9     3    79     4
## 10    NA    64     3
## 11     2    53     2
## 12    12    45     2
## 13     1    34     1
df %>% filter(Country=="AUSTRALIA") %>% group_by(Month) %>%count(sort= TRUE)
## # A tibble: 13 x 2
## # Groups:   Month [13]
##    Month     n
##    <int> <int>
##  1     1   145
##  2    12   112
##  3     2   108
##  4    11    88
##  5     3    80
##  6     4    78
##  7    10    63
##  8     6    45
##  9     5    39
## 10     9    39
## 11     7    32
## 12     8    32
## 13    NA    32
df %>% filter(Country=="SOUTH AFRICA") %>% group_by(Month) %>%count(sort= TRUE)
## # A tibble: 13 x 2
## # Groups:   Month [13]
##    Month     n
##    <int> <int>
##  1     1    81
##  2    12    53
##  3     3    47
##  4     2    42
##  5     4    36
##  6    11    30
##  7     7    27
##  8     5    26
##  9    NA    25
## 10    10    24
## 11     6    19
## 12     8    18
## 13     9    13
df %>% group_by(Month) %>% filter(!is.na(isFatal)) %>%count(sort= TRUE)
## # A tibble: 13 x 2
## # Groups:   Month [13]
##    Month     n
##    <int> <int>
##  1     7   410
##  2     8   361
##  3     9   336
##  4     6   296
##  5     1   273
##  6     4   268
##  7    10   257
##  8    12   230
##  9    11   229
## 10     5   221
## 11     3   220
## 12     2   217
## 13    NA   159
df %>%  filter(!is.na(isFatal)) %>% count()
##      n
## 1 3477
df %>%  filter(isFatal==T) %>% count()
##     n
## 1 588
df %>%  filter(isFatal==F) %>% count()
##      n
## 1 2889