This dataset contains very much information about the terrorism, from There are ` records and columns including date, time, location, number of hostages, killed, wounded, if there was a ransom, the outcome, if there was a suicide attack, claims, weapons used.
There is a significant amount of missing information. For example, there are missing longitude values and missing latitude values. For only events is recorded a type of claim and only ransom notes are recorded.
Shark, Wikimedia Commons, public domain
## # A tibble: 26 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Time 3403 52.4
## 2 Species 2923 45.0
## 3 Age 2872 44.3
## 4 Day 875 13.5
## 5 isFatal 619 9.54
## 6 Sex 567 8.74
## 7 Activity 554 8.54
## 8 Location 549 8.46
## 9 Month 506 7.80
## 10 Area 465 7.17
## # ... with 16 more rows
qomo <- "./images/shark.png"
sk <- png::readPNG(qomo)
ski = raster2uri(as.raster(sk))
kili <- "https://cdn.pixabay.com/photo/2017/05/16/10/10/shark-2317422__340.png"
df %>% select(Country,isFatal) %>% filter(!is.na(isFatal)) %>% group_by(isFatal) %>% summarise(count=n()) -> plt ## `summarise()` ungrouping output (override with `.groups` argument)
data <- data.frame(
x = c("non Fatal", "Fatal"),
value = c(plt$count[1], plt$count[2]),
symbol = c(paste0("image://", ski),
paste0("image://", kili))
)
data %>%
e_charts(x) %>%
e_pictorial(value, symbol) %>%
e_labels(fontSize = 10) %>%
e_legend(F) Let’s look now to both countries and years in the same time. We also add, besides the number of killed people, the number of wounded (color is proportional with the number of wounded).
treemap(df, #Your data frame object
index=c("Country", "Year"),
type = "value",
vSize = "isFatal",
#vColor="nwound",
palette = "RdBu",
title="Victims in Shark Attacks (Countries/Years) - size is proportional with the number of Attacks",
title.legend = "Number of Attacks",
fontsize.title = 13
)df %>% group_by(Year) %>% filter( n() > 100 & !is.na(Year)) %>% count() %>%
ggplot() +
geom_col(aes(x=reorder(Year, n), y=n, fill=Year)) + coord_flip() + theme_bw() +
labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
ggtitle("Years with the highest number of Attacks \n by Shark Attack until 2019") +
scale_y_continuous(breaks = seq(0,300,20))df %>% group_by(Year) %>% filter( !is.na(Year)) %>% count() %>%
ggplot() +
geom_line(aes(x=Year, y=n) , color = "red") + theme_bw() +
labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
ggtitle("All time Attacks by Year") df %>% group_by(Year) %>% filter( !is.na(Year) & Year>1900) %>% count() %>%
ggplot() +
geom_line(aes(x=Year, y=n), color = "red") + theme_bw() +
labs(x="Years", y="Number of Attacks") + theme(legend.position="none") +
ggtitle(" Attacks by Year since 1900") Let’s look to the deaths of people grouped by highest Countries.
df %>% group_by(Country) %>% filter(isFatal == 1 & n() > 30 & !is.na(Country)) %>% count() %>%
ggplot(aes(x=reorder(Country, n), y=n, fill=Country)) +
geom_col() + coord_flip() + theme_bw() +
geom_text(aes(label=n) , color="white", hjust=1.2 , size=3.5)+
labs(x="", y="Number of Deaths") + theme(legend.position="none") +
ggtitle("Countries with the highest number of deaths \n by Shark Attack until 2019") +
scale_y_continuous(breaks = seq(0,300,20)) ### Deaths by Year since 1800
df_fatal_year <- df %>% filter(Year != 0, Year > 1800) %>% group_by(Year) %>% filter( isFatal ==1 ) %>% summarise( count = n() )## `summarise()` ungrouping output (override with `.groups` argument)
plt <- plot_ly(df_fatal_year, x = ~Year, y = ~count,
color = ~Year, colors = sample(colours(), 180),
type = 'bar', orientation = 'v',
hoverinfo = 'text' , text = ~paste("Year: ", Year, "<br>count: ", count)) %>%
layout(showlegend = FALSE,
title = "Deaths by Year",
xaxis = list(title = "Year"),
yaxis = list(title = "count"),
annotations = list(x = 1, y = -0.1, text = "Death Since 1800",
showarrow = F, xref='paper', yref='paper',
xanchor='right', yanchor='auto', xshift=0, yshift=0,
font=list(size=10, color="grey") ),
title = "Attacks by Country",
yaxis = list(title = "Country"),
xaxis = list(title = "count")
) %>%
highlight("plotly_click")
hide_colorbar(plt) df <- filter(df,lat!=0)
leaflet(data = df) %>%
addTiles() %>%
addMarkers(lat=df$lat, lng=df$long, clusterOptions = markerClusterOptions(),
popup= paste("<strong>Date: </strong>", df$Month,"/", df$Day,"/", df$Year,
"<br><br><strong>Place: </strong>", df$Location,"-",df$Country,
"<br><strong>Case NUmber: </strong>", df$Case.Number,
"<br><strong>Latitude: </strong>", df$lat,
"<br><strong>Longitude: </strong>", df$long,
"<br><strong>Specie: </strong>", df$Species
#"<br><strong>Wounded US citizens: </strong>", dfr$nwoundus,
#"<br><strong>Suicide attack(0-No/1-Yes): </strong>", dfr$suicide,
#"<br><strong>Ransom paid: </strong>", dfr$ransompaid,
#"<br><strong>Ransom note: </strong>", dfr$ransomnote,
#"<br><strong>Hostages/kidnapped: </strong>", dfr$nhostkid,
#"<br><strong>Hostages/kidnapped outcome: </strong>", dfr$hostkidoutcome_txt
))set.seed(1234)
df %>% filter(Species != is.na(Species)) %>% count(Species) -> cd2
wordcloud(words = cd2$Species, freq = cd2$n, min.freq = 9, colors=brewer.pal(8, "Dark2"))species_txt <- df %>% filter(!is.na(Species)) %>%
mutate( linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word,Species) %>%
anti_join(stop_words) ## Joining, by = "word"
vec_species <- species %>% c(tolower(species$Common_name))
is_specie <- function(field) {
return (tolower(field) %in% vec_species )
}
species_txt %>% mutate(ident_specie = is_specie(species_txt$word) ) %>%
filter(ident_specie==TRUE) %>% select(word) -> WordList
WordList %>%
count(word, sort= TRUE) %>%
#filter(n > 5) %>% #if want filter
mutate(word=reorder(word,n)) %>%
ggplot(aes(word,n)) +
geom_col() +
xlab(NULL) +
coord_flip()We can observe that Bombing/Explosion was allways the most frequent type of attack, followed by Armed Assault. In recent years, the number of such events increased dramatically, accounting for the majority of the events in recent years.
###Gender
## Sex n perc_fatal
## 1 F 405 11.64797239
## 2 lli 1 0.02876043
## 3 M 2827 81.30572332
## 4 N 1 0.02876043
## 5 <NA> 243 6.98878343
## Sex n perc_fatal
## 1 F 41 6.972789
## 2 M 520 88.435374
## 3 <NA> 27 4.591837
## Sex n perc_fatal
## 1 F 364 12.59951540
## 2 lli 1 0.03461405
## 3 M 2307 79.85462098
## 4 N 1 0.03461405
## 5 <NA> 216 7.47663551
## isFatal n
## 1 0 2889
## 2 1 588
## 3 NA 368
## Sex == "F" n
## 1 FALSE 3091
## 2 TRUE 450
## 3 NA 304
## Sex == "M" n
## 1 FALSE 453
## 2 TRUE 3088
## 3 NA 304
## isFatal n
## 1 0 2307
## 2 1 520
## 3 NA 261
## isFatal n
## 1 0 364
## 2 1 41
## 3 NA 45
## isFatal n
## 1 0 2307
## 2 1 520
## isFatal n
## 1 0 364
## 2 1 41
###Activity
set.seed(1234)
df %>% filter(Activity != is.na(Activity)) %>% count(Activity, sort= TRUE) -> cd2
wordcloud(words = cd2$Activity, freq = cd2$n, min.freq = 5, colors=brewer.pal(8, "Dark2"))cd2 %>%
filter(n > 10) %>% #if want filter
mutate(Activity=reorder(Activity,n)) %>%
ggplot(aes(Activity,n, fill=Activity)) +
geom_col() + coord_flip() +
geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
labs(x="", y="Number of Attacks") + theme(legend.position="none") +
ggtitle("Activities with the highest Shark Attacks") +
scale_y_continuous(breaks = seq(0,800,100))## Type n
## 1 Boat 4
## 2 Invalid 328
## 3 Provoked 348
## 4 Questionable 7
## 5 Sea Disaster 51
## 6 Unconfirmed 1
## 7 Unprovoked 2886
## 8 Unverified 1
## 9 Watercraft 218
## 10 <NA> 1
df %>% filter(Type != is.na(Type)) %>% count(Type, sort= TRUE) %>% mutate(Type=reorder(Type,n)) %>%
ggplot(aes(Type,n, fill=Type)) +
geom_col() + coord_flip() +
geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
labs(x="", y="Number of Attacks") + theme(legend.position="none") +
ggtitle("Activities with the highest Shark Attacks") +
scale_y_continuous(breaks = seq(0,3000,1000)) # Fatal
df %>% filter(Type != is.na(Type) & isFatal == TRUE) %>% count(Type, sort= TRUE) %>% mutate(Type=reorder(Type,n)) %>%
ggplot(aes(Type,n, fill=Type)) +
geom_col() + coord_flip() +
geom_text(aes(label=n) , color="white", hjust=1.1 , size=3)+
labs(x="", y="Number of Attacks") + theme(legend.position="none") +
ggtitle("Activities with the highest Shark Attacks") +
scale_y_continuous(breaks = seq(0,3000,1000))tot_usa <- count(df,Country=="USA")
TOT <- as.integer(df %>% filter(Country=="USA" ) %>% count())
df %>% filter(Country=="USA" ) %>% group_by(Month) %>%count(sort= TRUE) %>% mutate(perc = as.integer(n/TOT*100))## # A tibble: 13 x 3
## # Groups: Month [13]
## Month n perc
## <int> <int> <int>
## 1 7 310 16
## 2 8 295 15
## 3 9 250 13
## 4 6 208 11
## 5 10 157 8
## 6 5 137 7
## 7 4 135 7
## 8 11 97 5
## 9 3 79 4
## 10 NA 64 3
## 11 2 53 2
## 12 12 45 2
## 13 1 34 1
## # A tibble: 13 x 2
## # Groups: Month [13]
## Month n
## <int> <int>
## 1 1 145
## 2 12 112
## 3 2 108
## 4 11 88
## 5 3 80
## 6 4 78
## 7 10 63
## 8 6 45
## 9 5 39
## 10 9 39
## 11 7 32
## 12 8 32
## 13 NA 32
## # A tibble: 13 x 2
## # Groups: Month [13]
## Month n
## <int> <int>
## 1 1 81
## 2 12 53
## 3 3 47
## 4 2 42
## 5 4 36
## 6 11 30
## 7 7 27
## 8 5 26
## 9 NA 25
## 10 10 24
## 11 6 19
## 12 8 18
## 13 9 13
## # A tibble: 13 x 2
## # Groups: Month [13]
## Month n
## <int> <int>
## 1 7 410
## 2 8 361
## 3 9 336
## 4 6 296
## 5 1 273
## 6 4 268
## 7 10 257
## 8 12 230
## 9 11 229
## 10 5 221
## 11 3 220
## 12 2 217
## 13 NA 159
## n
## 1 3477
## n
## 1 588
## n
## 1 2889