##Global Terrorism Database Analysis by Dawn Daras MS
##These are EDA’s and an analysis run on the Global Terrorism Database (GTD), which is publicly available and kept by the University of Maryland: GTD Univ of MD
Here is the codebook for the dataset: GTD Codebook
Downloadable shapefile for the world map: World shapefile
library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr) # for data manipulation
library(ggplot2) # for data visualization
library(corrplot) # for correlations
## corrplot 0.92 loaded
library(Rtsne) # for tsne plotting
library(geomtextpath)
library(hrbrthemes)
library(ggplot2)
library(repr)
library(RColorBrewer)
library(e1071)
library(C50)
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(nnet)
library(tidyr)
library(lares)
##
## Attaching package: 'lares'
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:hrbrthemes':
##
## scale_x_comma, scale_x_percent, scale_y_comma, scale_y_percent
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:lares':
##
## impute
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## The following object is masked from 'package:DMwR':
##
## join
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(leaflet)
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1; sf_use_s2() is TRUE
library(DT)
library(descr)
##
## Attaching package: 'descr'
## The following object is masked from 'package:lares':
##
## crosstab
library(gt)
##
## Attaching package: 'gt'
## The following object is masked from 'package:Hmisc':
##
## html
library(htmltools)
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(WDI)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::arrange() masks plyr::arrange(), dplyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ plyr::count() masks dplyr::count()
## ✖ plyr::desc() masks dplyr::desc()
## ✖ plyr::failwith() masks dplyr::failwith()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ plyr::id() masks dplyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ plotly::mutate() masks plyr::mutate(), dplyr::mutate()
## ✖ plotly::rename() masks plyr::rename(), dplyr::rename()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ plotly::summarise() masks plyr::summarise(), dplyr::summarise()
## ✖ plyr::summarize() masks Hmisc::summarize(), dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vembedr)
##
## Attaching package: 'vembedr'
##
## The following object is masked from 'package:lubridate':
##
## hms
df <- read.csv("/cloud/project/gtd.csv", header=TRUE, stringsAsFactors=FALSE)
#EDAs - Exploratory Data Analysis
##Explanation of Selected Variables:
##success - Success of a terrorist strike ##suicide - 1 = “Yes” The incident was a suicide attack. 0 = “No” There is no indication that the incident was a suicide ##attacktype1 - The general method of attack ##attacktype1_txt - The general method of attack and broad class of tactics used. ##targtype1_txt - The general type of target/victim ##targsubtype1_txt - The more specific target category ##target1 - The specific person, building, installation that was targeted and/or victimized ##natlty1_txt - The nationality of the target that was attacked ##gname - The name of the group that carried out the attack ##gsubname - Additional details about group that carried out the attack like fractions ##nperps - The total number of terrorists participating in the incident ##weaptype1_txt - General type of weapon used in the incident ##weapsubtype1_txt - More specific value for most of the Weapon Types ##nkill - The number of total confirmed fatalities for the incident ##nkillus - The number of U.S. citizens who died as a result of the incident
##Selecting only the variables that you wish to keep
df2 <-df %>% select(eventid, iyear,success,imonth, iday, country_txt,region_txt,suicide,latitude,longitude,attacktype1_txt, targtype1_txt, target1,nkill )
head(df2,1)
## eventid iyear success imonth iday country_txt
## 1 1.97e+11 1970 1 7 2 Dominican Republic
## region_txt suicide latitude longitude attacktype1_txt
## 1 Central America & Caribbean 0 18.45679 -69.95116 Assassination
## targtype1_txt target1 nkill
## 1 Private Citizens & Property Julio Guzman 1
df2$eventid <- as.factor(df2$eventid)
class(df2$eventid)
## [1] "factor"
str(df2)
## 'data.frame': 181691 obs. of 14 variables:
## $ eventid : Factor w/ 181691 levels "197000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ iyear : int 1970 1970 1970 1970 1970 1970 1970 1970 1970 1970 ...
## $ success : int 1 1 1 1 1 1 0 1 1 1 ...
## $ imonth : int 7 0 1 1 1 1 1 1 1 1 ...
## $ iday : int 2 0 0 0 0 1 2 2 2 3 ...
## $ country_txt : chr "Dominican Republic" "Mexico" "Philippines" "Greece" ...
## $ region_txt : chr "Central America & Caribbean" "North America" "Southeast Asia" "Western Europe" ...
## $ suicide : int 0 0 0 0 0 0 0 0 0 0 ...
## $ latitude : num 18.5 19.4 15.5 38 33.6 ...
## $ longitude : num -70 -99.1 120.6 23.8 130.4 ...
## $ attacktype1_txt: chr "Assassination" "Hostage Taking (Kidnapping)" "Assassination" "Bombing/Explosion" ...
## $ targtype1_txt : chr "Private Citizens & Property" "Government (Diplomatic)" "Journalists & Media" "Government (Diplomatic)" ...
## $ target1 : chr "Julio Guzman" "Nadine Chaval, daughter" "Employee" "U.S. Embassy" ...
## $ nkill : int 1 0 1 NA NA 0 0 0 0 0 ...
summary(df2)
## eventid iyear success imonth
## 197000000001: 1 Min. :1970 Min. :0.0000 Min. : 0.000
## 197000000002: 1 1st Qu.:1991 1st Qu.:1.0000 1st Qu.: 4.000
## 197001000001: 1 Median :2009 Median :1.0000 Median : 6.000
## 197001000002: 1 Mean :2003 Mean :0.8896 Mean : 6.467
## 197001000003: 1 3rd Qu.:2014 3rd Qu.:1.0000 3rd Qu.: 9.000
## 197001010002: 1 Max. :2017 Max. :1.0000 Max. :12.000
## (Other) :181685
## iday country_txt region_txt suicide
## Min. : 0.00 Length:181691 Length:181691 Min. :0.00000
## 1st Qu.: 8.00 Class :character Class :character 1st Qu.:0.00000
## Median :15.00 Mode :character Mode :character Median :0.00000
## Mean :15.51 Mean :0.03651
## 3rd Qu.:23.00 3rd Qu.:0.00000
## Max. :31.00 Max. :1.00000
##
## latitude longitude attacktype1_txt targtype1_txt
## Min. :-53.16 Min. :-86185896 Length:181691 Length:181691
## 1st Qu.: 11.51 1st Qu.: 5 Class :character Class :character
## Median : 31.47 Median : 43 Mode :character Mode :character
## Mean : 23.50 Mean : -459
## 3rd Qu.: 34.69 3rd Qu.: 69
## Max. : 74.63 Max. : 179
## NA's :4556 NA's :4557
## target1 nkill
## Length:181691 Min. : 0.000
## Class :character 1st Qu.: 0.000
## Mode :character Median : 0.000
## Mean : 2.403
## 3rd Qu.: 2.000
## Max. :1570.000
## NA's :10313
##Let’s look at number of terror attacks by year
##Then we’ll look at attacks by region X year
region <- df2 %>%
group_by(region_txt) %>%
tally()
print(region)
## # A tibble: 12 × 2
## region_txt n
## <chr> <int>
## 1 Australasia & Oceania 282
## 2 Central America & Caribbean 10344
## 3 Central Asia 563
## 4 East Asia 802
## 5 Eastern Europe 5144
## 6 Middle East & North Africa 50474
## 7 North America 3456
## 8 South America 18978
## 9 South Asia 44974
## 10 Southeast Asia 12485
## 11 Sub-Saharan Africa 17550
## 12 Western Europe 16639
df2$year <- case_when(
df2$iyear %in% c("1970", "1971", "1972", "1973", "1974", "1975", "1976", "1977", "1978", "1979") ~ "1970's",
df2$iyear %in% c("1980", "1981", "1982", "1982", "1983", "1984", "1985", "1986", "1987", "1988", "1989") ~ "1980's",
df2$iyear %in% c("1990", "1991", "1992", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999") ~ "1990's",
df2$iyear %in% c("2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009") ~ "2000's",
df2$iyear %in% c("2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020") ~ "2010's",
)
agg_df <- aggregate(df2$iyear, by=list(df2$success,df2$iyear), FUN=length)
# filter data
agg_df = subset(agg_df, agg_df$Group.1 == 1)
head(agg_df)
## Group.1 Group.2 x
## 2 1 1970 549
## 4 1 1971 420
## 6 1 1972 452
## 8 1 1973 433
## 10 1 1974 545
## 12 1 1975 705
colnames(agg_df)[1] <- "Group"
colnames(agg_df)[2] <- "Year"
colnames(agg_df)[3] <- "Attacks"
head(agg_df, 5)
## Group Year Attacks
## 2 1 1970 549
## 4 1 1971 420
## 6 1 1972 452
## 8 1 1973 433
## 10 1 1974 545
##Building a trendline of attacks from 1970 to 2020
agg_df %>%
ggplot(aes(x = Year, y = Attacks, group = 1)) +
geom_line(aes(col = 'red'))
##The trendline shows a sharp increase in global terrorist attacks overall since after 2000 with a specific sharp increase after 2010. Let’s drill in and see where these are occurring.
attacks <- df2 %>%
group_by(region_txt, year) %>%
tally()
head(attacks, 5)
## # A tibble: 5 × 3
## # Groups: region_txt [1]
## region_txt year n
## <chr> <chr> <int>
## 1 Australasia & Oceania 1970's 16
## 2 Australasia & Oceania 1980's 78
## 3 Australasia & Oceania 1990's 113
## 4 Australasia & Oceania 2000's 28
## 5 Australasia & Oceania 2010's 47
#Rename columns
colnames(attacks)[1] <- "Region" # Rename first column
colnames(attacks)[2] <- "Decade" # Rename the 2nd column
colnames(attacks)[3] <- "Attacks"
head(attacks, 5)
## # A tibble: 5 × 3
## # Groups: Region [1]
## Region Decade Attacks
## <chr> <chr> <int>
## 1 Australasia & Oceania 1970's 16
## 2 Australasia & Oceania 1980's 78
## 3 Australasia & Oceania 1990's 113
## 4 Australasia & Oceania 2000's 28
## 5 Australasia & Oceania 2010's 47
##Building an interactive data table with exportable buttons for attacks by decades and regions
datatable(attacks,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))
##Building a map
##We are going to use the “map data” from ggplot
mapdata <- map_data('world')
df3 <-df2 %>% group_by(country_txt) %>%
summarise(sum_attacks=sum(success),
.groups = 'drop')
#df3 <- df3[ -c(1) ]
head(df3,5)
## # A tibble: 5 × 2
## country_txt sum_attacks
## <chr> <int>
## 1 Afghanistan 11141
## 2 Albania 64
## 3 Algeria 2561
## 4 Andorra 1
## 5 Angola 486
colnames(df3)[1] <- "region"
head(df3, 1)
## # A tibble: 1 × 2
## region sum_attacks
## <chr> <int>
## 1 Afghanistan 11141
mapjoin <-left_join(df3,mapdata,by='region')
head(mapjoin,5)
## # A tibble: 5 × 7
## region sum_attacks long lat group order subregion
## <chr> <int> <dbl> <dbl> <dbl> <int> <chr>
## 1 Afghanistan 11141 74.9 37.2 2 12 <NA>
## 2 Afghanistan 11141 74.8 37.2 2 13 <NA>
## 3 Afghanistan 11141 74.8 37.2 2 14 <NA>
## 4 Afghanistan 11141 74.7 37.3 2 15 <NA>
## 5 Afghanistan 11141 74.7 37.3 2 16 <NA>
mapjoin%>%
ggplot(aes(x=long,y=lat,group=group))+
geom_polygon(aes(fill=sum_attacks),color="grey",size = 0.2)+
labs(fill= "Terrorist Attacks per Region")+
theme(axis.title=element_blank(),axis.text=element_blank(),
axis.ticks=element_blank() )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
embed_url("https://youtu.be/gWK1sG3spiE?si=TDrkogh7SrbYkTMW")