The Global Terrorism Database or simply called GTD is an unclassified database for terrorist attacks in the world.

 In this post we are going to see how I performed the correspondance analysis and Topic modelling to a subset of GTD dataset to uncover hidden patterns.

Note: The datset used to perform these tasks are subsetted using SQLite3 queries in Database Browser.I hope you can reproduce them.

# Loading the necessary  libraries
library(dplyr) # data manupulation library
library(reshape2) # used to reshaping the data
library(ca) # to perform correspondance analysis 
library(stringr) # string manupulation library
library(tidyverse) # collection of tidymodels packages
library(tidytext) # used for text mining
library(ggplot2) # used for data visualization

Simple Correspondance Analysis

 Firstly I wrote a utitlit function to perform ca and obtain the required values to plot a graph.The function goes as follows.

names <- c("1-2", "3-9", "10-29" ,">=30") # creating a vector of fatality levels

# creating a fucntion to  perform CA
simple_ca <- function(df, formula) {
  
  regCT <- dcast(df, formula) # dcasting the data by provided formula
  
  regCT[is.na(regCT)] <- 0 # assigning null a zero value
  
  fit <- ca(regCT[, names]) # performing the CA
  
  tmp <- data.frame(name=regCT$txt,chsqdist=fit$rowdist, fit$rowcoord,
                    inertia=fit$rowinertia) # creating a dataframe of necessary  values
  
  tmp2 <- data.frame(name=c("Very High" ,"Very Low", "High", "Low"),
                     chsqdist=fit$coldist, fit$colcoord, inertia=fit$colinertia) # getting the CA values
  
  
  tmp3 <- rbind(tmp, tmp2) # row binding the dataframes
  
  plot(fit) # plotting the cA
  
  return(tmp3) # returning the required data
}

 Now let’s load the dataframe for regions

regDf <- read.csv("../../data/sub_data/Fatality Freq by Region.csv") # loading the region data
names(regDf) <- c("txt", "Region", "FatalityLevel", "Count") # assigning custom colnames
 
head(regDf) # printing the 1st few rows
##                          txt Region FatalityLevel Count
## 1 Middle East & North Africa     10          >=30 26696
## 2 Middle East & North Africa     10         10-29  2578
## 3 Middle East & North Africa     10           3-9  8513
## 4 Middle East & North Africa     10           1-2 18836
## 5                 South Asia      6          >=30 24778
## 6                 South Asia      6         10-29  2310

 performing the analysis and writing the files into a csv

rfDf <- simple_ca(regDf, Region+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.

head(rfDf) # printing the 1st few rows
##                          name  chsqdist       Dim1       Dim2        Dim3
## 1               North America 0.5901336 -2.4730326  0.9050432  1.20742588
## 2 Central America & Caribbean 0.2520180 -0.7905412  1.8203108 -1.90556258
## 3               South America 0.2525573 -0.9908017  1.0094568  1.29704173
## 4                   East Asia 0.5489721 -2.1769695  2.1315703  0.08235646
## 5              Southeast Asia 0.1876448 -0.3480425 -1.8610862 -0.51412768
## 6                  South Asia 0.1397336  0.5813942 -0.1111881  1.16382265
##       inertia
## 1 0.006441518
## 2 0.003266065
## 3 0.006209175
## 4 0.001243204
## 5 0.002462793
## 6 0.004966294
write.csv(rfDf, "../../data/sub_data/CA.xlsx") #saving it to a csv

 Similarly with other dataframes

# CA on  country data
cntryDf <- read.csv("../../data/sub_data/Fatality Freq by Country.csv") #loading the country data
names(cntryDf) <- c("txt", "Country", "FatalityLevel", "Count") # naming the columns

head(cntryDf) # prinitng the 1st few rows
##           txt Country FatalityLevel Count
## 1        Iraq      95          >=30 10035
## 2        Iraq      95         10-29  1331
## 3        Iraq      95           3-9  4825
## 4        Iraq      95           1-2 10567
## 5 Afghanistan       4          >=30  5027
## 6 Afghanistan       4         10-29  1182
cfDf <- simple_ca(cntryDf, Country+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.

head(cfDf) # printing the 1st few rows
##          name  chsqdist       Dim1        Dim2       Dim3      inertia
## 1 Afghanistan 0.5302638 -1.5352536  0.48278188  1.5341965 0.0236973079
## 2     Algeria 0.5051409 -1.4748128  0.29122546 -1.3498472 0.0036535884
## 3      Angola 0.3881101  0.2339624  2.39910853 -0.2954537 0.0003921405
## 4   Argentina 0.6452989  1.9020060  0.42560238 -0.1516066 0.0017680523
## 5  Bangladesh 0.3776133  1.1185291 -0.07352067  0.1141011 0.0012624282
## 6     Bolivia 0.7434582  2.1563244  0.90637586  0.9785849 0.0009107626
# CA on weapons data 
weapDf <- read.csv("../../data/sub_data/Fatality Freq by Weapon Type1.csv") #loading the weapons data
names(weapDf) <- c("txt", "Weapon", "FatalityLevel", "Count") # naming the columns

head(weapDf) # printing the 1st few rows
##          txt Weapon FatalityLevel Count
## 1 Explosives      6          >=30 65358
## 2 Explosives      6         10-29  3362
## 3 Explosives      6           3-9 11743
## 4 Explosives      6           1-2 20232
## 5   Firearms      5          >=30 21184
## 6   Firearms      5         10-29  3402
wfDf <- simple_ca(weapDf, Weapon+txt~FatalityLevel) # performing cA
## Using Count as value column: use value.var to override.

head(wfDf) # prinitng the 1st few rows
##           name  chsqdist       Dim1       Dim2        Dim3      inertia
## 1   Biological 0.6044652 -1.4672389  4.7793579  1.90321770 6.513916e-05
## 2     Chemical 0.5492654 -1.5131465  0.9121140  3.01371156 5.154430e-04
## 3 Radiological 0.9075066 -2.4809896  2.6745848  1.38777316 5.302002e-05
## 4     Firearms 0.4615347  1.2813150  0.4207661 -0.03954476 6.871318e-02
## 5   Explosives 0.2093462 -0.5800027 -0.2225406 -0.69509908 2.185420e-02
## 6 Fake Weapons 0.8013542 -2.1522056  3.3572139  1.55494436 1.176651e-04
# CA on target data
targDf <- read.csv("../../data/sub_data/Fatality Freq by Target Type1.csv") # loading the target data
names(targDf) <- c("txt", "Target", "FatalityLevel", "Count") # naming the columns

head(targDf) # printing the 1st few rows
##                           txt Target FatalityLevel Count
## 1 Private Citizens & Property     14          >=30 22480
## 2 Private Citizens & Property     14         10-29  2668
## 3 Private Citizens & Property     14           3-9  8172
## 4 Private Citizens & Property     14           1-2 16374
## 5                    Military      4          >=30 13782
## 6                    Military      4         10-29  2444
tfDf <- simple_ca(targDf, Target+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.

head(tfDf) # printing the 1st  few rows 
##                   name  chsqdist       Dim1        Dim2        Dim3     inertia
## 1             Business 0.3714388 -1.1760741 -0.09079016  0.16603061 0.014945128
## 2 Government (General) 0.2196121 -0.2882984  1.74324648 -1.01206440 0.005554735
## 3               Police 0.2919282  0.8835164  0.56809336  1.80080243 0.011528746
## 4             Military 0.3051099  0.8303005 -1.36652390 -0.69102824 0.014803010
## 5     Abortion Related 0.8508017 -2.6784825 -0.81262648  0.68811837 0.001010887
## 6  Airports & Aircraft 0.5971910 -1.8583039 -0.98825133  0.08868291 0.002507910
# CA on attack data
atckDf <- read.csv("../../data/sub_data/Fatality Freq by Attack Type1.csv") # loading the attack data
names(atckDf) <- c("txt", "Attack", "FatalityLevel", "Count") # naming columns

head(atckDf) # printing 1st few rows
##                 txt Attack FatalityLevel Count
## 1 Bombing/Explosion      3          >=30 62847
## 2 Bombing/Explosion      3         10-29  3073
## 3 Bombing/Explosion      3           3-9 10885
## 4 Bombing/Explosion      3           1-2 18867
## 5     Armed Assault      2          >=30 16607
## 6     Armed Assault      2         10-29  3422
afDf <- simple_ca(atckDf, Attack+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.

head(afDf) # printing the 1st few rows
##                                  name  chsqdist       Dim1        Dim2
## 1                       Assassination 0.8670784 -1.9079964 -2.06851035
## 2                       Armed Assault 0.4130508 -0.9158321  0.95971546
## 3                   Bombing/Explosion 0.2238941  0.5592376 -0.05638744
## 4                           Hijacking 0.5407579  1.3071356 -0.71001150
## 5 Hostage Taking (Barricade Incident) 0.4888697  1.1890336  0.39193440
## 6         Hostage Taking (Kidnapping) 0.2566760  0.5835999 -0.51511107
##         Dim3     inertia
## 1 -0.2217367 0.077940993
## 2 -0.1557028 0.040257662
## 3  0.8278074 0.023750199
## 4 -0.8120140 0.001078845
## 5 -4.7425356 0.001339768
## 6 -1.6839535 0.004317435

Topic Modelling

 I have choosen the data of attack summary, weapon details and motive of attack only for past 2 decades.

rawDf <- tibble(read.csv("../../data/sub_data/topics.csv")) # loading the topics modeling subset

rawDf$iyear <- as.integer(as.character(rawDf$iyear)) # converting year to integer

rawDf$iyear <- factor(rawDf$iyear) # changing year to factor

rawDf <- rawDf %>% mutate_if(is.character, list(~na_if(.,""))) # converting blanks to NULL

rawDf # printing 1st few rows
## # A tibble: 131,350 x 5
##     eventid iyear summary               motive            weapdetail            
##       <dbl> <fct> <chr>                 <chr>             <chr>                 
##  1  2.00e11 2000  01/01/2000: In the f~ Unknown           <NA>                  
##  2  2.00e11 2000  01/01/2000: In the s~ Unknown           <NA>                  
##  3  2.00e11 2000  01/01/2000: Members ~ Unknown           A sniper rifle was us~
##  4  2.00e11 2000  01/01/2000: A Romany~ Unknown           <NA>                  
##  5  2.00e11 2000  01/01/2000: Over six~ It was possible ~ <NA>                  
##  6  2.00e11 2000  01/01/2000: In a ser~ An attempt to di~ Unknown gun types, ex~
##  7  2.00e11 2000  01/01/2000: An Oil a~ Unknown           <NA>                  
##  8  2.00e11 2000  01/01/2000: A Serb w~ Unknown           <NA>                  
##  9  2.00e11 2000  01/01/2000: A pipe b~ Unknown           <NA>                  
## 10  2.00e11 2000  01/01/2000: A Civil ~ Unknown           36 petrol bombs were ~
## # ... with 131,340 more rows

 Performing topic modeling on attack Summary and extracting required data

## Topic modelling on Summary
SumDf <- rawDf %>% select(iyear, summary) %>% drop_na() # creating a subset of raw data

# cleaning the string 
SumDf$summary %<>%  
  str_replace_all("[^A-z]", " ") %>% # excluding numericals
  str_replace_all("[\\s]+", " ") %>% # excluding whitespace
  tolower() # converting to lowercase

# tokenization
SumDf %<>% 
  mutate(line = row_number()) %>% # generating wor numbers
  unnest_tokens(word, summary) %>% # unnesting into words
  anti_join(stop_words) %>%  # removing stop words
  filter(word != "responsibility" & word!="claimed") # removing outliers
## Joining, by = "word"
# calculating frequency
Sum_tfIdf <- SumDf %>% 
  count(iyear, word, sort = TRUE) %>% # count each word by year 
  bind_tf_idf(word, iyear, n) %>%  
  arrange(-tf_idf) %>% # arrange in the order of tf-idf
  group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract distinct count
  top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the tf-idf
g1 <- Sum_tfIdf %>%
  mutate(word=reorder_within(word, tf_idf, iyear))%>% # reorder words
  ggplot(aes(word, tf_idf, fill=iyear)) + # initializing the gggplot object
  geom_col(alpha=0.8, show.legend = F) + # creating a column/bar chart
  facet_wrap(~iyear, scales = "free", ncol = 5) + # wraping the year with free scaling 
  scale_x_reordered() + # reordering scales
  coord_flip() # flipping the coordinates
g1 

gd1 <- ggplot_build(g1)$data # getting the plot data

glimpse(gd1) # printing the plot data
## List of 1
##  $ :'data.frame':    200 obs. of  14 variables:
##   ..$ fill       : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
##   ..$ x          : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
##   ..$ y          : num [1:200] 0.00168 0.00145 0.00128 0.00127 0.00127 ...
##   ..$ PANEL      : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ group      : int [1:200] 153 137 124 121 120 112 92 66 65 48 ...
##   ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..$ ymin       : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ ymax       : num [1:200] 0.00168 0.00145 0.00128 0.00127 0.00127 ...
##   ..$ xmin       : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
##   ..$ xmax       : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
##   ..$ colour     : logi [1:200] NA NA NA NA NA NA ...
##   ..$ size       : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##   ..$ linetype   : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ alpha      : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
write.csv(gd1, "../../data/sub_data/topic modellling/topic.csv") # saving the plot data

 similarly you can do for other data frames

## Topic Modeling Terrorists Motive
motDf <- rawDf %>% select(iyear, motive) %>% drop_na() # Creating a subset

motDf <- motDf[(motDf$motive!="Unknown"),] # removing unknown values

# text  cleaning 
motDf$motive %<>%  
  str_replace_all("[^A-z]", " ") %>% # extract only aphabets
  str_replace_all("[\\s]+", " ") %>% # removing white spaces
  tolower() # converting to lowercase

# tokenization
motDf %<>% 
  mutate(line = row_number()) %>% # creating s row number
  unnest_tokens(word, motive) %>% # unnesting words
  anti_join(stop_words) # removing stop words
## Joining, by = "word"
# calculating tf-idf 
mot_tfIdf <- motDf %>% 
  count(iyear, word, sort = TRUE) %>% # counting words by year
  bind_tf_idf(word, iyear, n) %>% 
  arrange(-tf_idf) %>% # arrange by tf-idf value
  group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract distinct count
  top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the motives
g2 <- mot_tfIdf %>%
  mutate(word=reorder_within(word, tf_idf, iyear))%>% # order by tf-idf
  ggplot(aes(word, tf_idf, fill=iyear)) + # initializing the ggplot
  geom_col(alpha=0.8, show.legend = F) + # creating colum chart
  facet_wrap(~iyear, scales = "free", ncol = 5) + # adding facet by year with free scale 
  scale_x_reordered() + # re-order scaling 
  coord_flip() # filp the coordinates 
g2 

gd2 <- ggplot_build(g2)$data # extract the plot data

glimpse(gd2) # print the data
## List of 1
##  $ :'data.frame':    200 obs. of  14 variables:
##   ..$ fill       : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
##   ..$ x          : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
##   ..$ y          : num [1:200] 0.0035 0.00333 0.00314 0.0031 0.00292 ...
##   ..$ PANEL      : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ group      : int [1:200] 90 82 76 75 69 63 54 46 45 41 ...
##   ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..$ ymin       : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ ymax       : num [1:200] 0.0035 0.00333 0.00314 0.0031 0.00292 ...
##   ..$ xmin       : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
##   ..$ xmax       : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
##   ..$ colour     : logi [1:200] NA NA NA NA NA NA ...
##   ..$ size       : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##   ..$ linetype   : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ alpha      : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
## Topics modeling Weapons detail
weapDf <- rawDf %>% select(iyear, weapdetail) %>% drop_na() # creating a subset

# text cleaning 
weapDf$weapdetail %<>%  
  str_replace_all("[^A-z]", " ") %>% # filter alphabets
  str_replace_all("[\\s]+", " ") %>% # removing white space
  tolower() # converting to lower

# tokenization
weapDf %<>% 
  mutate(line = row_number()) %>% # creating a row number
  unnest_tokens(word, weapdetail) %>% # unnesting into words
  anti_join(stop_words) # removing stopwords
## Joining, by = "word"
# creating the frequency
weap_tfIdf <- weapDf %>% 
  count(iyear, word, sort = TRUE) %>% # groupby year and count words
  bind_tf_idf(word, iyear, n) %>% 
  arrange(-tf_idf) %>% # order by tf-idf
  group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract  distinct count
  top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the tf-idf
g3 <- weap_tfIdf %>%
  mutate(word=reorder_within(word, tf_idf, iyear))%>% # reorder by year and tf-tdf
  ggplot(aes(word, tf_idf, fill=iyear)) + # initialize ggplot
  geom_col(alpha=0.8, show.legend = F) + # create a column chart
  facet_wrap(~iyear, scales = "free", ncol = 5) + # wraping year with free scaling
  scale_x_reordered() + # reorder scales 
  coord_flip() # flipping the coordinates
g3

gd3 <- ggplot_build(g3)$data # extract the plot data

glimpse(gd3) # printing the data
## List of 1
##  $ :'data.frame':    200 obs. of  14 variables:
##   ..$ fill       : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
##   ..$ x          : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
##   ..$ y          : num [1:200] 0.01297 0.00774 0.00607 0.00541 0.00479 ...
##   ..$ PANEL      : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ group      : int [1:200] 189 176 161 154 147 139 128 114 113 104 ...
##   ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..$ ymin       : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ ymax       : num [1:200] 0.01297 0.00774 0.00607 0.00541 0.00479 ...
##   ..$ xmin       : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
##   ..$ xmax       : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
##   ..$ colour     : logi [1:200] NA NA NA NA NA NA ...
##   ..$ size       : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##   ..$ linetype   : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ alpha      : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...

 After all of these I have combined them using Tableau joins to obtain my final dataframe.