The Global Terrorism Database or simply called GTD is an unclassified database for terrorist attacks in the world.
In this post we are going to see how I performed the correspondance analysis and Topic modelling to a subset of GTD dataset to uncover hidden patterns.
Note: The datset used to perform these tasks are subsetted using SQLite3 queries in Database Browser.I hope you can reproduce them.
# Loading the necessary libraries
library(dplyr) # data manupulation library
library(reshape2) # used to reshaping the data
library(ca) # to perform correspondance analysis
library(stringr) # string manupulation library
library(tidyverse) # collection of tidymodels packages
library(tidytext) # used for text mining
library(ggplot2) # used for data visualization
Firstly I wrote a utitlit function to perform ca and obtain the required values to plot a graph.The function goes as follows.
names <- c("1-2", "3-9", "10-29" ,">=30") # creating a vector of fatality levels
# creating a fucntion to perform CA
simple_ca <- function(df, formula) {
regCT <- dcast(df, formula) # dcasting the data by provided formula
regCT[is.na(regCT)] <- 0 # assigning null a zero value
fit <- ca(regCT[, names]) # performing the CA
tmp <- data.frame(name=regCT$txt,chsqdist=fit$rowdist, fit$rowcoord,
inertia=fit$rowinertia) # creating a dataframe of necessary values
tmp2 <- data.frame(name=c("Very High" ,"Very Low", "High", "Low"),
chsqdist=fit$coldist, fit$colcoord, inertia=fit$colinertia) # getting the CA values
tmp3 <- rbind(tmp, tmp2) # row binding the dataframes
plot(fit) # plotting the cA
return(tmp3) # returning the required data
}
Now let’s load the dataframe for regions
regDf <- read.csv("../../data/sub_data/Fatality Freq by Region.csv") # loading the region data
names(regDf) <- c("txt", "Region", "FatalityLevel", "Count") # assigning custom colnames
head(regDf) # printing the 1st few rows
## txt Region FatalityLevel Count
## 1 Middle East & North Africa 10 >=30 26696
## 2 Middle East & North Africa 10 10-29 2578
## 3 Middle East & North Africa 10 3-9 8513
## 4 Middle East & North Africa 10 1-2 18836
## 5 South Asia 6 >=30 24778
## 6 South Asia 6 10-29 2310
performing the analysis and writing the files into a csv
rfDf <- simple_ca(regDf, Region+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.
head(rfDf) # printing the 1st few rows
## name chsqdist Dim1 Dim2 Dim3
## 1 North America 0.5901336 -2.4730326 0.9050432 1.20742588
## 2 Central America & Caribbean 0.2520180 -0.7905412 1.8203108 -1.90556258
## 3 South America 0.2525573 -0.9908017 1.0094568 1.29704173
## 4 East Asia 0.5489721 -2.1769695 2.1315703 0.08235646
## 5 Southeast Asia 0.1876448 -0.3480425 -1.8610862 -0.51412768
## 6 South Asia 0.1397336 0.5813942 -0.1111881 1.16382265
## inertia
## 1 0.006441518
## 2 0.003266065
## 3 0.006209175
## 4 0.001243204
## 5 0.002462793
## 6 0.004966294
write.csv(rfDf, "../../data/sub_data/CA.xlsx") #saving it to a csv
Similarly with other dataframes
# CA on country data
cntryDf <- read.csv("../../data/sub_data/Fatality Freq by Country.csv") #loading the country data
names(cntryDf) <- c("txt", "Country", "FatalityLevel", "Count") # naming the columns
head(cntryDf) # prinitng the 1st few rows
## txt Country FatalityLevel Count
## 1 Iraq 95 >=30 10035
## 2 Iraq 95 10-29 1331
## 3 Iraq 95 3-9 4825
## 4 Iraq 95 1-2 10567
## 5 Afghanistan 4 >=30 5027
## 6 Afghanistan 4 10-29 1182
cfDf <- simple_ca(cntryDf, Country+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.
head(cfDf) # printing the 1st few rows
## name chsqdist Dim1 Dim2 Dim3 inertia
## 1 Afghanistan 0.5302638 -1.5352536 0.48278188 1.5341965 0.0236973079
## 2 Algeria 0.5051409 -1.4748128 0.29122546 -1.3498472 0.0036535884
## 3 Angola 0.3881101 0.2339624 2.39910853 -0.2954537 0.0003921405
## 4 Argentina 0.6452989 1.9020060 0.42560238 -0.1516066 0.0017680523
## 5 Bangladesh 0.3776133 1.1185291 -0.07352067 0.1141011 0.0012624282
## 6 Bolivia 0.7434582 2.1563244 0.90637586 0.9785849 0.0009107626
# CA on weapons data
weapDf <- read.csv("../../data/sub_data/Fatality Freq by Weapon Type1.csv") #loading the weapons data
names(weapDf) <- c("txt", "Weapon", "FatalityLevel", "Count") # naming the columns
head(weapDf) # printing the 1st few rows
## txt Weapon FatalityLevel Count
## 1 Explosives 6 >=30 65358
## 2 Explosives 6 10-29 3362
## 3 Explosives 6 3-9 11743
## 4 Explosives 6 1-2 20232
## 5 Firearms 5 >=30 21184
## 6 Firearms 5 10-29 3402
wfDf <- simple_ca(weapDf, Weapon+txt~FatalityLevel) # performing cA
## Using Count as value column: use value.var to override.
head(wfDf) # prinitng the 1st few rows
## name chsqdist Dim1 Dim2 Dim3 inertia
## 1 Biological 0.6044652 -1.4672389 4.7793579 1.90321770 6.513916e-05
## 2 Chemical 0.5492654 -1.5131465 0.9121140 3.01371156 5.154430e-04
## 3 Radiological 0.9075066 -2.4809896 2.6745848 1.38777316 5.302002e-05
## 4 Firearms 0.4615347 1.2813150 0.4207661 -0.03954476 6.871318e-02
## 5 Explosives 0.2093462 -0.5800027 -0.2225406 -0.69509908 2.185420e-02
## 6 Fake Weapons 0.8013542 -2.1522056 3.3572139 1.55494436 1.176651e-04
# CA on target data
targDf <- read.csv("../../data/sub_data/Fatality Freq by Target Type1.csv") # loading the target data
names(targDf) <- c("txt", "Target", "FatalityLevel", "Count") # naming the columns
head(targDf) # printing the 1st few rows
## txt Target FatalityLevel Count
## 1 Private Citizens & Property 14 >=30 22480
## 2 Private Citizens & Property 14 10-29 2668
## 3 Private Citizens & Property 14 3-9 8172
## 4 Private Citizens & Property 14 1-2 16374
## 5 Military 4 >=30 13782
## 6 Military 4 10-29 2444
tfDf <- simple_ca(targDf, Target+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.
head(tfDf) # printing the 1st few rows
## name chsqdist Dim1 Dim2 Dim3 inertia
## 1 Business 0.3714388 -1.1760741 -0.09079016 0.16603061 0.014945128
## 2 Government (General) 0.2196121 -0.2882984 1.74324648 -1.01206440 0.005554735
## 3 Police 0.2919282 0.8835164 0.56809336 1.80080243 0.011528746
## 4 Military 0.3051099 0.8303005 -1.36652390 -0.69102824 0.014803010
## 5 Abortion Related 0.8508017 -2.6784825 -0.81262648 0.68811837 0.001010887
## 6 Airports & Aircraft 0.5971910 -1.8583039 -0.98825133 0.08868291 0.002507910
# CA on attack data
atckDf <- read.csv("../../data/sub_data/Fatality Freq by Attack Type1.csv") # loading the attack data
names(atckDf) <- c("txt", "Attack", "FatalityLevel", "Count") # naming columns
head(atckDf) # printing 1st few rows
## txt Attack FatalityLevel Count
## 1 Bombing/Explosion 3 >=30 62847
## 2 Bombing/Explosion 3 10-29 3073
## 3 Bombing/Explosion 3 3-9 10885
## 4 Bombing/Explosion 3 1-2 18867
## 5 Armed Assault 2 >=30 16607
## 6 Armed Assault 2 10-29 3422
afDf <- simple_ca(atckDf, Attack+txt~FatalityLevel) # performing CA
## Using Count as value column: use value.var to override.
head(afDf) # printing the 1st few rows
## name chsqdist Dim1 Dim2
## 1 Assassination 0.8670784 -1.9079964 -2.06851035
## 2 Armed Assault 0.4130508 -0.9158321 0.95971546
## 3 Bombing/Explosion 0.2238941 0.5592376 -0.05638744
## 4 Hijacking 0.5407579 1.3071356 -0.71001150
## 5 Hostage Taking (Barricade Incident) 0.4888697 1.1890336 0.39193440
## 6 Hostage Taking (Kidnapping) 0.2566760 0.5835999 -0.51511107
## Dim3 inertia
## 1 -0.2217367 0.077940993
## 2 -0.1557028 0.040257662
## 3 0.8278074 0.023750199
## 4 -0.8120140 0.001078845
## 5 -4.7425356 0.001339768
## 6 -1.6839535 0.004317435
I have choosen the data of attack summary, weapon details and motive of attack only for past 2 decades.
rawDf <- tibble(read.csv("../../data/sub_data/topics.csv")) # loading the topics modeling subset
rawDf$iyear <- as.integer(as.character(rawDf$iyear)) # converting year to integer
rawDf$iyear <- factor(rawDf$iyear) # changing year to factor
rawDf <- rawDf %>% mutate_if(is.character, list(~na_if(.,""))) # converting blanks to NULL
rawDf # printing 1st few rows
## # A tibble: 131,350 x 5
## eventid iyear summary motive weapdetail
## <dbl> <fct> <chr> <chr> <chr>
## 1 2.00e11 2000 01/01/2000: In the f~ Unknown <NA>
## 2 2.00e11 2000 01/01/2000: In the s~ Unknown <NA>
## 3 2.00e11 2000 01/01/2000: Members ~ Unknown A sniper rifle was us~
## 4 2.00e11 2000 01/01/2000: A Romany~ Unknown <NA>
## 5 2.00e11 2000 01/01/2000: Over six~ It was possible ~ <NA>
## 6 2.00e11 2000 01/01/2000: In a ser~ An attempt to di~ Unknown gun types, ex~
## 7 2.00e11 2000 01/01/2000: An Oil a~ Unknown <NA>
## 8 2.00e11 2000 01/01/2000: A Serb w~ Unknown <NA>
## 9 2.00e11 2000 01/01/2000: A pipe b~ Unknown <NA>
## 10 2.00e11 2000 01/01/2000: A Civil ~ Unknown 36 petrol bombs were ~
## # ... with 131,340 more rows
Performing topic modeling on attack Summary and extracting required data
## Topic modelling on Summary
SumDf <- rawDf %>% select(iyear, summary) %>% drop_na() # creating a subset of raw data
# cleaning the string
SumDf$summary %<>%
str_replace_all("[^A-z]", " ") %>% # excluding numericals
str_replace_all("[\\s]+", " ") %>% # excluding whitespace
tolower() # converting to lowercase
# tokenization
SumDf %<>%
mutate(line = row_number()) %>% # generating wor numbers
unnest_tokens(word, summary) %>% # unnesting into words
anti_join(stop_words) %>% # removing stop words
filter(word != "responsibility" & word!="claimed") # removing outliers
## Joining, by = "word"
# calculating frequency
Sum_tfIdf <- SumDf %>%
count(iyear, word, sort = TRUE) %>% # count each word by year
bind_tf_idf(word, iyear, n) %>%
arrange(-tf_idf) %>% # arrange in the order of tf-idf
group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract distinct count
top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the tf-idf
g1 <- Sum_tfIdf %>%
mutate(word=reorder_within(word, tf_idf, iyear))%>% # reorder words
ggplot(aes(word, tf_idf, fill=iyear)) + # initializing the gggplot object
geom_col(alpha=0.8, show.legend = F) + # creating a column/bar chart
facet_wrap(~iyear, scales = "free", ncol = 5) + # wraping the year with free scaling
scale_x_reordered() + # reordering scales
coord_flip() # flipping the coordinates
g1
gd1 <- ggplot_build(g1)$data # getting the plot data
glimpse(gd1) # printing the plot data
## List of 1
## $ :'data.frame': 200 obs. of 14 variables:
## ..$ fill : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
## ..$ x : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
## ..$ y : num [1:200] 0.00168 0.00145 0.00128 0.00127 0.00127 ...
## ..$ PANEL : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..$ group : int [1:200] 153 137 124 121 120 112 92 66 65 48 ...
## ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ ymin : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ ymax : num [1:200] 0.00168 0.00145 0.00128 0.00127 0.00127 ...
## ..$ xmin : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
## ..$ xmax : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
## ..$ colour : logi [1:200] NA NA NA NA NA NA ...
## ..$ size : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## ..$ linetype : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ alpha : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
write.csv(gd1, "../../data/sub_data/topic modellling/topic.csv") # saving the plot data
similarly you can do for other data frames
## Topic Modeling Terrorists Motive
motDf <- rawDf %>% select(iyear, motive) %>% drop_na() # Creating a subset
motDf <- motDf[(motDf$motive!="Unknown"),] # removing unknown values
# text cleaning
motDf$motive %<>%
str_replace_all("[^A-z]", " ") %>% # extract only aphabets
str_replace_all("[\\s]+", " ") %>% # removing white spaces
tolower() # converting to lowercase
# tokenization
motDf %<>%
mutate(line = row_number()) %>% # creating s row number
unnest_tokens(word, motive) %>% # unnesting words
anti_join(stop_words) # removing stop words
## Joining, by = "word"
# calculating tf-idf
mot_tfIdf <- motDf %>%
count(iyear, word, sort = TRUE) %>% # counting words by year
bind_tf_idf(word, iyear, n) %>%
arrange(-tf_idf) %>% # arrange by tf-idf value
group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract distinct count
top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the motives
g2 <- mot_tfIdf %>%
mutate(word=reorder_within(word, tf_idf, iyear))%>% # order by tf-idf
ggplot(aes(word, tf_idf, fill=iyear)) + # initializing the ggplot
geom_col(alpha=0.8, show.legend = F) + # creating colum chart
facet_wrap(~iyear, scales = "free", ncol = 5) + # adding facet by year with free scale
scale_x_reordered() + # re-order scaling
coord_flip() # filp the coordinates
g2
gd2 <- ggplot_build(g2)$data # extract the plot data
glimpse(gd2) # print the data
## List of 1
## $ :'data.frame': 200 obs. of 14 variables:
## ..$ fill : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
## ..$ x : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
## ..$ y : num [1:200] 0.0035 0.00333 0.00314 0.0031 0.00292 ...
## ..$ PANEL : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..$ group : int [1:200] 90 82 76 75 69 63 54 46 45 41 ...
## ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ ymin : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ ymax : num [1:200] 0.0035 0.00333 0.00314 0.0031 0.00292 ...
## ..$ xmin : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
## ..$ xmax : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
## ..$ colour : logi [1:200] NA NA NA NA NA NA ...
## ..$ size : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## ..$ linetype : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ alpha : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
## Topics modeling Weapons detail
weapDf <- rawDf %>% select(iyear, weapdetail) %>% drop_na() # creating a subset
# text cleaning
weapDf$weapdetail %<>%
str_replace_all("[^A-z]", " ") %>% # filter alphabets
str_replace_all("[\\s]+", " ") %>% # removing white space
tolower() # converting to lower
# tokenization
weapDf %<>%
mutate(line = row_number()) %>% # creating a row number
unnest_tokens(word, weapdetail) %>% # unnesting into words
anti_join(stop_words) # removing stopwords
## Joining, by = "word"
# creating the frequency
weap_tfIdf <- weapDf %>%
count(iyear, word, sort = TRUE) %>% # groupby year and count words
bind_tf_idf(word, iyear, n) %>%
arrange(-tf_idf) %>% # order by tf-idf
group_by(iyear) %>% distinct(n, .keep_all = TRUE) %>% # extract distinct count
top_n(10) %>% ungroup() # filter top 10
## Selecting by tf_idf
# plotting the tf-idf
g3 <- weap_tfIdf %>%
mutate(word=reorder_within(word, tf_idf, iyear))%>% # reorder by year and tf-tdf
ggplot(aes(word, tf_idf, fill=iyear)) + # initialize ggplot
geom_col(alpha=0.8, show.legend = F) + # create a column chart
facet_wrap(~iyear, scales = "free", ncol = 5) + # wraping year with free scaling
scale_x_reordered() + # reorder scales
coord_flip() # flipping the coordinates
g3
gd3 <- ggplot_build(g3)$data # extract the plot data
glimpse(gd3) # printing the data
## List of 1
## $ :'data.frame': 200 obs. of 14 variables:
## ..$ fill : chr [1:200] "#F8766D" "#F8766D" "#F8766D" "#F8766D" ...
## ..$ x : 'mapped_discrete' int [1:200] 10 9 8 7 6 5 4 3 2 1 ...
## ..$ y : num [1:200] 0.01297 0.00774 0.00607 0.00541 0.00479 ...
## ..$ PANEL : Factor w/ 20 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..$ group : int [1:200] 189 176 161 154 147 139 128 114 113 104 ...
## ..$ flipped_aes: logi [1:200] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ ymin : num [1:200] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ ymax : num [1:200] 0.01297 0.00774 0.00607 0.00541 0.00479 ...
## ..$ xmin : 'mapped_discrete' num [1:200] 9.55 8.55 7.55 6.55 5.55 4.55 3.55 2.55 1.55 0.55 ...
## ..$ xmax : 'mapped_discrete' num [1:200] 10.45 9.45 8.45 7.45 6.45 ...
## ..$ colour : logi [1:200] NA NA NA NA NA NA ...
## ..$ size : num [1:200] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## ..$ linetype : num [1:200] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ alpha : num [1:200] 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 0.8 ...
After all of these I have combined them using Tableau joins to obtain my final dataframe.