The main aim of this analytics project is to get out the hidden trends and unknown information about the terrorist attacks that took place in the world from 1970-2016 and understand why they happened,where they happended,which countires were most affected by it,what were the most targetted entities of the terrorist groups, the most violent and active terrorist groups in the world throught etc and lots more.

And then also some rolling down to specific and most affected countries with terrorist activities in the past such as India,USA,UK etc.

#analyzing the GLobal Terrorist Activities
require(data.table)
require(readr) #to read data faster
require(ggplot2)
#Data preprocessing and Transformation packages
require(dplyr)
require(tidyr)


terror<-read_csv("F:/globalterrorismdb_0617dist.csv")

#checking the structure of data
#str(terror)
#summary(terror)
#a wide data set with lots of columns


#---------------------

Starting with Descriptive Analytics

#checking how many terrorist attacks were successfull
table(terror$success)
## 
##      0      1 
##  17649 152701
success<-as.data.frame(table(terror$success))
success$Var1<-ifelse(success$Var1==1,"Successful","Unsuccessful")
names(success)<-c("Attack","count")
#almost 90% of all attacks were successfull
ggplot(aes(x = Attack,y=count), data = success) + 
  geom_col(width=0.5,color="black",fill="purple",alpha=0.5) + 
  labs(x = "Attack",y="Count",title="Distrubution of Successful and Unsuccessful attacks from 1970-2015")

Most of the attacks in the past were successfully executed.


Analyzing what types of Attacks were done

Aggregating and summarising data.

table(terror$attacktype1_txt)
## 
##                       Armed Assault                       Assassination 
##                               40223                               18402 
##                   Bombing/Explosion      Facility/Infrastructure Attack 
##                               83073                                9581 
##                           Hijacking Hostage Taking (Barricade Incident) 
##                                 598                                 902 
##         Hostage Taking (Kidnapping)                     Unarmed Assault 
##                               10233                                 913 
##                             Unknown 
##                                6425
#Most common terror attack is Bombing/Explosion

#Finding Succcessful attacks and their attack types
#percentage of Successful and unsuccessful attacks
#converting to a data frame
attackdf<-data.frame(table(terror$attacktype1_txt,
                                      terror$success))



#Spreading the data frame
attackdf<-attackdf %>% spread(key = Var2,Freq)
colnames(attackdf)<-c("Type_of_Attack","Unsuccessful","Successful")

#Adding new columns which contain Percentage of Successful and Unsucesful attacks
attackdf<-attackdf%>% mutate(PerSuccessful=round((Successful/(Unsuccessful+Successful))*100,2), PerUnsuccessful=round((Unsuccessful/(Unsuccessful+Successful))*100,2))

attackdf
##                        Type_of_Attack Unsuccessful Successful
## 1                       Armed Assault         1938      38285
## 2                       Assassination         4279      14123
## 3                   Bombing/Explosion         9520      73553
## 4      Facility/Infrastructure Attack          549       9032
## 5                           Hijacking           75        523
## 6 Hostage Taking (Barricade Incident)            8        894
## 7         Hostage Taking (Kidnapping)          232      10001
## 8                     Unarmed Assault          130        783
## 9                             Unknown          918       5507
##   PerSuccessful PerUnsuccessful
## 1         95.18            4.82
## 2         76.75           23.25
## 3         88.54           11.46
## 4         94.27            5.73
## 5         87.46           12.54
## 6         99.11            0.89
## 7         97.73            2.27
## 8         85.76           14.24
## 9         85.71           14.29
#Plotting
plot<-ggplot(aes(x = "" , y = Successful,fill=Type_of_Attack),data = attackdf) + 
  geom_bar(width=1 , stat="identity") + 
  theme(axis.line = element_blank(), 
        plot.title = element_text(hjust=0.5)) + 
        labs(title="Pie Chart of Count of the Types of Successful Attacks",
             x=NULL,y=NULL , fill = 'Type of Attack')


#Final Plot to make it a Pie Chart

plot + coord_polar(theta ="y" , start = 0)  + scale_color_brewer(palette = "Set1")

#Making a Bar plot
theme_set(theme_grey())

ggplot(aes(x = reorder(Type_of_Attack,PerSuccessful),y = PerSuccessful),data = attackdf) + 
  geom_col(color="black",fill="red",alpha=0.6) + 
  coord_flip() + 
  labs(title="Barplot of Types of Attacks and Successful attacks",x="Type of Attack",y="Percentage of Successful Attacks")

As the Plot shows the Most Successful Terror Attacks were for Hostage Taking, Kidnapping,Armed Assaults, Bombings.


Analyzing the Countires and the Terrorist attacks took place

countrydf<-data.frame(table(terror$success,terror$country_txt))
#Spreading the Dataframe
countrydf<-countrydf %>% spread(Var1,Freq)
names(countrydf)<-c("Country","Unsuccessful","Successful")
countrydf<-countrydf %>% mutate(TotalAttacks=(Successful+Unsuccessful))
#renaming the columns


#FInding the Top countries with most succcessfully executed terror attacks

Topcountrydf<- countrydf %>% group_by(Country) %>%
        summarise(Mean_Successfull = mean(Successful)) %>%
        top_n(20) %>%
        arrange(desc(Mean_Successfull))
## Selecting by Mean_Successfull
#Finding Countries with Most Unsuccessful Terror Attack attempts
TopFailsAttacks<- countrydf %>% group_by(Country) %>%
        summarise(Mean_Unsuccessfull = mean(Unsuccessful)) %>%
        top_n(20) %>%
        arrange(desc(Mean_Unsuccessfull))
## Selecting by Mean_Unsuccessfull
#Countires with least Terrorist Attacks

LeastAttackdf<-countrydf %>% group_by(Country) %>%
      filter(TotalAttacks %in% seq(20,100)) %>%
      select(TotalAttacks) %>%
      top_n(30) %>%
      arrange(TotalAttacks)
## Adding missing grouping variables: `Country`
## Selecting by TotalAttacks
#New Zealand ,Malaysia, Hong Kong , Cuba ,UAE etc are countries having very less
#Terrorist attacks less than 100 attempts.


#Plotting the Barplots
theme_set(theme_classic())
ggplot(aes(x = reorder(Country,Mean_Successfull) , y = Mean_Successfull),data = Topcountrydf) + 
  geom_col(color="black",fill="#1111F7") + 
  coord_flip() +
  labs(x = "Countries",y="Successfull Terrorist Attacks from 1970-2015")

#Countires with Most Unsuccessful Terror Attacks
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,Mean_Unsuccessfull),y=Mean_Unsuccessfull),data = TopFailsAttacks) + 
  geom_col(stat="identity",color="black",fill="#DEFD16") + 
  coord_flip() +
  labs(x = "Countries",y="Unsuccessful Terrorist Attacks from 1970-2015") + 
  scale_y_continuous(limits = c(0,2100),breaks=seq(0,2100,200))
## Warning: Ignoring unknown parameters: stat

#Countries with Least Terrorist Activities 

leastFilter<- LeastAttackdf %>% 
  filter(TotalAttacks > 50) %>% top_n(20)
## Selecting by TotalAttacks
#Bar plot for Countries having attacks > 50
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,-TotalAttacks),y=TotalAttacks),data = leastFilter) + 
  geom_col(stat="count",color="black",fill="#DEFD16") + 
  coord_flip() +
  labs(x = "Countries with Least Terrorist Attacks",y="Total Terrorist Attacks from 1970-2015")
## Warning: Ignoring unknown parameters: stat


What were the target of the Terrorists?

worldTargetdf<-data.frame(table(terror$targtype1_txt)) %>% arrange(desc(Freq))

theme_set(theme_bw())
ggplot(aes(x = reorder(Var1,Freq),y = Freq),data = worldTargetdf) +
  geom_col(fill='purple') +
  coord_flip()

As we can notice from the plot above the most targetted entities were Citizens,then Military,Police ,Goverment and Businesses.


Creating a new Dataframe

countryList<-c("United States","United Kingdom")

SpecificTargdf<-terror %>% select(success,target1,targtype1_txt,attacktype1_txt,
                                  country_txt,city,iyear,gname,nkill) %>% 
                          filter(country_txt %in% countryList)

The above data frame consists only attacks on 2 major developed countries i.e United States of America and United Kingdom which were in the top 20 list of most successfully executed terrorist attacks.


Analyzing attack in USA

Creating a data frame with only some targetted variables.

UStarg<-na.omit(SpecificTargdf) %>% filter(country_txt=="United States")
summary(UStarg$nkill)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.000    0.000    0.000    1.379    0.000 1383.000

City with most successful attacks

uscitydf<-data.frame(table(UStarg$city,UStarg$success))

uscitydf<-uscitydf %>% spread(Var2,Freq)
names(uscitydf)<-c("city",'unsuccessful','success')

uscitysuccess<-uscitydf %>% group_by(city) %>%
                          summarise(success=mean(success)) %>%
                          arrange(desc(success)) %>%
                          top_n(30)
## Selecting by success
#most attacks in New York
theme_set(theme_minimal())
ggplot(aes(x=reorder(city,success),y=success),data=uscitysuccess) +
  geom_col(fill="#131D75")+
  coord_flip() +
  labs(x="City",y="Number of successfull attacks")

Time series of Year and number of succesful attacks in USA

Creating a dataframe with year and number of attacks.

theme_set(theme_bw())

yearsuccess<-data.frame(table(UStarg$iyear,UStarg$success))
yearsuccess<-yearsuccess %>% spread(Var2,Freq)
names(yearsuccess)<-c("year","fail",'success')

#generating numeric year column
year<-data.frame(seq(1970,1992))
year[24:46,]<-seq(1994,2016)
names(year)<-c("year")

#adding the year df to yearsuccess df
yearsuccess$attack<-year







#Plotting the time series splot
success=ggplot(data=yearsuccess,aes(x=attack,y=success)) +
  geom_point(color="#E80110",size=2) + 
  geom_line(color="#E80110") + 
  scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
  labs(x="Year",y="Number of successful attacks",title="Time series of attacks in USA")

success

fail=ggplot(data=yearsuccess,aes(x=attack,y=fail)) +
  geom_point(color="#E18001",size=2) + 
  geom_line(color="#E18001") + 
  scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4))  +
  labs(x="Year",y="Number of failed attacks",title="Time series of failed attacks in USA")

 fail


Analyzing attacks in India

I will create a new data frame for India by filtering and selecting only the relevant columns from the original data frame.

#All the terrorist attacks in INDIA
indiaAttack<-terror %>%
  filter(country_txt=="India") %>% select(iyear,provstate,city,summary,location,success,attacktype1_txt,targtype1_txt,gname,weaptype1_txt,nkill) 

#Which state has most attacks 
table(indiaAttack$provstate)
## 
##    Andhra pradesh    Andhra Pradesh Arunachal Pradesh             Assam 
##                41               234                17              1120 
##             Bihar        Chandigarh      Chhattisgarh             Delhi 
##               633                47               835               208 
##               Goa           Gujarat           Haryana  Himachal Pradesh 
##                 5                81                47                24 
## Jammu and Kashmir         Jharkhand         Karnataka            Kerala 
##              2197               801                67                78 
##    Madhya Pradesh       Maharashtra           Manipur         Meghalaya 
##                72               280              1011               259 
##           Mizoram          Nagaland            Odisha            Orissa 
##                26               106               374               221 
##        Puducherry            Punjab         Rajasthan            Sikkim 
##                 2               939                42                 4 
##        Tamil Nadu         Telangana           Tripura           Unknown 
##               158                16               114               155 
##     Uttar Pradesh       Uttaranchal       West Bengal 
##               180                23               561
indiastate<-data.frame(table(indiaAttack$provstate,
                             indiaAttack$success))

indiastate<-indiastate %>% spread(Var2,Freq)
names(indiastate)<-c("state","Unsuccessful","successful")

#plotting states and number of Successfull attacks
ggplot(aes(x=reorder(state,successful),y=successful ),data = indiastate) + geom_col(color="black",fill="blue",alpha=0.6) + 
  coord_flip() + 
  scale_y_continuous(limits=c(0,2000),breaks=seq(0,2000,400)) + 
  labs(x = "States",y="Successful terrorist attacks")

#most successfull terrorist attacks in Jammu and Kashmir
#We have Punjab , J and K and Assam on the top with most Successfull attacks
  
#number of kills
ggplot(aes(x = reorder(provstate,nkill), y = nkill),data = na.omit(indiaAttack)) + geom_col(fill="#E63B10") + 
  coord_flip() +
  labs(x="State",y="Number of Kills from 1970-2015")

ggplot(aes(x = reorder(provstate,nkill)),data = indiaAttack) + 
  geom_bar(fill="green",color="black") + 
  coord_flip()

We have Punjab , J & K and Assam on the top with most Successful terrorist attacks and maximum number of fatalities due to the terrorist activities occurred at Chattisgarh,Assam and J&K as well.

#cities with most terrorist attacks
citydf<-data.frame(table(indiaAttack$city,indiaAttack$success))
citydf<-citydf %>%spread(Var2,Freq)
names(citydf)<-c("city","unsuccess","success")

#dataframe consisting of cities with top most count of successful attacks
cityTopsuccess<-citydf %>% group_by(city) %>%
  summarise(success = mean(success)) %>%
  arrange(desc(success)) %>%
  top_n(25)
## Selecting by success
#Plot for cities with most successful attacks added
ggplot(aes(x = reorder(city,success) , y = success),data = cityTopsuccess) + 
  geom_col(fill="#FE3C01") + 
  coord_flip() + 
  scale_y_continuous(limits=c(0,600),breaks=seq(0,600,100)) +
  labs(x="city name",y="Count of Successful terrorist attacks")


Checking the type of attacks occured in India

india_typeAttack<-with(indiaAttack,data.frame(table(attacktype1_txt,success)))
#spreading the dataframe
india_typeAttack<-india_typeAttack %>% spread(success,Freq)
india_typeAttack
##                       attacktype1_txt    0    1
## 1                       Armed Assault  194 2767
## 2                       Assassination  204  954
## 3                   Bombing/Explosion 1033 3483
## 4      Facility/Infrastructure Attack   23  770
## 5                           Hijacking    4   33
## 6 Hostage Taking (Barricade Incident)    1   27
## 7         Hostage Taking (Kidnapping)   15 1060
## 8                     Unarmed Assault    1  106
## 9                             Unknown   23  280
#Most Bombings occured in india followed by armed assaults.
names(india_typeAttack)<-c("type","fail","success")

india_typeAttack<-india_typeAttack%>% mutate(successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))

#plotting barplots now
ggplot(aes(y = successRate , x = reorder(type,successRate)),data = india_typeAttack) + geom_col(width=0.8,fill='#0000A4',alpha=0.7) +
  coord_flip() +
  labs(x="Type of attack",y="Success Rate of Attacks(in %)")

The surprising thing is that the Bombings/Explosions have the highest distribution i.e highest count amongst all other types of terror attacks but their success rate is less.The terrorist activity having highest success rate is unarmed assaults.


Analyzing the Target of the Terrorist activities

target_india<-with(indiaAttack,data.frame(table(targtype1_txt,success)))
target_india<-target_india %>% spread(success,Freq)
names(target_india)<-c("target","fail","success")
target_india<-target_india%>% mutate(total=fail+success,successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))

#barplot with distribution of the most attacks in and its target
ggplot(aes(x = reorder(target,total),y=total),data=target_india) + 
  geom_bar(stat="identity",color="white",fill="#FE0202") +
  coord_flip() +
  labs(x ="Target Type ", y ="Total number of terror attacks")

#Most attacks targetted for Private citizens and property followed by police and goverment


#---------Most successful targets----------#
theme_set(theme_classic())

ggplot(aes(x = reorder(target,successRate),y=successRate),data=target_india) +
  geom_col(width=0.7,fill="#F22424",alpha=0.7) + coord_flip() +
  labs(x="Target of Terror attacks",y="% of Successul Attacks")


Time series analysis of Number of kills over the Years

I will create a dataframe consiting of year and the total number of kills for that particular year.

#creating a new data frame-grouping by year and summarising by total sum of kills for a year
yearkillsIndia<-na.omit(indiaAttack) %>%group_by(iyear) %>% summarise(nkills=sum(nkill))

#Time series analysis
theme_set(theme_bw())
ggplot(aes(x = iyear, y =nkills),data=yearkillsIndia) +
  geom_point(color="purple",size=2) + 
  geom_line(color="#9124F2") + 
  scale_x_continuous(limits=c(1975,2016),breaks=seq(1975,2016,4)) + 
  labs(x="Year",y="number of Kills")

#highest number of kills in year 2010

#another time series chart using dygraphs
require(dygraphs)
## Loading required package: dygraphs
dygraph(yearkillsIndia) %>%
  dyHighlight(highlightCircleSize = 5, 
              highlightSeriesBackgroundAlpha = 0.2,
              hideOnMouseOut = FALSE)

The above time series plot shows that the highest number of kills were in year 2010.


Group names associated with Terrorist activities

Creating a new dataframe which is grouped by the Group name .

gnamedf<-indiaAttack %>% group_by(gname) %>%
        select(success,attacktype1_txt,targtype1_txt,nkill,iyear,provstate,city)


mostSuccessGroup<-as.data.frame(table(gnamedf$gname,gnamedf$success,
                                      gnamedf$attacktype1_txt))

#spreading the dataframe
mostSuccessGroup<-mostSuccessGroup %>% spread(Var2,Freq)
names(mostSuccessGroup)<-c("gname","attacktype","Unsuccessful","success")

SuccessGroup<-mostSuccessGroup%>% group_by(gname) %>%
          select(success,Unsuccessful,attacktype) %>%
          arrange(desc(success))
         

#considering only groups which have successful attack>20 and removing unknown groups                        
SuccessGroup <- SuccessGroup %>% filter(gname!="Unknown",success>20)

#Plotting a Barplot of Gropus vs number of successfull attacks 
theme_set(theme_bw())
p<-ggplot(aes(x=reorder(gname,success),y=success),data=SuccessGroup) + 
  geom_col(aes(fill=attacktype)) + 
  coord_flip()+
  labs(x="Group Name",y="Number of Successful attacks",title="Groups and their successful attacks",
       fill="Type of attack") + 
  scale_y_continuous(breaks=seq(0,2000,200))

p+scale_fill_brewer(palette = "Dark2")


The most used Weapon and weapon types used in Attacks

Now let’s analyze the most used weapons which was used by the extremists to execute the attacks.We need to create a separate data frame for arms and weapons used.

#grouping by group_name
weapondf<-na.omit(indiaAttack) %>% filter(gname!="Unknown") %>%
  group_by(gname) %>%
  select(success,nkill,weaptype1_txt ) %>%
  arrange(desc(nkill))
## Adding missing grouping variables: `gname`
#data frame with most number of kills and grouped by Group name
mostkilldf<- weapondf %>% group_by(gname) %>%
  summarise(sum_kill=sum(nkill)) %>%
  arrange(desc(sum_kill)) %>%
  top_n(20)
## Selecting by sum_kill
#plot of most kills by which terrorist group
theme_set(theme_bw())
ggplot(aes(x = reorder(gname,sum_kill) , y = sum_kill ),data = mostkilldf) +
  geom_col(fill="red") + 
  coord_flip() +
  labs(x = "Terrorist group",y="total number of kills",title="Plot of total kills vs Terrorist Group" )

#data frame for weapons which killed most people
weaponKill<-weapondf %>% group_by(weaptype1_txt) %>% 
  summarise(sum_kill=sum(nkill)) %>%
  filter(sum_kill >0 ) %>% 
  arrange(desc(sum_kill))
weaponKill
## # A tibble: 5 x 2
##               weaptype1_txt sum_kill
##                       <chr>    <int>
## 1                  Firearms     2271
## 2 Explosives/Bombs/Dynamite     1304
## 3                   Unknown      357
## 4                     Melee      238
## 5                Incendiary       48