The main aim of this analytics project is to get out the hidden trends and unknown information about the terrorist attacks that took place in the world from 1970-2016 and understand why they happened,where they happended,which countires were most affected by it,what were the most targetted entities of the terrorist groups, the most violent and active terrorist groups in the world throught etc and lots more.
And then also some rolling down to specific and most affected countries with terrorist activities in the past such as India,USA,UK etc.
#analyzing the GLobal Terrorist Activities
require(data.table)
require(readr) #to read data faster
require(ggplot2)
#Data preprocessing and Transformation packages
require(dplyr)
require(tidyr)
terror<-read_csv("F:/globalterrorismdb_0617dist.csv")
#checking the structure of data
#str(terror)
#summary(terror)
#a wide data set with lots of columns
#---------------------
#checking how many terrorist attacks were successfull
table(terror$success)
##
## 0 1
## 17649 152701
success<-as.data.frame(table(terror$success))
success$Var1<-ifelse(success$Var1==1,"Successful","Unsuccessful")
names(success)<-c("Attack","count")
#almost 90% of all attacks were successfull
ggplot(aes(x = Attack,y=count), data = success) +
geom_col(width=0.5,color="black",fill="purple",alpha=0.5) +
labs(x = "Attack",y="Count",title="Distrubution of Successful and Unsuccessful attacks from 1970-2015")
Most of the attacks in the past were successfully executed.
Aggregating and summarising data.
table(terror$attacktype1_txt)
##
## Armed Assault Assassination
## 40223 18402
## Bombing/Explosion Facility/Infrastructure Attack
## 83073 9581
## Hijacking Hostage Taking (Barricade Incident)
## 598 902
## Hostage Taking (Kidnapping) Unarmed Assault
## 10233 913
## Unknown
## 6425
#Most common terror attack is Bombing/Explosion
#Finding Succcessful attacks and their attack types
#percentage of Successful and unsuccessful attacks
#converting to a data frame
attackdf<-data.frame(table(terror$attacktype1_txt,
terror$success))
#Spreading the data frame
attackdf<-attackdf %>% spread(key = Var2,Freq)
colnames(attackdf)<-c("Type_of_Attack","Unsuccessful","Successful")
#Adding new columns which contain Percentage of Successful and Unsucesful attacks
attackdf<-attackdf%>% mutate(PerSuccessful=round((Successful/(Unsuccessful+Successful))*100,2), PerUnsuccessful=round((Unsuccessful/(Unsuccessful+Successful))*100,2))
attackdf
## Type_of_Attack Unsuccessful Successful
## 1 Armed Assault 1938 38285
## 2 Assassination 4279 14123
## 3 Bombing/Explosion 9520 73553
## 4 Facility/Infrastructure Attack 549 9032
## 5 Hijacking 75 523
## 6 Hostage Taking (Barricade Incident) 8 894
## 7 Hostage Taking (Kidnapping) 232 10001
## 8 Unarmed Assault 130 783
## 9 Unknown 918 5507
## PerSuccessful PerUnsuccessful
## 1 95.18 4.82
## 2 76.75 23.25
## 3 88.54 11.46
## 4 94.27 5.73
## 5 87.46 12.54
## 6 99.11 0.89
## 7 97.73 2.27
## 8 85.76 14.24
## 9 85.71 14.29
#Plotting
plot<-ggplot(aes(x = "" , y = Successful,fill=Type_of_Attack),data = attackdf) +
geom_bar(width=1 , stat="identity") +
theme(axis.line = element_blank(),
plot.title = element_text(hjust=0.5)) +
labs(title="Pie Chart of Count of the Types of Successful Attacks",
x=NULL,y=NULL , fill = 'Type of Attack')
#Final Plot to make it a Pie Chart
plot + coord_polar(theta ="y" , start = 0) + scale_color_brewer(palette = "Set1")
#Making a Bar plot
theme_set(theme_grey())
ggplot(aes(x = reorder(Type_of_Attack,PerSuccessful),y = PerSuccessful),data = attackdf) +
geom_col(color="black",fill="red",alpha=0.6) +
coord_flip() +
labs(title="Barplot of Types of Attacks and Successful attacks",x="Type of Attack",y="Percentage of Successful Attacks")
As the Plot shows the Most Successful Terror Attacks were for Hostage Taking, Kidnapping,Armed Assaults, Bombings.
countrydf<-data.frame(table(terror$success,terror$country_txt))
#Spreading the Dataframe
countrydf<-countrydf %>% spread(Var1,Freq)
names(countrydf)<-c("Country","Unsuccessful","Successful")
countrydf<-countrydf %>% mutate(TotalAttacks=(Successful+Unsuccessful))
#renaming the columns
#FInding the Top countries with most succcessfully executed terror attacks
Topcountrydf<- countrydf %>% group_by(Country) %>%
summarise(Mean_Successfull = mean(Successful)) %>%
top_n(20) %>%
arrange(desc(Mean_Successfull))
## Selecting by Mean_Successfull
#Finding Countries with Most Unsuccessful Terror Attack attempts
TopFailsAttacks<- countrydf %>% group_by(Country) %>%
summarise(Mean_Unsuccessfull = mean(Unsuccessful)) %>%
top_n(20) %>%
arrange(desc(Mean_Unsuccessfull))
## Selecting by Mean_Unsuccessfull
#Countires with least Terrorist Attacks
LeastAttackdf<-countrydf %>% group_by(Country) %>%
filter(TotalAttacks %in% seq(20,100)) %>%
select(TotalAttacks) %>%
top_n(30) %>%
arrange(TotalAttacks)
## Adding missing grouping variables: `Country`
## Selecting by TotalAttacks
#New Zealand ,Malaysia, Hong Kong , Cuba ,UAE etc are countries having very less
#Terrorist attacks less than 100 attempts.
#Plotting the Barplots
theme_set(theme_classic())
ggplot(aes(x = reorder(Country,Mean_Successfull) , y = Mean_Successfull),data = Topcountrydf) +
geom_col(color="black",fill="#1111F7") +
coord_flip() +
labs(x = "Countries",y="Successfull Terrorist Attacks from 1970-2015")
#Countires with Most Unsuccessful Terror Attacks
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,Mean_Unsuccessfull),y=Mean_Unsuccessfull),data = TopFailsAttacks) +
geom_col(stat="identity",color="black",fill="#DEFD16") +
coord_flip() +
labs(x = "Countries",y="Unsuccessful Terrorist Attacks from 1970-2015") +
scale_y_continuous(limits = c(0,2100),breaks=seq(0,2100,200))
## Warning: Ignoring unknown parameters: stat
#Countries with Least Terrorist Activities
leastFilter<- LeastAttackdf %>%
filter(TotalAttacks > 50) %>% top_n(20)
## Selecting by TotalAttacks
#Bar plot for Countries having attacks > 50
theme_set(theme_bw())
ggplot(aes(x = reorder(Country,-TotalAttacks),y=TotalAttacks),data = leastFilter) +
geom_col(stat="count",color="black",fill="#DEFD16") +
coord_flip() +
labs(x = "Countries with Least Terrorist Attacks",y="Total Terrorist Attacks from 1970-2015")
## Warning: Ignoring unknown parameters: stat
worldTargetdf<-data.frame(table(terror$targtype1_txt)) %>% arrange(desc(Freq))
theme_set(theme_bw())
ggplot(aes(x = reorder(Var1,Freq),y = Freq),data = worldTargetdf) +
geom_col(fill='purple') +
coord_flip()
As we can notice from the plot above the most targetted entities were Citizens,then Military,Police ,Goverment and Businesses.
countryList<-c("United States","United Kingdom")
SpecificTargdf<-terror %>% select(success,target1,targtype1_txt,attacktype1_txt,
country_txt,city,iyear,gname,nkill) %>%
filter(country_txt %in% countryList)
The above data frame consists only attacks on 2 major developed countries i.e United States of America and United Kingdom which were in the top 20 list of most successfully executed terrorist attacks.
Creating a data frame with only some targetted variables.
UStarg<-na.omit(SpecificTargdf) %>% filter(country_txt=="United States")
summary(UStarg$nkill)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.379 0.000 1383.000
uscitydf<-data.frame(table(UStarg$city,UStarg$success))
uscitydf<-uscitydf %>% spread(Var2,Freq)
names(uscitydf)<-c("city",'unsuccessful','success')
uscitysuccess<-uscitydf %>% group_by(city) %>%
summarise(success=mean(success)) %>%
arrange(desc(success)) %>%
top_n(30)
## Selecting by success
#most attacks in New York
theme_set(theme_minimal())
ggplot(aes(x=reorder(city,success),y=success),data=uscitysuccess) +
geom_col(fill="#131D75")+
coord_flip() +
labs(x="City",y="Number of successfull attacks")
Creating a dataframe with year and number of attacks.
theme_set(theme_bw())
yearsuccess<-data.frame(table(UStarg$iyear,UStarg$success))
yearsuccess<-yearsuccess %>% spread(Var2,Freq)
names(yearsuccess)<-c("year","fail",'success')
#generating numeric year column
year<-data.frame(seq(1970,1992))
year[24:46,]<-seq(1994,2016)
names(year)<-c("year")
#adding the year df to yearsuccess df
yearsuccess$attack<-year
#Plotting the time series splot
success=ggplot(data=yearsuccess,aes(x=attack,y=success)) +
geom_point(color="#E80110",size=2) +
geom_line(color="#E80110") +
scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
labs(x="Year",y="Number of successful attacks",title="Time series of attacks in USA")
success
fail=ggplot(data=yearsuccess,aes(x=attack,y=fail)) +
geom_point(color="#E18001",size=2) +
geom_line(color="#E18001") +
scale_x_continuous(limits=c(1970,2016),breaks=seq(1970,2016,4)) +
labs(x="Year",y="Number of failed attacks",title="Time series of failed attacks in USA")
fail
I will create a new data frame for India by filtering and selecting only the relevant columns from the original data frame.
#All the terrorist attacks in INDIA
indiaAttack<-terror %>%
filter(country_txt=="India") %>% select(iyear,provstate,city,summary,location,success,attacktype1_txt,targtype1_txt,gname,weaptype1_txt,nkill)
#Which state has most attacks
table(indiaAttack$provstate)
##
## Andhra pradesh Andhra Pradesh Arunachal Pradesh Assam
## 41 234 17 1120
## Bihar Chandigarh Chhattisgarh Delhi
## 633 47 835 208
## Goa Gujarat Haryana Himachal Pradesh
## 5 81 47 24
## Jammu and Kashmir Jharkhand Karnataka Kerala
## 2197 801 67 78
## Madhya Pradesh Maharashtra Manipur Meghalaya
## 72 280 1011 259
## Mizoram Nagaland Odisha Orissa
## 26 106 374 221
## Puducherry Punjab Rajasthan Sikkim
## 2 939 42 4
## Tamil Nadu Telangana Tripura Unknown
## 158 16 114 155
## Uttar Pradesh Uttaranchal West Bengal
## 180 23 561
indiastate<-data.frame(table(indiaAttack$provstate,
indiaAttack$success))
indiastate<-indiastate %>% spread(Var2,Freq)
names(indiastate)<-c("state","Unsuccessful","successful")
#plotting states and number of Successfull attacks
ggplot(aes(x=reorder(state,successful),y=successful ),data = indiastate) + geom_col(color="black",fill="blue",alpha=0.6) +
coord_flip() +
scale_y_continuous(limits=c(0,2000),breaks=seq(0,2000,400)) +
labs(x = "States",y="Successful terrorist attacks")
#most successfull terrorist attacks in Jammu and Kashmir
#We have Punjab , J and K and Assam on the top with most Successfull attacks
#number of kills
ggplot(aes(x = reorder(provstate,nkill), y = nkill),data = na.omit(indiaAttack)) + geom_col(fill="#E63B10") +
coord_flip() +
labs(x="State",y="Number of Kills from 1970-2015")
ggplot(aes(x = reorder(provstate,nkill)),data = indiaAttack) +
geom_bar(fill="green",color="black") +
coord_flip()
We have Punjab , J & K and Assam on the top with most Successful terrorist attacks and maximum number of fatalities due to the terrorist activities occurred at Chattisgarh,Assam and J&K as well.
#cities with most terrorist attacks
citydf<-data.frame(table(indiaAttack$city,indiaAttack$success))
citydf<-citydf %>%spread(Var2,Freq)
names(citydf)<-c("city","unsuccess","success")
#dataframe consisting of cities with top most count of successful attacks
cityTopsuccess<-citydf %>% group_by(city) %>%
summarise(success = mean(success)) %>%
arrange(desc(success)) %>%
top_n(25)
## Selecting by success
#Plot for cities with most successful attacks added
ggplot(aes(x = reorder(city,success) , y = success),data = cityTopsuccess) +
geom_col(fill="#FE3C01") +
coord_flip() +
scale_y_continuous(limits=c(0,600),breaks=seq(0,600,100)) +
labs(x="city name",y="Count of Successful terrorist attacks")
india_typeAttack<-with(indiaAttack,data.frame(table(attacktype1_txt,success)))
#spreading the dataframe
india_typeAttack<-india_typeAttack %>% spread(success,Freq)
india_typeAttack
## attacktype1_txt 0 1
## 1 Armed Assault 194 2767
## 2 Assassination 204 954
## 3 Bombing/Explosion 1033 3483
## 4 Facility/Infrastructure Attack 23 770
## 5 Hijacking 4 33
## 6 Hostage Taking (Barricade Incident) 1 27
## 7 Hostage Taking (Kidnapping) 15 1060
## 8 Unarmed Assault 1 106
## 9 Unknown 23 280
#Most Bombings occured in india followed by armed assaults.
names(india_typeAttack)<-c("type","fail","success")
india_typeAttack<-india_typeAttack%>% mutate(successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))
#plotting barplots now
ggplot(aes(y = successRate , x = reorder(type,successRate)),data = india_typeAttack) + geom_col(width=0.8,fill='#0000A4',alpha=0.7) +
coord_flip() +
labs(x="Type of attack",y="Success Rate of Attacks(in %)")
The surprising thing is that the Bombings/Explosions have the highest distribution i.e highest count amongst all other types of terror attacks but their success rate is less.The terrorist activity having highest success rate is unarmed assaults.
target_india<-with(indiaAttack,data.frame(table(targtype1_txt,success)))
target_india<-target_india %>% spread(success,Freq)
names(target_india)<-c("target","fail","success")
target_india<-target_india%>% mutate(total=fail+success,successRate=round((success/(success+fail))*100,2) , failRate=round((fail/(success+fail))*100,2))
#barplot with distribution of the most attacks in and its target
ggplot(aes(x = reorder(target,total),y=total),data=target_india) +
geom_bar(stat="identity",color="white",fill="#FE0202") +
coord_flip() +
labs(x ="Target Type ", y ="Total number of terror attacks")
#Most attacks targetted for Private citizens and property followed by police and goverment
#---------Most successful targets----------#
theme_set(theme_classic())
ggplot(aes(x = reorder(target,successRate),y=successRate),data=target_india) +
geom_col(width=0.7,fill="#F22424",alpha=0.7) + coord_flip() +
labs(x="Target of Terror attacks",y="% of Successul Attacks")
I will create a dataframe consiting of year and the total number of kills for that particular year.
#creating a new data frame-grouping by year and summarising by total sum of kills for a year
yearkillsIndia<-na.omit(indiaAttack) %>%group_by(iyear) %>% summarise(nkills=sum(nkill))
#Time series analysis
theme_set(theme_bw())
ggplot(aes(x = iyear, y =nkills),data=yearkillsIndia) +
geom_point(color="purple",size=2) +
geom_line(color="#9124F2") +
scale_x_continuous(limits=c(1975,2016),breaks=seq(1975,2016,4)) +
labs(x="Year",y="number of Kills")
#highest number of kills in year 2010
#another time series chart using dygraphs
require(dygraphs)
## Loading required package: dygraphs
dygraph(yearkillsIndia) %>%
dyHighlight(highlightCircleSize = 5,
highlightSeriesBackgroundAlpha = 0.2,
hideOnMouseOut = FALSE)
The above time series plot shows that the highest number of kills were in year 2010.
Creating a new dataframe which is grouped by the Group name .
gnamedf<-indiaAttack %>% group_by(gname) %>%
select(success,attacktype1_txt,targtype1_txt,nkill,iyear,provstate,city)
mostSuccessGroup<-as.data.frame(table(gnamedf$gname,gnamedf$success,
gnamedf$attacktype1_txt))
#spreading the dataframe
mostSuccessGroup<-mostSuccessGroup %>% spread(Var2,Freq)
names(mostSuccessGroup)<-c("gname","attacktype","Unsuccessful","success")
SuccessGroup<-mostSuccessGroup%>% group_by(gname) %>%
select(success,Unsuccessful,attacktype) %>%
arrange(desc(success))
#considering only groups which have successful attack>20 and removing unknown groups
SuccessGroup <- SuccessGroup %>% filter(gname!="Unknown",success>20)
#Plotting a Barplot of Gropus vs number of successfull attacks
theme_set(theme_bw())
p<-ggplot(aes(x=reorder(gname,success),y=success),data=SuccessGroup) +
geom_col(aes(fill=attacktype)) +
coord_flip()+
labs(x="Group Name",y="Number of Successful attacks",title="Groups and their successful attacks",
fill="Type of attack") +
scale_y_continuous(breaks=seq(0,2000,200))
p+scale_fill_brewer(palette = "Dark2")
Now let’s analyze the most used weapons which was used by the extremists to execute the attacks.We need to create a separate data frame for arms and weapons used.
#grouping by group_name
weapondf<-na.omit(indiaAttack) %>% filter(gname!="Unknown") %>%
group_by(gname) %>%
select(success,nkill,weaptype1_txt ) %>%
arrange(desc(nkill))
## Adding missing grouping variables: `gname`
#data frame with most number of kills and grouped by Group name
mostkilldf<- weapondf %>% group_by(gname) %>%
summarise(sum_kill=sum(nkill)) %>%
arrange(desc(sum_kill)) %>%
top_n(20)
## Selecting by sum_kill
#plot of most kills by which terrorist group
theme_set(theme_bw())
ggplot(aes(x = reorder(gname,sum_kill) , y = sum_kill ),data = mostkilldf) +
geom_col(fill="red") +
coord_flip() +
labs(x = "Terrorist group",y="total number of kills",title="Plot of total kills vs Terrorist Group" )
#data frame for weapons which killed most people
weaponKill<-weapondf %>% group_by(weaptype1_txt) %>%
summarise(sum_kill=sum(nkill)) %>%
filter(sum_kill >0 ) %>%
arrange(desc(sum_kill))
weaponKill
## # A tibble: 5 x 2
## weaptype1_txt sum_kill
## <chr> <int>
## 1 Firearms 2271
## 2 Explosives/Bombs/Dynamite 1304
## 3 Unknown 357
## 4 Melee 238
## 5 Incendiary 48