Import Libraries and Data Set

#Data Cleaning
library(DT)
library(highcharter)
library(dplyr)
library(car)
library(visdat)
library(lubridate)

# timeseries
library(forecast)
library(tseries)
library(imputeTS)

# heatmap
library(ggplot2)
library(sf)
library(rvest)
library(plotly)
library(viridis)
library(ggrepel)

#WordCloud
library(ggwordcloud)

  #conflict= read.csv('File Location')

Data Cleaning and Manipulation

All the blank values were converted to NA values. A vismiss plot helped in understanding the NA values Columns with maximum NA values were discarded and the remaining rows were removed. This helped in effective elimination of missing data. Dates were reformatted into YYYY and MM YYYY formats for the analysis. Age were converted to age groups. The data contained the city wise locations, which was changed to district/region wise locations for heat maps.

#Converting blank values to NA values
conflict[conflict == ''] <- NA
  #View(conflict)
vis_miss(conflict)

#na_count <- sum(is.na(conflict))
#print(na_count)
#colSums(is.na(conflict))
#rowSums(is.na(conflict))

#Removing unwanted columns
conflict <- conflict %>%
  select(-ammunition, -took_part_in_the_hostilities)

#colSums(is.na(data1))

#Removing Leftover Blank Valued Data
conflict <- conflict[complete.cases(conflict), ]

vis_miss(conflict)

#Total values omitted  457

#Convert date format from DMY to YYYY
df <- conflict %>% 
  mutate(NEW_FORMAT = parse_date_time(date_of_event, 
                                      orders = c("mdy", "dmy", "ymd"))) %>% 
  mutate(DESIRED_FORMAT = format.Date(NEW_FORMAT,  "%Y"))

  #View(df)


# Month Year format
df2 <- df %>% 
  mutate(NEW_FORMAT = parse_date_time(date_of_event, 
                                      orders = c("mdy", "dmy", "ymd"))) %>% 
  mutate(DESIRED_FORMAT2 = format.Date(NEW_FORMAT,  "%b %Y"))

  #View(df2)

# Spliting age into age groups
df3= df2 %>% 
  mutate(
    # Create categories
    age_group = dplyr::case_when(
      age <= 18            ~ "0-18",
      age > 18 & age <= 36 ~ "19-36",
      age > 36 & age <= 54 ~ "37-54",
      age > 54             ~ "> 54" 
      ) ,
    # Convert to factor
    age_group = factor(
      age_group,
      level = c("0-18", "19-36","37-54", "> 54")
    )
  )
  #View(df3)

# Omitting the Unwanted Row
df_clean= select(df3,-c(NEW_FORMAT))
  #View(df_clean)   
 

# Exporting a clean excel file for future reference

 #library(writexl)
  #write_xlsx( df_clean,"C:/Users/MyDevice/desiredtitle.xlsx")

Graphical Insights

#Gender Pie Chart
custom_colors <- c("#454568", "#785956")

df_clean%>%
  group_by(gender) %>%
  summarise(count=n()) %>%
  # arrange(desc(Rating)) %>%  # Sort by count in descending order
  # head(5) %>%  
  hchart('pie', hcaes(x = gender, y = count, color = custom_colors)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  hc_title(text = "Genderwise Count")

# Age Group Bar Chart
df_clean%>%
  group_by(age_group) %>%
  summarise(count=n()) %>%
  hchart('column', hcaes(x = age_group, y = count, color = count)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  hc_title(text = "Age Groups")

#Citizenship Bar Chart
custom_colors <- c("#005EB8", "#000000")

df_clean%>%
  group_by(citizenship) %>%
  summarise(count=n()) %>%
  # arrange(desc(Rating)) %>%  # Sort by count in descending order
  # head(5) %>%  
  hchart('column', hcaes(x = citizenship, y = count, color = custom_colors)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  hc_title(text = "Countrywise Citizenship Count")

#Killed By Bar Chart
custom_colors <- c("#EE2A35","#005EB8", "#000000")

df_clean%>%
  group_by(killed_by) %>%
  summarise(count=n()) %>%
  # arrange(desc(Rating)) %>%  # Sort by count in descending order
  # head(5) %>%  
  hchart('column', hcaes(x = killed_by, y = count, color = custom_colors)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  hc_title(text = "Killed By Counts")

#Year wise Deaths Bar chart
df_clean%>%
  group_by(DESIRED_FORMAT) %>%
  summarise(count=n()) %>%
  # arrange(desc(Rating)) %>%  # Sort by count in descending order
  # head(5) %>%  
  hchart('column', hcaes(x = DESIRED_FORMAT, y = count, color = count)) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  hc_title(text = "Year Wise Count of Deaths")

#h1= df_clean%>%
 # group_by(type_of_injury) %>%
  #summarise(count = n()) %>%
  #arrange(desc(count)) %>%  
  #head(5) %>%  
  #hchart('pie', hcaes(x =type_of_injury, y = count, color = count)) %>%
  #hc_add_theme(hc_theme_google()) %>%
  #hc_tooltip(pointFormat = '<b> Count </b>: {point.y} <br>') %>%  
  #hc_title(text = "Type of Injuries")

# h1

#Word Cloud of Type of Wepons Used
set.seed(1000)
w = df_clean %>% group_by(type_of_injury) %>%
  summarize(count = n())
#View(w)
ggplot(w, aes(label = w$type_of_injury, size = w$count, color= w$count)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 50) +
  theme_minimal() +
  scale_color_gradient(low = "darkred", high = "maroon")

Heat Maps

Shape files contain the coordinates and maps of any geographical location. They are used to plot heat maps. Firstly we select the required variables and merge them with the shape file. Then we plot a heatmap using ggplot and plotly. In this case, if you hover your cursor on the white borders of any district/region, you can find Total death count, Palestinians dead in Palestine count and the count of Israelis dead in Palestine(similarly for Israel) which summarizes the requirement of 3 heat maps for each country to one single heat map.

  #is.sh1= read_sf('Israel Shape File Location')
  #pa.sh1= read_sf('Palestine Shape File Location')
###PALESTINE DEATHS

#                 Total Palestine Deaths
prw=  df_clean %>% group_by(event_location_region, event_location_district) %>%
  summarize(count = n())

#View(prw)

#colnames(prw)[1] <- "NAME_1"
colnames(prw)[2] <- "NAME_2"
#View(prw)

#Merging Palestine shape file with Pivot table
map.prw= merge(pa.sh1, prw, by= "NAME_2" )
#View(map.prw)

points.prw= cbind(map.prw, st_coordinates(st_centroid(map.prw$geometry)))



#               Israeli Citizens Dead in Palestine
pa.icp = df_clean %>% group_by(df_clean$citizenship,
                               df_clean$event_location_district) %>%
  filter(citizenship =="Israeli") %>%  summarize(count = n())
#View(pa.icp)

#colnames(pa.deaths)[1] <- "NAME_0"
colnames(pa.icp)[2] <- "NAME_2"
#View(pa.icp)

map.icp = merge(pa.sh1, pa.icp, by= "NAME_2", )

#                 Palestinian Citizens Dead in Palestine
pa.pcp = df_clean %>% group_by(df_clean$citizenship,
                               df_clean$event_location_district) %>%
  filter(citizenship =="Palestina") %>%  summarize(count = n())
#View(pa.pcp)

#colnames(pa.deaths)[1] <- "NAME_0"
colnames(pa.pcp)[2] <- "NAME_2"
#View(pa.pcp)

map.pcp= merge(pa.sh1, pa.pcp, by= "NAME_2")


info= paste0(
  '\nName:', map.prw$NAME_2,
  '\nTotal Death Count: ', map.prw$count,
  '\nPalestinians dead in Palestine : ', map.pcp$count,
  '\nIsraelis dead in Palestine : ', map.icp$count
)
map.prw$info = info

pa.pd= ggplot(map.prw)+ geom_sf(aes(fill= map.prw$count, label= info),
                                color= 'white', size= 1)+
  ggtitle("Deaths in Palestine")+
  geom_text(data= points.prw, aes(x= points.prw$X, y= points.prw$Y, 
                                  label= paste(points.prw$NAME_2)),
            color= 'Red', vjust= -2)+
  scale_fill_viridis_c(option = 'A', trans= 'sqrt')+
  labs(x = "Longitude", y = "Latitude")+ theme(
  plot.margin = margin(1, 1, 1, 1, "cm"))

fig1= ggplotly(pa.pd) %>%
  layout(autosize = F, width = 750, height = 750)
fig1

# ISRAEL DEATHS
#                  Total Israel Deaths
irw =  df_clean %>% group_by(event_location,
                             event_location_region) %>% 
  filter(event_location_region == "Israel") %>%
  summarize(count = n())
#View(irw)

colnames(irw)[1] <- "NAME_1"
#View(irw)

map.irw= merge(is.sh1, irw, by= "NAME_1")
#View(map.irw)

points.irw= cbind(map.irw, st_coordinates(st_centroid(map.irw$geometry)))
#View(points.irw)

#                         Israeli citizens dead in Israel
is.ici = df_clean %>% group_by(
  df_clean$event_location,
  df_clean$event_location_region,
  df_clean$citizenship) %>%
  filter( event_location_region =='Israel', citizenship== 'Israeli') %>%  
  summarize(count = n())
#View(is.ici)

colnames(is.ici)[1] <- "NAME_1"
#View(is.ici)

map.ici= merge(is.sh1, is.ici, by= "NAME_1")
#View(map.ici)

#                  Palestine Citizens Dead in Israel

is.pci = df_clean %>% group_by(
  df_clean$event_location,
  df_clean$event_location_region,
  df_clean$citizenship) %>%
  filter( event_location_region =='Israel', citizenship== 'Palestina') %>%  
  summarize(count = n())
#View(is.pci)

colnames(is.pci)[1] <- "NAME_1"
#View(is.pci)

map.pci= merge(is.sh1, is.pci, by= "NAME_1")
#View(map.pci)

info1= paste0(
  '\nName: ', map.irw$NAME_1,
  '\nTotal Death Count: ', map.irw$count,
  '\nIsraeli dead in Israel: ', map.ici$count,
  '\nPalestinians dead in Israel: ', map.pci$count
)
map.irw$info1 = info1


itd = ggplot(map.irw)+ geom_sf(aes(fill= map.irw$count, label= info1), 
                               color= 'white', size= 5)+
  scale_fill_viridis(option = 'B', trans= 'sqrt')+
  ggtitle("Israeli Deaths")+
  geom_text(data= points.irw, aes(x= points.irw$X, y= points.irw$Y, 
                                  label= paste(points.irw$NAME_1)),
            nudge_x = c(0, -0.4, 0.6, 0, 0.3, -0.4), nudge_y = c(0, 0, 0,0, 0,0),
            color= 'Red')+
  labs(x = "Longitude", y = "Latitude")+
  theme(
    panel.background = element_rect(fill = 'grey', size= 0.5,
                                    linetype = 0, color = 'black'),
  )+ theme(
  plot.margin = margin(1, 1, 1, 1, "cm"))

fig= ggplotly(itd) %>%
  layout(autosize = F, width = 750, height = 750)
fig

Time Series

For time series analysis, it was required to rearrange all the dates in ascending order of events. Some dates were found to be missing. Therefore the missing dates were imputed using Kamlan filter. The time series plot, ADF test, ACF and PACF plots were done followed by forecasting.

#Reformatting the dates in the ascending order for time series 
yrwise= df_clean %>% group_by(DESIRED_FORMAT2) %>%
  mutate(DESIRED_FORMAT2 = as.Date(paste0(DESIRED_FORMAT2, " 01"), format = "%b %Y %d")) %>%
  arrange(DESIRED_FORMAT2)   %>%
  #arrange(month(DESIRED_FORMAT2)) %>% 
  summarize(count = n())
 #View(yrwise)

#To deal with missing dates

# Find the minimum and maximum dates in yrwise$DESIRED_FORMAT2
min_date <- min(yrwise$DESIRED_FORMAT2)
max_date <- max(yrwise$DESIRED_FORMAT2)

# Create a sequence of complete monthly dates
complete_dates <- seq(from = min_date, to = max_date, by = "month")
complete_dates_df <- data.frame(DESIRED_FORMAT2 = complete_dates)

# Merge yrwise with the complete sequence of dates
yrwise <- merge(yrwise, complete_dates_df, by = "DESIRED_FORMAT2", all = TRUE)
#View(yrwise)

#Missing dates are filled. Now to deal with NA count values for newly filled dates

yrwise <- na_kalman(yrwise) 
#View(yrwise)

# Plotting the time series graph
mts <- ts(yrwise$count, start = c(2000,10), end = c(2023,09), frequency = 12)
#mts

p1= ts.plot(mts, main = "Time Series Plot", xlab = "Time", ylab = "Count of Deaths")

abline(reg=lm(mts~time(mts)))

#Using adf test to check stationarity
adf.test(mts)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  mts
## Dickey-Fuller = -6.3336, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary

#Using ARIMA models
arima_model=auto.arima(mts)

print(arima_model)

## Series: mts 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##          ma1     mean
##       0.3773  39.9151
## s.e.  0.0575   9.4919
## 
## sigma^2 = 13231:  log likelihood = -1700.36
## AIC=3406.73   AICc=3406.82   BIC=3417.59

#Auto correlation and Moving Avg graphs
acf_result= acf(mts, lag.max = 12, main = "Autocorrelation Function", 
                xlab = "Lag", ylab = "ACF")

pacf_result= pacf(mts, lag.max = 12, main = "Partial Autocorrelation Function",
                  xlab = "Lag", ylab = "PACF")

#Forecasting for next 12 months
forecast_values <- forecast(mts, h=12)

plot(forecast_values, main="ARIMA Forecast",
     xlab="Time", ylab="Count of Deaths")

Data Set

#Including data set for readers reference
datatable(df_clean, options = list(
  scrollX = TRUE,  # Enable horizontal scrolling if necessary
  scrollY = '500px',  # Set the maximum height of the table
  scrollCollapse = TRUE  # Collapse table borders when scrollbar is present
))

You can go through the detailed report of the analysis at: Israeli-Palestinian Conflict Report