# Chosen state: Texas 
# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

# load in files
ACLED <- read.csv("ACLED_NAm_2020_2022.csv")

## clean data
ACLED[ACLED == ""] <- "unknown"

## Convert event_date to date type ##
ACLED$event_date <- as.Date(ACLED$event_date, format="%d-%b-%y")
## Select your state
TX_Protests <- filter(ACLED, admin1 == "Texas", event_type == "Protests")

## First create a new data frame for the frequency of each protest type
Peaceful <- sum(TX_Protests$sub_event_type == "Peaceful protest")/sum(TX_Protests$event_type == "Protests")*100

With_Intervention <- sum(TX_Protests$sub_event_type == "Protest with intervention")/sum(TX_Protests$event_type == "Protests")*100

Protests_by_Type <- data.frame("Type" = c('Peaceful', 'With_Intervention'),
                               "Freq" = c(Peaceful, With_Intervention))
# Bar graph of protest type
ggplot(data = Protests_by_Type) +
  geom_bar(mapping = aes(x = Type, y = Freq, fill = Type), stat = "identity") +
  scale_fill_manual(values = c("Peaceful" = "blue", "With_Intervention" = "red")) +
  labs(x = "Protest Type", y = "Frequency", title = "Bar Graph of Protest Type") +
  theme(legend.position = "top")

# Create a pie chart
ggplot(Protests_by_Type, aes(x = "", y = Freq, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(Type, ": ", sprintf("%.1f%%", Freq))), # to format percentage
            position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = c("Peaceful" = "purple", "With_Intervention" = "red")) +
  theme_void() +  # Remove unnecessary axes and labels
  theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) +  # Center the title (for some reason it wasnt automatically centering)
  labs(title = "Protest Types Distribution")  # Add a title

# Protests by date and location 
ggplot(data = TX_Protests) +
  geom_bar(mapping = aes(x = event_date, fill = assoc_actor_1)) +
  theme(legend.position = "none")

# it allows us to easily compare the counts of protests for different dates and see any patterns or trends over time.
#filter data
filtered_TX_Protests <- filter(TX_Protests, assoc_actor_1 == c("Women (United States)", "BLM: Black Lives Matter"), year == c(2020, 2021,2022))
## Warning: There were 2 warnings in `filter()`.
## The first warning was:
## ℹ In argument: `assoc_actor_1 == c("Women (United States)", "BLM: Black Lives
##   Matter")`.
## Caused by warning in `assoc_actor_1 == c("Women (United States)", "BLM: Black Lives Matter")`:
## ! longer object length is not a multiple of shorter object length
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggplot(data = filtered_TX_Protests) +
  geom_bar(mapping = aes(x = factor(year), fill = assoc_actor_1), position = "stack") +
  labs(title = "Count of Potests in 2020 and 2021 in Texas", x = "Year", y = "Count") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = c("Women (United States)" = "pink", "BLM: Black Lives Matter" = "blue"))

# I chose a stacked bar graph for the association actor of "Women" and "BLM" because it allows to easily compare the total count of protests for "Women" and "Black Lives Matter" in each year while also showing the individual contributions of each association actor. By stacking the bars, you can see the overall distribution of protests across the years and easily identify any patterns or trends. It provides a clear visual representation of the data and makes it easy to compare the relative proportions of the two.
# Question 6:
# event_date - numerical, date format. Distribution - Uniform
# event_type - qualitative, categorical, character 
# Type - quantitative
# data_id - quantitative, int  
# iso - quantitative, int. Distribution - uniform
# event_id_cnty - character, catagorical, qualitative. Distribution - uniform
# event_id_no_cnty - quantitative, int. Distribution - normal
# year - quantitative, int. Normal
# time_precision - quantitative, int. Distribution - Uniform
# sub_event_type - qualitative, categorical, character. Distribution - skewed
# actor1 - qualitative, categorical, character. Distribution - skewed
# assoc_actor_1 - qualitative, categorical, character. Distribution - skewed
# inter1 - quantitative, int. Distribution - Uniform
# actor2 - qualitative, categorical, character. Distribution - Uniform
# assoc_actor_2 - qualitative, categorical, character. Distribution - Uniform
# inter2 - quantitative, int. Distribution - Uniform
# interaction - quantitative, int. Distribution - Uniform
# region - qualitative, categorical, character. Distribution - Uniform
# country - qualitative, categorical, character. Distribution - Uniform
# admin1 - qualitative, categorical, character. Distribution - Uniform
# admin2 - qualitative, categorical, character. Distribution - right skewed
# admin3 -  unknown
# location - qualitative, categorical, character. Distribution - normal
# latitude - quantitative, int. Distribution - normal
# longitude - quantitative, int. Distribution - normal
# geo_precision - quantitative, int. Distribution - Uniform
# source - qualitative, categorical, character. Distribution - skewed
# source_scale - qualitative, categorical, character. Distribution - skewed
# notes - qualitative, categorical, character. Distribution - skewed
# fatalities - quantitative, int. Distribution - normal
# timestamp - quantitative, int. Distribution - Uniform
#iso3 - qualitative, categorical, character. Distribution - Uniform
# Question 7:
str(TX_Protests)
## 'data.frame':    1403 obs. of  31 variables:
##  $ data_id         : int  8798018 8798214 8798010 8798077 8798102 8798107 8798110 8798152 8798076 8798153 ...
##  $ iso             : int  840 840 840 840 840 840 840 840 840 840 ...
##  $ event_id_cnty   : chr  "USA38349" "USA38209" "USA38321" "USA38281" ...
##  $ event_id_no_cnty: int  38349 38209 38321 38281 38331 38330 38394 38304 38291 38362 ...
##  $ event_date      : Date, format: "2022-01-20" "2022-01-19" ...
##  $ year            : int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ time_precision  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ event_type      : chr  "Protests" "Protests" "Protests" "Protests" ...
##  $ sub_event_type  : chr  "Peaceful protest" "Peaceful protest" "Peaceful protest" "Peaceful protest" ...
##  $ actor1          : chr  "Protesters (United States)" "Protesters (United States)" "Protesters (United States)" "Protesters (United States)" ...
##  $ assoc_actor_1   : chr  "Students (United States)" "BLM757; African American Group (United States)" "Indivisible Movement; Teachers (United States)" "African American Group (United States); Christian Group (United States)" ...
##  $ inter1          : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ actor2          : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ assoc_actor_2   : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ inter2          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ interaction     : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ region          : chr  "North America" "North America" "North America" "North America" ...
##  $ country         : chr  "United States" "United States" "United States" "United States" ...
##  $ admin1          : chr  "Texas" "Texas" "Texas" "Texas" ...
##  $ admin2          : chr  "Williamson" "Tarrant" "Travis" "Dallas" ...
##  $ admin3          : logi  NA NA NA NA NA NA ...
##  $ location        : chr  "Round Rock" "Arlington" "Austin" "Dallas" ...
##  $ latitude        : num  30.5 32.7 30.3 32.8 31.1 ...
##  $ longitude       : num  -97.7 -97.1 -97.7 -96.8 -97.7 ...
##  $ geo_precision   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ source          : chr  "KETK NBC; Fox 44 (Waco); KOLR10; Fox7 (Austin); KUT90.5; KTAB News; Austin American-Statesman; KLST; CBS Austin"| __truncated__ "Youtube" "CBS Austin; KVUE" "Dallas Observer" ...
##  $ source_scale    : chr  "Subnational-National" "New media" "Subnational" "Subnational" ...
##  $ notes           : chr  "On 20 January 2022, around 50 students walked out of class and protested outside their school in Round Rock (Te"| __truncated__ "On 19 January 2022, armed members of BLM757 gathered for a protest in Arlington (Texas) against Child Protectio"| __truncated__ "On 17 January 2022, people gathered in Austin (Texas) to support voting rights in honor of Martin Luther King J"| __truncated__ "On 17 January 2022, people rallied at the AT&T plaza in Dallas (Texas) to support voting rights and to protest "| __truncated__ ...
##  $ fatalities      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ timestamp       : int  1643146210 1643146211 1643146210 1643146210 1643146210 1643146210 1643146210 1643146210 1643146210 1643146210 ...
##  $ iso3            : chr  "USA" "USA" "USA" "USA" ...