# ASOC708, CODING CHALLENGE 5
# Author:  Heeyoung
# Date:    20220921

### PART 1: LOADING THE DATA ###

# Load the tidyverse and lubridate packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.1
## v readr   2.1.2     v forcats 0.5.1
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'stringr'는 R 버전 4.1.3에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## Warning: 패키지 'lubridate'는 R 버전 4.1.2에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Set your working directory to wherever you put the data
getwd()
## [1] "C:/Users/distr/OneDrive/R FILE/asoc708/Coding Challenges"
setwd("C:/Users/distr/OneDrive/R FILE/asoc708")
# Make a variable called "speeches" that contains the entire corpus of SOTU addresses as a string
# Use read_file(filename)
speeches <- read_file("./Data/stateoftheunion1790-2022.txt")

# Overwrite speeches with a list that contains each SOTU address as its own element
# Note: the speeches are separated by "***"
speeches <- str_split(speeches, "\\*{3}")

# Give the first (and only) column of the speeches the name "text"
# Hunt: use names(df)[col#] <- "variable_name"
names(speeches)[1] <- "text"

# Convert speeches into a tibble
speeches <- as_tibble(speeches)

# Trim extra whitespace at the beginning and end of the speeches
speeches <- speeches %>%
    mutate(text = str_trim(text)) 

# We know these are SOTU addresses - get rid of the first line
# Hint: get rid of everything between the start of the string and the first new line
# Hint: you know the special character for new lines
# Hint: I provided you the regex for "everything in between"
speeches <- speeches %>%
    mutate(text = str_remove(text, "^(.*)\\n"))


# Now make a new variable, called president, that extracts the president's name
# Hint: it is now the first line of each speech! You can re-use your previous code!
# Then, delete that line from text
# Hint: this will look just like your previous code!
# Then, remove the new line at the end of president
speeches <- speeches %>%  
    mutate(president = str_extract(text, "^(.*)\\n")) %>%
    mutate(text = str_remove(text, "^(.*)\\n")) %>%
    mutate(president = str_trim(president))


# We have some extra observations at the end of the data frame with no data!
# Get rid of those using filter()
speeches <- speeches %>%
    filter(!is.na(president))


# Now make a new variable, called speechDate, that extracts the date
# Then, delete that line from the text
# Hint: this is now the first line of each speech!
speeches <- speeches %>%
    mutate(speechDate = str_extract(text, "^(.*)\\n")) %>%
    mutate(text = str_remove(text, "^(.*)\\n"))


# Get the dates into a nice format with lubridate
# Hint: note how the dates are formatted
speeches <- speeches %>%
    mutate(speechDate = mdy(speechDate))


# Clean up the text variable by getting rid of all of the new lines (replace with a space)
# Then get rid of double spaces (replace with a single space)
# Then clean up any remaining leading and trailing whitespace
speeches <- speeches %>%
    mutate(text = str_replace_all(text, "\\n", " ")) %>%
    mutate(text = str_replace_all(text, "  ", " ")) %>%
    mutate(text = str_trim(text))
glimpse(speeches)
## Rows: 236
## Columns: 3
## $ text       <chr> "Fellow-Citizens of the Senate and House of Representatives~
## $ president  <chr> "George Washington", "George Washington", "George Washingto~
## $ speechDate <date> 1990-08-17, 1990-08-17, 1791-02-05, 1992-06-17, 1993-03-17~
### PART 2: ANALYSIS ###

# Now we are going to make counts of mentions of the USA, using the following terms:
# "the united states", "these united states", "u.s.a. OR usa", "america", "the union"
# first make a new variable, textClean, that is lowercased
# you will run the rest of your text analyses on textClean
# make 3 new variables - theCount, theseCount, and usaCount that capture mentions of "the united states", "these united states", and "u.s.a. OR usa"
# for each of these, make a new variable - presidentThe, presidentThese, and presidentUSA that captures the sum of the each count variable, by president.
speeches <- speeches %>%
    mutate(textClean = str_to_lower(text)) %>%
    mutate(theCount = str_count(textClean, "the united states")) %>%
    mutate(theseCount = str_count(textClean, "these united states")) %>%
    mutate(usaCount = str_count(textClean, "(u\\.s\\.a\\.)|(usa)")) %>%
    group_by(president) %>%
    mutate(presidentThe = sum(theCount)) %>%
    mutate(presidentThese = sum(theseCount)) %>%
    mutate(presidentUSA = sum(usaCount))



# "america" and "the union" are a little bit tricky, you'll get lots of false positives
# I've taken the liberty of finding all the false positives in the corpus, they are:
# "latin america", "the americas", "south america", "central america", "united states of america", "american", americans" for "america" and...
# "unions", "unionized", and "labor union" for "the union"
# make two new character objects called falseposAmerica  and falseposUnion that are regular expressions we could use to identify these false positives
falseposAmerica <- "(latin america)|(the americas)|(south america)|(central america)|(united states of america)|(americans)|(american)"
falseposUnion <- "(unions)|(unionized)|(labor union)"


# now make four variables, americaCount, presidentAmerica, unionCount, and presidentUnion that are the same as the previous ones you made
# be sure to get rid of false positives in the textClean variable using the regular expressions we just made!
speeches <- speeches %>%  
    mutate(textClean = str_remove_all(textClean, falseposAmerica)) %>%
    mutate(americaCount = str_count(textClean, "america")) %>%
    mutate(textClean = str_remove_all(textClean, falseposUnion)) %>%
    mutate(unionCount = str_count(textClean, "the union")) %>%
    group_by(president) %>%
    mutate(presidentAmerica = sum(americaCount)) %>%
    mutate(presidentUnion = sum(unionCount))

# TFactorize the president variable, maintaining the order of presidents by date
glimpse(speeches)
## Rows: 236
## Columns: 14
## Groups: president [43]
## $ text             <chr> "Fellow-Citizens of the Senate and House of Represent~
## $ president        <chr> "George Washington", "George Washington", "George Was~
## $ speechDate       <date> 1990-08-17, 1990-08-17, 1791-02-05, 1992-06-17, 1993~
## $ textClean        <chr> "fellow-citizens of the senate and house of represent~
## $ theCount         <int> 4, 2, 17, 5, 22, 14, 5, 16, 18, 14, 10, 7, 1, 1, 5, 2~
## $ theseCount       <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ usaCount         <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,~
## $ presidentThe     <int> 85, 85, 85, 85, 85, 85, 85, 85, 49, 49, 49, 49, 30, 3~
## $ presidentThese   <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUSA     <int> 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6,~
## $ americaCount     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ unionCount       <int> 3, 3, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 2,~
## $ presidentAmerica <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUnion   <int> 12, 12, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 6, 6, 6, ~
speeches$president <- factor(speeches$president, 
                             levels = unique(speeches$president))



# Now we're going to make a bar chart!
# Make a new dataframe called speeches_trimmed selecting only the following variables:
# president, presidentThe, presidentThese, presidentAmerica, presidentUnion, presidentUSA
speeches_trimmed <- speeches %>%
    select(president,
           presidentThe,
           presidentThese,
           presidentAmerica,
           presidentUnion,
           presidentUSA)
glimpse(speeches_trimmed)
## Rows: 236
## Columns: 6
## Groups: president [43]
## $ president        <fct> George Washington, George Washington, George Washingt~
## $ presidentThe     <int> 85, 85, 85, 85, 85, 85, 85, 85, 49, 49, 49, 49, 30, 3~
## $ presidentThese   <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentAmerica <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUnion   <int> 12, 12, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 6, 6, 6, ~
## $ presidentUSA     <int> 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6,~
# Now convert speeches_trimmed to "long" format and keep only one row for each president

speeches_trimmed <- pivot_longer(speeches_trimmed, 
                                 presidentThe:presidentUSA, 
                                 names_to = "variable", 
                                 values_to = "value")

# Make a bar chart of all of these by president, with different colors for each search term
# Make sure that the bar shows proportions of mentions rather than raw values, and
# rotate the x axis (president name) 90 degrees so we can read it
ggplot(data = speeches_trimmed, aes(x = president, y = value, fill = variable)) +
    geom_bar(stat = "identity", position = "fill") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))