# ASOC708, CODING CHALLENGE 5
# Author: Heeyoung
# Date: 20220921
### PART 1: LOADING THE DATA ###
# Load the tidyverse and lubridate packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.1
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'stringr'는 R 버전 4.1.3에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: 패키지 'lubridate'는 R 버전 4.1.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Set your working directory to wherever you put the data
getwd()
## [1] "C:/Users/distr/OneDrive/R FILE/asoc708/Coding Challenges"
setwd("C:/Users/distr/OneDrive/R FILE/asoc708")
# Make a variable called "speeches" that contains the entire corpus of SOTU addresses as a string
# Use read_file(filename)
speeches <- read_file("./Data/stateoftheunion1790-2022.txt")
# Overwrite speeches with a list that contains each SOTU address as its own element
# Note: the speeches are separated by "***"
speeches <- str_split(speeches, "\\*{3}")
# Give the first (and only) column of the speeches the name "text"
# Hunt: use names(df)[col#] <- "variable_name"
names(speeches)[1] <- "text"
# Convert speeches into a tibble
speeches <- as_tibble(speeches)
# Trim extra whitespace at the beginning and end of the speeches
speeches <- speeches %>%
mutate(text = str_trim(text))
# We know these are SOTU addresses - get rid of the first line
# Hint: get rid of everything between the start of the string and the first new line
# Hint: you know the special character for new lines
# Hint: I provided you the regex for "everything in between"
speeches <- speeches %>%
mutate(text = str_remove(text, "^(.*)\\n"))
# Now make a new variable, called president, that extracts the president's name
# Hint: it is now the first line of each speech! You can re-use your previous code!
# Then, delete that line from text
# Hint: this will look just like your previous code!
# Then, remove the new line at the end of president
speeches <- speeches %>%
mutate(president = str_extract(text, "^(.*)\\n")) %>%
mutate(text = str_remove(text, "^(.*)\\n")) %>%
mutate(president = str_trim(president))
# We have some extra observations at the end of the data frame with no data!
# Get rid of those using filter()
speeches <- speeches %>%
filter(!is.na(president))
# Now make a new variable, called speechDate, that extracts the date
# Then, delete that line from the text
# Hint: this is now the first line of each speech!
speeches <- speeches %>%
mutate(speechDate = str_extract(text, "^(.*)\\n")) %>%
mutate(text = str_remove(text, "^(.*)\\n"))
# Get the dates into a nice format with lubridate
# Hint: note how the dates are formatted
speeches <- speeches %>%
mutate(speechDate = mdy(speechDate))
# Clean up the text variable by getting rid of all of the new lines (replace with a space)
# Then get rid of double spaces (replace with a single space)
# Then clean up any remaining leading and trailing whitespace
speeches <- speeches %>%
mutate(text = str_replace_all(text, "\\n", " ")) %>%
mutate(text = str_replace_all(text, " ", " ")) %>%
mutate(text = str_trim(text))
glimpse(speeches)
## Rows: 236
## Columns: 3
## $ text <chr> "Fellow-Citizens of the Senate and House of Representatives~
## $ president <chr> "George Washington", "George Washington", "George Washingto~
## $ speechDate <date> 1990-08-17, 1990-08-17, 1791-02-05, 1992-06-17, 1993-03-17~
### PART 2: ANALYSIS ###
# Now we are going to make counts of mentions of the USA, using the following terms:
# "the united states", "these united states", "u.s.a. OR usa", "america", "the union"
# first make a new variable, textClean, that is lowercased
# you will run the rest of your text analyses on textClean
# make 3 new variables - theCount, theseCount, and usaCount that capture mentions of "the united states", "these united states", and "u.s.a. OR usa"
# for each of these, make a new variable - presidentThe, presidentThese, and presidentUSA that captures the sum of the each count variable, by president.
speeches <- speeches %>%
mutate(textClean = str_to_lower(text)) %>%
mutate(theCount = str_count(textClean, "the united states")) %>%
mutate(theseCount = str_count(textClean, "these united states")) %>%
mutate(usaCount = str_count(textClean, "(u\\.s\\.a\\.)|(usa)")) %>%
group_by(president) %>%
mutate(presidentThe = sum(theCount)) %>%
mutate(presidentThese = sum(theseCount)) %>%
mutate(presidentUSA = sum(usaCount))
# "america" and "the union" are a little bit tricky, you'll get lots of false positives
# I've taken the liberty of finding all the false positives in the corpus, they are:
# "latin america", "the americas", "south america", "central america", "united states of america", "american", americans" for "america" and...
# "unions", "unionized", and "labor union" for "the union"
# make two new character objects called falseposAmerica and falseposUnion that are regular expressions we could use to identify these false positives
falseposAmerica <- "(latin america)|(the americas)|(south america)|(central america)|(united states of america)|(americans)|(american)"
falseposUnion <- "(unions)|(unionized)|(labor union)"
# now make four variables, americaCount, presidentAmerica, unionCount, and presidentUnion that are the same as the previous ones you made
# be sure to get rid of false positives in the textClean variable using the regular expressions we just made!
speeches <- speeches %>%
mutate(textClean = str_remove_all(textClean, falseposAmerica)) %>%
mutate(americaCount = str_count(textClean, "america")) %>%
mutate(textClean = str_remove_all(textClean, falseposUnion)) %>%
mutate(unionCount = str_count(textClean, "the union")) %>%
group_by(president) %>%
mutate(presidentAmerica = sum(americaCount)) %>%
mutate(presidentUnion = sum(unionCount))
# TFactorize the president variable, maintaining the order of presidents by date
glimpse(speeches)
## Rows: 236
## Columns: 14
## Groups: president [43]
## $ text <chr> "Fellow-Citizens of the Senate and House of Represent~
## $ president <chr> "George Washington", "George Washington", "George Was~
## $ speechDate <date> 1990-08-17, 1990-08-17, 1791-02-05, 1992-06-17, 1993~
## $ textClean <chr> "fellow-citizens of the senate and house of represent~
## $ theCount <int> 4, 2, 17, 5, 22, 14, 5, 16, 18, 14, 10, 7, 1, 1, 5, 2~
## $ theseCount <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ usaCount <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,~
## $ presidentThe <int> 85, 85, 85, 85, 85, 85, 85, 85, 49, 49, 49, 49, 30, 3~
## $ presidentThese <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUSA <int> 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6,~
## $ americaCount <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ unionCount <int> 3, 3, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 2,~
## $ presidentAmerica <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUnion <int> 12, 12, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 6, 6, 6, ~
speeches$president <- factor(speeches$president,
levels = unique(speeches$president))
# Now we're going to make a bar chart!
# Make a new dataframe called speeches_trimmed selecting only the following variables:
# president, presidentThe, presidentThese, presidentAmerica, presidentUnion, presidentUSA
speeches_trimmed <- speeches %>%
select(president,
presidentThe,
presidentThese,
presidentAmerica,
presidentUnion,
presidentUSA)
glimpse(speeches_trimmed)
## Rows: 236
## Columns: 6
## Groups: president [43]
## $ president <fct> George Washington, George Washington, George Washingt~
## $ presidentThe <int> 85, 85, 85, 85, 85, 85, 85, 85, 49, 49, 49, 49, 30, 3~
## $ presidentThese <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentAmerica <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ presidentUnion <int> 12, 12, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 6, 6, 6, ~
## $ presidentUSA <int> 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6,~
# Now convert speeches_trimmed to "long" format and keep only one row for each president
speeches_trimmed <- pivot_longer(speeches_trimmed,
presidentThe:presidentUSA,
names_to = "variable",
values_to = "value")
# Make a bar chart of all of these by president, with different colors for each search term
# Make sure that the bar shows proportions of mentions rather than raw values, and
# rotate the x axis (president name) 90 degrees so we can read it
ggplot(data = speeches_trimmed, aes(x = president, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "fill") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
