Libraries used

library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer

President Trump’s 2018 State of the Union Address

1. Scraping

website=read_html("https://www.whitehouse.gov/briefings-statements/president-donald-j-trumps-state-union-address/")

scrape=website %>% html_nodes(".editor") %>% html_text()
scrape_cleaned=gsub("\n\t\t\t\n\t\n\t\t\t\t\t\t\tShare:\n\t\t\t\t\t\n\t\t\n\t\t\t\n\t\t\t\tshare-this-page-on-facebook\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\tshare-this-page-on-twitter\t\t\t\n\t\t\n\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\tcopy-url-to-your-clipboard\t\t\t\t\n\t\t\t\t\t\n\t\n\n\tAll News\n\n","", scrape) #clean the very beginning

2. Organizing the Data

review_source=VectorSource(scrape_cleaned)
corpus=Corpus(review_source)
corpus=tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm=DocumentTermMatrix(corpus)
dtm2=as.matrix(dtm)

3. Finding the Most Frequent Terms

frequency=colSums(dtm2)
frequency=sort(frequency, decreasing=TRUE)
words<-names(frequency)
wordcloud(words[1:100], frequency[1:100], scale = c(2.75, .2))

President Obama’s 2010 State of the Union Address

1. Scraping

website2=read_html("https://abcnews.go.com/Politics/State_of_the_Union/state-of-the-union-2010-president-obama-speech-transcript/story?id=9678572")
scrape2=website2 %>% html_nodes(".article-copy") %>% html_text()

2. Organizing the Data

review_source2=VectorSource(scrape2)
corpus2=Corpus(review_source2)
corpus2=tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2=tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2=tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
corpus2=tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
dtm3=DocumentTermMatrix(corpus2)
dtm4=as.matrix(dtm3)

3. Finding the Most Frequent Terms

frequency2=colSums(dtm4)
frequency2=sort(frequency2, decreasing=TRUE)
words2<-names(frequency2)
wordcloud(words2[1:100], frequency2[1:100], scale = c(2.75, .2))