Libraries used
library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
President Trump’s 2018 State of the Union Address
1. Scraping
website=read_html("https://www.whitehouse.gov/briefings-statements/president-donald-j-trumps-state-union-address/")
scrape=website %>% html_nodes(".editor") %>% html_text()
scrape_cleaned=gsub("\n\t\t\t\n\t\n\t\t\t\t\t\t\tShare:\n\t\t\t\t\t\n\t\t\n\t\t\t\n\t\t\t\tshare-this-page-on-facebook\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\tshare-this-page-on-twitter\t\t\t\n\t\t\n\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\tcopy-url-to-your-clipboard\t\t\t\t\n\t\t\t\t\t\n\t\n\n\tAll News\n\n","", scrape) #clean the very beginning
2. Organizing the Data
review_source=VectorSource(scrape_cleaned)
corpus=Corpus(review_source)
corpus=tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm=DocumentTermMatrix(corpus)
dtm2=as.matrix(dtm)
3. Finding the Most Frequent Terms
frequency=colSums(dtm2)
frequency=sort(frequency, decreasing=TRUE)
words<-names(frequency)
wordcloud(words[1:100], frequency[1:100], scale = c(2.75, .2))

President Obama’s 2010 State of the Union Address
1. Scraping
website2=read_html("https://abcnews.go.com/Politics/State_of_the_Union/state-of-the-union-2010-president-obama-speech-transcript/story?id=9678572")
scrape2=website2 %>% html_nodes(".article-copy") %>% html_text()
2. Organizing the Data
review_source2=VectorSource(scrape2)
corpus2=Corpus(review_source2)
corpus2=tm_map(corpus2, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2=tm_map(corpus2, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2=tm_map(corpus2, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
corpus2=tm_map(corpus2, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
dtm3=DocumentTermMatrix(corpus2)
dtm4=as.matrix(dtm3)
3. Finding the Most Frequent Terms
frequency2=colSums(dtm4)
frequency2=sort(frequency2, decreasing=TRUE)
words2<-names(frequency2)
wordcloud(words2[1:100], frequency2[1:100], scale = c(2.75, .2))
