Webscrapping-Digital Advertising

install.packages('rvest')

library(rvest)
library(tm)
library(SnowballC)
library(wordcloud)
library(RCurl)
library(XML)
library(RColorBrewer)

#Specifying the url for desired website to be scraped
url<-"https://www.webopedia.com/definitions/digital-advertising/"

#Reading the HTML code from the website
webpage <- read_html(url)
class(webpage)
View(webpage)

webpage


text<-webpage %>% html_nodes("p") %>%   html_text() #It has to be in p, ie. paragraph, look at the paragrah in p, at get the text

docs <- Corpus(VectorSource(text))
inspect(docs)

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, " -")
docs <- tm_map(docs, toSpace, "-")



# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("due"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("ran", "will","the","and","for","are")) 
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=160, random.order=FALSE,scale=c(2.5,0.25),rot.per=0.35, 
          colors=brewer.pal(8, "Spectral"))

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Webscrapping-Digital Advertising

Celeste Pokhrel

6/6/2022

Including Plots