### Install Packages
install.packages("rvest") #package for harvesting data
install.packages("tidyverse") #packages for cleansing data
install.packages("tm") # package for text mining
install.packages("wordcloud") #package for wordcloud creation
### Load Packages
library(rvest)
library(tidyverse)
library(tm)
library(wordcloud)
url_base <- "https://www.winemag.com/ratings/?s=&drink_type=wine&page=%d"
### Scrape and Extract
map_df(1:50, function(i) { #returns a data frame with individual elements 1 to 50
cat(".") # simple progress indicator
pg <- read_html(sprintf(url_base, i)) # read and iterate through the base url and store in a variable called pg
# read nodes, clean data, place into data frame
data.frame(
wine=html_text(html_nodes(pg, "h3.title")),
excerpt=html_text(html_nodes(pg, "div.excerpt")),
rating=gsub(" Points", "", html_text(html_nodes(pg, "span.rating"))),
appellation=html_text(html_nodes(pg, "span.appellation")),
price=gsub("\\$", "", html_text(html_nodes(pg, "span.price"))),
stringsAsFactors=FALSE)
}) -> wines
..................................................
## View the dataset
wines
## Examine the data types
sapply(wines, class)
wine excerpt rating appellation price
"character" "character" "character" "character" "character"
## Convert the price and rating columns to numbers and store in a dataframe
wine_price_rating<-data.frame(wines,priceNum=as.numeric(wines$price),ratingNum=as.numeric(wines$rating))
## View the dataset
wine_price_rating
#Generate a bar plot that shows the number of wines at each rating level
ggplot(data=wine_price_rating) +
geom_bar(aes(x=ratingNum), fill="orange")

#Generate a density plot that shows the distribution of wines at each rating level
ggplot(data=wine_price_rating) +
geom_density(aes(x=priceNum), fill="purple")

#Generate box plots that show median price at each rating level
ggplot(wine_price_rating, aes(y=priceNum, x=rating,))+geom_boxplot()

### Create a wordcloud
##Place the exerpts into a corpus
wineCorpus <- VCorpus(VectorSource(wine_price_rating$excerpt))
##Clean the corpus
wineCorpus <- tm_map(wineCorpus, content_transformer(tolower))
wineCorpus <- tm_map(wineCorpus, removePunctuation)
wineCorpus <- tm_map(wineCorpus, PlainTextDocument)
wineCorpus <- tm_map(wineCorpus, removeWords, stopwords('english'))
## Generate the wordcloud
wordcloud(wineCorpus, colors=brewer.pal(8, "Dark2"), random.color= TRUE, random.order = FALSE, max.words = 20)

NA
NA
LS0tCnRpdGxlOiAiTXVsdGlQYWdlIFNjcmFwZSIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7cn0KIyMjIEluc3RhbGwgUGFja2FnZXMKCmluc3RhbGwucGFja2FnZXMoInJ2ZXN0IikgICAgICNwYWNrYWdlIGZvciBoYXJ2ZXN0aW5nIGRhdGEKaW5zdGFsbC5wYWNrYWdlcygidGlkeXZlcnNlIikgI3BhY2thZ2VzIGZvciBjbGVhbnNpbmcgZGF0YQppbnN0YWxsLnBhY2thZ2VzKCJ0bSIpICAgICAgICAjIHBhY2thZ2UgZm9yIHRleHQgbWluaW5nCmluc3RhbGwucGFja2FnZXMoIndvcmRjbG91ZCIpICNwYWNrYWdlIGZvciB3b3JkY2xvdWQgY3JlYXRpb24KCiMjIyBMb2FkIFBhY2thZ2VzCmxpYnJhcnkocnZlc3QpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KHdvcmRjbG91ZCkKYGBgCgpgYGB7cn0KIyMjIENyZWF0ZSBiYXNlIFVSTCB3aXRoICVkIGFzIHBsYWNlaG9sZGVyIAp1cmxfYmFzZSA8LSAiaHR0cHM6Ly93d3cud2luZW1hZy5jb20vcmF0aW5ncy8/cz0mZHJpbmtfdHlwZT13aW5lJnBhZ2U9JWQiCmBgYAoKCmBgYHtyfQoKIyMjIFNjcmFwZSBhbmQgRXh0cmFjdAoKbWFwX2RmKDE6NTAsIGZ1bmN0aW9uKGkpIHsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjcmV0dXJucyBhIGRhdGEgZnJhbWUgd2l0aCBpbmRpdmlkdWFsIGVsZW1lbnRzIDEgdG8gNTAKCmNhdCgiLiIpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIyBzaW1wbGUgcHJvZ3Jlc3MgaW5kaWNhdG9yIAoKICBwZyA8LSByZWFkX2h0bWwoc3ByaW50Zih1cmxfYmFzZSwgaSkpICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIHJlYWQgYW5kIGl0ZXJhdGUgdGhyb3VnaCB0aGUgYmFzZSB1cmwgYW5kIHN0b3JlIGluIGEgdmFyaWFibGUgY2FsbGVkIHBnCiAgCiAgCiMgIHJlYWQgbm9kZXMsIGNsZWFuIGRhdGEsIHBsYWNlIGludG8gZGF0YSBmcmFtZQoKIGRhdGEuZnJhbWUoCiAgICAgICAgICAgIHdpbmU9aHRtbF90ZXh0KGh0bWxfbm9kZXMocGcsICJoMy50aXRsZSIpKSwgICAgICAgICAgICAgICAgCiAgICAgICAgICAgIGV4Y2VycHQ9aHRtbF90ZXh0KGh0bWxfbm9kZXMocGcsICJkaXYuZXhjZXJwdCIpKSwKICAgICAgICAgICAgcmF0aW5nPWdzdWIoIiBQb2ludHMiLCAiIiwgaHRtbF90ZXh0KGh0bWxfbm9kZXMocGcsICJzcGFuLnJhdGluZyIpKSksCiAgICAgICAgICAgIGFwcGVsbGF0aW9uPWh0bWxfdGV4dChodG1sX25vZGVzKHBnLCAic3Bhbi5hcHBlbGxhdGlvbiIpKSwKICAgICAgICAgICAgcHJpY2U9Z3N1YigiXFwkIiwgIiIsIGh0bWxfdGV4dChodG1sX25vZGVzKHBnLCAic3Bhbi5wcmljZSIpKSksCiAgICAgICAgICAgICAKICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycz1GQUxTRSkKfSkgLT4gd2luZXMKCmBgYApgYGB7cn0KIyMgVmlldyB0aGUgZGF0YXNldAp3aW5lcwpgYGAKCmBgYHtyfQojIyBFeGFtaW5lIHRoZSBkYXRhIHR5cGVzCnNhcHBseSh3aW5lcywgY2xhc3MpCgpgYGAKCmBgYHtyfQojIyBDb252ZXJ0IHRoZSBwcmljZSBhbmQgcmF0aW5nIGNvbHVtbnMgdG8gbnVtYmVycyBhbmQgc3RvcmUgaW4gYSBkYXRhZnJhbWUKd2luZV9wcmljZV9yYXRpbmc8LWRhdGEuZnJhbWUod2luZXMscHJpY2VOdW09YXMubnVtZXJpYyh3aW5lcyRwcmljZSkscmF0aW5nTnVtPWFzLm51bWVyaWMod2luZXMkcmF0aW5nKSkgCmBgYAoKYGBge3J9CiMjIFZpZXcgdGhlIGRhdGFzZXQKd2luZV9wcmljZV9yYXRpbmcKYGBgCgpgYGB7cn0KI0dlbmVyYXRlIGEgYmFyIHBsb3QgdGhhdCBzaG93cyB0aGUgbnVtYmVyIG9mIHdpbmVzIGF0IGVhY2ggcmF0aW5nIGxldmVsCmdncGxvdChkYXRhPXdpbmVfcHJpY2VfcmF0aW5nKSArIAogIGdlb21fYmFyKGFlcyh4PXJhdGluZ051bSksIGZpbGw9Im9yYW5nZSIpCmBgYApgYGB7cn0KI0dlbmVyYXRlIGEgZGVuc2l0eSBwbG90IHRoYXQgc2hvd3MgdGhlIGRpc3RyaWJ1dGlvbiBvZiB3aW5lcyBhdCBlYWNoIHJhdGluZyBsZXZlbApnZ3Bsb3QoZGF0YT13aW5lX3ByaWNlX3JhdGluZykgKyAKICBnZW9tX2RlbnNpdHkoYWVzKHg9cHJpY2VOdW0pLCBmaWxsPSJwdXJwbGUiKQoKYGBgCmBgYHtyfQojR2VuZXJhdGUgYm94IHBsb3RzIHRoYXQgc2hvdyBtZWRpYW4gcHJpY2UgYXQgZWFjaCByYXRpbmcgbGV2ZWwKZ2dwbG90KHdpbmVfcHJpY2VfcmF0aW5nLCBhZXMoeT1wcmljZU51bSwgeD1yYXRpbmcsKSkrZ2VvbV9ib3hwbG90KCkKYGBgCmBgYHtyfQoKIyMjIENyZWF0ZSBhIHdvcmRjbG91ZAoKIyNQbGFjZSB0aGUgZXhlcnB0cyBpbnRvIGEgY29ycHVzCndpbmVDb3JwdXMgPC0gVkNvcnB1cyhWZWN0b3JTb3VyY2Uod2luZV9wcmljZV9yYXRpbmckZXhjZXJwdCkpCgojI0NsZWFuIHRoZSBjb3JwdXMKd2luZUNvcnB1cyA8LSB0bV9tYXAod2luZUNvcnB1cywgY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkKd2luZUNvcnB1cyA8LSB0bV9tYXAod2luZUNvcnB1cywgcmVtb3ZlUHVuY3R1YXRpb24pCndpbmVDb3JwdXMgPC0gdG1fbWFwKHdpbmVDb3JwdXMsIFBsYWluVGV4dERvY3VtZW50KQp3aW5lQ29ycHVzIDwtIHRtX21hcCh3aW5lQ29ycHVzLCByZW1vdmVXb3Jkcywgc3RvcHdvcmRzKCdlbmdsaXNoJykpCgpgYGAKCmBgYHtyfQojIyBHZW5lcmF0ZSB0aGUgd29yZGNsb3VkCndvcmRjbG91ZCh3aW5lQ29ycHVzLCBjb2xvcnM9YnJld2VyLnBhbCg4LCAiRGFyazIiKSwgIHJhbmRvbS5jb2xvcj0gVFJVRSwgcmFuZG9tLm9yZGVyID0gRkFMU0UsIG1heC53b3JkcyA9IDIwKQoKCmBgYAoK