knitr::opts_chunk$set(echo = TRUE)
The final project for DATA 607 aims to find the best breweries in a particular area based on their ratings and their beer ratings. This project scrapes the data from beeradvocate website and then plots them on the map.
The Motivation behind this project was to find the best beer in my area and also to visualize the breweries and see which areas has more high rated brewries when plotted on the map.
The app first takes the beeradvocate URL as an input and then scrape it to get the following characteristic of the brewery and beer.
To scrape the data we used rvest package, the code below loops through all the result pages and gather the brewery data. After that we converted it to a dataframe for cleaning and transforming.
library(RCurl)
library(XML)
## Warning: package 'XML' was built under R version 3.3.2
library(rvest)
library(knitr)
## Warning: package 'knitr' was built under R version 3.3.2
library(dplyr)
library(stringr)
library(devtools)
#parse the URL
theURL <- "https://www.beeradvocate.com/place/list/?start=0&&c_id=US&s_id=CA&brewery=Y&sort=name"
pageURL <- vector(mode="character", length=0)
brewery <- vector(mode="character", length=0)
final <- vector(mode="character", length=0)
final_add <- vector(mode="character", length=0)
brewery_add <- vector(mode="character", length=0)
scrape_brewery_links <- function(page_data){
brewery <- page_data %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "hr_bottom_light", " " ))]//b | //a//b') %>%
html_text()
brewery <- brewery[-1]
return(brewery)
}
scrape_brewery_address <- function(page_data){
brewery <- page_data %>%
html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "hr_bottom_dark", " " )) and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]') %>%
html_text()
#brewery <- brewery[-1]
#output <- matrix(unlist(brewery), ncol = 5, byrow = TRUE)
return(brewery)
}
#get the total number of pages
pages <- read_html(theURL) %>%
html_nodes(xpath='//span//b[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]') %>%
html_text()
# regex to convert it into numbers
results <- str_extract(pages,"of \\d+")
results <- as.numeric(str_extract(results,"\\d+"))
# need this conversion for the website and how they name the page numbers.
page_no = results/20;
page_no = ceiling(page_no) #number of pages
page_no_url = page_no*20 #for URL string
# Loop through all the pages :
loop_URL <- "https://www.beeradvocate.com/place/list/?start="
for (i in c(1:page_no)){
Sys.sleep(1)
page_no_url = (i-1)*20
pageURL[i] <- paste(loop_URL,page_no_url,"&c_id=US&s_id=CA&brewery=Y&sort=name",sep = "")
page_data <- read_html(pageURL[i])
brewery = scrape_brewery_links(page_data)
final = append(final,brewery)
remove(brewery)
brewery_add = scrape_brewery_address(page_data)
final_add = append(final_add,brewery_add)
remove(brewery_add)
}
brewery_mat <- matrix(unlist(final), ncol = 5, byrow = TRUE)
brewery_final <- cbind(brewery_mat,final_add)
beer_df <- as.data.frame(brewery_final)
## V1 V2 V3 V4 V5
## 1 101 North Brewing Company 3.88 2 3.82 7
## 2 1991 Brewery - - - 0
## 3 21st Amendment Brewery 3.9 2 - 0
## 4 21st Amendment Brewery 3.88 260 3.74 65
## 5 2kids Brewing Company 3.65 6 3.56 17
## 6 32 North Brewing Co. 4.42 9 3.73 29
## final_add
## 1 1304 Scott StSte DPetaluma, California, 94954-7100United States(707) 778-8384
## 2 13012 Saticoy StSte 10North Hollywood, California, 91605-3513United States(818) 275-0202
## 3 2010 Williams StSan Leandro, California, 94577-2334United States(510) 595-2111
## 4 563 2nd StSan Francisco, California, 94107-1411United States(415) 369-0900
## 5 8680 Miralani DrSte 123San Diego, California, 92126-6391United States(858) 480-5437
## 6 8655 Production AveSte ASan Diego, California, 92121-2258United States(714) 791-8973
After scraping we need to clean and reformat the data.
# change the column names
colnames(beer_df) <- c("brewery_name", "brewery_rating", "number_of_reviews", "beer_avg", "number_of_beers", "address")
# get our missing values as NA
beer_df[beer_df == "-"] = NA
# clean up some of the columns and remove extra data
beer_df$address <- as.character(beer_df$address) # change into character data type
library(stringr)
beer_df$phone_number <- str_extract_all(beer_df$address, "\\(?\\d{3}\\)?[.-]? *\\d{3}[.-]? *[.-]?\\d{4}")
beer_df$phone_number[beer_df$phone_number == "character(0)"] = NA
# remove the phone number tag from the address field for geocoding
beer_df$address <- gsub("\\(?\\d{3}\\)?[.-]? *\\d{3}[.-]? *[.-]?\\d{4}", "", beer_df$address)
beer_df$address <- gsub("United States", " United States", beer_df$address)
beer_df$address <- gsub('([[:lower:]])([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('([[:digit:]])([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('(\\.)([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('([[:upper:]])([[:upper:]])', '\\1 \\2',beer_df$address)
# change the factor for average beer rating into a numeric
beer_df$beer_avg <- as.numeric(levels(beer_df$beer_avg))[beer_df$beer_avg]
beer_df$brewery_rating <- as.numeric(levels(beer_df$brewery_rating))[beer_df$brewery_rating]
beer_df$number_of_reviews <- as.numeric(beer_df$number_of_reviews)
top_beer <- beer_df %>%
filter(!is.na(number_of_reviews)) %>%
filter(!is.na(beer_avg)) %>%
filter(number_of_reviews >= 100 | beer_avg >= 4) %>%
arrange(desc(beer_avg),desc(number_of_reviews))
top_brewery <- beer_df %>%
filter(!is.na(number_of_reviews)) %>%
filter(!is.na(brewery_rating)) %>%
filter(number_of_reviews >= 50 & brewery_rating >= 4) %>%
arrange(desc(brewery_rating),desc(number_of_reviews))
library(htmlwidgets)
library(leaflet)
library(ggmap)
library(dplyr)
library(RColorBrewer)
# This function geocodes a location (find latitude and longitude) using the Google Maps API
geo <- geocode(location = top_beer$address, output="latlon", source="google")
geo2 <- geocode(location = top_brewery$address, output="latlon", source="google")
# add those coordinates to our dataset
top_beer$lon <- geo$lon
top_beer$lat <- geo$lat
# add those coordinates to our dataset
top_brewery$lon <- geo2$lon
top_brewery$lat <- geo2$lat
Mapping of breweries based on their beer ratings.
Mapping of the breweries based on the brewery ratings.
Some of the challenges faced were: