knitr::opts_chunk$set(echo = TRUE)

Introduction

The final project for DATA 607 aims to find the best breweries in a particular area based on their ratings and their beer ratings. This project scrapes the data from beeradvocate website and then plots them on the map.

Motivation

The Motivation behind this project was to find the best beer in my area and also to visualize the breweries and see which areas has more high rated brewries when plotted on the map.

Overview of data gathered

The app first takes the beeradvocate URL as an input and then scrape it to get the following characteristic of the brewery and beer.

Brewery Name.
Brewery ratings.
No. of reviews.
Average beer rating for that particular brewery.
No. of beers in that brewery.
Address of the brewery.

Code to scrape the Data.

To scrape the data we used rvest package, the code below loops through all the result pages and gather the brewery data. After that we converted it to a dataframe for cleaning and transforming.

library(RCurl)
library(XML)

## Warning: package 'XML' was built under R version 3.3.2

library(rvest)
library(knitr)

## Warning: package 'knitr' was built under R version 3.3.2

library(dplyr)
library(stringr)
library(devtools)


#parse the URL

theURL <- "https://www.beeradvocate.com/place/list/?start=0&&c_id=US&s_id=CA&brewery=Y&sort=name"
pageURL <- vector(mode="character", length=0)
brewery <- vector(mode="character", length=0)
final <- vector(mode="character", length=0)
final_add <- vector(mode="character", length=0)
brewery_add <- vector(mode="character", length=0)


scrape_brewery_links <- function(page_data){
  brewery <- page_data %>%
    html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "hr_bottom_light", " " ))]//b | //a//b') %>%
    html_text()
  brewery <- brewery[-1]
  
  return(brewery)
}

scrape_brewery_address <- function(page_data){
  brewery <- page_data %>%
    html_nodes(xpath='//*[contains(concat( " ", @class, " " ), concat( " ", "hr_bottom_dark", " " )) and (((count(preceding-sibling::*) + 1) = 1) and parent::*)]') %>%
    html_text()
  #brewery <- brewery[-1]
  #output <- matrix(unlist(brewery), ncol = 5, byrow = TRUE)
  return(brewery)
}

#get the total number of pages

pages <- read_html(theURL) %>%
  html_nodes(xpath='//span//b[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]') %>%
  html_text()

# regex to convert it into numbers
results <- str_extract(pages,"of \\d+")
results <- as.numeric(str_extract(results,"\\d+"))

# need this conversion for the website and how they name the page numbers.
page_no = results/20;
page_no = ceiling(page_no) #number of pages
page_no_url = page_no*20 #for URL string

# Loop through all the pages :
loop_URL <- "https://www.beeradvocate.com/place/list/?start="
for (i in c(1:page_no)){
  Sys.sleep(1)
  page_no_url = (i-1)*20
  pageURL[i] <- paste(loop_URL,page_no_url,"&c_id=US&s_id=CA&brewery=Y&sort=name",sep = "")
  page_data <- read_html(pageURL[i])
  brewery = scrape_brewery_links(page_data)
  final = append(final,brewery)
  remove(brewery)
  brewery_add = scrape_brewery_address(page_data)
  final_add = append(final_add,brewery_add)
  remove(brewery_add)
}

brewery_mat <- matrix(unlist(final), ncol = 5, byrow = TRUE)
brewery_final <- cbind(brewery_mat,final_add)
beer_df <- as.data.frame(brewery_final)

The results after scraping

##                          V1   V2  V3   V4 V5
## 1 101 North Brewing Company 3.88   2 3.82  7
## 2              1991 Brewery    -   -    -  0
## 3    21st Amendment Brewery  3.9   2    -  0
## 4    21st Amendment Brewery 3.88 260 3.74 65
## 5     2kids Brewing Company 3.65   6 3.56 17
## 6      32 North Brewing Co. 4.42   9 3.73 29
##                                                                                  final_add
## 1            1304 Scott StSte DPetaluma, California, 94954-7100United States(707) 778-8384
## 2 13012 Saticoy StSte 10North Hollywood, California, 91605-3513United States(818) 275-0202
## 3           2010 Williams StSan Leandro, California, 94577-2334United States(510) 595-2111
## 4               563 2nd StSan Francisco, California, 94107-1411United States(415) 369-0900
## 5      8680 Miralani DrSte 123San Diego, California, 92126-6391United States(858) 480-5437
## 6     8655 Production AveSte ASan Diego, California, 92121-2258United States(714) 791-8973

Cleansing the data

After scraping we need to clean and reformat the data.

# change the column names
colnames(beer_df) <- c("brewery_name", "brewery_rating", "number_of_reviews", "beer_avg", "number_of_beers", "address")

# get our missing values as NA
beer_df[beer_df == "-"] = NA

# clean up some of the columns and remove extra data
beer_df$address <- as.character(beer_df$address) # change into character data type

library(stringr)
beer_df$phone_number <- str_extract_all(beer_df$address, "\\(?\\d{3}\\)?[.-]? *\\d{3}[.-]? *[.-]?\\d{4}")
beer_df$phone_number[beer_df$phone_number == "character(0)"] = NA

# remove the phone number tag from the address field for geocoding
beer_df$address <- gsub("\\(?\\d{3}\\)?[.-]? *\\d{3}[.-]? *[.-]?\\d{4}", "", beer_df$address)
beer_df$address <- gsub("United States", " United States", beer_df$address)
beer_df$address <- gsub('([[:lower:]])([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('([[:digit:]])([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('(\\.)([[:upper:]])', '\\1 \\2',beer_df$address)
beer_df$address <- gsub('([[:upper:]])([[:upper:]])', '\\1 \\2',beer_df$address)

# change the factor for average beer rating into a numeric

beer_df$beer_avg <- as.numeric(levels(beer_df$beer_avg))[beer_df$beer_avg]
beer_df$brewery_rating <- as.numeric(levels(beer_df$brewery_rating))[beer_df$brewery_rating]
beer_df$number_of_reviews <- as.numeric(beer_df$number_of_reviews)

top_beer <-     beer_df %>%
  filter(!is.na(number_of_reviews)) %>%
  filter(!is.na(beer_avg)) %>%
  filter(number_of_reviews >= 100 | beer_avg >= 4) %>%
  arrange(desc(beer_avg),desc(number_of_reviews))

top_brewery <-     beer_df %>%
  filter(!is.na(number_of_reviews)) %>%
  filter(!is.na(brewery_rating)) %>%
  filter(number_of_reviews >= 50 & brewery_rating >= 4) %>%
  arrange(desc(brewery_rating),desc(number_of_reviews))

Create location cordinates

library(htmlwidgets)
library(leaflet)
library(ggmap)
library(dplyr)
library(RColorBrewer)

# This function geocodes a location (find latitude and longitude) using the Google Maps API
geo <- geocode(location = top_beer$address, output="latlon", source="google")
geo2 <- geocode(location = top_brewery$address, output="latlon", source="google")

# add those coordinates to our dataset
top_beer$lon <- geo$lon
top_beer$lat <- geo$lat

# add those coordinates to our dataset
top_brewery$lon <- geo2$lon
top_brewery$lat <- geo2$lat

Results

Mapping of breweries based on their beer ratings.

Results

Mapping of the breweries based on the brewery ratings.

Challenges and lessons learned

Some of the challenges faced were:

Address formatting,
Due to which it was calculating wrong coordinates.
Also something new I learned in this project is about leaflet package and ioslides.

Introduction

Motivation

Overview of data gathered

Code to scrape the Data.

The results after scraping

Cleansing the data

Create location cordinates

Results

Results

Results

Challenges and lessons learned

References