This is a data set in which 1.5 million rows of information on various brands of beer has been provided .A dealer wants to understand what kind of product (beer) is being preferred by customers .Now, we will understand this huge data set and pull the information what the dealer wants.
rm(list=ls())
#1.Loading the data set :
setwd ("D:/Raviteja/Raviteja Professional/Data Science/my projects/new-working projects")
beer <- read.csv("beer_reviews.csv", header = TRUE, sep = ",")
library(gridExtra)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#2.Brief of the data set:
#variables in the data set:
names(beer)
## [1] "brewery_id" "brewery_name" "review_time"
## [4] "review_overall" "review_aroma" "review_appearance"
## [7] "review_profilename" "beer_style" "review_palate"
## [10] "review_taste" "beer_name" "beer_abv"
## [13] "beer_beerid"
summary(beer)
## brewery_id brewery_name
## Min. : 1 Boston Beer Company (Samuel Adams): 39444
## 1st Qu.: 143 Dogfish Head Brewery : 33839
## Median : 429 Stone Brewing Co. : 33066
## Mean : 3130 Sierra Nevada Brewing Co. : 28751
## 3rd Qu.: 2372 Bell's Brewery, Inc. : 25191
## Max. :28003 Rogue Ales : 24083
## (Other) :1402240
## review_time review_overall review_aroma review_appearance
## Min. :8.407e+08 Min. :0.000 Min. :1.000 Min. :0.000
## 1st Qu.:1.173e+09 1st Qu.:3.500 1st Qu.:3.500 1st Qu.:3.500
## Median :1.239e+09 Median :4.000 Median :4.000 Median :4.000
## Mean :1.224e+09 Mean :3.816 Mean :3.736 Mean :3.842
## 3rd Qu.:1.289e+09 3rd Qu.:4.500 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.326e+09 Max. :5.000 Max. :5.000 Max. :5.000
##
## review_profilename beer_style
## northyorksammy: 5817 American IPA : 117586
## BuckeyeNation : 4661 American Double / Imperial IPA : 85977
## mikesgroove : 4617 American Pale Ale (APA) : 63469
## Thorpe429 : 3518 Russian Imperial Stout : 54129
## womencantsail : 3497 American Double / Imperial Stout: 50705
## NeroFiddled : 3488 American Porter : 50477
## (Other) :1561016 (Other) :1164271
## review_palate review_taste
## Min. :1.000 Min. :1.000
## 1st Qu.:3.500 1st Qu.:3.500
## Median :4.000 Median :4.000
## Mean :3.744 Mean :3.793
## 3rd Qu.:4.000 3rd Qu.:4.500
## Max. :5.000 Max. :5.000
##
## beer_name beer_abv
## 90 Minute IPA : 3290 Min. : 0.01
## India Pale Ale : 3130 1st Qu.: 5.20
## Old Rasputin Russian Imperial Stout: 3111 Median : 6.50
## Sierra Nevada Celebration Ale : 3000 Mean : 7.04
## Two Hearted Ale : 2728 3rd Qu.: 8.50
## Arrogant Bastard Ale : 2704 Max. :57.70
## (Other) :1568651 NA's :67785
## beer_beerid
## Min. : 3
## 1st Qu.: 1717
## Median :13906
## Mean :21713
## 3rd Qu.:39441
## Max. :77317
##
sapply(beer, function(x) sum(is.na(x)))
## brewery_id brewery_name review_time
## 0 0 0
## review_overall review_aroma review_appearance
## 0 0 0
## review_profilename beer_style review_palate
## 0 0 0
## review_taste beer_name beer_abv
## 0 0 67785
## beer_beerid
## 0
#Only the variable beer_abv is having missed values which we can ignore as the data is huge.
#Taking out the rows that has missing values
beer= na.omit(beer)
#3.Now, let us see the questions that the dealer have :
#Which brewery produces the strongest beers by ABV%?
#If i have to pick 3 beers to recommend using only this data, which one should i pick?
#Which of the factors (aroma, taste, appearance, palette) are most important in determining the overall quality of a beer?
#Lastly, if I typically enjoy a beer due to its aroma and appearance, which beer style should I try?
#4.Answering questions:
#1.Which brewery produces the strongest beers by ABV%?
# there is a library called "sqldf", using which we can write 'sql' language and pull the required information from the data.
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
top_abv<-sqldf("SELECT brewery_name,AVG(beer_abv) FROM beer GROUP BY brewery_name ORDER BY AVG(beer_abv) DESC")
## Loading required package: tcltk
#Let us see the top 5 breweries that produces the strongest beers by ABV% :
print(head(top_abv,5))
## brewery_name AVG(beer_abv)
## 1 Schorschbräu 19.22882
## 2 Shoes Brewery 15.20000
## 3 Rome Brewing Company 13.84000
## 4 Hurlimann Brewery 13.75000
## 5 Alt-Oberurseler Brauhaus 13.20000
#2.If i have to pick 3 beers to recommend using only this data, which one should i pick?
beer_rating1 <- group_by(beer, beer_name)
m1<- summarise(beer_rating1, rating_mean= mean(review_overall))
beer_by_rating <- arrange(m1,-rating_mean)
head(beer_by_rating,3)
## Source: local data frame [3 x 2]
##
## beer_name rating_mean
## (fctr) (dbl)
## 1 10th Anniversary Strong Belgian 5
## 2 2005 Grand Cru 5
## 3 2006 Barley Wine (Triple IPA) 5
#3. Which of the factors (aroma, taste, appearance, palette) are most important in determining the overall quality of a beer?
#We need to calculate the corelation coefficinets of the above mentioned variables with the overall_rating :
corelations<- cor(beer[ ,c(4,5,6,9,10)])
View(corelations)
library(corrplot)
corrplot(corelations, method="circle")
#So from the above analysis we can suggest to the dealer that the taste is very important in determining the overall_rating
#4.Lastly, if I typically enjoy a beer due to its aroma and appearance, which beer style should I try?
beerdata1<-aggregate( beer$review_aroma,list(b1=beer$beer_style),FUN=mean)
beerdata2<-aggregate( beer$review_appearance,list(b1=beer$beer_style),FUN=mean)
table<- inner_join(beerdata1,beerdata2,by=c('b1'))
library('data.table')
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
setnames(table,"x.x","aroma")
setnames(table,"x.y","appearance")
table1<-table[order(-table$aroma,-table$appearance),]
head(table1,1)
## b1 aroma appearance
## 12 American Double / Imperial Stout 4.161538 4.16432
names(beer)
## [1] "brewery_id" "brewery_name" "review_time"
## [4] "review_overall" "review_aroma" "review_appearance"
## [7] "review_profilename" "beer_style" "review_palate"
## [10] "review_taste" "beer_name" "beer_abv"
## [13] "beer_beerid"
#One more interesting thing is that the people are loving beer a lot, which provides a very good opportunity for any one who wants to come in to this business.See the below graphs:
m5=qplot(x=review_overall,data=beer,binwidth=0.05)+scale_x_continuous(limits=c(1,5),breaks=seq(1,5,0.5))
m2=qplot(x=review_aroma,data=beer,binwidth=0.05)+scale_x_continuous(limits=c(1,5),breaks=seq(1,5,0.5))
m3=qplot(x=review_appearance,data=beer,binwidth=0.05)+scale_x_continuous(limits=c(1,5),breaks=seq(1,5,0.5))
m4=qplot(x=review_taste,data=beer,binwidth=0.05)+scale_x_continuous(limits=c(1,5),breaks=seq(1,5,0.5))
m1=qplot(x=review_palate,data=beer,binwidth=0.05)+scale_x_continuous(limits=c(1,5),breaks=seq(1,5,0.5))
grid.arrange(m1,m2,m3,m4,m5,ncol=2)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 7 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 7 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
#More than half of the users have rated the beer with ratings of 4 and 4.5 for all the qualities of the beer.