Case Study 1

R Markdown

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggpubr)
beers <- read_csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv")

## Rows: 2410 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

breweries <- read_csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv")

## Rows: 558 Columns: 4

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##How many breweries are present in each state?
#This table outlines the quantity of breweries in each state and will be used as the basis of the following charts.
numbystate <- data.frame(breweries %>% count(State))
numbystate

##    State  n
## 1     AK  7
## 2     AL  3
## 3     AR  2
## 4     AZ 11
## 5     CA 39
## 6     CO 47
## 7     CT  8
## 8     DC  1
## 9     DE  2
## 10    FL 15
## 11    GA  7
## 12    HI  4
## 13    IA  5
## 14    ID  5
## 15    IL 18
## 16    IN 22
## 17    KS  3
## 18    KY  4
## 19    LA  5
## 20    MA 23
## 21    MD  7
## 22    ME  9
## 23    MI 32
## 24    MN 12
## 25    MO  9
## 26    MS  2
## 27    MT  9
## 28    NC 19
## 29    ND  1
## 30    NE  5
## 31    NH  3
## 32    NJ  3
## 33    NM  4
## 34    NV  2
## 35    NY 16
## 36    OH 15
## 37    OK  6
## 38    OR 29
## 39    PA 25
## 40    RI  5
## 41    SC  4
## 42    SD  1
## 43    TN  3
## 44    TX 28
## 45    UT  4
## 46    VA 16
## 47    VT 10
## 48    WA 23
## 49    WI 20
## 50    WV  1
## 51    WY  4

numbystate$State <- as.factor(numbystate$State)

#bar graph depicting the number of breweries in each state
numbystate %>% ggplot(aes(x = reorder(State, n), alpha = n, y = n, fill = State)) + 
  geom_bar(stat = 'identity', fill = "steelblue3") + 
  coord_flip() + 
  theme_classic() + 
  theme(legend.position = "none") +
  labs(x = "Count", y = "State") +
  ggtitle("Number of Breweries per State")

#Different chart (polar coordinates) giving a better visualization of relative amount of breweries in each state.
numbystate %>% ggplot(aes(x = State, y = n, fill = State)) + 
  geom_bar(stat = 'identity') + 
  coord_polar() + theme_bw() + 
  theme(legend.position = "none") + 
  labs(x = "State", y = "Count") + ggtitle("Number of Breweries per State")

##Address the missing values in each column.
sum(is.na(breweries))

## [1] 0

#As we can see, the breweries data set holds no missing values, thus no need to eliminate any missing values.

sum(is.na(beers))

## [1] 1072

#The beers dataset has 1072, so we must eliminate noted values.

beers_clean <- beers %>% filter(!is.na(Name)) %>% filter(!is.na(Beer_ID)) %>% 
  filter(!is.na(ABV)) %>% filter(!is.na(IBU)) %>% 
  filter(!is.na(Brewery_id)) %>% filter(!is.na(Style)) %>% 
  filter(!is.na(Ounces))

#Removed 1007 rows due to missing values.

##Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

#In order to join the two data sets successfully, we will need to find a primary key and a foreign key by which to join by. Let's examine the column names.

colnames(breweries)

## [1] "Brew_ID" "Name"    "City"    "State"

colnames(beers_clean)

## [1] "Name"       "Beer_ID"    "ABV"        "IBU"        "Brewery_id"
## [6] "Style"      "Ounces"

#As we can see, Brewery ID would be a good key to join them; however, we must make sure that they have a common spelling. Also, even though 'Name" makes sense for each respective dataset, we will need to change it in order to clarify what the column is indeed naming when joining.

breweries$Brewery = breweries$Name
breweries$Brewery_id = breweries$Brew_ID
head(breweries)

## # A tibble: 6 × 6
##   Brew_ID Name                      City          State Brewery       Brewery_id
##     <dbl> <chr>                     <chr>         <chr> <chr>              <dbl>
## 1       1 NorthGate Brewing         Minneapolis   MN    NorthGate Br…          1
## 2       2 Against the Grain Brewery Louisville    KY    Against the …          2
## 3       3 Jack's Abby Craft Lagers  Framingham    MA    Jack's Abby …          3
## 4       4 Mike Hess Brewing Company San Diego     CA    Mike Hess Br…          4
## 5       5 Fort Point Beer Company   San Francisco CA    Fort Point B…          5
## 6       6 COAST Brewing Company     Charleston    SC    COAST Brewin…          6

#Let's create a new dataframe with the relevant columns from 'breweries' before joining.

breweries_clean <- breweries %>% select(Brewery, City, State, Brewery_id)
join <- inner_join(breweries_clean, beers_clean, by = "Brewery_id")

#From the joined data frame, let's select the relevant columns.

medABVIBU <- join %>% select(State, ABV, IBU)
head(medABVIBU)

## # A tibble: 6 × 3
##   State   ABV   IBU
##   <chr> <dbl> <dbl>
## 1 MN    0.045    50
## 2 MN    0.049    26
## 3 MN    0.048    19
## 4 MN    0.06     38
## 5 MN    0.06     25
## 6 MN    0.056    47

#Having the relevant columns to work with, lets create a new one with the median ABV of each state and call ot 'medABV'.
medABV <- medABVIBU %>% group_by(State) %>% mutate(medianABV = median(ABV)*100) %>% select(State, medianABV) %>% arrange(State) %>% distinct(State, medianABV)
head(medABV)

## # A tibble: 6 × 2
## # Groups:   State [6]
##   State medianABV
##   <chr>     <dbl>
## 1 AK          5.7
## 2 AL          6  
## 3 AR          4  
## 4 AZ          5.5
## 5 CA          5.8
## 6 CO          6.5

#As you can see, there is a row for every beer produced in each state. In order to plot the median correctly, we will need to delete duplicates so that we may only have one row (median) per state.



##Data set is now ready to plot.

medABV %>% ggplot(aes(x = reorder(State, medianABV), y = medianABV, fill = State, alpha = medianABV)) + 
  geom_bar(stat = 'identity', fill = "steelblue3") + 
  coord_flip() +
  theme(legend.position = "none") +
  theme_classic() +
  labs(x = "State", y = "Median ABV") +
  ggtitle("Median ABV by State")

##We will follow a similar process to derive median IBU by State.

medIBU <- medABVIBU %>% 
  group_by(State) %>% 
  mutate(medianIBU = median(IBU)) %>% 
  select(State, medianIBU) %>% 
  arrange(State) %>% 
  distinct(State, medianIBU)

head(medIBU)

## # A tibble: 6 × 2
## # Groups:   State [6]
##   State medianIBU
##   <chr>     <dbl>
## 1 AK           46
## 2 AL           43
## 3 AR           39
## 4 AZ           20
## 5 CA           42
## 6 CO           40

#And plot in a similar manner.

medIBU %>% ggplot(aes(x = reorder(State, medianIBU), y = medianIBU, fill = State, alpha = medianIBU)) + 
  geom_bar(stat = 'identity', fill = "steelblue3") + 
  coord_flip() +
  theme(legend.position = "none") +
  theme_classic() +
  labs(x = "State", y = "Median IBU") +
  ggtitle("Median IBU by State")

##Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?

#We will create a new data frame with relevant columns to explor this question.
maxABVIBU <- join %>% select(Brewery, Name, State, ABV, IBU)

#We need to identify the highest ABV level
max(maxABVIBU$ABV)

## [1] 0.125

maxABVIBU %>% filter(ABV == "0.125")

## # A tibble: 1 × 5
##   Brewery                   Name           State   ABV   IBU
##   <chr>                     <chr>          <chr> <dbl> <dbl>
## 1 Against the Grain Brewery London Balling KY    0.125    80

## We can see that the beer with the highest ABV belongs to the state of Kentucky (KY) with an ABV of 12.5. The beer is "London Balling" by Against the Grain Brewery.

#We follow the same process for IBU.
max(maxABVIBU$IBU)

## [1] 138

maxABVIBU %>% filter(IBU == "138")

## # A tibble: 1 × 5
##   Brewery                 Name                      State   ABV   IBU
##   <chr>                   <chr>                     <chr> <dbl> <dbl>
## 1 Astoria Brewing Company Bitter Bitch Imperial IPA OR    0.082   138

#The beer with the highest IBU belongs to the state of Oregon (OR) with an IBU of 138. The beer is "Bitter Bitch Imperial IPA" by Astoria Brewing Company.

##Comment on the summary statistics and distribution of the ABV variable.
summary(maxABVIBU$ABV)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05700 0.05992 0.06800 0.12500

#Histogram
maxABVIBU %>% ggplot(aes(x = ABV)) + 
  geom_histogram(fill = "steelblue3") + 
  theme_classic() +
  labs(x = "ABV", y = "") +
  ggtitle("Distribution of ABV")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Boxplot
maxABVIBU %>% ggplot(aes(x = ABV)) + 
  geom_boxplot(fill = "steelblue3") + 
  theme_classic() +
  labs(x = "ABV", y = "") +
  ggtitle("Distribution of ABV")

##Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.  Make your best judgment of a relationship and EXPLAIN your answer.

#As we saw earlier, the distributions of both the IBU and ABV columns were right skewed, it'd be helpful to apply a log transformation before plotting. In the plot, we'd also like to include a linear regression line as well as its corresponding equation to better examine its relationship. 

maxABVIBU %>% ggplot(aes(x = log(IBU), y = log(ABV))) +
  geom_point(color = "steelblue3") +
  geom_smooth(method = "lm", color = "grey49") +
  stat_regline_equation() +
  theme_classic() +
  labs(x = "IBU", y = "ABV") +
  ggtitle("Relationship between IBU and ABV")

## `geom_smooth()` using formula 'y ~ x'

##From a visual inspection as well as a simple linear regression model, we can say that there is an apparent significant positive relationship between IBU and ABV (p-value = <.0001), with a 0.2-unit increase in ABV for every unit increase in IBU.

Case Study 1

Joaquin

10/8/2021

R Markdown