Introduction

For my data analysis assignment I decided to explore the alcohol consumption by country and region data set from https://data.worldbank.org/indicator/SH.ALC.PCAP.LI. The data sets I used measure consumption in liters per captia for each country and also include each country’s income group. I was interested in finding which regions overall consumed the most alcohol from 2000-2018, finding the top 8 Alcohol consuming countries in 2018 and what their income group was and lastly, I compared consumption from the US, Germany and UK.

Alcohol Consumption in Liters by Region (combined) from 2000-2018

library(readxl)
Alcohol_Consumption_By_Country <- read_excel("~/Desktop/HW5/Alcohol Consumption_By Country.xls")
Alcohol_Consumption_By_Country
## # A tibble: 1,330 × 4
##    `Country Name`              `Country Code`  Year Liters
##    <chr>                       <chr>          <dbl>  <dbl>
##  1 Aruba                       ABW             2000 NA    
##  2 Africa Eastern and Southern AFE             2000  5.01 
##  3 Afghanistan                 AFG             2000 NA    
##  4 Africa Western and Central  AFW             2000  8.09 
##  5 Angola                      AGO             2000  2.76 
##  6 Albania                     ALB             2000  6.57 
##  7 Andorra                     AND             2000 13.3  
##  8 Arab World                  ARB             2000  0.673
##  9 United Arab Emirates        ARE             2000  2.58 
## 10 Argentina                   ARG             2000  8.75 
## # … with 1,320 more rows
Region_Income <- read_excel("~/Desktop/HW5/Region_Income.xlsx")
Region_Income
## # A tibble: 265 × 3
##    `Country Code` Region                     IncomeGroup        
##    <chr>          <chr>                      <chr>              
##  1 ABW            Latin America & Caribbean  High income        
##  2 AFE            <NA>                       <NA>               
##  3 AFG            South Asia                 Low income         
##  4 AFW            <NA>                       <NA>               
##  5 AGO            Sub-Saharan Africa         Lower middle income
##  6 ALB            Europe & Central Asia      Upper middle income
##  7 AND            Europe & Central Asia      High income        
##  8 ARB            <NA>                       <NA>               
##  9 ARE            Middle East & North Africa High income        
## 10 ARG            Latin America & Caribbean  Upper middle income
## # … with 255 more rows

First I used inner_join to combine the data sets by Country Code

library(tidyverse)
Alcohol_Consumption_Region <- inner_join(Alcohol_Consumption_By_Country, Region_Income, by = c("Country Code" = "Country Code"))
Alcohol_Consumption_Region
## # A tibble: 1,325 × 6
##    `Country Name`              `Country Code`  Year Liters Region    IncomeGroup
##    <chr>                       <chr>          <dbl>  <dbl> <chr>     <chr>      
##  1 Aruba                       ABW             2000 NA     Latin Am… High income
##  2 Africa Eastern and Southern AFE             2000  5.01  <NA>      <NA>       
##  3 Afghanistan                 AFG             2000 NA     South As… Low income 
##  4 Africa Western and Central  AFW             2000  8.09  <NA>      <NA>       
##  5 Angola                      AGO             2000  2.76  Sub-Saha… Lower midd…
##  6 Albania                     ALB             2000  6.57  Europe &… Upper midd…
##  7 Andorra                     AND             2000 13.3   Europe &… High income
##  8 Arab World                  ARB             2000  0.673 <NA>      <NA>       
##  9 United Arab Emirates        ARE             2000  2.58  Middle E… High income
## 10 Argentina                   ARG             2000  8.75  Latin Am… Upper midd…
## # … with 1,315 more rows

Then I removed Country Name, Country Code and Income group by using Subset() and Select()

Alcohol_Consumption_Region <- subset(Alcohol_Consumption_Region, select = -c(`Country Name`, `Country Code`, `IncomeGroup`))
Alcohol_Consumption_Region
## # A tibble: 1,325 × 3
##     Year Liters Region                    
##    <dbl>  <dbl> <chr>                     
##  1  2000 NA     Latin America & Caribbean 
##  2  2000  5.01  <NA>                      
##  3  2000 NA     South Asia                
##  4  2000  8.09  <NA>                      
##  5  2000  2.76  Sub-Saharan Africa        
##  6  2000  6.57  Europe & Central Asia     
##  7  2000 13.3   Europe & Central Asia     
##  8  2000  0.673 <NA>                      
##  9  2000  2.58  Middle East & North Africa
## 10  2000  8.75  Latin America & Caribbean 
## # … with 1,315 more rows

I removed missing values using na.omit()

Alcohol_Consumption_Region <- na.omit(Alcohol_Consumption_Region) 
Alcohol_Consumption_Region
## # A tibble: 951 × 3
##     Year Liters Region                    
##    <dbl>  <dbl> <chr>                     
##  1  2000   2.76 Sub-Saharan Africa        
##  2  2000   6.57 Europe & Central Asia     
##  3  2000  13.3  Europe & Central Asia     
##  4  2000   2.58 Middle East & North Africa
##  5  2000   8.75 Latin America & Caribbean 
##  6  2000   4.23 Europe & Central Asia     
##  7  2000   5.13 Latin America & Caribbean 
##  8  2000  11.7  East Asia & Pacific       
##  9  2000  13.1  Europe & Central Asia     
## 10  2000   3.45 Europe & Central Asia     
## # … with 941 more rows

I used the group_by() function to group by Region, Year and Liters. Then I used summarise() to sum total liters per captia for each region. Lastly, I plotted the data using a plotly bar chart

Alcohol_Consumption_Region <- Alcohol_Consumption_Region %>% group_by(Region, Year, Liters) %>% summarize(Liters=sum(Liters))
library(plotly)
Alcohol_Consumption_plot <- plot_ly(Alcohol_Consumption_Region, x=~Year, y= ~Liters, color = ~Region, type = "bar") %>%
layout(title = 'Alcohol Consumption in Liters by Region (combined) from 2000-2018')
Alcohol_Consumption_plot

Alcohol Consumption in Liters per Captia for Top 8 Countries in 2018

For this analysis I wanted to know the top 8 countries with the highest levels of consumption while also looking at similarities in income group. The country with the highest liters per captia in 2018 was categorized as a lower middle income country. I did not find a correlation between alcohol consumption and income group since I only looked at 8 countries.

I started my analysis by using inner_join() to combine the data sets by Country Code

Alcohol_Consumption_Top_8 <- inner_join(Alcohol_Consumption_By_Country, Region_Income, by = c("Country Code" = "Country Code")) 
Alcohol_Consumption_Top_8
## # A tibble: 1,325 × 6
##    `Country Name`              `Country Code`  Year Liters Region    IncomeGroup
##    <chr>                       <chr>          <dbl>  <dbl> <chr>     <chr>      
##  1 Aruba                       ABW             2000 NA     Latin Am… High income
##  2 Africa Eastern and Southern AFE             2000  5.01  <NA>      <NA>       
##  3 Afghanistan                 AFG             2000 NA     South As… Low income 
##  4 Africa Western and Central  AFW             2000  8.09  <NA>      <NA>       
##  5 Angola                      AGO             2000  2.76  Sub-Saha… Lower midd…
##  6 Albania                     ALB             2000  6.57  Europe &… Upper midd…
##  7 Andorra                     AND             2000 13.3   Europe &… High income
##  8 Arab World                  ARB             2000  0.673 <NA>      <NA>       
##  9 United Arab Emirates        ARE             2000  2.58  Middle E… High income
## 10 Argentina                   ARG             2000  8.75  Latin Am… Upper midd…
## # … with 1,315 more rows

Then I used select() to identify the columns I wanted to keep, I used filter() to only use the year 2018 and then I used slice_max to give me the top 8 variables in the Liters column

Alcohol_Consumption_Top_8 <- Alcohol_Consumption_Top_8 %>% 
  select(`Country Name`, Year, Liters, IncomeGroup) %>% 
  filter(Year == 2018) %>% 
  slice_max(Liters, n=10) %>%
  na.omit()
Alcohol_Consumption_Top_8
## # A tibble: 8 × 4
##   `Country Name`  Year Liters IncomeGroup        
##   <chr>          <dbl>  <dbl> <chr>              
## 1 Eswatini        2018   20.5 Lower middle income
## 2 Tuvalu          2018   15.1 Upper middle income
## 3 Cayman Islands  2018   14.4 High income        
## 4 Lesotho         2018   13.2 Lower middle income
## 5 Cyprus          2018   12.9 High income        
## 6 India           2018   12.9 Lower middle income
## 7 Lithuania       2018   12.8 High income        
## 8 Burkina Faso    2018   12.6 Low income

I plotted the data set using a bar chart

Alcohol_Consumption_8_plot <- plot_ly(data = Alcohol_Consumption_Top_8, x = ~ `Country Name`, y = ~ Liters, color = ~IncomeGroup) %>%
  add_bars() %>%
  layout(title = 'Alcohol Consumption in Liters for Top 8 Countries in 2018', xaxis = list(title = 'Country'), yaxis = list(title='Liters per captia'))
Alcohol_Consumption_8_plot

Alcohol Consumption Comparison United States, Germany & UK

Finally, I wanted to explore how much alcohol the US consumed in comparison to Germany and the UK. I cleaned the Alcohol_Consumption_By_Country dataset by using select() to only show the columns I was interested in, then I used filter() to filter by Country Name

Alcohol_Consumption_US_Germany_UK <- Alcohol_Consumption_By_Country %>%
  select(`Country Name`, Year, Liters) %>% 
  filter(`Country Name` %in% c("United States", "Germany", "United Kingdom"))
Alcohol_Consumption_US_Germany_UK
## # A tibble: 15 × 3
##    `Country Name`  Year Liters
##    <chr>          <dbl>  <dbl>
##  1 Germany         2000  14.2 
##  2 United Kingdom  2000  13.7 
##  3 United States   2000   9.19
##  4 Germany         2005   1.28
##  5 United Kingdom  2005   6.39
##  6 United States   2005   4.15
##  7 Germany         2010   9.65
##  8 United Kingdom  2010   4.26
##  9 United States   2010   7.03
## 10 Germany         2015   0.49
## 11 United Kingdom  2015   9.61
## 12 United States   2015   2.59
## 13 Germany         2018  11.2 
## 14 United Kingdom  2018   2.75
## 15 United States   2018   9.06

I used a line plot to compare alcohol consumption for the three countries from 2000 to 2018. Germany had the highest alcohol consumption throughout the years followed by the UK and the US

Alcohol_Consumption_US_Germany_UK_plot <- plot_ly(data = Alcohol_Consumption_US_Germany_UK, x = ~ Year, y = ~ Liters, color = ~`Country Name`, mode = 'lines+markers') %>% 
  layout(title = 'Alcohol Consumption Comparison United States, Germany & UK', xaxis = list(title = 'Country'), yaxis = list(title='Liters per captia'))
Alcohol_Consumption_US_Germany_UK_plot