Lab 5

Author

Julie Milligan

options(scipen=999) 
library(tidyverse)
library(socviz)
library(datasetsICR)

PART 0: Useful code for customers{datasetsICR}

Since I am using the customers dataset, I will need to transform the Channel and Region from integer into factor variables, using text for the levels. The data dictionary for the dataset provides the factor values. The code below helps to accomplish this task :-).

## THIS CODE IS TO RENAME THE LEVELS FOR THE CHANNEL AND REGION VARIABLES IN THE CUSTOMER DATASET
library(datasetsICR)
data(customers)
head(customers)

  Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1       2      3 12669 9656    7561    214             2674       1338
2       2      3  7057 9810    9568   1762             3293       1776
3       2      3  6353 8808    7684   2405             3516       7844
4       1      3 13265 1196    4221   6404              507       1788
5       2      3 22615 5410    7198   3915             1777       5185
6       2      3  9413 8259    5126    666             1795       1451

# CHANNEL VARIABLE
#   create the factor variable
customers$channel <- as.factor(customers$Channel)  
#   rename the levels from 1 and 2 to descriptor
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"

# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"

# DROP THE FIRST TWO COLUMNS TO AVOID CONFUSION
customers <- customers %>% select(3:10)
head(customers)

  Fresh Milk Grocery Frozen Detergents_Paper Delicassen          channel region
1 12669 9656    7561    214             2674       1338           Retail  Other
2  7057 9810    9568   1762             3293       1776           Retail  Other
3  6353 8808    7684   2405             3516       7844           Retail  Other
4 13265 1196    4221   6404              507       1788 Hotel-Restaurant  Other
5 22615 5410    7198   3915             1777       5185           Retail  Other
6  9413 8259    5126    666             1795       1451           Retail  Other

PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

library(datasetsICR)
library(dplyr)
library(ggplot2)

pip1 <- customers %>%         
  group_by(region, channel) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1

# A tibble: 6 × 5
# Groups:   region [3]
  region channel              N  freq   pct
  <fct>  <fct>            <int> <dbl> <dbl>
1 Lisbon Hotel-Restaurant    59 0.766    77
2 Lisbon Retail              18 0.234    23
3 Oporto Hotel-Restaurant    28 0.596    60
4 Oporto Retail              19 0.404    40
5 Other  Hotel-Restaurant   211 0.668    67
6 Other  Retail             105 0.332    33

pip1 <- na.omit(pip1)

From a quick glance, we can tell that the hotel-restaurant channel has a higher frequency over retail in each region. Neither Lisbon nor Oporto have the highest number of observations, but the highest overall frequency comes from Lisbon in their hotel-restaurant channel.

PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Channel by Region"
p_caption <- "customers dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(region) & !is.na(channel)), 
                        aes(x=region, y=pct, fill = channel))

p + geom_col(position = "stack") +
    labs(x="Major Region",
         y="Percent",
         fill = "Channel",
         title = p_title,
         caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Major Region",
         y="Percent",
         fill = "Channel",
         title = p_title,
         caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9))

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL,
         y="Percent",
         fill = "Channel",
         title = p_title,
         caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ region) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

Each of the three bar charts are showing the relationship between the two different channels and their respective regions. Rather than showing frequency, the charts are set up to show the percent of each customer channel.

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE – Fresh and Grocery

pip2 <- customers %>%         
  group_by(region) %>%
  summarize(N = n(),
            fresh_mean = mean(Fresh, na.rm=TRUE), 
            grocery_mean = mean(Grocery, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2

# A tibble: 3 × 6
  region     N fresh_mean grocery_mean  freq   pct
  <fct>  <int>      <dbl>        <dbl> <dbl> <dbl>
1 Lisbon    77     11102.        7403. 0.175    18
2 Oporto    47      9888.        9219. 0.107    11
3 Other    316     12533.        7896. 0.718    72

I chose to use “region” as the categorical variable for this part along with “fresh” and “grocery” as the two categorical variables. The code above produces the means of both categorical variables in each of the three regions.

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

p <- ggplot(pip2, 
            mapping = aes(x=grocery_mean, y=fresh_mean, color=region))

p + geom_point(size=5) +
    annotate(geom = "rect", xmin = 7000, xmax = 9545,
             ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) + 
    annotate(geom = "text", x = 0, y = 0,
             label = "Higher annual ammounts\n  spent from each region.", hjust = -1.9, vjust = -5) +
    labs(y="Fresh Goods", x="Grocery", 
         title="Average Annual Spending on Fresh Goods and Groceries by Region", 
         subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
         caption = "customers dataset{datasetsICR}")

The chart above is a simple scatterplot that incorporates the two continuous varibles by the three different regions. In this plot, the three main points are relatively close together and because of this, I wanted to use the “geom = ‘rect’” function to highlight the general area that they reside in. Under the highlighted section you will find a quick and simple interpretation of the data that was created using the “geom = ‘text’” function.

PART 5: LEGEND AND GUIDES

pip2$region.c <- as.character(pip2$region)
pip2 <- pip2[order(pip2$region.c),]
pip2

# A tibble: 3 × 7
  region     N fresh_mean grocery_mean  freq   pct region.c
  <fct>  <int>      <dbl>        <dbl> <dbl> <dbl> <chr>   
1 Lisbon    77     11102.        7403. 0.175    18 Lisbon  
2 Oporto    47      9888.        9219. 0.107    11 Oporto  
3 Other    316     12533.        7896. 0.718    72 Other

p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))

p + geom_point(size=5) +
    annotate(geom = "rect", xmin = 7000, xmax = 9545,
             ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) + 
    annotate(geom = "text", x = 0, y=0, 
                     label = "Higher annual ammounts\n  spent from each region.", hjust=-1.4, vjust=-7.15) +
    labs(y="Fresh Goods", 
         x="Grocery", 
         color = "Major \nRegions") +
    theme(legend.title = element_text(color="gray40", size=14, face="bold"),
        legend.position = c(x=0.88, y=.36)) +
      labs(title="Average Annual Spending on Fresh Goods and Groceries by Region", 
         subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
         caption = "customers dataset{datasetsICR}")

Very similar to the chart from part 4, you will find the same data points highlighted using the same function. However, this scatterplot contains a legend that is inside the chart rather than on the outside. I placed this legend directly under the data points for an easy read. However, because of the placement I chose for the legend, I had to move the data point note to the left of the data rather than directly under.

PART 6: DATA LABELS VS LEGEND

p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))

p + geom_point(size=5) +
    annotate(geom = "rect", xmin = 6300, xmax = 9545,
             ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) + 
    geom_text(mapping = aes(label=region), hjust=1.3, size=4) +
    annotate(geom = "text", x = 0, y=0, 
                     label = "Higher annual ammounts\n  spent from each region.", hjust=-2.37, vjust=-5) +
    labs(y="Fresh Goods", 
         x="Grocery", 
         title="Average Annual Spending on Fresh Goods and Groceries by Region", 
         subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
         caption = "customers dataset{datasetsICR}") +
    theme(legend.position = "none")

Combining all the characteristics from the two previous scatterplots, you will find another very similar chart above. Rather than having a legend at all, I opted to label each individual point with their respective region instead. Because of this, I had to expand the highlighted area ever so slightly and ended up placing the data point note back under that open region.

PART 7: INTERPRETATION

p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))

p + geom_point(size=5) +
    annotate(geom = "rect", xmin = 6300, xmax = 9545,
             ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) + 
    geom_text(mapping = aes(label=region), hjust=1.3, size=4) +
    annotate(geom = "text", x = 0, y=0, 
                     label = "Higher annual ammounts\n  spent from each region.", hjust=-2.37, vjust=-5) +
    labs(y="Fresh Goods", 
         x="Grocery", 
         title="Average Annual Spending on Fresh Goods and Groceries by Region", 
         subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.\nOporto spends more on groceries while other regions put more money towards fresh goods.",
         caption = "customers dataset{datasetsICR}") +
    theme(legend.position = "none")

With this final scatterplot, we can interpret a few things. We can tell by the highlighted area that each individual region generally spends a higher amount annually on the two different types of goods. However, there are two regions who focus their spending more on one good than the other. Oporto tends to spend a higher amount annually on their groceries while the “other” region category spends much more on fresh goods. Lisbon can be seen to spend a lesser amount on both groceries and fresh goods but that amount can still be considered high.