Lab#5

Author

Jose Gabriel Rodriguez

options(scipen=999) 
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(socviz)
library(datasetsICR)
library(datasetsICR)
data(customers)
head(customers)
  Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1       2      3 12669 9656    7561    214             2674       1338
2       2      3  7057 9810    9568   1762             3293       1776
3       2      3  6353 8808    7684   2405             3516       7844
4       1      3 13265 1196    4221   6404              507       1788
5       2      3 22615 5410    7198   3915             1777       5185
6       2      3  9413 8259    5126    666             1795       1451
# CHANNEL VARIABLE
#   create the factor variable
customers$channel <- as.factor(customers$Channel)  
#   rename the levels from 1 and 2 to descriptor
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"

# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"

# DROP THE FIRST TWO COLUMNS TO AVOID CONFUSION
customers <- customers %>% select(3:10)
head(customers)
  Fresh Milk Grocery Frozen Detergents_Paper Delicassen          channel region
1 12669 9656    7561    214             2674       1338           Retail  Other
2  7057 9810    9568   1762             3293       1776           Retail  Other
3  6353 8808    7684   2405             3516       7844           Retail  Other
4 13265 1196    4221   6404              507       1788 Hotel-Restaurant  Other
5 22615 5410    7198   3915             1777       5185           Retail  Other
6  9413 8259    5126    666             1795       1451           Retail  Other

PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

pip1 <- customers %>%         
  group_by(channel, region) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
`summarise()` has grouped output by 'channel'. You can override using the
`.groups` argument.
pip1
# A tibble: 6 × 5
# Groups:   channel [2]
  channel          region     N   freq   pct
  <fct>            <fct>  <int>  <dbl> <dbl>
1 Hotel-Restaurant Lisbon    59 0.198     20
2 Hotel-Restaurant Oporto    28 0.0940     9
3 Hotel-Restaurant Other    211 0.708     71
4 Retail           Lisbon    18 0.127     13
5 Retail           Oporto    19 0.134     13
6 Retail           Other    105 0.739     74

PART 2: Create stacked and dodged bar charts: Two Categorical Variables

# Stacked Bar Chart

p_title <- "Channel by Region"
p_caption <- "customers dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(region) & !is.na(channel)), 
                        aes(x=region, y=pct, fill = channel))

p + geom_col(position = "stack") +
    labs(x="Region", y="Percent", fill = "Channel",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# Dodged Bar Chart

p + geom_col(position = "dodge2") +
    labs(x="Region", y="Percent", fill = "Channel",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9)) 

# Faceted Horizontal Bar Chart 

p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Channel",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ channel) +
    geom_text(aes(label = pct), position = position_dodge2(width = 0.5))

PART 3: Practice using pipes (dplyr) to summarize data: Two Continuous Variables and One Categorical

pip2 <- customers %>%         
  group_by(channel) %>%
  summarize(N = n(),
            fresh_mean = mean(Fresh, na.rm=TRUE), 
            delicassen_mean = mean(Delicassen, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2
# A tibble: 2 × 6
  channel              N fresh_mean delicassen_mean  freq   pct
  <fct>            <int>      <dbl>           <dbl> <dbl> <dbl>
1 Hotel-Restaurant   298     13476.           1416. 0.677    68
2 Retail             142      8904.           1753. 0.323    32

PART 4: Create a scatterplot: Two Continuous Variables and One Categorical

p <- ggplot(data = subset(pip2, !is.na(fresh_mean) & !is.na(channel) & !is.na(delicassen_mean)),
            aes(x=fresh_mean, y=delicassen_mean, color=channel)) +
  geom_point(size=5) +
  annotate(geom = "text", x = 1000, y=1700, 
           label = "Delicasen products reach their 
highest spending point at $1753.4", hjust=0) +
  labs(y="Average Spend in Delicassen", x="Average Spend in Fresh Products", 
       title="Average Spending in Delicassen and Fresh Products by Channel", 
       subtitle = "Average spending in fresh products is significantly higher than delicassen products",
       caption = "customers dataset{datasetsICR}")
p

PART 5: LEGEND AND GUIDES

pip2$channel.c <- as.character(pip2$channel)
pip2 <- pip2[order(pip2$channel.c),]
pip2
# A tibble: 2 × 7
  channel              N fresh_mean delicassen_mean  freq   pct channel.c       
  <fct>            <int>      <dbl>           <dbl> <dbl> <dbl> <chr>           
1 Hotel-Restaurant   298     13476.           1416. 0.677    68 Hotel-Restaurant
2 Retail             142      8904.           1753. 0.323    32 Retail          
p <- ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))
p + geom_point(size=5)+
geom_text(aes(label = paste("(", round(fresh_mean, 1), ",", round(delicassen_mean, 1), ")")), vjust = 1, hjust = 1)+
    annotate(geom = "text", x = 6000, y=1700, 
                     label = "Delicasen products reach their 
highest spending point at $1753.4", hjust=0) +
    labs(title = "Averge Spending in Fresh and Delicassen Products by Channel",
         subtitle = "Average spending in fresh products is significantly higher than delicassen products",
       caption = "customers dataset{datasetsICR}",
      y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", 
         color = "Channel") +
  theme(legend.title = element_text(color="black", size=12),
        legend.position = c(x=0.89, y=.89),
        panel.background = element_rect(fill = "white"),
        axis.line = element_line(colour = "black"),
        panel.grid.minor = element_line(colour = "lightgray"),) 

PART 6: DATA LABELS VS LEGEND

p <- ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))
p + geom_point(size=5) +
    geom_text(mapping = aes(label=channel), hjust=1.4, size=3) +
    annotate(geom = "text", x =6000, y=1700, 
                     label = "Delicasen products reach their 
highest spending point at $1753.4", hjust=0) +
    labs(y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", 
         title="Average Spending in Fresh and Delicassen Products by Channel"
         ,subtitle = "Average spending in fresh products is significantly higher than delicassen products",
       caption = "customers dataset{datasetsICR}", 
         color = "Channel") +
    theme(legend.position = "none")

PART 7: INTERPRETATION

pip2$channel.c <- as.character(pip2$channel)
pip2 <- pip2[order(pip2$channel.c),]
pip2
# A tibble: 2 × 7
  channel              N fresh_mean delicassen_mean  freq   pct channel.c       
  <fct>            <int>      <dbl>           <dbl> <dbl> <dbl> <chr>           
1 Hotel-Restaurant   298     13476.           1416. 0.677    68 Hotel-Restaurant
2 Retail             142      8904.           1753. 0.323    32 Retail          
p <- ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))
p + geom_point(size=5)+
geom_text(aes(label = paste("(", round(fresh_mean, 1), ",", round(delicassen_mean, 1), ")")), vjust = 1, hjust = 1)+
    annotate(geom = "text", x = 6000, y=1700, 
                     label = "Delicasen products reach their 
highest spending point at $1753.4", hjust=0) +
    labs(title = "Averge Spending in Fresh and Delicassen Products by Channel",
         subtitle = "Average spending in fresh products is significantly higher than delicassen products",
       caption = "customers dataset{datasetsICR}",
      y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", 
         color = "Channel") +
  theme(legend.title = element_text(color="black", size=12),
        legend.position = c(x=0.89, y=.89),
        panel.background = element_rect(fill = "white"),
        axis.line = element_line(colour = "black"),
        panel.grid.minor = element_line(colour = "lightgray"),) 

Insights

The average spending for Fresh products in Hotel-Restaurant and Retail is $13,475.6 and $8,904.3 respectively.

The average spending for Delicassen products in Hotel-Restaurant and Retail is $1,416.7 and $1,753.4. respectively.

The visual shows that spending in Fresh products is significantly higher than spending in Delicassen products, no matter the channel. It also shows that the total average spending between the two variables (Fresh products and Delicassen products) is higher in the Hotel-Restaurant channel.