LAB 5

Author

Gabriela Peralta

Categorical Variables Analysis

library(magrittr)
library(datasetsICR)
library(dplyr)
library(ggplot2)
data(customers)
head(customers)
  Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1       2      3 12669 9656    7561    214             2674       1338
2       2      3  7057 9810    9568   1762             3293       1776
3       2      3  6353 8808    7684   2405             3516       7844
4       1      3 13265 1196    4221   6404              507       1788
5       2      3 22615 5410    7198   3915             1777       5185
6       2      3  9413 8259    5126    666             1795       1451
# Channel Variable
customers$channel <- as.factor(customers$Channel)  
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"

# Region Variable
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"

# First two columns dropped to avoid confusion
customers <- customers %>% select(3:10)
head(customers)
  Fresh Milk Grocery Frozen Detergents_Paper Delicassen          channel region
1 12669 9656    7561    214             2674       1338           Retail  Other
2  7057 9810    9568   1762             3293       1776           Retail  Other
3  6353 8808    7684   2405             3516       7844           Retail  Other
4 13265 1196    4221   6404              507       1788 Hotel-Restaurant  Other
5 22615 5410    7198   3915             1777       5185           Retail  Other
6  9413 8259    5126    666             1795       1451           Retail  Other
# Summarized data by the two categorical variables and get the frequency and percent
summary_data <- customers %>%
  group_by(channel, region) %>%
  summarise(count = n()) %>%
  mutate(percent = count / sum(count) * 100)

print(summary_data)
# A tibble: 6 × 4
# Groups:   channel [2]
  channel          region count percent
  <fct>            <fct>  <int>   <dbl>
1 Hotel-Restaurant Lisbon    59   19.8 
2 Hotel-Restaurant Oporto    28    9.40
3 Hotel-Restaurant Other    211   70.8 
4 Retail           Lisbon    18   12.7 
5 Retail           Oporto    19   13.4 
6 Retail           Other    105   73.9 
# Plot to visualize the summarized data
ggplot(data = summary_data, aes(x = channel, y = percent, fill = region)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Channel", y = "Percent", fill = "Region") +
  theme_minimal()

Bar Chart Comparison

# Stacked bar chart
ggplot(data = summary_data, aes(x = channel, y = percent, fill = region)) +
  geom_bar(stat = "identity") +
  labs(x = "Channel", y = "Percent", fill = "Region", 
       caption = "Source: datasetsICR") +  # Add the source here
  theme_minimal()

# Dodged bar chart
ggplot(data = summary_data, aes(x = channel, y = percent, fill = region)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Channel", y = "Percent", fill = "Region", 
       caption = "Source: datasetsICR") +  # Add the source here
  theme_minimal()

# Faceted horizontal bar chart
ggplot(data = summary_data, aes(x = channel, y = percent, fill = region)) +
  geom_bar(stat = "identity") +
  labs(x = "Channel", y = "Percent", fill = "Region", 
       caption = "Source: datasetsICR") +  # Add the source here
  facet_wrap(~region, ncol = 1, scales = "free_y") +
  theme_minimal() +
  theme(legend.position = "none")

Continuous Variables Analysis

# Summarized data by one categorical variable including the mean for two continuous variables
summary_data_part3 <- customers %>%
  group_by(region) %>%
  summarise(mean_grocery = mean(Grocery),
            mean_frozen = mean(Frozen))

print(summary_data_part3)
# A tibble: 3 × 3
  region mean_grocery mean_frozen
  <fct>         <dbl>       <dbl>
1 Lisbon        7403.       3000.
2 Oporto        9219.       4045.
3 Other         7896.       2945.

Scatterplot Visualization

# Identify the coordinates for the annotation
chosen_x <- 1000
chosen_y <- 500   

# Scatterplot using the summarized dataset from Part 3
ggplot(data = summary_data_part3, aes(x = mean_grocery, y = mean_frozen, color = region)) +
  geom_point() +
  labs(x = "Mean of Grocery", y = "Mean of Frozen", color = "Region") +
  theme_minimal() +
  # Annotate text at the chosen point
  annotate("text", x = chosen_x, y = chosen_y, label = "Oporto is the leader of Grocery and Frozen consumption", size = 4, color = "black", hjust = -0.1, vjust = -27)

Legend and Guide Enhancements

# Scatterplot using the summarized dataset from Part 3
ggplot(data = summary_data_part3, aes(x = mean_grocery, y = mean_frozen, color = region)) +
  geom_point() +
  labs(x = "Mean of Grocery", y = "Mean of Frozen", color = "Region") +
  ggtitle("Comparison of consumers behavior by Region") +  # Add title here
  theme_minimal() +
  annotate("text", x = chosen_x, y = chosen_y, label = "Oporto is the leader of Grocery and Frozen consumption", size = 4, color = "black", hjust = -0.1, vjust = -0.2) +
  scale_color_discrete(name = "Region", 
                       breaks = c("Lisbon", "Oporto", "Other"), 
                       labels = c("Lisbon", "Oporto", "Other"), 
                       guide = guide_legend(title.position = "top",   
                                             title.theme = element_text(face = "italic", size = 12),  
                                             label.theme = element_text(size = 10),  
                                             label.hjust = 0.5),  
                       ) +
  theme(legend.position = "right") 

Data Labeling Techniques

# Scatterplot using the summarized dataset from Part 3
ggplot(data = summary_data_part3, aes(x = mean_grocery, y = mean_frozen)) +
  geom_point(aes(color = region), size = 3) +
  geom_text(aes(label = region), 
            position = position_nudge(y = -0.5),  # Nudge labels downward
            size = 3, vjust = 1, 
            hjust = ifelse(summary_data_part3$region == "Oporto", 1.3,
                           ifelse(summary_data_part3$region == "Lisbon", 1.3, -0.3))) +  # Adjust label position
  labs(x = "Mean of Grocery", y = "Mean of Frozen") +
  theme_minimal() +
  ggtitle("Comparison of consumers behavior by Region") +
  annotate("text", x = chosen_x, y = chosen_y, label = "Oporto is the leader of Grocery and Frozen consumption", size = 4, color = "black", hjust = -0.1, vjust = -0.3) +
  # Remove the legend
  theme(legend.position = "none") +
  # Adjust plot margins to ensure labels are fully visible
  theme(plot.margin = margin(1, 1, 1, 1, "cm"))

Interpretation

Categorical Variables Analysis

  • I have been given the dataset “Customers” that contains 440 observations and 8 variables. Out of the Variables, I will be working with 2 Categorical (Channel and Region) and 2 Continuous variables (Grocery and Frozen). This data set is intended to study consumer behavior related to these 4 variables. At first sight it is easy to interpret that Groceries is more claim by consumers than Frozen things.  

  • To study a little bit more in depth, I created a scatter plot for both Categorical Variables and both continuous variables. The “Channel” represents the way sales are distributed, while the fill is the “Region” where these are distributed. The “Channel” variable has 2 categories: Hotel Restaurants and Retail, while the “Region” has 3 categories: Lisbon, Oporto, and Other. It is highly noticeable that most of the sales are happening in the “Other” region section, leading the Retail category. It is also outstanding that for the Category “Hotel Restaurant” Lisbon is the second region leading, while for the “Retail” category “Oporto” is the second region leading.

Bar Chart Comparison

  • This section uses 3 types of charts to compare the same data, and it is intended to find the chart that better fits the data and that better describes it. These charts are: Stacked bar chart, Dodged bar chart, and Faceted horizontal bar chart. I like both Dodged and Faceted bar chart. The Faceted bar chart seems more professional and well worked, but it does take more time to understand it. It uses percentages to compared the sales in the different regions through the different channels. On the other hand the Dodged Bar chart is simple and easy to follow, since the bars are next to each other, it is easy to identify the differences and avoids any type of confusion.  

Continuous Variables Analysis

  • In the continuous variables Grocery and Frozen, there are spending patterns, regional disparities, market potential and consumer behavior that can be easily identified.

  • Spending Patterns: Customers in Oporto appear to spend more on both grocery and frozen products compared to Lisbon and Other regions. This suggests that Oporto might have a higher demand or preference for these types of products.

  • Regional Disparities: The mean spending on grocery and frozen products in Lisbon is lower compared to Oporto and Other regions. This could indicate variations in consumer preferences, income levels, or market characteristics between regions.

  • Market Potential: Higher mean spending in certain regions, such as Oporto, may indicate greater market potential for grocery and frozen products. Businesses may consider targeting these regions for marketing efforts or expanding their product offerings to cater to the preferences of consumers in those areas.

  • Consumer Behavior: The differences in mean spending across regions could reflect underlying differences in consumer behavior, lifestyle, or cultural factors. Identifying this factors can help businesses tailor their marketing strategies and product offerings to better meet the needs of customers in each region.

Scatterplot Visualization

  • The mean of the variables can be plugged in a scatter plot to make it easier to understand. In the graph, it stands out that they are all over the 7000, therefore they look close to each other in the X axis, but the Y axis which is frozen, the difference is bigger between regions, having Oporto as the leader, followed by Lisbon and lastly other right underneath Lisbon. Although the means of Grocery go higher than 7500s, the means for Frozen is barely above the 4000s.

Legend and Guide Enhancements

  • Adding a annotation text help add insights in the graph. Usually one will look good so the graph does not look crowded and confuses the audience. It can be placed wherever it fits best in the graph and makes the more sense.  

Data Labeling Techniques

  • The final presentation of the graph includes a title, accompanied by labels next to the dots, instead of having a legend, which is intended to make it easier for the audience to follow the imagine without having to do a lot of work. The graph is filled with the x axis as the mean for grocery and the y axis as the mean for frozen, and its filled is by region. Green means “Oporto”, Orange Coral means “Lisbon” and blue means “Other”.