options(scipen=999)
library(tidyverse)
library(socviz)
library(datasetsICR)Lab 5
PART 0: Useful code for customers{datasetsICR}
Since I am using the customers dataset, I will need to transform the Channel and Region from integer into factor variables, using text for the levels. The data dictionary for the dataset provides the factor values. The code below helps to accomplish this task :-).
## THIS CODE IS TO RENAME THE LEVELS FOR THE CHANNEL AND REGION VARIABLES IN THE CUSTOMER DATASET
library(datasetsICR)
data(customers)
head(customers) Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1 2 3 12669 9656 7561 214 2674 1338
2 2 3 7057 9810 9568 1762 3293 1776
3 2 3 6353 8808 7684 2405 3516 7844
4 1 3 13265 1196 4221 6404 507 1788
5 2 3 22615 5410 7198 3915 1777 5185
6 2 3 9413 8259 5126 666 1795 1451
# CHANNEL VARIABLE
# create the factor variable
customers$channel <- as.factor(customers$Channel)
# rename the levels from 1 and 2 to descriptor
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"
# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"
# DROP THE FIRST TWO COLUMNS TO AVOID CONFUSION
customers <- customers %>% select(3:10)
head(customers) Fresh Milk Grocery Frozen Detergents_Paper Delicassen channel region
1 12669 9656 7561 214 2674 1338 Retail Other
2 7057 9810 9568 1762 3293 1776 Retail Other
3 6353 8808 7684 2405 3516 7844 Retail Other
4 13265 1196 4221 6404 507 1788 Hotel-Restaurant Other
5 22615 5410 7198 3915 1777 5185 Retail Other
6 9413 8259 5126 666 1795 1451 Retail Other
PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
library(datasetsICR)
library(dplyr)
library(ggplot2)
pip1 <- customers %>%
group_by(region, channel) %>%
summarize(N = n()) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip1# A tibble: 6 × 5
# Groups: region [3]
region channel N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 Lisbon Hotel-Restaurant 59 0.766 77
2 Lisbon Retail 18 0.234 23
3 Oporto Hotel-Restaurant 28 0.596 60
4 Oporto Retail 19 0.404 40
5 Other Hotel-Restaurant 211 0.668 67
6 Other Retail 105 0.332 33
pip1 <- na.omit(pip1)From a quick glance, we can tell that the hotel-restaurant channel has a higher frequency over retail in each region. Neither Lisbon nor Oporto have the highest number of observations, but the highest overall frequency comes from Lisbon in their hotel-restaurant channel.
PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES
p_title <- "Channel by Region"
p_caption <- "customers dataset"
# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(region) & !is.na(channel)),
aes(x=region, y=pct, fill = channel))
p + geom_col(position = "stack") +
labs(x="Major Region",
y="Percent",
fill = "Channel",
title = p_title,
caption = p_caption,
subtitle = "As a stacked bar chart") +
geom_text(aes(label=pct), position = position_stack(vjust=.5))# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
labs(x="Major Region",
y="Percent",
fill = "Channel",
title = p_title,
caption = p_caption,
subtitle = "As a dodged bar chart") +
geom_text(aes(label = pct), position = position_dodge(width = .9)) # AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
labs(x=NULL,
y="Percent",
fill = "Channel",
title = p_title,
caption = p_caption,
subtitle = "As a faceted horizontal bar chart") +
guides(fill = "none") +
coord_flip() +
facet_grid(~ region) +
geom_text(aes(label = pct), position = position_dodge2(width = 1))Each of the three bar charts are showing the relationship between the two different channels and their respective regions. Rather than showing frequency, the charts are set up to show the percent of each customer channel.
PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE – Fresh and Grocery
pip2 <- customers %>%
group_by(region) %>%
summarize(N = n(),
fresh_mean = mean(Fresh, na.rm=TRUE),
grocery_mean = mean(Grocery, na.rm=TRUE)) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip2# A tibble: 3 × 6
region N fresh_mean grocery_mean freq pct
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Lisbon 77 11102. 7403. 0.175 18
2 Oporto 47 9888. 9219. 0.107 11
3 Other 316 12533. 7896. 0.718 72
I chose to use “region” as the categorical variable for this part along with “fresh” and “grocery” as the two categorical variables. The code above produces the means of both categorical variables in each of the three regions.
PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE
p <- ggplot(pip2,
mapping = aes(x=grocery_mean, y=fresh_mean, color=region))
p + geom_point(size=5) +
annotate(geom = "rect", xmin = 7000, xmax = 9545,
ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) +
annotate(geom = "text", x = 0, y = 0,
label = "Higher annual ammounts\n spent from each region.", hjust = -1.9, vjust = -5) +
labs(y="Fresh Goods", x="Grocery",
title="Average Annual Spending on Fresh Goods and Groceries by Region",
subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
caption = "customers dataset{datasetsICR}") The chart above is a simple scatterplot that incorporates the two continuous varibles by the three different regions. In this plot, the three main points are relatively close together and because of this, I wanted to use the “geom = ‘rect’” function to highlight the general area that they reside in. Under the highlighted section you will find a quick and simple interpretation of the data that was created using the “geom = ‘text’” function.
PART 5: LEGEND AND GUIDES
pip2$region.c <- as.character(pip2$region)
pip2 <- pip2[order(pip2$region.c),]
pip2 # A tibble: 3 × 7
region N fresh_mean grocery_mean freq pct region.c
<fct> <int> <dbl> <dbl> <dbl> <dbl> <chr>
1 Lisbon 77 11102. 7403. 0.175 18 Lisbon
2 Oporto 47 9888. 9219. 0.107 11 Oporto
3 Other 316 12533. 7896. 0.718 72 Other
p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))
p + geom_point(size=5) +
annotate(geom = "rect", xmin = 7000, xmax = 9545,
ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) +
annotate(geom = "text", x = 0, y=0,
label = "Higher annual ammounts\n spent from each region.", hjust=-1.4, vjust=-7.15) +
labs(y="Fresh Goods",
x="Grocery",
color = "Major \nRegions") +
theme(legend.title = element_text(color="gray40", size=14, face="bold"),
legend.position = c(x=0.88, y=.36)) +
labs(title="Average Annual Spending on Fresh Goods and Groceries by Region",
subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
caption = "customers dataset{datasetsICR}") Very similar to the chart from part 4, you will find the same data points highlighted using the same function. However, this scatterplot contains a legend that is inside the chart rather than on the outside. I placed this legend directly under the data points for an easy read. However, because of the placement I chose for the legend, I had to move the data point note to the left of the data rather than directly under.
PART 6: DATA LABELS VS LEGEND
p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))
p + geom_point(size=5) +
annotate(geom = "rect", xmin = 6300, xmax = 9545,
ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) +
geom_text(mapping = aes(label=region), hjust=1.3, size=4) +
annotate(geom = "text", x = 0, y=0,
label = "Higher annual ammounts\n spent from each region.", hjust=-2.37, vjust=-5) +
labs(y="Fresh Goods",
x="Grocery",
title="Average Annual Spending on Fresh Goods and Groceries by Region",
subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.",
caption = "customers dataset{datasetsICR}") +
theme(legend.position = "none")Combining all the characteristics from the two previous scatterplots, you will find another very similar chart above. Rather than having a legend at all, I opted to label each individual point with their respective region instead. Because of this, I had to expand the highlighted area ever so slightly and ended up placing the data point note back under that open region.
PART 7: INTERPRETATION
p <- ggplot(pip2, aes(x=grocery_mean, y=fresh_mean, color=region.c))
p + geom_point(size=5) +
annotate(geom = "rect", xmin = 6300, xmax = 9545,
ymin = 9000, ymax = 13300, fill = "red", alpha = 0.15) +
geom_text(mapping = aes(label=region), hjust=1.3, size=4) +
annotate(geom = "text", x = 0, y=0,
label = "Higher annual ammounts\n spent from each region.", hjust=-2.37, vjust=-5) +
labs(y="Fresh Goods",
x="Grocery",
title="Average Annual Spending on Fresh Goods and Groceries by Region",
subtitle = "Each region spends a higher respective amount on both fresh goods and groceries combined.\nOporto spends more on groceries while other regions put more money towards fresh goods.",
caption = "customers dataset{datasetsICR}") +
theme(legend.position = "none")With this final scatterplot, we can interpret a few things. We can tell by the highlighted area that each individual region generally spends a higher amount annually on the two different types of goods. However, there are two regions who focus their spending more on one good than the other. Oporto tends to spend a higher amount annually on their groceries while the “other” region category spends much more on fresh goods. Lisbon can be seen to spend a lesser amount on both groceries and fresh goods but that amount can still be considered high.