library(datasetsICR)
mydata <- data(customers)
library(dplyr)
options(scipen=999)
library(tidyverse)Lab 5
table(customers$region)< table of extent 0 >
table(customers$channel)< table of extent 0 >
summary(customers) Channel Region Fresh Milk
Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
Median :1.000 Median :3.000 Median : 8504 Median : 3627
Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
Grocery Frozen Detergents_Paper Delicassen
Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
str(customers)'data.frame': 440 obs. of 8 variables:
$ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
$ Region : int 3 3 3 3 3 3 3 3 3 3 ...
$ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
$ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
$ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
$ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
$ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
$ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
head(customers) Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1 2 3 12669 9656 7561 214 2674 1338
2 2 3 7057 9810 9568 1762 3293 1776
3 2 3 6353 8808 7684 2405 3516 7844
4 1 3 13265 1196 4221 6404 507 1788
5 2 3 22615 5410 7198 3915 1777 5185
6 2 3 9413 8259 5126 666 1795 1451
# CHANNEL VARIABLE
customers$channel <- as.factor(customers$Channel)
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"
# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"
customers <- customers %>% select(3:10)
head(customers) Fresh Milk Grocery Frozen Detergents_Paper Delicassen channel region
1 12669 9656 7561 214 2674 1338 Retail Other
2 7057 9810 9568 1762 3293 1776 Retail Other
3 6353 8808 7684 2405 3516 7844 Retail Other
4 13265 1196 4221 6404 507 1788 Hotel-Restaurant Other
5 22615 5410 7198 3915 1777 5185 Retail Other
6 9413 8259 5126 666 1795 1451 Retail Other
PART 1: SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
pip1 <- customers %>%
group_by(channel, region) %>%
summarize(N = n()) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip1# A tibble: 6 × 5
# Groups: channel [2]
channel region N freq pct
<fct> <fct> <int> <dbl> <dbl>
1 Hotel-Restaurant Lisbon 59 0.198 20
2 Hotel-Restaurant Oporto 28 0.0940 9
3 Hotel-Restaurant Other 211 0.708 71
4 Retail Lisbon 18 0.127 13
5 Retail Oporto 19 0.134 13
6 Retail Other 105 0.739 74
The combination of “Hotel-Restaurant” in the “Other” region dominates the dataset, with the highest frequency of 211, constituting 70.81% of the total observations.
“Hotel-Restaurant” has a higher frequency in the “Other” region (211) compared to “Lisbon” (59) and “Oporto” (28).
PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES
p_title <- "Region by Channel"
p_caption <- "customers dataset"
# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(channel) & !is.na(region)),
aes(x=channel, y=pct, fill = region))
p + geom_col(position = "stack") +
labs(x="Channel", y="Percent", fill = "Region",
title = p_title, caption = p_caption,
subtitle = "As a stacked bar chart") +
geom_text(aes(label=pct), position = position_stack(vjust=.5))# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
labs(x="Channel", y="Percent", fill = "Region",
title = p_title, caption = p_caption,
subtitle = "As a dodged bar chart") +
geom_text(aes(label = pct), position = position_dodge(width = .9)) # AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
labs(x=NULL, y="Percent", fill = "Region",
title = p_title, caption = p_caption,
subtitle = "As a faceted horizontal bar chart") +
guides(fill = "none") +
coord_flip() +
facet_grid(~ channel) +
geom_text(aes(label = pct), position = position_dodge2(width = 1))PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE
pip2 <- customers %>%
group_by(region) %>%
summarize(N = n(),
Milk_mean = mean(Milk, na.rm=TRUE),
Frozen_mean = mean(Frozen, na.rm=TRUE)) %>%
mutate(freq = N/sum(N),
pct = round((freq*100),0))
pip2# A tibble: 3 × 6
region N Milk_mean Frozen_mean freq pct
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Lisbon 77 5486. 3000. 0.175 18
2 Oporto 47 5088. 4045. 0.107 11
3 Other 316 5977. 2945. 0.718 72
“Other” regions dominate our samples and have a greater average spending on milk but the least spending on frozen items. The Oporto region has the greatest total average spending on frozen items.
PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE
p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region))
p + geom_point(size=6) +
annotate(geom = "text", x = 157, y=33,
label = "Oporto Spends over a Thousand dollars more on Frozen Items than other regions", hjust=0.015) +
labs(y="Average Spent on Frozen Items", x="Average Spent on Milk.",
title="Annual Spending on Milk and Frozen Items by Region",
subtitle = "Axis are money spent annually on Milk and Frozen Items",
caption <- "customers dataset{ICRdatasets}")PART 5: LEGEND AND GUIDES
pip2$region.c <- as.character(pip2$region)
pip2 <- pip2[order(pip2$region.c),]
pip2 # A tibble: 3 × 7
region N Milk_mean Frozen_mean freq pct region.c
<fct> <int> <dbl> <dbl> <dbl> <dbl> <chr>
1 Lisbon 77 5486. 3000. 0.175 18 Lisbon
2 Oporto 47 5088. 4045. 0.107 11 Oporto
3 Other 316 5977. 2945. 0.718 72 Other
p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region.c))
p + geom_point(size=6) +
annotate(geom = "text", x = 1.6, y=150,
label = "Both Lisbon and Oporto spend less on Milk than other regions", hjust=0) +
labs(y="Average Spent on Frozen Items", x="Average Spent on Milk",
title="Annual Spending on Milk and Frozen Items by Region",
subtitle = "Axis are money spent annually on Milk and Frozen Items",
color = "Region") +
theme(legend.title = element_text(color="black", size=15, face="bold"),
legend.position = c(x=0.1, y=.7),
panel.grid.major = element_line(color = "black", linetype = "dashed"),
panel.grid.minor = element_blank())PART 6: GRAPH WITH DATA LABELS
p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region.c))
p + geom_point(size=5) +
geom_text(mapping = aes(label=region), hjust=1.3, size=3) +
annotate(geom = "text", x = 1.6, y=58,
label = "Axis are money spent annually on Milk and Frozen Items", hjust=0) +
labs(y="Average Spent on Frozen Items", x="Average Spent on Milk",
title="Annual Spending on Milk and Frozen Items by Region",
subtitle = "Both Lisbon and Oporto spend less on Milk than other regions",
color = "Region") +
theme(legend.position = "none")PART 7: INTERPRETATIONS AND INSIGHTS
The combination of “Hotel-Restaurant” in the “Other” regions contributes the highest frequency (211) and makes up 70.81% of the dataset. This suggests a significant presence of “Hotel-Restaurant” establishments in regions other than Lisbon and Oporto.
The “Other” region stands out with the highest count (316) and percentage (71.8%) across both “Hotel-Restaurant” and “Retail” channels.
“Hotel-Restaurant” shows a higher frequency in the “Other” region (211) compared to “Lisbon” (59) and “Oporto” (28). This suggests a potential regional variation in the distribution of hotel and restaurant establishments.
The “Retail” channel has a relatively balanced distribution across regions, with the highest count in the “Other” region (105 observations, 73.94%). This indicates a more uniform representation of retail establishments across different regions.
The “Other” region contributes the most to the overall dataset, with a combined frequency of 316 (71.8%). This emphasizes the significance of considering this region in any analysis or interpretation of the data.
RESOURCES
OpenAI. GPT-3.5. https://platform.openai.com/models/gpt-3.5