Lab5

Customers dataset

Author

Abigail Russell

options(scipen=999) 
library(datasetsICR)
library(tidyverse)
library(socviz)
data("customers")
head(customers)

  Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1       2      3 12669 9656    7561    214             2674       1338
2       2      3  7057 9810    9568   1762             3293       1776
3       2      3  6353 8808    7684   2405             3516       7844
4       1      3 13265 1196    4221   6404              507       1788
5       2      3 22615 5410    7198   3915             1777       5185
6       2      3  9413 8259    5126    666             1795       1451

# CHANNEL VARIABLE
customers$channel <- as.factor(customers$Channel)  
#   rename the levels from 1 and 2 to descriptor
levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"

# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"


customers <- customers %>% select(3:10)
head(customers)

  Fresh Milk Grocery Frozen Detergents_Paper Delicassen          channel region
1 12669 9656    7561    214             2674       1338           Retail  Other
2  7057 9810    9568   1762             3293       1776           Retail  Other
3  6353 8808    7684   2405             3516       7844           Retail  Other
4 13265 1196    4221   6404              507       1788 Hotel-Restaurant  Other
5 22615 5410    7198   3915             1777       5185           Retail  Other
6  9413 8259    5126    666             1795       1451           Retail  Other

#  Categorial variables: Channel and Region 
#  Numerical variables: Milk and delicassen

Part 1: Summarizing two categorical variables using Pipes

library(socviz)

pip1 <- customers %>%         
  group_by(channel, region) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1

# A tibble: 6 × 5
# Groups:   channel [2]
  channel          region     N   freq   pct
  <fct>            <fct>  <int>  <dbl> <dbl>
1 Hotel-Restaurant Lisbon    59 0.198     20
2 Hotel-Restaurant Oporto    28 0.0940     9
3 Hotel-Restaurant Other    211 0.708     71
4 Retail           Lisbon    18 0.127     13
5 Retail           Oporto    19 0.134     13
6 Retail           Other    105 0.739     74

In using pipes to summarize the categorical variables, The highest frequency % of observations within the different combinations of categorical variables is retail (from channel) and other (from region). In this combination the region of other accounts for 74% of retail, meaning that the majority of customers are coming from a different region then the ones presented. the second largest is other and hotel-restaurant where other accounts for 70% of hotel-restaurant meaning that purchases made through the hotel-restaurant location are in a region not mentioned. This grouping is also contains the largest number of observations in the whole of the dataset.

Part 2: Stacked, dodged, and horizontal (without a legend) bar charts.

p_title <- "Customers purchases through Channel by Region"
p_caption <- "customers dataset {datasetsICR}"

# Stacked Bar Chart 
p <- ggplot(data = subset(pip1, !is.na(channel) & !is.na(region)), 
                        aes(x=channel, y=pct, fill = region))

p + geom_col(position = "stack") +
    labs(x="Purchasing Channel", y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# As dodged bar chart 
p + geom_col(position = "dodge2") +
    labs(x="Purchasing Channel", y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9))

# As faceted horizontal bar chart 
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ channel) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

Insights from the Stacked Bar Chart, Dodged Bar Chart and Faceted Horizontal Chart. showing Customers Purchases through Channel by Region. The Largest percentage of channel by region is the other region. This is due to the data having very uneven categories for the region. Other is the largest group totaling more that Lisbon and Oporto combined. Other accounts for 71% of the Hotel-Retaurant category and 74% of the Retail category. Meaning that Other is the largest group of customers for both channels that purchase from those location. I believe that the Stacked Bar chart shares this information in the clearest way.

Part 3:Using pipe to summarize two continuous and one categorical variable

pip2 <- customers %>%         
  group_by(region) %>%
  summarize(N = n(),
            Milk_mean = mean(Milk, na.rm=TRUE), 
            Delicassen_mean = mean(Delicassen, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2

# A tibble: 3 × 6
  region     N Milk_mean Delicassen_mean  freq   pct
  <fct>  <int>     <dbl>           <dbl> <dbl> <dbl>
1 Lisbon    77     5486.           1355. 0.175    18
2 Oporto    47     5088.           1160. 0.107    11
3 Other    316     5977.           1621. 0.718    72

table(customers$region)


Lisbon Oporto  Other 
    77     47    316

The highest percentage of frequency for the combinations for the categorical with two continuouse variables is other combines with milk and delicassen. Milk and delicassen account for 72% of the other category while the combinations for Lisbon is 18% and Oporto is 11%. This makes sense due to other being the largest response among the observations.

Part 4: Scatterplot for the continuouse and categorical variable

p2 <- ggplot(customers, aes(x=Milk, y=Delicassen, color=region))
p2 + geom_point(size=5, na.rm=TRUE) +
    annotate(geom = "text", x = 1.6, y=58, 
                     label = "", hjust=0) +
    labs(y="Average Milk Purchases", x="Average Delicassen Purchases", 
         title="Milk and Delicassen by Region",  
         subtitle = "Interesting interpretation goes here",
         caption <- "customers dataset {datasetsICR}") +
  facet_wrap(~ region)+ scale_x_log10() + scale_y_log10()

Part 5: Legends and Guides

give the legend a title

# my legend in coding already is in order by color leading from the smallest to the largest, I am going to modify the name but other than that I believe that the legend is adequate for interpreting the graph. 
library(ggplot2)
p2 <- ggplot(customers, aes(x=Milk, y=Delicassen, color=region))
p2 + geom_point(size=5, na.rm=TRUE) +
    annotate(geom = "text", x = 1.6, y=58, 
                     label = "", hjust=0) +
    labs(y="Average Milk Purchases", x="Average Delicassen Purchases", color= "Customer Region",
         title="Milk and Delicassen by Region",  
         subtitle = "Interesting interpretation goes here",
         caption <- "customers dataset {datasetsICR}") +
  facet_wrap(~ region)+  scale_x_log10() + scale_y_log10()+theme(legend.position = "bottom",legend.title = element_text(color="gray50", size=14, face="bold" ))

Data Labels

library(ggplot2)
p <- ggplot(customers, aes(x=Milk, y=Delicassen, color=region))
p +  geom_point(size=5, na.rm=TRUE) +
    geom_text(mapping = aes(label=region), hjust=1.2, size=3) +
    annotate(geom = "text", x = 1.6, y=58, 
                     label = "", hjust=0) +
    labs(y="Milk Purchases", x="Delicassen Purchases", 
         title="Milk and Delicassen by Region",
         subtitle="The Other Region is Positively Correlated with Outliers Associated with Higher Delicassen Purchases.",
         color = "Religion") +
    theme(legend.position = "none")+
  facet_wrap(~ region)+  scale_x_log10() + scale_y_log10()

These scatterplots are showing the relationship of purchases between Milk and Delicassen categorized by region. We can see that the other region takes up the majority of the observations in the dataset. The x and y axis are altered by a log base 10. It seems to be consistent in that people who buy a lot milk also buy from the Delicassen frequently. There are few outliers within the Other region and the Lisbon region that buy more from the Delicassen then they do Milk. Otherwise the observations are very grouped together leaving questions on if there are other factors that contribute (like age or income) to the relationship.

Part 1: Summarizing two categorical variables using Pipes

Part 2: Stacked, dodged, and horizontal (without a legend) bar charts.

Part 3:Using pipe to summarize two continuous and one categorical variable

Part 4: Scatterplot for the continuouse and categorical variable

Part 5: Legends and Guides

Data Labels

End