Lab 5

Author

Lincoln Morphis

library(datasetsICR)
mydata <- data(customers)
library(dplyr)
options(scipen=999) 
library(tidyverse)

table(customers$region)

< table of extent 0 >

table(customers$channel)

< table of extent 0 >

summary(customers)

    Channel          Region          Fresh             Milk      
 Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
 1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
 Median :1.000   Median :3.000   Median :  8504   Median : 3627  
 Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
 3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
 Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
    Grocery          Frozen        Detergents_Paper    Delicassen     
 Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
 1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
 Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
 Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
 3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
 Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

str(customers)

'data.frame':   440 obs. of  8 variables:
 $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
 $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
 $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
 $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
 $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
 $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
 $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

head(customers)

  Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
1       2      3 12669 9656    7561    214             2674       1338
2       2      3  7057 9810    9568   1762             3293       1776
3       2      3  6353 8808    7684   2405             3516       7844
4       1      3 13265 1196    4221   6404              507       1788
5       2      3 22615 5410    7198   3915             1777       5185
6       2      3  9413 8259    5126    666             1795       1451

# CHANNEL VARIABLE
customers$channel <- as.factor(customers$Channel)  

levels(customers$channel)[levels(customers$channel)=='1'] <- "Hotel-Restaurant"
levels(customers$channel)[levels(customers$channel)=='2'] <- "Retail"

# REGION VARIABLE
customers$region <- as.factor(customers$Region)
levels(customers$region)[levels(customers$region)=='1'] <- "Lisbon"
levels(customers$region)[levels(customers$region)=='2'] <- "Oporto"
levels(customers$region)[levels(customers$region)=='3'] <- "Other"

customers <- customers %>% select(3:10)
head(customers)

  Fresh Milk Grocery Frozen Detergents_Paper Delicassen          channel region
1 12669 9656    7561    214             2674       1338           Retail  Other
2  7057 9810    9568   1762             3293       1776           Retail  Other
3  6353 8808    7684   2405             3516       7844           Retail  Other
4 13265 1196    4221   6404              507       1788 Hotel-Restaurant  Other
5 22615 5410    7198   3915             1777       5185           Retail  Other
6  9413 8259    5126    666             1795       1451           Retail  Other

PART 1: SUMMARIZE DATA: TWO CATEGORICAL VARIABLES

pip1 <- customers %>%         
  group_by(channel, region) %>%
  summarize(N = n()) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip1

# A tibble: 6 × 5
# Groups:   channel [2]
  channel          region     N   freq   pct
  <fct>            <fct>  <int>  <dbl> <dbl>
1 Hotel-Restaurant Lisbon    59 0.198     20
2 Hotel-Restaurant Oporto    28 0.0940     9
3 Hotel-Restaurant Other    211 0.708     71
4 Retail           Lisbon    18 0.127     13
5 Retail           Oporto    19 0.134     13
6 Retail           Other    105 0.739     74

The combination of “Hotel-Restaurant” in the “Other” region dominates the dataset, with the highest frequency of 211, constituting 70.81% of the total observations.

“Hotel-Restaurant” has a higher frequency in the “Other” region (211) compared to “Lisbon” (59) and “Oporto” (28).

PART 2: CREATE STACKED AND DODGED BAR CHARTS FROM 2 CATEGORICAL VARIABLES

p_title <- "Region by Channel"
p_caption <- "customers dataset"

# AS STACKED BAR CHART
p <- ggplot(data = subset(pip1, !is.na(channel) & !is.na(region)), 
                        aes(x=channel, y=pct, fill = region))

p + geom_col(position = "stack") +
    labs(x="Channel", y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "As a stacked bar chart") +
    geom_text(aes(label=pct), position = position_stack(vjust=.5))

# AS DODGED BAR CHART
p + geom_col(position = "dodge2") +
    labs(x="Channel", y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "As a dodged bar chart") + 
    geom_text(aes(label = pct), position = position_dodge(width = .9))

# AS FACETED HORIZONTAL BAR CHART
p + geom_col(position = "dodge2") +
    labs(x=NULL, y="Percent", fill = "Region",
         title = p_title, caption = p_caption, 
         subtitle = "As a faceted horizontal bar chart") +
         guides(fill = "none") +
         coord_flip() +
         facet_grid(~ channel) +
    geom_text(aes(label = pct), position = position_dodge2(width = 1))

PART 3: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CONTINUOUS & ONE CATEGORICAL VARIABLE

pip2 <- customers %>%         
  group_by(region) %>%
  summarize(N = n(),
            Milk_mean = mean(Milk, na.rm=TRUE), 
            Frozen_mean = mean(Frozen, na.rm=TRUE)) %>% 
  mutate(freq = N/sum(N),
         pct = round((freq*100),0))
pip2

# A tibble: 3 × 6
  region     N Milk_mean Frozen_mean  freq   pct
  <fct>  <int>     <dbl>       <dbl> <dbl> <dbl>
1 Lisbon    77     5486.       3000. 0.175    18
2 Oporto    47     5088.       4045. 0.107    11
3 Other    316     5977.       2945. 0.718    72

“Other” regions dominate our samples and have a greater average spending on milk but the least spending on frozen items. The Oporto region has the greatest total average spending on frozen items.

PART 4: SCATTERPLOT WITH A THIRD CATEGORICAL VARIABLE

p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region))
p + geom_point(size=6) +
    annotate(geom = "text", x = 157, y=33, 
                     label = "Oporto Spends over a Thousand dollars more on Frozen Items than other regions", hjust=0.015) +
    labs(y="Average Spent on Frozen Items", x="Average Spent on Milk.", 
         title="Annual Spending on Milk and Frozen Items by Region", 
         subtitle = "Axis are money spent annually on Milk and Frozen Items",
         caption <- "customers dataset{ICRdatasets}")

PART 5: LEGEND AND GUIDES

pip2$region.c <- as.character(pip2$region)
pip2 <- pip2[order(pip2$region.c),]
pip2

# A tibble: 3 × 7
  region     N Milk_mean Frozen_mean  freq   pct region.c
  <fct>  <int>     <dbl>       <dbl> <dbl> <dbl> <chr>   
1 Lisbon    77     5486.       3000. 0.175    18 Lisbon  
2 Oporto    47     5088.       4045. 0.107    11 Oporto  
3 Other    316     5977.       2945. 0.718    72 Other

p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region.c))
p + geom_point(size=6) +
    annotate(geom = "text", x = 1.6, y=150, 
                     label = "Both Lisbon and Oporto spend less on Milk than other regions", hjust=0) +
    labs(y="Average Spent on Frozen Items", x="Average Spent on Milk", 
         title="Annual Spending on Milk and Frozen Items by Region", 
         subtitle = "Axis are money spent annually on Milk and Frozen Items",
         color = "Region") +
  theme(legend.title = element_text(color="black", size=15, face="bold"),
        legend.position = c(x=0.1, y=.7),
         panel.grid.major = element_line(color = "black", linetype = "dashed"),
        panel.grid.minor = element_blank())

PART 6: GRAPH WITH DATA LABELS

p <- ggplot(pip2, aes(x=Milk_mean, y=Frozen_mean, color=region.c))
p + geom_point(size=5) +
    geom_text(mapping = aes(label=region), hjust=1.3, size=3) +
    annotate(geom = "text", x = 1.6, y=58, 
                     label = "Axis are money spent annually on Milk and Frozen Items", hjust=0) +
    labs(y="Average Spent on Frozen Items", x="Average Spent on Milk", 
         title="Annual Spending on Milk and Frozen Items by Region", 
         subtitle = "Both Lisbon and Oporto spend less on Milk than other regions",
         color = "Region") +
    theme(legend.position = "none")

PART 7: INTERPRETATIONS AND INSIGHTS

The combination of “Hotel-Restaurant” in the “Other” regions contributes the highest frequency (211) and makes up 70.81% of the dataset. This suggests a significant presence of “Hotel-Restaurant” establishments in regions other than Lisbon and Oporto.

The “Other” region stands out with the highest count (316) and percentage (71.8%) across both “Hotel-Restaurant” and “Retail” channels.

“Hotel-Restaurant” shows a higher frequency in the “Other” region (211) compared to “Lisbon” (59) and “Oporto” (28). This suggests a potential regional variation in the distribution of hotel and restaurant establishments.

The “Retail” channel has a relatively balanced distribution across regions, with the highest count in the “Other” region (105 observations, 73.94%). This indicates a more uniform representation of retail establishments across different regions.

The “Other” region contributes the most to the overall dataset, with a combined frequency of 316 (71.8%). This emphasizes the significance of considering this region in any analysis or interpretation of the data.

RESOURCES

OpenAI. GPT-3.5. https://platform.openai.com/models/gpt-3.5