# CHANNEL VARIABLE# create the factor variablecustomers$channel <-as.factor(customers$Channel) # rename the levels from 1 and 2 to descriptorlevels(customers$channel)[levels(customers$channel)=='1'] <-"Hotel-Restaurant"levels(customers$channel)[levels(customers$channel)=='2'] <-"Retail"# REGION VARIABLEcustomers$region <-as.factor(customers$Region)levels(customers$region)[levels(customers$region)=='1'] <-"Lisbon"levels(customers$region)[levels(customers$region)=='2'] <-"Oporto"levels(customers$region)[levels(customers$region)=='3'] <-"Other"# DROP THE FIRST TWO COLUMNS TO AVOID CONFUSIONcustomers <- customers %>%select(3:10)head(customers)
Fresh Milk Grocery Frozen Detergents_Paper Delicassen channel region
1 12669 9656 7561 214 2674 1338 Retail Other
2 7057 9810 9568 1762 3293 1776 Retail Other
3 6353 8808 7684 2405 3516 7844 Retail Other
4 13265 1196 4221 6404 507 1788 Hotel-Restaurant Other
5 22615 5410 7198 3915 1777 5185 Retail Other
6 9413 8259 5126 666 1795 1451 Retail Other
PART 1: PRACTICE USING PIPES (dplyr) TO SUMMARIZE DATA: TWO CATEGORICAL VARIABLES
PART 2: Create stacked and dodged bar charts: Two Categorical Variables
# Stacked Bar Chartp_title <-"Channel by Region"p_caption <-"customers dataset"# AS STACKED BAR CHARTp <-ggplot(data =subset(pip1, !is.na(region) &!is.na(channel)), aes(x=region, y=pct, fill = channel))p +geom_col(position ="stack") +labs(x="Region", y="Percent", fill ="Channel",title = p_title, caption = p_caption, subtitle ="As a stacked bar chart") +geom_text(aes(label=pct), position =position_stack(vjust=.5))
# Dodged Bar Chartp +geom_col(position ="dodge2") +labs(x="Region", y="Percent", fill ="Channel",title = p_title, caption = p_caption, subtitle ="As a dodged bar chart") +geom_text(aes(label = pct), position =position_dodge(width = .9))
# Faceted Horizontal Bar Chart p +geom_col(position ="dodge2") +labs(x=NULL, y="Percent", fill ="Channel",title = p_title, caption = p_caption, subtitle ="As a faceted horizontal bar chart") +guides(fill ="none") +coord_flip() +facet_grid(~ channel) +geom_text(aes(label = pct), position =position_dodge2(width =0.5))
PART 3: Practice using pipes (dplyr) to summarize data: Two Continuous Variables and One Categorical
PART 4: Create a scatterplot: Two Continuous Variables and One Categorical
p <-ggplot(data =subset(pip2, !is.na(fresh_mean) &!is.na(channel) &!is.na(delicassen_mean)),aes(x=fresh_mean, y=delicassen_mean, color=channel)) +geom_point(size=5) +annotate(geom ="text", x =1000, y=1700, label ="Delicasen products reach their highest spending point at $1753.4", hjust=0) +labs(y="Average Spend in Delicassen", x="Average Spend in Fresh Products", title="Average Spending in Delicassen and Fresh Products by Channel", subtitle ="Average spending in fresh products is significantly higher than delicassen products",caption ="customers dataset{datasetsICR}")p
p <-ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))p +geom_point(size=5)+geom_text(aes(label =paste("(", round(fresh_mean, 1), ",", round(delicassen_mean, 1), ")")), vjust =1, hjust =1)+annotate(geom ="text", x =6000, y=1700, label ="Delicasen products reach their highest spending point at $1753.4", hjust=0) +labs(title ="Averge Spending in Fresh and Delicassen Products by Channel",subtitle ="Average spending in fresh products is significantly higher than delicassen products",caption ="customers dataset{datasetsICR}",y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", color ="Channel") +theme(legend.title =element_text(color="black", size=12),legend.position =c(x=0.89, y=.89),panel.background =element_rect(fill ="white"),axis.line =element_line(colour ="black"),panel.grid.minor =element_line(colour ="lightgray"),)
PART 6: DATA LABELS VS LEGEND
p <-ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))p +geom_point(size=5) +geom_text(mapping =aes(label=channel), hjust=1.4, size=3) +annotate(geom ="text", x =6000, y=1700, label ="Delicasen products reach their highest spending point at $1753.4", hjust=0) +labs(y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", title="Average Spending in Fresh and Delicassen Products by Channel" ,subtitle ="Average spending in fresh products is significantly higher than delicassen products",caption ="customers dataset{datasetsICR}", color ="Channel") +theme(legend.position ="none")
p <-ggplot(pip2, aes(x=fresh_mean, y=delicassen_mean, color=channel.c))p +geom_point(size=5)+geom_text(aes(label =paste("(", round(fresh_mean, 1), ",", round(delicassen_mean, 1), ")")), vjust =1, hjust =1)+annotate(geom ="text", x =6000, y=1700, label ="Delicasen products reach their highest spending point at $1753.4", hjust=0) +labs(title ="Averge Spending in Fresh and Delicassen Products by Channel",subtitle ="Average spending in fresh products is significantly higher than delicassen products",caption ="customers dataset{datasetsICR}",y="Average Spend in Delicassen Products", x="Average Spend in Fresh Products", color ="Channel") +theme(legend.title =element_text(color="black", size=12),legend.position =c(x=0.89, y=.89),panel.background =element_rect(fill ="white"),axis.line =element_line(colour ="black"),panel.grid.minor =element_line(colour ="lightgray"),)
Insights
The average spending for Fresh products in Hotel-Restaurant and Retail is $13,475.6 and $8,904.3 respectively.
The average spending for Delicassen products in Hotel-Restaurant and Retail is $1,416.7 and $1,753.4. respectively.
The visual shows that spending in Fresh products is significantly higher than spending in Delicassen products, no matter the channel. It also shows that the total average spending between the two variables (Fresh products and Delicassen products) is higher in the Hotel-Restaurant channel.