# CHANNEL VARIABLEcustomers$channel <-as.factor(customers$Channel) # rename the levels from 1 and 2 to descriptorlevels(customers$channel)[levels(customers$channel)=='1'] <-"Hotel-Restaurant"levels(customers$channel)[levels(customers$channel)=='2'] <-"Retail"# REGION VARIABLEcustomers$region <-as.factor(customers$Region)levels(customers$region)[levels(customers$region)=='1'] <-"Lisbon"levels(customers$region)[levels(customers$region)=='2'] <-"Oporto"levels(customers$region)[levels(customers$region)=='3'] <-"Other"customers <- customers %>%select(3:10)head(customers)
Fresh Milk Grocery Frozen Detergents_Paper Delicassen channel region
1 12669 9656 7561 214 2674 1338 Retail Other
2 7057 9810 9568 1762 3293 1776 Retail Other
3 6353 8808 7684 2405 3516 7844 Retail Other
4 13265 1196 4221 6404 507 1788 Hotel-Restaurant Other
5 22615 5410 7198 3915 1777 5185 Retail Other
6 9413 8259 5126 666 1795 1451 Retail Other
# Categorial variables: Channel and Region # Numerical variables: Milk and delicassen
Part 1: Summarizing two categorical variables using Pipes
In using pipes to summarize the categorical variables, The highest frequency % of observations within the different combinations of categorical variables is retail (from channel) and other (from region). In this combination the region of other accounts for 74% of retail, meaning that the majority of customers are coming from a different region then the ones presented. the second largest is other and hotel-restaurant where other accounts for 70% of hotel-restaurant meaning that purchases made through the hotel-restaurant location are in a region not mentioned. This grouping is also contains the largest number of observations in the whole of the dataset.
Part 2: Stacked, dodged, and horizontal (without a legend) bar charts.
p_title <-"Customers purchases through Channel by Region"p_caption <-"customers dataset {datasetsICR}"# Stacked Bar Chart p <-ggplot(data =subset(pip1, !is.na(channel) &!is.na(region)), aes(x=channel, y=pct, fill = region))p +geom_col(position ="stack") +labs(x="Purchasing Channel", y="Percent", fill ="Region",title = p_title, caption = p_caption, subtitle ="The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") +geom_text(aes(label=pct), position =position_stack(vjust=.5))
# As dodged bar chart p +geom_col(position ="dodge2") +labs(x="Purchasing Channel", y="Percent", fill ="Region",title = p_title, caption = p_caption, subtitle ="The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") +geom_text(aes(label = pct), position =position_dodge(width = .9))
# As faceted horizontal bar chart p +geom_col(position ="dodge2") +labs(x=NULL, y="Percent", fill ="Region",title = p_title, caption = p_caption, subtitle ="The Other Region Accompanies the Largest Percentage of Customers Purchases by Channel") +guides(fill ="none") +coord_flip() +facet_grid(~ channel) +geom_text(aes(label = pct), position =position_dodge2(width =1))
Insights from the Stacked Bar Chart, Dodged Bar Chart and Faceted Horizontal Chart. showing Customers Purchases through Channel by Region. The Largest percentage of channel by region is the other region. This is due to the data having very uneven categories for the region. Other is the largest group totaling more that Lisbon and Oporto combined. Other accounts for 71% of the Hotel-Retaurant category and 74% of the Retail category. Meaning that Other is the largest group of customers for both channels that purchase from those location. I believe that the Stacked Bar chart shares this information in the clearest way.
Part 3:Using pipe to summarize two continuous and one categorical variable
# A tibble: 3 × 6
region N Milk_mean Delicassen_mean freq pct
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Lisbon 77 5486. 1355. 0.175 18
2 Oporto 47 5088. 1160. 0.107 11
3 Other 316 5977. 1621. 0.718 72
table(customers$region)
Lisbon Oporto Other
77 47 316
The highest percentage of frequency for the combinations for the categorical with two continuouse variables is other combines with milk and delicassen. Milk and delicassen account for 72% of the other category while the combinations for Lisbon is 18% and Oporto is 11%. This makes sense due to other being the largest response among the observations.
Part 4: Scatterplot for the continuouse and categorical variable
# my legend in coding already is in order by color leading from the smallest to the largest, I am going to modify the name but other than that I believe that the legend is adequate for interpreting the graph. library(ggplot2)p2 <-ggplot(customers, aes(x=Milk, y=Delicassen, color=region))p2 +geom_point(size=5, na.rm=TRUE) +annotate(geom ="text", x =1.6, y=58, label ="", hjust=0) +labs(y="Average Milk Purchases", x="Average Delicassen Purchases", color="Customer Region",title="Milk and Delicassen by Region", subtitle ="Interesting interpretation goes here", caption <-"customers dataset {datasetsICR}") +facet_wrap(~ region)+scale_x_log10() +scale_y_log10()+theme(legend.position ="bottom",legend.title =element_text(color="gray50", size=14, face="bold" ))
Data Labels
library(ggplot2)p <-ggplot(customers, aes(x=Milk, y=Delicassen, color=region))p +geom_point(size=5, na.rm=TRUE) +geom_text(mapping =aes(label=region), hjust=1.2, size=3) +annotate(geom ="text", x =1.6, y=58, label ="", hjust=0) +labs(y="Milk Purchases", x="Delicassen Purchases", title="Milk and Delicassen by Region",subtitle="The Other Region is Positively Correlated with Outliers Associated with Higher Delicassen Purchases.",color ="Religion") +theme(legend.position ="none")+facet_wrap(~ region)+scale_x_log10() +scale_y_log10()
These scatterplots are showing the relationship of purchases between Milk and Delicassen categorized by region. We can see that the other region takes up the majority of the observations in the dataset. The x and y axis are altered by a log base 10. It seems to be consistent in that people who buy a lot milk also buy from the Delicassen frequently. There are few outliers within the Other region and the Lisbon region that buy more from the Delicassen then they do Milk. Otherwise the observations are very grouped together leaving questions on if there are other factors that contribute (like age or income) to the relationship.