# load data
customer <- read.csv("/cloud/project/business/Customer_Demographics_Purchases.csv")
head(customer)
## CustomerID Name Age Gender Income Region PurchaseAmount
## 1 1 Grace Smith 56 Male 4721590 Kigali 388549.1
## 2 2 Daniel Davis 46 Female 2834937 Gulu 676498.1
## 3 3 Emily Jones 32 Female 3457473 Kampala 1712959.8
## 4 4 Grace Lee 60 Female 1112461 Dodoma 430010.1
## 5 5 Michael Perez 25 Male 4727036 Dar es Salaam 160893.4
## 6 6 Ava Johnson 38 Male 4355555 Kigali 156635.0
## ProductCategory RepeatPurchase
## 1 Accessories Yes
## 2 Home Appliances Yes
## 3 Home Appliances Yes
## 4 Home Appliances No
## 5 Electronics Yes
## 6 Home Appliances No
feedback <- read.csv("/cloud/project/business/Customer_Feedback_Retention.csv")
head(feedback)
## CustomerID Name SatisfactionScore ServiceQuality SupportTickets
## 1 803 Michael Perez 3 10 9
## 2 209 Emily Miller 3 2 8
## 3 524 James Thompson 4 3 7
## 4 508 Noah Lopez 1 2 1
## 5 175 Grace Brown 4 1 7
## 6 355 Benjamin Martinez 1 7 2
## RetainedNextYear
## 1 Yes
## 2 Yes
## 3 No
## 4 Yes
## 5 Yes
## 6 Yes
product <- read.csv("/cloud/project/business/Product_Sales_Transactions.csv")
head(product)
## TransactionID CustomerID ProductID ProductName ProductCategory Price
## 1 1 181 P0211 Tablet Electronics 1152268.4
## 2 2 743 P0048 Router Accessories 1194958.3
## 3 3 301 P0086 Camera Accessories 332285.8
## 4 4 660 P0149 Speaker Electronics 2748024.1
## 5 5 792 P0188 Smartwatch Electronics 318557.1
## 6 6 189 P0372 Smartphone Electronics 1962217.1
## UnitsSold DiscountApplied Season Region
## 1 13 0.04 Q4 Entebbe
## 2 9 0.28 Q4 Mombasa
## 3 7 0.07 Q4 Kampala
## 4 12 0.30 Q1 Entebbe
## 5 5 0.15 Q1 Arusha
## 6 18 0.23 Q2 Kampala
# What is the average of PurchaseAmount by Region?
# step1: install r packages
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# step2:establish mean of purchases and group by region
average_purchase <- customer %>%
group_by(Region)%>%
summarise(Avg_Purchase=mean(PurchaseAmount,na.rm=TRUE),.groups='drop')
# step3: print results
print(average_purchase)
## # A tibble: 10 × 2
## Region Avg_Purchase
## <chr> <dbl>
## 1 Arusha 1118920.
## 2 Dar es Salaam 1023535.
## 3 Dodoma 970052.
## 4 Entebbe 1017553.
## 5 Gulu 935893.
## 6 Jinja 945965.
## 7 Kampala 996701.
## 8 Kigali 886316.
## 9 Mombasa 1069363.
## 10 Nairobi 1036287.
# step4: visualize results
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data=average_purchase,mapping=aes(x=Region,y=Avg_Purchase))+
geom_col(color='black',fill='lightblue')+
labs(title='Average Purchases by Region')

#What is the gender distribution of your customers?
#step1: count and group by gender
library(dplyr)
gender_distribution <- customer %>%
group_by(Gender)%>%
summarise(count=n(),.groups='drop')
#step2: print results
print(gender_distribution)
## # A tibble: 2 × 2
## Gender count
## <chr> <int>
## 1 Female 474
## 2 Male 526
#step3: visualize results
library(ggplot2)
ggplot(data=gender_distribution,mapping=aes(x=Gender,y=count))+
geom_col(color='black',fill='orange')

#Which ProductCategory is most popular overall and by region?
#step1: join product and customer tables
library(dplyr)
product_customer <- left_join(product,customer,by='CustomerID')
#step2: aggregate and group by product category and region
popular_category <- product_customer %>%
group_by(ProductCategory.x,Region.x)%>%
summarise(ProductCount=n(),.groups='drop')%>%
arrange(desc(ProductCount))%>%
slice(1:10)
#step3: print results
print(popular_category)
## # A tibble: 10 × 3
## ProductCategory.x Region.x ProductCount
## <chr> <chr> <int>
## 1 Home Appliances Gulu 53
## 2 Electronics Gulu 49
## 3 Accessories Dodoma 48
## 4 Electronics Nairobi 47
## 5 Home Appliances Dar es Salaam 47
## 6 Electronics Entebbe 45
## 7 Electronics Kampala 44
## 8 Accessories Entebbe 43
## 9 Home Appliances Nairobi 43
## 10 Home Appliances Entebbe 42
#step4: visualize output
ggplot(data=popular_category,mapping=aes(ProductCategory.x,y=ProductCount,fill=Region.x))+
geom_col(color='black',position='dodge')

#Predict UnitsSold using Price, DiscountApplied, Season, and Region.
# build a linear model
linear_model <- lm(UnitsSold~DiscountApplied+Region,data=product)
# print results
summary(linear_model)
##
## Call:
## lm(formula = UnitsSold ~ DiscountApplied + Region, data = product)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1028 -4.5971 -0.1062 4.7963 9.3907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.370745 0.575721 18.013 <2e-16 ***
## DiscountApplied -0.334064 1.827831 -0.183 0.855
## RegionDar es Salaam -0.545599 0.717468 -0.760 0.447
## RegionDodoma -0.687980 0.700754 -0.982 0.326
## RegionEntebbe 0.745460 0.699494 1.066 0.287
## RegionGulu -0.319901 0.696987 -0.459 0.646
## RegionJinja 0.312071 0.747319 0.418 0.676
## RegionKampala -0.009295 0.713308 -0.013 0.990
## RegionKigali 0.300138 0.711598 0.422 0.673
## RegionMombasa -0.091010 0.732091 -0.124 0.901
## RegionNairobi -0.479452 0.702825 -0.682 0.495
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.463 on 1189 degrees of freedom
## Multiple R-squared: 0.006299, Adjusted R-squared: -0.002058
## F-statistic: 0.7537 on 10 and 1189 DF, p-value: 0.6738
#Predict if customer will make a repeat purchase based on age, gender & income
#step1: convert repeat purchase to factor
customer$RepeatPurchase <- as.factor(customer$RepeatPurchase)
#step1: build logistical regression model
logistical_model <- glm(RepeatPurchase~Age+Gender+Income,data=customer,family='binomial')
#step3: print results
summary(logistical_model)
##
## Call:
## glm(formula = RepeatPurchase ~ Age + Gender + Income, family = "binomial",
## data = customer)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.006e-01 2.633e-01 0.382 0.702
## Age 4.298e-03 4.832e-03 0.890 0.374
## GenderMale -1.994e-02 1.301e-01 -0.153 0.878
## Income 6.701e-08 4.960e-08 1.351 0.177
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1337.5 on 999 degrees of freedom
## Residual deviance: 1335.1 on 996 degrees of freedom
## AIC: 1343.1
##
## Number of Fisher Scoring iterations: 4
# Do UnitsSold differ significantly across Season?
# step1: conduct t.test to estabish difference
anova_result <- aov(UnitsSold~Season,data=product)
# step2: print results
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Season 3 76 25.22 0.846 0.469
## Residuals 1196 35640 29.80
#Is there an association between ProductCategory and Season for high vol
#sales?
#step1: create a contingency table
tbl <-table(product$ProductCategory,product$Season)
#step2: calculate chi test
chi_result <- chisq.test(tbl)
#step3: print result
print(chi_result)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 12.232, df = 6, p-value = 0.05698
#Compare average PurchaseAmount between Male vs Female customers.
#step1: use t.test to establish difference
t_result <- t.test(PurchaseAmount~Gender,data=customer)
#step2: print result
print(t_result)
##
## Welch Two Sample t-test
##
## data: PurchaseAmount by Gender
## t = -0.78701, df = 988.8, p-value = 0.4315
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -98611.64 42156.43
## sample estimates:
## mean in group Female mean in group Male
## 984920.4 1013148.0
# what is the income categorization count?
# step1: establish income categories
income_category <- cut(customer$Income,
breaks=c(300000,1000000,2000000,3000000,4000000,5000000),
labels=c("300000-1000000","1001000-2000000","2001000-3000000","3001000-4000000","4001000-5000000"),
include.lowest=TRUE)
#step2: create new column using mutate function
library(dplyr)
customer <- customer %>%
mutate(IncomeCategory=income_category)
#step3: aggregate and group by region
categorization <- customer %>%
group_by(IncomeCategory,Region)%>%
summarise(CustomerCount=n(),.groups='drop')
#step4: print results
print(categorization)
## # A tibble: 50 × 3
## IncomeCategory Region CustomerCount
## <fct> <chr> <int>
## 1 300000-1000000 Arusha 12
## 2 300000-1000000 Dar es Salaam 16
## 3 300000-1000000 Dodoma 11
## 4 300000-1000000 Entebbe 11
## 5 300000-1000000 Gulu 12
## 6 300000-1000000 Jinja 10
## 7 300000-1000000 Kampala 12
## 8 300000-1000000 Kigali 13
## 9 300000-1000000 Mombasa 13
## 10 300000-1000000 Nairobi 11
## # ℹ 40 more rows
#step5: visualize results
library(ggplot2)
ggplot(data=categorization,mapping=aes(x=IncomeCategory,y=CustomerCount,fill=Region))+
geom_col(color='black',position='dodge')+
labs(title='Income Categorization Count')+
theme(axis.text.x = element_text(angle=45,hjust=1))
