# load data
customer <- read.csv("/cloud/project/business/Customer_Demographics_Purchases.csv")
head(customer)
##   CustomerID          Name Age Gender  Income        Region PurchaseAmount
## 1          1   Grace Smith  56   Male 4721590        Kigali       388549.1
## 2          2  Daniel Davis  46 Female 2834937          Gulu       676498.1
## 3          3   Emily Jones  32 Female 3457473       Kampala      1712959.8
## 4          4     Grace Lee  60 Female 1112461        Dodoma       430010.1
## 5          5 Michael Perez  25   Male 4727036 Dar es Salaam       160893.4
## 6          6   Ava Johnson  38   Male 4355555        Kigali       156635.0
##   ProductCategory RepeatPurchase
## 1     Accessories            Yes
## 2 Home Appliances            Yes
## 3 Home Appliances            Yes
## 4 Home Appliances             No
## 5     Electronics            Yes
## 6 Home Appliances             No
feedback <- read.csv("/cloud/project/business/Customer_Feedback_Retention.csv")
head(feedback)
##   CustomerID              Name SatisfactionScore ServiceQuality SupportTickets
## 1        803     Michael Perez                 3             10              9
## 2        209      Emily Miller                 3              2              8
## 3        524    James Thompson                 4              3              7
## 4        508        Noah Lopez                 1              2              1
## 5        175       Grace Brown                 4              1              7
## 6        355 Benjamin Martinez                 1              7              2
##   RetainedNextYear
## 1              Yes
## 2              Yes
## 3               No
## 4              Yes
## 5              Yes
## 6              Yes
product <- read.csv("/cloud/project/business/Product_Sales_Transactions.csv")
head(product)
##   TransactionID CustomerID ProductID ProductName ProductCategory     Price
## 1             1        181     P0211      Tablet     Electronics 1152268.4
## 2             2        743     P0048      Router     Accessories 1194958.3
## 3             3        301     P0086      Camera     Accessories  332285.8
## 4             4        660     P0149     Speaker     Electronics 2748024.1
## 5             5        792     P0188  Smartwatch     Electronics  318557.1
## 6             6        189     P0372  Smartphone     Electronics 1962217.1
##   UnitsSold DiscountApplied Season  Region
## 1        13            0.04     Q4 Entebbe
## 2         9            0.28     Q4 Mombasa
## 3         7            0.07     Q4 Kampala
## 4        12            0.30     Q1 Entebbe
## 5         5            0.15     Q1  Arusha
## 6        18            0.23     Q2 Kampala
# What is the average of PurchaseAmount by Region?

# step1: install r packages
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# step2:establish mean of purchases and group by region
average_purchase <- customer %>%
  group_by(Region)%>%
  summarise(Avg_Purchase=mean(PurchaseAmount,na.rm=TRUE),.groups='drop')

# step3: print results
print(average_purchase)
## # A tibble: 10 × 2
##    Region        Avg_Purchase
##    <chr>                <dbl>
##  1 Arusha            1118920.
##  2 Dar es Salaam     1023535.
##  3 Dodoma             970052.
##  4 Entebbe           1017553.
##  5 Gulu               935893.
##  6 Jinja              945965.
##  7 Kampala            996701.
##  8 Kigali             886316.
##  9 Mombasa           1069363.
## 10 Nairobi           1036287.
# step4: visualize results
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data=average_purchase,mapping=aes(x=Region,y=Avg_Purchase))+
  geom_col(color='black',fill='lightblue')+
  labs(title='Average Purchases by Region')

#What is the gender distribution of your customers?

#step1: count and group by gender
library(dplyr)
gender_distribution <- customer %>%
  group_by(Gender)%>%
  summarise(count=n(),.groups='drop')

#step2: print results
print(gender_distribution)
## # A tibble: 2 × 2
##   Gender count
##   <chr>  <int>
## 1 Female   474
## 2 Male     526
#step3: visualize results
library(ggplot2)
ggplot(data=gender_distribution,mapping=aes(x=Gender,y=count))+
  geom_col(color='black',fill='orange')

#Which ProductCategory is most popular overall and by region?

#step1: join product and customer tables
library(dplyr)
product_customer <- left_join(product,customer,by='CustomerID')

#step2: aggregate and group by product category and region
popular_category <- product_customer %>%
  group_by(ProductCategory.x,Region.x)%>%
  summarise(ProductCount=n(),.groups='drop')%>%
  arrange(desc(ProductCount))%>%
  slice(1:10)

#step3: print results
print(popular_category)
## # A tibble: 10 × 3
##    ProductCategory.x Region.x      ProductCount
##    <chr>             <chr>                <int>
##  1 Home Appliances   Gulu                    53
##  2 Electronics       Gulu                    49
##  3 Accessories       Dodoma                  48
##  4 Electronics       Nairobi                 47
##  5 Home Appliances   Dar es Salaam           47
##  6 Electronics       Entebbe                 45
##  7 Electronics       Kampala                 44
##  8 Accessories       Entebbe                 43
##  9 Home Appliances   Nairobi                 43
## 10 Home Appliances   Entebbe                 42
#step4: visualize output
ggplot(data=popular_category,mapping=aes(ProductCategory.x,y=ProductCount,fill=Region.x))+
  geom_col(color='black',position='dodge')

#Predict UnitsSold using Price, DiscountApplied, Season, and Region.

# build a linear model
linear_model <- lm(UnitsSold~DiscountApplied+Region,data=product)

# print results
summary(linear_model)
## 
## Call:
## lm(formula = UnitsSold ~ DiscountApplied + Region, data = product)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1028  -4.5971  -0.1062   4.7963   9.3907 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         10.370745   0.575721  18.013   <2e-16 ***
## DiscountApplied     -0.334064   1.827831  -0.183    0.855    
## RegionDar es Salaam -0.545599   0.717468  -0.760    0.447    
## RegionDodoma        -0.687980   0.700754  -0.982    0.326    
## RegionEntebbe        0.745460   0.699494   1.066    0.287    
## RegionGulu          -0.319901   0.696987  -0.459    0.646    
## RegionJinja          0.312071   0.747319   0.418    0.676    
## RegionKampala       -0.009295   0.713308  -0.013    0.990    
## RegionKigali         0.300138   0.711598   0.422    0.673    
## RegionMombasa       -0.091010   0.732091  -0.124    0.901    
## RegionNairobi       -0.479452   0.702825  -0.682    0.495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.463 on 1189 degrees of freedom
## Multiple R-squared:  0.006299,   Adjusted R-squared:  -0.002058 
## F-statistic: 0.7537 on 10 and 1189 DF,  p-value: 0.6738
#Predict if customer will make a repeat purchase based on age, gender & income

#step1: convert repeat purchase to factor
customer$RepeatPurchase <- as.factor(customer$RepeatPurchase)

#step1: build logistical regression model
logistical_model <- glm(RepeatPurchase~Age+Gender+Income,data=customer,family='binomial')

#step3: print results
summary(logistical_model)
## 
## Call:
## glm(formula = RepeatPurchase ~ Age + Gender + Income, family = "binomial", 
##     data = customer)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.006e-01  2.633e-01   0.382    0.702
## Age          4.298e-03  4.832e-03   0.890    0.374
## GenderMale  -1.994e-02  1.301e-01  -0.153    0.878
## Income       6.701e-08  4.960e-08   1.351    0.177
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1337.5  on 999  degrees of freedom
## Residual deviance: 1335.1  on 996  degrees of freedom
## AIC: 1343.1
## 
## Number of Fisher Scoring iterations: 4
# Do UnitsSold differ significantly across Season?

# step1: conduct t.test to estabish difference
anova_result <- aov(UnitsSold~Season,data=product)

# step2: print results
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)
## Season         3     76   25.22   0.846  0.469
## Residuals   1196  35640   29.80
#Is there an association between ProductCategory and Season for high vol
#sales?

#step1: create a contingency table
tbl <-table(product$ProductCategory,product$Season)

#step2: calculate chi test
chi_result <- chisq.test(tbl)

#step3: print result
print(chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 12.232, df = 6, p-value = 0.05698
#Compare average PurchaseAmount between Male vs Female customers.

#step1: use t.test to establish difference 
t_result <- t.test(PurchaseAmount~Gender,data=customer)

#step2: print result
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  PurchaseAmount by Gender
## t = -0.78701, df = 988.8, p-value = 0.4315
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -98611.64  42156.43
## sample estimates:
## mean in group Female   mean in group Male 
##             984920.4            1013148.0
# what is the income categorization count?

# step1: establish income categories
income_category <- cut(customer$Income,
                       breaks=c(300000,1000000,2000000,3000000,4000000,5000000),
                       labels=c("300000-1000000","1001000-2000000","2001000-3000000","3001000-4000000","4001000-5000000"),
                       include.lowest=TRUE)

#step2: create new column using mutate function
library(dplyr)
customer <- customer %>%
  mutate(IncomeCategory=income_category)

#step3: aggregate and group by region
categorization <- customer %>%
  group_by(IncomeCategory,Region)%>%
  summarise(CustomerCount=n(),.groups='drop')

#step4: print results
print(categorization)
## # A tibble: 50 × 3
##    IncomeCategory Region        CustomerCount
##    <fct>          <chr>                 <int>
##  1 300000-1000000 Arusha                   12
##  2 300000-1000000 Dar es Salaam            16
##  3 300000-1000000 Dodoma                   11
##  4 300000-1000000 Entebbe                  11
##  5 300000-1000000 Gulu                     12
##  6 300000-1000000 Jinja                    10
##  7 300000-1000000 Kampala                  12
##  8 300000-1000000 Kigali                   13
##  9 300000-1000000 Mombasa                  13
## 10 300000-1000000 Nairobi                  11
## # ℹ 40 more rows
#step5: visualize results
library(ggplot2)
ggplot(data=categorization,mapping=aes(x=IncomeCategory,y=CustomerCount,fill=Region))+
  geom_col(color='black',position='dodge')+
  labs(title='Income Categorization Count')+
  theme(axis.text.x = element_text(angle=45,hjust=1))