# load dataset
customer <- read.csv("/cloud/project/retail/customer_data.csv")
head(customer)
##   Age Gender Income SpendScore EducationLevel Region PurchasedProduct Rating
## 1  56   Male  41672         19      Bachelors  South                0      1
## 2  46 Female  78217         37            PhD   West                1      5
## 3  32 Female  28279         85      Bachelors   West                1      2
## 4  60 Female  17017          8    High School   East                1      2
## 5  25   Male  56600         28      Bachelors   West                1      1
## 6  38   Male  42469          5      Bachelors  North                1      5
##   CustomerSatisfaction YearsWithCompany
## 1                  Low                5
## 2               Medium               10
## 3                  Low               28
## 4                  Low               10
## 5                  Low                6
## 6               Medium                1
# What are the average and median income levels of customers across different regions?

#Step1: load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

# step1: group by region
mean_median <- customer %>%
  group_by(Region)%>%
  summarise(
    avg_income=mean(Income,na.rm=TRUE),
    med_income=median(Income,na.rm=TRUE)
  ,.groups='drop')

# step2: print results
print(mean_median)
## # A tibble: 4 × 3
##   Region avg_income med_income
##   <chr>       <dbl>      <dbl>
## 1 East       48973.     48031 
## 2 North      51949.     51800 
## 3 South      50658.     51371 
## 4 West       51868.     51116.
# step3: visualize output
library(ggplot2)

ggplot(data=mean_median,mapping=aes(x=Region,y=avg_income,fill=med_income))+
  geom_col()

# How does the spend score vary by age group or gender?
# Step1:categorise the age
age_category <- cut(customer$Age,
                    breaks=c(18,29,39,49,59,69),
                    labels=c("18-29","30-39","40-49","50-59","60-69"),
include.lowest='true')

# Step2:create new category column
customer <- customer %>%
  mutate(Age_Category=age_category)

# Step3: calculate the spend score by gender and age category 
spend_demographic <- customer%>%
  group_by(Gender,Age_Category)%>%
  summarise(Avg_score=mean(SpendScore,na.rm=TRUE),.groups='drop')

# Step4: print results
print(spend_demographic)
## # A tibble: 10 × 3
##    Gender Age_Category Avg_score
##    <chr>  <fct>            <dbl>
##  1 Female 18-29             55.6
##  2 Female 30-39             49.2
##  3 Female 40-49             46.8
##  4 Female 50-59             52.4
##  5 Female 60-69             57.1
##  6 Male   18-29             50.2
##  7 Male   30-39             50.9
##  8 Male   40-49             51.8
##  9 Male   50-59             49.4
## 10 Male   60-69             46.7
# Step5: visualize results
ggplot(data=spend_demographic,mapping=aes(x=Age_Category,y=Avg_score,fill=Gender))+
  geom_col()+labs(title='How spend score varies by age group and gender')+
  theme_minimal()

#What is the distribution of education levels across customers?
educ_distribution <- customer %>%
  group_by(EducationLevel)%>%
  summarise(Count=n(),.groups='drop')
print(educ_distribution)
## # A tibble: 4 × 2
##   EducationLevel Count
##   <chr>          <int>
## 1 Bachelors        247
## 2 High School      241
## 3 Masters          262
## 4 PhD              250
#visualise output
ggplot(data=educ_distribution,mapping=aes(x=EducationLevel,y=Count, fill=EducationLevel))+
  geom_col()+labs(title='Distribution of Education Levels')+
  theme_minimal()

#Which regions have the highest number of purchases, and how does this relate to customer satisfaction?

top_regions <- customer%>%
  filter(!is.na(PurchasedProduct),PurchasedProduct==1)%>%
  group_by(Region,CustomerSatisfaction)%>%
  summarise(purchase_count=n(),.groups='drop')
  print(top_regions)
## # A tibble: 12 × 3
##    Region CustomerSatisfaction purchase_count
##    <chr>  <chr>                         <int>
##  1 East   High                             63
##  2 East   Low                              77
##  3 East   Medium                           51
##  4 North  High                             52
##  5 North  Low                              45
##  6 North  Medium                           53
##  7 South  High                             64
##  8 South  Low                              50
##  9 South  Medium                           64
## 10 West   High                             63
## 11 West   Low                              58
## 12 West   Medium                           63
# print results
  ggplot(data=top_regions,mapping=aes(x=Region,y=purchase_count,fill=CustomerSatisfaction))+
    geom_col()+labs(title='Regions with highest purchases')+
    theme_minimal()

#Is there a significant correlation between income and spend score?
correlation_result <- cor(customer$Income,customer$SpendScore,use='complete.obs')
print(correlation_result)
## [1] -0.04283549
#Does the age of customers significantly affect their purchase behavior 

#Step1: use ANOVA for mean spend score across age groups
anova_result <- aov(SpendScore~Age_Category,data=customer)
print (anova_result)
## Call:
##    aov(formula = SpendScore ~ Age_Category, data = customer)
## 
## Terms:
##                 Age_Category Residuals
## Sum of Squares        1377.6  824400.2
## Deg. of Freedom            4       995
## 
## Residual standard error: 28.78442
## Estimated effects may be unbalanced
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)
## Age_Category   4   1378   344.4   0.416  0.797
## Residuals    995 824400   828.5
#Step2: visualize output
ggplot(data=anova_result,mapping=aes(x=Age_Category,y=SpendScore))+
  geom_boxplot(outlier.colour = 'red')

#Are there significant differences in income between different education levels?
aov_result <- aov(Income~EducationLevel,data=customer)
print(aov_result)
## Call:
##    aov(formula = Income ~ EducationLevel, data = customer)
## 
## Terms:
##                 EducationLevel    Residuals
## Sum of Squares       355277363 224695052611
## Deg. of Freedom              3          996
## 
## Residual standard error: 15019.9
## Estimated effects may be unbalanced
summary(aov_result)
##                 Df    Sum Sq   Mean Sq F value Pr(>F)
## EducationLevel   3 3.553e+08 118425788   0.525  0.665
## Residuals      996 2.247e+11 225597442
#Visualize results
ggplot(data=aov_result,mapping=aes(x=EducationLevel,y=Income))+
  geom_boxplot(outlier.colour = 'red')

# Is there an association between gender and purchase behavior?

chisquare_result <- chisq.test(customer$Gender,customer$PurchasedProduct)
print(chisquare_result)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  customer$Gender and customer$PurchasedProduct
## X-squared = 0.42833, df = 1, p-value = 0.5128
# summarize result
gender_purchase_df <- customer%>%
  count(Gender,PurchasedProduct)

# visualize output
ggplot(gender_purchase_df,mapping=aes(x=Gender,y=n,fill=factor(PurchasedProduct)))+
  geom_bar(stat='identity',position='stack') +
  labs(title='Association between Gender and Purchase behavior')

#Does region have a significant association with customer satisfaction levels?
chisq_df <- chisq.test(customer$Region,customer$CustomerSatisfaction)
print(chisq_df)
## 
##  Pearson's Chi-squared test
## 
## data:  customer$Region and customer$CustomerSatisfaction
## X-squared = 7.7256, df = 6, p-value = 0.2589
#Summarize the data
region_custsatisfaction_df <- customer %>%
  count(Region, CustomerSatisfaction)

#Visualize output
ggplot(data=region_custsatisfaction_df,mapping=aes(x=Region,y=n,fill=factor(CustomerSatisfaction)))+
  geom_col()+
  labs(title='Association of Region and Customer Satisfaction')

#Is there a relationship between education level and purchased product status?
chisq.df <- chisq.test(customer$EducationLevel,customer$PurchasedProduct)
print(chisq.df)
## 
##  Pearson's Chi-squared test
## 
## data:  customer$EducationLevel and customer$PurchasedProduct
## X-squared = 1.5835, df = 3, p-value = 0.6631
#Summarize data
education_purchase_df <- customer %>%
  count(EducationLevel,PurchasedProduct)

#Visualize result
ggplot(data=education_purchase_df,mapping=aes(x=EducationLevel,y=n,fill=factor(PurchasedProduct)))+
         geom_bar(stat='identity',position='stack')+
  theme(axis.text.x = element_text(angle=45,hjust=1))+
  labs(title='Relationship-Education Level & Purchased Product')

#Can we predict spend score based on income, age, and years with the company?
model <- lm(SpendScore ~ Income + Age + YearsWithCompany, data = customer)
print(model)
## 
## Call:
## lm(formula = SpendScore ~ Income + Age + YearsWithCompany, data = customer)
## 
## Coefficients:
##      (Intercept)            Income               Age  YearsWithCompany  
##        5.693e+01        -8.337e-05        -6.480e-02         5.579e-02
#How does income influence the likelihood of a purchase?
pp_model <- glm(PurchasedProduct~Income,data=customer,family=binomial)
print(pp_model)
## 
## Call:  glm(formula = PurchasedProduct ~ Income, family = binomial, data = customer)
## 
## Coefficients:
## (Intercept)       Income  
##   7.476e-01    2.247e-06  
## 
## Degrees of Freedom: 999 Total (i.e. Null);  998 Residual
## Null Deviance:       1217 
## Residual Deviance: 1216  AIC: 1220
#Does customer satisfaction rating have a significant impact on years with the company?
model_2 <- lm(YearsWithCompany~CustomerSatisfaction,data=customer)
print(model_2)
## 
## Call:
## lm(formula = YearsWithCompany ~ CustomerSatisfaction, data = customer)
## 
## Coefficients:
##                (Intercept)     CustomerSatisfactionLow  
##                    15.0117                     -0.5953  
## CustomerSatisfactionMedium  
##                    -1.3675
# What are the outliers in spend score and income, and how do they differ by region?

#Step1: calculate IQR for spendscore  
iqr_spendscore <- IQR(customer$SpendScore,na.rm=TRUE)
q1_spendscore <- quantile(customer$SpendScore,0.25,na.rm=TRUE)
q3_spendscore <- quantile(customer$SpendScore,0.75,na.rm=TRUE)

#Step2: calculate upper and lower bounds
lower_bound_spendscore <- q1_spendscore - 1.5 * iqr_spendscore
upper_bound_spendscore <- q3_spendscore + 1.5 * iqr_spendscore

#Step3: identify outliers in spendscore
outliers_spendscore <- customer$SpendScore < lower_bound_spendscore | customer$SpendScore > upper_bound_spendscore

#Step4: calculate iqr for income
iqr_income <- IQR(customer$Income,na.rm=TRUE)
q1_income <- quantile(customer$Income,0.25,na.rm=TRUE)
q3_income <- quantile(customer$Income,0.75,na.rm=TRUE)

#Step5: calculate upper and lower bounds
lower_bound_income <- q1_income - 1.5 * iqr_income
upper_bound_income <- q3_income + 1.5 * iqr_income

#Step6: identify outliers in income
outliers_income <- customer$Income < lower_bound_income | customer$Income > upper_bound_income

#Step7: add outlier columns in customer dataset
customer <- customer %>%
  mutate(
    Outliers_SpendScore=outliers_spendscore,
    Outliers_Income=outliers_income
      )

#Step8:group outliers by region
regional_outliers <- customer %>%
  group_by(Region)%>%
  summarise(
    total_outliers_spendscore=sum(Outliers_SpendScore),
    total_outliers_income=sum(Outliers_Income)
  ,.groups='drop')
#Step9: print results
print(regional_outliers)
## # A tibble: 4 × 3
##   Region total_outliers_spendscore total_outliers_income
##   <chr>                      <int>                 <int>
## 1 East                           0                     3
## 2 North                          0                     1
## 3 South                          0                     1
## 4 West                           0                     1
#Step10: visualize spendscore outlier output
ggplot(data=customer,mapping=aes(x=Region,y=SpendScore))+
  geom_boxplot(outlier.colour = "red") +
  labs(title='SpendScore Outliers by Region')

#Step11: visualize income outlier output
ggplot(data=customer,mapping=aes(x=Region,y=Income))+
  geom_boxplot(outlier.colour = 'red') + labs(title="Regional Outliiers by Income")

#Are there any extreme values in years with company that affect purchasing behavior?

# Calculate the years IQR
years_iqr <- IQR(customer$YearsWithCompany,na.rm=TRUE)
q1_years <- quantile(customer$YearsWithCompany,0.25,na.rm=TRUE)
q3_years <- quantile(customer$YearsWithCompany,0.75,na.rm=TRUE)

# Calculate lower and upper bound
years_lower_bound <- q1_years - 1.5 * years_iqr
years_upper_bound <- q3_years + 1.5 * years_iqr

# Establish the years outliers in customer dataset
years_outliers <- customer$YearsWithCompany < years_lower_bound | customer$YearsWithCompany > years_upper_bound

# Create a new years outliers columns
customer <- customer %>%
  mutate(Years_Outliers=years_outliers)

# Group outliers by purchasing behavior
purchase_outliers <- customer %>%
  group_by(customer$PurchasedProduct)%>%
  summarise(Total_years_outliers=sum(Years_Outliers),.groups='drop')

# Print results
print(purchase_outliers)
## # A tibble: 2 × 2
##   `customer$PurchasedProduct` Total_years_outliers
##                         <int>                <int>
## 1                           0                    0
## 2                           1                    0
# Visualize output
ggplot(data=customer,mapping=aes(x=factor(PurchasedProduct),y=YearsWithCompany))+
  geom_boxplot(outlier.colour = 'red')+
  labs(title='Year Outliers that affect Purchasing Behavior')