# load dataset
customer <- read.csv("/cloud/project/retail/customer_data.csv")
head(customer)
## Age Gender Income SpendScore EducationLevel Region PurchasedProduct Rating
## 1 56 Male 41672 19 Bachelors South 0 1
## 2 46 Female 78217 37 PhD West 1 5
## 3 32 Female 28279 85 Bachelors West 1 2
## 4 60 Female 17017 8 High School East 1 2
## 5 25 Male 56600 28 Bachelors West 1 1
## 6 38 Male 42469 5 Bachelors North 1 5
## CustomerSatisfaction YearsWithCompany
## 1 Low 5
## 2 Medium 10
## 3 Low 28
## 4 Low 10
## 5 Low 6
## 6 Medium 1
# What are the average and median income levels of customers across different regions?
#Step1: load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# step1: group by region
mean_median <- customer %>%
group_by(Region)%>%
summarise(
avg_income=mean(Income,na.rm=TRUE),
med_income=median(Income,na.rm=TRUE)
,.groups='drop')
# step2: print results
print(mean_median)
## # A tibble: 4 × 3
## Region avg_income med_income
## <chr> <dbl> <dbl>
## 1 East 48973. 48031
## 2 North 51949. 51800
## 3 South 50658. 51371
## 4 West 51868. 51116.
# step3: visualize output
library(ggplot2)
ggplot(data=mean_median,mapping=aes(x=Region,y=avg_income,fill=med_income))+
geom_col()

# How does the spend score vary by age group or gender?
# Step1:categorise the age
age_category <- cut(customer$Age,
breaks=c(18,29,39,49,59,69),
labels=c("18-29","30-39","40-49","50-59","60-69"),
include.lowest='true')
# Step2:create new category column
customer <- customer %>%
mutate(Age_Category=age_category)
# Step3: calculate the spend score by gender and age category
spend_demographic <- customer%>%
group_by(Gender,Age_Category)%>%
summarise(Avg_score=mean(SpendScore,na.rm=TRUE),.groups='drop')
# Step4: print results
print(spend_demographic)
## # A tibble: 10 × 3
## Gender Age_Category Avg_score
## <chr> <fct> <dbl>
## 1 Female 18-29 55.6
## 2 Female 30-39 49.2
## 3 Female 40-49 46.8
## 4 Female 50-59 52.4
## 5 Female 60-69 57.1
## 6 Male 18-29 50.2
## 7 Male 30-39 50.9
## 8 Male 40-49 51.8
## 9 Male 50-59 49.4
## 10 Male 60-69 46.7
# Step5: visualize results
ggplot(data=spend_demographic,mapping=aes(x=Age_Category,y=Avg_score,fill=Gender))+
geom_col()+labs(title='How spend score varies by age group and gender')+
theme_minimal()

#What is the distribution of education levels across customers?
educ_distribution <- customer %>%
group_by(EducationLevel)%>%
summarise(Count=n(),.groups='drop')
print(educ_distribution)
## # A tibble: 4 × 2
## EducationLevel Count
## <chr> <int>
## 1 Bachelors 247
## 2 High School 241
## 3 Masters 262
## 4 PhD 250
#visualise output
ggplot(data=educ_distribution,mapping=aes(x=EducationLevel,y=Count, fill=EducationLevel))+
geom_col()+labs(title='Distribution of Education Levels')+
theme_minimal()

#Which regions have the highest number of purchases, and how does this relate to customer satisfaction?
top_regions <- customer%>%
filter(!is.na(PurchasedProduct),PurchasedProduct==1)%>%
group_by(Region,CustomerSatisfaction)%>%
summarise(purchase_count=n(),.groups='drop')
print(top_regions)
## # A tibble: 12 × 3
## Region CustomerSatisfaction purchase_count
## <chr> <chr> <int>
## 1 East High 63
## 2 East Low 77
## 3 East Medium 51
## 4 North High 52
## 5 North Low 45
## 6 North Medium 53
## 7 South High 64
## 8 South Low 50
## 9 South Medium 64
## 10 West High 63
## 11 West Low 58
## 12 West Medium 63
# print results
ggplot(data=top_regions,mapping=aes(x=Region,y=purchase_count,fill=CustomerSatisfaction))+
geom_col()+labs(title='Regions with highest purchases')+
theme_minimal()

#Is there a significant correlation between income and spend score?
correlation_result <- cor(customer$Income,customer$SpendScore,use='complete.obs')
print(correlation_result)
## [1] -0.04283549
#Does the age of customers significantly affect their purchase behavior
#Step1: use ANOVA for mean spend score across age groups
anova_result <- aov(SpendScore~Age_Category,data=customer)
print (anova_result)
## Call:
## aov(formula = SpendScore ~ Age_Category, data = customer)
##
## Terms:
## Age_Category Residuals
## Sum of Squares 1377.6 824400.2
## Deg. of Freedom 4 995
##
## Residual standard error: 28.78442
## Estimated effects may be unbalanced
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Age_Category 4 1378 344.4 0.416 0.797
## Residuals 995 824400 828.5
#Step2: visualize output
ggplot(data=anova_result,mapping=aes(x=Age_Category,y=SpendScore))+
geom_boxplot(outlier.colour = 'red')

#Are there significant differences in income between different education levels?
aov_result <- aov(Income~EducationLevel,data=customer)
print(aov_result)
## Call:
## aov(formula = Income ~ EducationLevel, data = customer)
##
## Terms:
## EducationLevel Residuals
## Sum of Squares 355277363 224695052611
## Deg. of Freedom 3 996
##
## Residual standard error: 15019.9
## Estimated effects may be unbalanced
summary(aov_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## EducationLevel 3 3.553e+08 118425788 0.525 0.665
## Residuals 996 2.247e+11 225597442
#Visualize results
ggplot(data=aov_result,mapping=aes(x=EducationLevel,y=Income))+
geom_boxplot(outlier.colour = 'red')

# Is there an association between gender and purchase behavior?
chisquare_result <- chisq.test(customer$Gender,customer$PurchasedProduct)
print(chisquare_result)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: customer$Gender and customer$PurchasedProduct
## X-squared = 0.42833, df = 1, p-value = 0.5128
# summarize result
gender_purchase_df <- customer%>%
count(Gender,PurchasedProduct)
# visualize output
ggplot(gender_purchase_df,mapping=aes(x=Gender,y=n,fill=factor(PurchasedProduct)))+
geom_bar(stat='identity',position='stack') +
labs(title='Association between Gender and Purchase behavior')

#Does region have a significant association with customer satisfaction levels?
chisq_df <- chisq.test(customer$Region,customer$CustomerSatisfaction)
print(chisq_df)
##
## Pearson's Chi-squared test
##
## data: customer$Region and customer$CustomerSatisfaction
## X-squared = 7.7256, df = 6, p-value = 0.2589
#Summarize the data
region_custsatisfaction_df <- customer %>%
count(Region, CustomerSatisfaction)
#Visualize output
ggplot(data=region_custsatisfaction_df,mapping=aes(x=Region,y=n,fill=factor(CustomerSatisfaction)))+
geom_col()+
labs(title='Association of Region and Customer Satisfaction')

#Is there a relationship between education level and purchased product status?
chisq.df <- chisq.test(customer$EducationLevel,customer$PurchasedProduct)
print(chisq.df)
##
## Pearson's Chi-squared test
##
## data: customer$EducationLevel and customer$PurchasedProduct
## X-squared = 1.5835, df = 3, p-value = 0.6631
#Summarize data
education_purchase_df <- customer %>%
count(EducationLevel,PurchasedProduct)
#Visualize result
ggplot(data=education_purchase_df,mapping=aes(x=EducationLevel,y=n,fill=factor(PurchasedProduct)))+
geom_bar(stat='identity',position='stack')+
theme(axis.text.x = element_text(angle=45,hjust=1))+
labs(title='Relationship-Education Level & Purchased Product')

#Can we predict spend score based on income, age, and years with the company?
model <- lm(SpendScore ~ Income + Age + YearsWithCompany, data = customer)
print(model)
##
## Call:
## lm(formula = SpendScore ~ Income + Age + YearsWithCompany, data = customer)
##
## Coefficients:
## (Intercept) Income Age YearsWithCompany
## 5.693e+01 -8.337e-05 -6.480e-02 5.579e-02
#How does income influence the likelihood of a purchase?
pp_model <- glm(PurchasedProduct~Income,data=customer,family=binomial)
print(pp_model)
##
## Call: glm(formula = PurchasedProduct ~ Income, family = binomial, data = customer)
##
## Coefficients:
## (Intercept) Income
## 7.476e-01 2.247e-06
##
## Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
## Null Deviance: 1217
## Residual Deviance: 1216 AIC: 1220
#Does customer satisfaction rating have a significant impact on years with the company?
model_2 <- lm(YearsWithCompany~CustomerSatisfaction,data=customer)
print(model_2)
##
## Call:
## lm(formula = YearsWithCompany ~ CustomerSatisfaction, data = customer)
##
## Coefficients:
## (Intercept) CustomerSatisfactionLow
## 15.0117 -0.5953
## CustomerSatisfactionMedium
## -1.3675
# What are the outliers in spend score and income, and how do they differ by region?
#Step1: calculate IQR for spendscore
iqr_spendscore <- IQR(customer$SpendScore,na.rm=TRUE)
q1_spendscore <- quantile(customer$SpendScore,0.25,na.rm=TRUE)
q3_spendscore <- quantile(customer$SpendScore,0.75,na.rm=TRUE)
#Step2: calculate upper and lower bounds
lower_bound_spendscore <- q1_spendscore - 1.5 * iqr_spendscore
upper_bound_spendscore <- q3_spendscore + 1.5 * iqr_spendscore
#Step3: identify outliers in spendscore
outliers_spendscore <- customer$SpendScore < lower_bound_spendscore | customer$SpendScore > upper_bound_spendscore
#Step4: calculate iqr for income
iqr_income <- IQR(customer$Income,na.rm=TRUE)
q1_income <- quantile(customer$Income,0.25,na.rm=TRUE)
q3_income <- quantile(customer$Income,0.75,na.rm=TRUE)
#Step5: calculate upper and lower bounds
lower_bound_income <- q1_income - 1.5 * iqr_income
upper_bound_income <- q3_income + 1.5 * iqr_income
#Step6: identify outliers in income
outliers_income <- customer$Income < lower_bound_income | customer$Income > upper_bound_income
#Step7: add outlier columns in customer dataset
customer <- customer %>%
mutate(
Outliers_SpendScore=outliers_spendscore,
Outliers_Income=outliers_income
)
#Step8:group outliers by region
regional_outliers <- customer %>%
group_by(Region)%>%
summarise(
total_outliers_spendscore=sum(Outliers_SpendScore),
total_outliers_income=sum(Outliers_Income)
,.groups='drop')
#Step9: print results
print(regional_outliers)
## # A tibble: 4 × 3
## Region total_outliers_spendscore total_outliers_income
## <chr> <int> <int>
## 1 East 0 3
## 2 North 0 1
## 3 South 0 1
## 4 West 0 1
#Step10: visualize spendscore outlier output
ggplot(data=customer,mapping=aes(x=Region,y=SpendScore))+
geom_boxplot(outlier.colour = "red") +
labs(title='SpendScore Outliers by Region')

#Step11: visualize income outlier output
ggplot(data=customer,mapping=aes(x=Region,y=Income))+
geom_boxplot(outlier.colour = 'red') + labs(title="Regional Outliiers by Income")

#Are there any extreme values in years with company that affect purchasing behavior?
# Calculate the years IQR
years_iqr <- IQR(customer$YearsWithCompany,na.rm=TRUE)
q1_years <- quantile(customer$YearsWithCompany,0.25,na.rm=TRUE)
q3_years <- quantile(customer$YearsWithCompany,0.75,na.rm=TRUE)
# Calculate lower and upper bound
years_lower_bound <- q1_years - 1.5 * years_iqr
years_upper_bound <- q3_years + 1.5 * years_iqr
# Establish the years outliers in customer dataset
years_outliers <- customer$YearsWithCompany < years_lower_bound | customer$YearsWithCompany > years_upper_bound
# Create a new years outliers columns
customer <- customer %>%
mutate(Years_Outliers=years_outliers)
# Group outliers by purchasing behavior
purchase_outliers <- customer %>%
group_by(customer$PurchasedProduct)%>%
summarise(Total_years_outliers=sum(Years_Outliers),.groups='drop')
# Print results
print(purchase_outliers)
## # A tibble: 2 × 2
## `customer$PurchasedProduct` Total_years_outliers
## <int> <int>
## 1 0 0
## 2 1 0
# Visualize output
ggplot(data=customer,mapping=aes(x=factor(PurchasedProduct),y=YearsWithCompany))+
geom_boxplot(outlier.colour = 'red')+
labs(title='Year Outliers that affect Purchasing Behavior')
