#step1: identify general directory
getwd()
## [1] "/cloud/project"
#step2: identify specific directory
list.files("/cloud/project/data")
## [1] "marketing_analysis_dataset_1000_rows.csv"
#step3: load data
marketing <- read.csv("/cloud/project/data/marketing_analysis_dataset_1000_rows.csv")
head(marketing)
## Date Customer_ID Age Gender Region Marketing_Channel Product_Category
## 1 2023-01-01 CUST00001 56 Male Central TV Product D
## 2 2023-01-02 CUST00002 69 Male Eastern Search Ads Product A
## 3 2023-01-03 CUST00003 46 Male Western Referral Product C
## 4 2023-01-04 CUST00004 32 Female Central TV Product C
## 5 2023-01-05 CUST00005 60 Male Western TV Product B
## 6 2023-01-06 CUST00006 25 Male Eastern Social Media Product D
## Customer_Segment Ad_Spend Website_Visits Email_Clicks Discount_Percent
## 1 New 114.72 15 1 24.4
## 2 Returning 243.43 11 6 22.6
## 3 Returning 520.47 11 2 27.4
## 4 New 81.60 6 1 25.0
## 5 Returning 605.42 12 4 7.6
## 6 Returning 299.20 8 4 9.9
## Conversion_Rate Sales_Amount Profit Customer_Satisfaction
## 1 0.070 543.63 160.16 3.8
## 2 0.084 755.88 193.27 3.6
## 3 0.057 1615.68 480.31 3.2
## 4 0.044 318.86 66.04 5.0
## 5 0.077 1765.08 449.26 5.0
## 6 0.071 991.89 362.42 4.2
#1. What is the distribution of Customer Age?
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data=marketing,mapping=aes(x=Age))+geom_histogram(color='black',fill='lightblue') +
labs(title='Age Distribution')
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#2. percentage of customers by region
#install dplyr package
install.packages('dplyr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#aggregate and group
customer_distribution <- marketing %>%
group_by(Region)%>%
summarise(customer_count=n()) %>%
mutate(
percentage = (customer_count/sum(customer_count))*100
)
print(customer_distribution)
## # A tibble: 4 × 3
## Region customer_count percentage
## <chr> <int> <dbl>
## 1 Central 251 25.1
## 2 Eastern 259 25.9
## 3 Northern 236 23.6
## 4 Western 254 25.4
#visualize output
ggplot(customer_distribution,aes(x=Region, y=percentage,fill=Region))+geom_col(color='black')+
labs(title='Customer Percentage by Region')

#3. Which marketing channel generate the highest total sales?
# Sum the sales and group by marketing channel
top_sales <- marketing %>%
group_by(Marketing_Channel)%>%
summarise(total_sales = sum(Sales_Amount,na.rm=TRUE),groups='drop')%>%
arrange(desc(total_sales))
print(top_sales)
## # A tibble: 5 × 3
## Marketing_Channel total_sales groups
## <chr> <dbl> <chr>
## 1 TV 390828. drop
## 2 Social Media 366227. drop
## 3 Referral 359246. drop
## 4 Email 336144. drop
## 5 Search Ads 328025. drop
# visualise output
ggplot(top_sales,aes(x=Marketing_Channel,y=total_sales))+geom_col(color='red')+
labs(title='Total sales by Channel')

#4. Is there a relationship between website visits and sales?
correlation_result <- cor(marketing$Website_Visits,marketing$Sales_Amount, use = 'complete.obs')
print(correlation_result)
## [1] 0.0268328
#5. Is there a relationship between Email clicks and conversion rate?
correlation2 <- cor(marketing$Email_Clicks,marketing$Conversion_Rate,use='complete.obs')
print(correlation2)
## [1] 0.5159913
#6. Do male and female customers have significantly different average Sales_Amount?
t_result <- t.test(Sales_Amount ~ Gender,data=marketing)
print(t_result)
##
## Welch Two Sample t-test
##
## data: Sales_Amount by Gender
## t = 0.093509, df = 885.55, p-value = 0.9255
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -152.5824 167.8492
## sample estimates:
## mean in group Female mean in group Male
## 1784.470 1776.837
#7. Do returning customers spend more than new customers?
anova_result <- aov(Sales_Amount~Customer_Segment,data=marketing)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Customer_Segment 2 2.590e+06 1295166 0.8 0.45
## Residuals 997 1.615e+09 1619573
#8. Is there an association between Gender and Customer_Segment?
# create contingency table
tbl <- table(marketing$Gender,marketing$Customer_Segment)
# establish degree of association
chi_result <- chisq.test(tbl)
print(chi_result)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 0.1019, df = 2, p-value = 0.9503
#9. How does Ad spend influence Sales_Amount?
linear_model <- lm(Sales_Amount ~ Ad_Spend,data=marketing)
summary(linear_model)
##
## Call:
## lm(formula = Sales_Amount ~ Ad_Spend, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1051.2 -193.7 -4.6 154.9 4786.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -243.58319 27.86512 -8.742 <2e-16 ***
## Ad_Spend 3.31934 0.03939 84.276 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 446.8 on 998 degrees of freedom
## Multiple R-squared: 0.8768, Adjusted R-squared: 0.8767
## F-statistic: 7103 on 1 and 998 DF, p-value: < 2.2e-16
#10. How much does every extra website visit increase sales?
linear_model2 <- lm(Sales_Amount ~ Website_Visits,data=marketing)
summary(linear_model2)
##
## Call:
## lm(formula = Sales_Amount ~ Website_Visits, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1575.0 -622.4 -235.4 312.6 15097.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1659.950 147.713 11.238 <2e-16 ***
## Website_Visits 9.987 11.777 0.848 0.397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1273 on 998 degrees of freedom
## Multiple R-squared: 0.00072, Adjusted R-squared: -0.0002813
## F-statistic: 0.7191 on 1 and 998 DF, p-value: 0.3967
#11. Build a model predicting Sales_Amount using Email_Clicks, Discount_Percent and Age
multiple_model <- lm(Sales_Amount~Email_Clicks+Discount_Percent+Age,data=marketing)
summary(multiple_model)
##
## Call:
## lm(formula = Sales_Amount ~ Email_Clicks + Discount_Percent +
## Age, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1619.0 -628.7 -215.4 324.5 14917.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1567.001 161.984 9.674 <2e-16 ***
## Email_Clicks 43.744 20.058 2.181 0.0294 *
## Discount_Percent 5.787 4.696 1.232 0.2182
## Age -1.135 2.690 -0.422 0.6732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1270 on 996 degrees of freedom
## Multiple R-squared: 0.00649, Adjusted R-squared: 0.003498
## F-statistic: 2.169 on 3 and 996 DF, p-value: 0.09011
#12. Build a model predicting Profit.
#a. Establish numeric variables have the strongest linear relationship with Profit
numeric_vars <- marketing[, c("Profit", "Age", "Ad_Spend", "Website_Visits",
"Email_Clicks", "Discount_Percent", "Conversion_Rate",
"Sales_Amount", "Customer_Satisfaction")]
# Correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# View correlations with Profit
cor_with_profit <- cor_matrix["Profit", ]
print(sort(abs(cor_with_profit), decreasing = TRUE))
## Profit Ad_Spend Sales_Amount
## 1.000000000 0.655432882 0.465973611
## Conversion_Rate Email_Clicks Customer_Satisfaction
## 0.051530826 0.049640608 0.030730752
## Website_Visits Discount_Percent Age
## 0.025967045 0.016266962 0.007861774
#b. Build linear model
linear_model3 <- lm(Profit ~ Sales_Amount + Ad_Spend,data=marketing)
summary(linear_model3)
##
## Call:
## lm(formula = Profit ~ Sales_Amount + Ad_Spend, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -562.32 -120.80 -6.23 110.82 702.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 130.34090 11.22358 11.61 <2e-16 ***
## Sales_Amount -0.26047 0.01229 -21.20 <2e-16 ***
## Ad_Spend 1.36922 0.04356 31.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 173.5 on 997 degrees of freedom
## Multiple R-squared: 0.6068, Adjusted R-squared: 0.606
## F-statistic: 769.3 on 2 and 997 DF, p-value: < 2.2e-16
#c. check for multicollinearity
install.packages("car")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(linear_model3)
## Sales_Amount Ad_Spend
## 8.116736 8.116736
# the result above indicates that sales and ad_spend have a strong correlation meaning
# they carry similar information and the model will struggle to separate their individual
# effects-so since ad_spend has a stronger corr i will drop sales
#13. Segment customers into groups based on:Age,Ad_Spend,Website_Visits, Sales_Amount, Profit
# a: Select the variables for clustering
cluster_data <- marketing[, c("Age", "Ad_Spend", "Website_Visits", "Sales_Amount", "Profit")]
# b: Scale the data (important for K-Means)
cluster_scaled <- scale(cluster_data)
# c: Run K-Means with 3 clusters (you can change k)
set.seed(123) # for reproducibility
kmeans_result <- kmeans(cluster_scaled, centers = 3)
# d: Add cluster assignment back to original data
marketing$Cluster <- kmeans_result$cluster
# e: View cluster centers
print(kmeans_result$centers)
## Age Ad_Spend Website_Visits Sales_Amount Profit
## 1 0.07463289 1.4376381 0.03012444 1.1976769 1.4061964
## 2 0.83478811 -0.3975651 -0.07652981 -0.3236251 -0.4018994
## 3 -0.88971854 -0.2888661 0.06368457 -0.2484025 -0.2692254
# f: Check cluster sizes
table(marketing$Cluster)
##
## 1 2 3
## 193 408 399
#14.Forecast the next 3 months sales.
# Load libraries
library(dplyr)
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
install.packages("forecast")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(forecast)
# Step 1: Aggregate sales by month
sales_by_month <- marketing %>%
mutate(Date = as.Date("2024-01-01") + 0:(nrow(marketing)-1)) %>%
group_by(Month = floor_date(Date, "month")) %>%
summarise(Sales = sum(Sales_Amount, na.rm = TRUE))
# Step 2: Create time series
sales_ts <- ts(sales_by_month$Sales,
start = c(2024, 1),
frequency = 12)
# Step 3: Forecast with ETS
fit_ets <- ets(sales_ts)
forecast_ets <- forecast(fit_ets, h = 3)
print(forecast_ets)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Oct 2026 53696.79 44531.57 62862.01 39679.79 67713.78
## Nov 2026 53696.79 44531.57 62862.01 39679.79 67713.78
## Dec 2026 53696.79 44531.57 62862.01 39679.79 67713.78
plot(forecast_ets)

# Step 4: Forecast with ARIMA
fit_arima <- auto.arima(sales_ts)
forecast_arima <- forecast(fit_arima, h = 3)
print(forecast_arima)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Oct 2026 53953.65 44939.44 62967.86 40167.61 67739.69
## Nov 2026 53953.65 44939.44 62967.86 40167.61 67739.69
## Dec 2026 53953.65 44939.44 62967.86 40167.61 67739.69
plot(forecast_arima)

# Step 5: Compare models
accuracy(fit_ets)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 257.9151 6931.555 5620.942 -1.154199 10.55798 0.7421495 -0.1636785
accuracy(fit_arima)
## ME RMSE MAE MPE MAPE MASE
## Training set 4.630236e-12 6926.432 5612.749 -1.639993 10.59343 0.7410677
## ACF1
## Training set -0.1636618
#The forecasting model is predicting that monthly sales will remain relatively
#stable over the next three months.
#Based on historical sales patterns, the ETS (error, trend, seasonality) model predicts monthly sales of
#approximately 53,697 for October, November, and December 2026.
#confidence levels-there is an 80% chance that actual October sales will fall
#between 44,532 and 62,862.
#confidence levels-There is a 95% chance that actual October sales will fall
#between 39,680 and 67,714.
#The same ranges appear for all three months because the model detected no strong
#trend or seasonality in the data.
#he ARIMA model predicts monthly sales of approximately 53,954 over the next three months.
#which is very close to the ETS forecast of 53,697