#step1: identify general directory
getwd()
## [1] "/cloud/project"
#step2: identify specific directory
list.files("/cloud/project/data")
## [1] "marketing_analysis_dataset_1000_rows.csv"
#step3: load data
marketing <- read.csv("/cloud/project/data/marketing_analysis_dataset_1000_rows.csv")
head(marketing)
##         Date Customer_ID Age Gender  Region Marketing_Channel Product_Category
## 1 2023-01-01   CUST00001  56   Male Central                TV        Product D
## 2 2023-01-02   CUST00002  69   Male Eastern        Search Ads        Product A
## 3 2023-01-03   CUST00003  46   Male Western          Referral        Product C
## 4 2023-01-04   CUST00004  32 Female Central                TV        Product C
## 5 2023-01-05   CUST00005  60   Male Western                TV        Product B
## 6 2023-01-06   CUST00006  25   Male Eastern      Social Media        Product D
##   Customer_Segment Ad_Spend Website_Visits Email_Clicks Discount_Percent
## 1              New   114.72             15            1             24.4
## 2        Returning   243.43             11            6             22.6
## 3        Returning   520.47             11            2             27.4
## 4              New    81.60              6            1             25.0
## 5        Returning   605.42             12            4              7.6
## 6        Returning   299.20              8            4              9.9
##   Conversion_Rate Sales_Amount Profit Customer_Satisfaction
## 1           0.070       543.63 160.16                   3.8
## 2           0.084       755.88 193.27                   3.6
## 3           0.057      1615.68 480.31                   3.2
## 4           0.044       318.86  66.04                   5.0
## 5           0.077      1765.08 449.26                   5.0
## 6           0.071       991.89 362.42                   4.2
#1. What is the distribution of Customer Age?
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data=marketing,mapping=aes(x=Age))+geom_histogram(color='black',fill='lightblue') +
  labs(title='Age Distribution')
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#2. percentage of customers by region
#install dplyr package
install.packages('dplyr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#aggregate and group
customer_distribution <- marketing %>%
  group_by(Region)%>%
  summarise(customer_count=n()) %>%
  mutate(
    percentage = (customer_count/sum(customer_count))*100
  )
print(customer_distribution)
## # A tibble: 4 × 3
##   Region   customer_count percentage
##   <chr>             <int>      <dbl>
## 1 Central             251       25.1
## 2 Eastern             259       25.9
## 3 Northern            236       23.6
## 4 Western             254       25.4
#visualize output
ggplot(customer_distribution,aes(x=Region, y=percentage,fill=Region))+geom_col(color='black')+
  labs(title='Customer Percentage by Region')

#3. Which marketing channel generate the highest total sales?
# Sum the sales and group by marketing channel
top_sales <- marketing %>%
  group_by(Marketing_Channel)%>%
  summarise(total_sales = sum(Sales_Amount,na.rm=TRUE),groups='drop')%>%
  arrange(desc(total_sales))
print(top_sales)
## # A tibble: 5 × 3
##   Marketing_Channel total_sales groups
##   <chr>                   <dbl> <chr> 
## 1 TV                    390828. drop  
## 2 Social Media          366227. drop  
## 3 Referral              359246. drop  
## 4 Email                 336144. drop  
## 5 Search Ads            328025. drop
# visualise output
ggplot(top_sales,aes(x=Marketing_Channel,y=total_sales))+geom_col(color='red')+
  labs(title='Total sales by Channel')

#4. Is there a relationship between website visits and sales?
correlation_result <- cor(marketing$Website_Visits,marketing$Sales_Amount, use = 'complete.obs')
print(correlation_result)
## [1] 0.0268328
#5. Is there a relationship between Email clicks and conversion rate?
correlation2 <- cor(marketing$Email_Clicks,marketing$Conversion_Rate,use='complete.obs')
print(correlation2)
## [1] 0.5159913
#6. Do male and female customers have significantly different average Sales_Amount?
t_result <- t.test(Sales_Amount ~ Gender,data=marketing)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  Sales_Amount by Gender
## t = 0.093509, df = 885.55, p-value = 0.9255
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -152.5824  167.8492
## sample estimates:
## mean in group Female   mean in group Male 
##             1784.470             1776.837
#7. Do returning customers spend more than new customers?
anova_result <- aov(Sales_Amount~Customer_Segment,data=marketing)
summary(anova_result)
##                   Df    Sum Sq Mean Sq F value Pr(>F)
## Customer_Segment   2 2.590e+06 1295166     0.8   0.45
## Residuals        997 1.615e+09 1619573
#8. Is there an association between Gender and Customer_Segment?
# create contingency table
tbl <- table(marketing$Gender,marketing$Customer_Segment)
# establish degree of association
chi_result <- chisq.test(tbl)
print(chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 0.1019, df = 2, p-value = 0.9503
#9. How does Ad spend influence Sales_Amount?
linear_model <- lm(Sales_Amount ~ Ad_Spend,data=marketing)
summary(linear_model)
## 
## Call:
## lm(formula = Sales_Amount ~ Ad_Spend, data = marketing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1051.2  -193.7    -4.6   154.9  4786.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -243.58319   27.86512  -8.742   <2e-16 ***
## Ad_Spend       3.31934    0.03939  84.276   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 446.8 on 998 degrees of freedom
## Multiple R-squared:  0.8768, Adjusted R-squared:  0.8767 
## F-statistic:  7103 on 1 and 998 DF,  p-value: < 2.2e-16
#10. How much does every extra website visit increase sales?
linear_model2 <- lm(Sales_Amount ~ Website_Visits,data=marketing)
summary(linear_model2)
## 
## Call:
## lm(formula = Sales_Amount ~ Website_Visits, data = marketing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1575.0  -622.4  -235.4   312.6 15097.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1659.950    147.713  11.238   <2e-16 ***
## Website_Visits    9.987     11.777   0.848    0.397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1273 on 998 degrees of freedom
## Multiple R-squared:  0.00072,    Adjusted R-squared:  -0.0002813 
## F-statistic: 0.7191 on 1 and 998 DF,  p-value: 0.3967
#11. Build a model predicting Sales_Amount using Email_Clicks, Discount_Percent and Age
multiple_model <- lm(Sales_Amount~Email_Clicks+Discount_Percent+Age,data=marketing)
summary(multiple_model)
## 
## Call:
## lm(formula = Sales_Amount ~ Email_Clicks + Discount_Percent + 
##     Age, data = marketing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1619.0  -628.7  -215.4   324.5 14917.8 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1567.001    161.984   9.674   <2e-16 ***
## Email_Clicks       43.744     20.058   2.181   0.0294 *  
## Discount_Percent    5.787      4.696   1.232   0.2182    
## Age                -1.135      2.690  -0.422   0.6732    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1270 on 996 degrees of freedom
## Multiple R-squared:  0.00649,    Adjusted R-squared:  0.003498 
## F-statistic: 2.169 on 3 and 996 DF,  p-value: 0.09011
#12. Build a model predicting Profit.

#a. Establish numeric variables have the strongest linear relationship with Profit
numeric_vars <- marketing[, c("Profit", "Age", "Ad_Spend", "Website_Visits", 
                              "Email_Clicks", "Discount_Percent", "Conversion_Rate", 
                              "Sales_Amount", "Customer_Satisfaction")]

# Correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")

# View correlations with Profit
cor_with_profit <- cor_matrix["Profit", ]
print(sort(abs(cor_with_profit), decreasing = TRUE))
##                Profit              Ad_Spend          Sales_Amount 
##           1.000000000           0.655432882           0.465973611 
##       Conversion_Rate          Email_Clicks Customer_Satisfaction 
##           0.051530826           0.049640608           0.030730752 
##        Website_Visits      Discount_Percent                   Age 
##           0.025967045           0.016266962           0.007861774
#b. Build linear model
linear_model3 <- lm(Profit ~ Sales_Amount + Ad_Spend,data=marketing)
summary(linear_model3)
## 
## Call:
## lm(formula = Profit ~ Sales_Amount + Ad_Spend, data = marketing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -562.32 -120.80   -6.23  110.82  702.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  130.34090   11.22358   11.61   <2e-16 ***
## Sales_Amount  -0.26047    0.01229  -21.20   <2e-16 ***
## Ad_Spend       1.36922    0.04356   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.5 on 997 degrees of freedom
## Multiple R-squared:  0.6068, Adjusted R-squared:  0.606 
## F-statistic: 769.3 on 2 and 997 DF,  p-value: < 2.2e-16
#c. check for multicollinearity
install.packages("car")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(linear_model3)
## Sales_Amount     Ad_Spend 
##     8.116736     8.116736
# the result above indicates that sales and ad_spend have a strong correlation meaning
# they carry similar information and the model will struggle to separate their individual
# effects-so since ad_spend has a stronger corr i will drop sales
#13. Segment customers into groups based on:Age,Ad_Spend,Website_Visits, Sales_Amount, Profit
# a: Select the variables for clustering
cluster_data <- marketing[, c("Age", "Ad_Spend", "Website_Visits", "Sales_Amount", "Profit")]

# b: Scale the data (important for K-Means)
cluster_scaled <- scale(cluster_data)

# c: Run K-Means with 3 clusters (you can change k)
set.seed(123)  # for reproducibility
kmeans_result <- kmeans(cluster_scaled, centers = 3)

# d: Add cluster assignment back to original data
marketing$Cluster <- kmeans_result$cluster

# e: View cluster centers
print(kmeans_result$centers)
##           Age   Ad_Spend Website_Visits Sales_Amount     Profit
## 1  0.07463289  1.4376381     0.03012444    1.1976769  1.4061964
## 2  0.83478811 -0.3975651    -0.07652981   -0.3236251 -0.4018994
## 3 -0.88971854 -0.2888661     0.06368457   -0.2484025 -0.2692254
# f: Check cluster sizes
table(marketing$Cluster)
## 
##   1   2   3 
## 193 408 399
#14.Forecast the next 3 months sales.
# Load libraries
library(dplyr)
install.packages("lubridate")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
install.packages("forecast")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.6'
## (as 'lib' is unspecified)
library(forecast)

# Step 1: Aggregate sales by month
sales_by_month <- marketing %>%
  mutate(Date = as.Date("2024-01-01") + 0:(nrow(marketing)-1)) %>%
  group_by(Month = floor_date(Date, "month")) %>%
  summarise(Sales = sum(Sales_Amount, na.rm = TRUE))

# Step 2: Create time series
sales_ts <- ts(sales_by_month$Sales, 
               start = c(2024, 1), 
               frequency = 12)

# Step 3: Forecast with ETS
fit_ets <- ets(sales_ts)
forecast_ets <- forecast(fit_ets, h = 3)
print(forecast_ets)
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Oct 2026       53696.79 44531.57 62862.01 39679.79 67713.78
## Nov 2026       53696.79 44531.57 62862.01 39679.79 67713.78
## Dec 2026       53696.79 44531.57 62862.01 39679.79 67713.78
plot(forecast_ets)

# Step 4: Forecast with ARIMA
fit_arima <- auto.arima(sales_ts)
forecast_arima <- forecast(fit_arima, h = 3)
print(forecast_arima)
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Oct 2026       53953.65 44939.44 62967.86 40167.61 67739.69
## Nov 2026       53953.65 44939.44 62967.86 40167.61 67739.69
## Dec 2026       53953.65 44939.44 62967.86 40167.61 67739.69
plot(forecast_arima)

# Step 5: Compare models
accuracy(fit_ets)
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 257.9151 6931.555 5620.942 -1.154199 10.55798 0.7421495 -0.1636785
accuracy(fit_arima)
##                        ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 4.630236e-12 6926.432 5612.749 -1.639993 10.59343 0.7410677
##                    ACF1
## Training set -0.1636618
#The forecasting model is predicting that monthly sales will remain relatively 
#stable over the next three months.
#Based on historical sales patterns, the ETS (error, trend, seasonality) model predicts monthly sales of 
#approximately 53,697 for October, November, and December 2026.
#confidence levels-there is an 80% chance that actual October sales will fall 
#between 44,532 and 62,862.
#confidence levels-There is a 95% chance that actual October sales will fall 
#between 39,680 and 67,714.
#The same ranges appear for all three months because the model detected no strong 
#trend or seasonality in the data.
#he ARIMA model predicts monthly sales of approximately 53,954 over the next three months.
#which is very close to the ETS forecast of 53,697