library(tidyverse)
library(plotly)
library(lubridate)
library(corrplot)

Import dataset

# Read csv file
dat <- read_csv("C:/Users/LENOVO/Downloads/nykaa_campaign_data.csv")
## Rows: 55555 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Campaign_ID, Campaign_Type, Target_Audience, Channel_Used, Language...
## dbl (9): Duration, Impressions, Clicks, Leads, Conversions, Revenue, Acquisi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View first rows

head(dat)
# Check structure

str(dat)
## spc_tbl_ [55,555 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Campaign_ID     : chr [1:55555] "NY-CMP-1000" "NY-CMP-1001" "NY-CMP-1002" "NY-CMP-1003" ...
##  $ Campaign_Type   : chr [1:55555] "Social Media" "Paid Ads" "Influencer" "Email" ...
##  $ Target_Audience : chr [1:55555] "College Students" "Tier 2 City Customers" "Youth" "Working Women" ...
##  $ Duration        : num [1:55555] 21 18 23 18 10 26 21 6 27 11 ...
##  $ Channel_Used    : chr [1:55555] "WhatsApp, YouTube" "YouTube" "WhatsApp, Google, YouTube" "YouTube, Facebook, Instagram" ...
##  $ Impressions     : num [1:55555] 57804 91801 15536 88114 96871 ...
##  $ Clicks          : num [1:55555] 6156 3321 2182 8413 3743 ...
##  $ Leads           : num [1:55555] 3616 1971 952 2231 2060 ...
##  $ Conversions     : num [1:55555] 2355 1357 755 947 1258 ...
##  $ Revenue         : num [1:55555] 1867515 1046247 197055 376906 518296 ...
##  $ Acquisition_Cost: num [1:55555] 111 180.8 90.6 249.1 228.6 ...
##  $ ROI             : num [1:55555] 6.14 3.26 1.88 0.6 0.8 3.09 1.17 1.4 3.73 1.61 ...
##  $ Language        : chr [1:55555] "Hindi" "Hindi" "English" "Hindi" ...
##  $ Engagement_Score: num [1:55555] 20.98 7.24 25.03 13.15 7.29 ...
##  $ Customer_Segment: chr [1:55555] "College Students" "College Students" "College Students" "College Students" ...
##  $ Date            : chr [1:55555] "29-04-2025" "06-04-2025" "14-01-2025" "04-06-2025" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Campaign_ID = col_character(),
##   ..   Campaign_Type = col_character(),
##   ..   Target_Audience = col_character(),
##   ..   Duration = col_double(),
##   ..   Channel_Used = col_character(),
##   ..   Impressions = col_double(),
##   ..   Clicks = col_double(),
##   ..   Leads = col_double(),
##   ..   Conversions = col_double(),
##   ..   Revenue = col_double(),
##   ..   Acquisition_Cost = col_double(),
##   ..   ROI = col_double(),
##   ..   Language = col_character(),
##   ..   Engagement_Score = col_double(),
##   ..   Customer_Segment = col_character(),
##   ..   Date = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data cleaning

# Convert date column

dat$Date <- as.Date(dat$Date)

# Check missing values

colSums(is.na(dat))
##      Campaign_ID    Campaign_Type  Target_Audience         Duration 
##                0                0                0                0 
##     Channel_Used      Impressions           Clicks            Leads 
##                0                0                0                0 
##      Conversions          Revenue Acquisition_Cost              ROI 
##                0                0                0                0 
##         Language Engagement_Score Customer_Segment             Date 
##                0                0                0                0
# Summary statistics

summary(dat)
##  Campaign_ID        Campaign_Type      Target_Audience       Duration    
##  Length:55555       Length:55555       Length:55555       Min.   : 5.00  
##  Class :character   Class :character   Class :character   1st Qu.:11.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :17.00  
##                                                           Mean   :17.45  
##                                                           3rd Qu.:24.00  
##                                                           Max.   :30.00  
##  Channel_Used        Impressions         Clicks          Leads     
##  Length:55555       Min.   : 10001   Min.   :  202   Min.   :  56  
##  Class :character   1st Qu.: 32680   1st Qu.: 2110   1st Qu.: 779  
##  Mode  :character   Median : 55182   Median : 3907   Median :1481  
##                     Mean   : 55088   Mean   : 4688   Mean   :1877  
##                     3rd Qu.: 77515   3rd Qu.: 6688   3rd Qu.:2605  
##                     Max.   :100000   Max.   :14868   Max.   :8876  
##   Conversions      Revenue        Acquisition_Cost        ROI        
##  Min.   :  19   Min.   :   6183   Min.   :    9.08   Min.   :-0.970  
##  1st Qu.: 400   1st Qu.: 177706   1st Qu.:  105.44   1st Qu.: 0.040  
##  Median : 779   Median : 360436   Median :  207.51   Median : 1.240  
##  Mean   :1033   Mean   : 515820   Mean   :  377.35   Mean   : 2.714  
##  3rd Qu.:1414   3rd Qu.: 687423   3rd Qu.:  428.58   3rd Qu.: 3.630  
##  Max.   :6686   Max.   :4579910   Max.   :15473.16   Max.   :74.420  
##    Language         Engagement_Score Customer_Segment        Date           
##  Length:55555       Min.   : 2.60    Length:55555       Min.   :0001-01-20  
##  Class :character   1st Qu.: 8.36    Class :character   1st Qu.:0008-05-20  
##  Mode  :character   Median :13.60    Mode  :character   Median :0015-11-20  
##                     Mean   :13.78                       Mean   :0016-01-04  
##                     3rd Qu.:18.86                       3rd Qu.:0023-07-20  
##                     Max.   :30.91                       Max.   :0031-12-20

Visualization 1-Avearge ROI by campaign type

roi_plot <- dat %>%
  group_by(Campaign_Type) %>%
  summarise(avg_roi = mean(ROI, na.rm = TRUE)) %>%
  ggplot(aes(x = Campaign_Type,
             y = avg_roi,
             fill = Campaign_Type)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Average ROI by Campaign Type",
    x = "Campaign Type",
    y = "Average ROI"
  ) +
  theme_minimal()

roi_plot

Visualization 2- Revenue trend over line

revenue_plot <- dat %>%
  group_by(Date) %>%
  summarise(total_revenue = sum(Revenue, na.rm = TRUE)) %>%
  ggplot(aes(x = Date,
             y = total_revenue)) +
  geom_line(size = 1) +
  labs(
    title = "Revenue Trend Over Time",
    x = "Date",
    y = "Revenue"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
revenue_plot

Visualization 3- Impressions vs conversions

scatter_plot <- ggplot(dat,
                       aes(x = Impressions,
                           y = Conversions,
                           color = Campaign_Type)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Relationship Between Impressions and Conversions",
    x = "Impressions",
    y = "Conversions"
  ) +
  theme_minimal()

scatter_plot

Visualization 4- Engagement score by language

box_plot <- ggplot(dat,
                   aes(x = Language,
                       y = Engagement_Score,
                       fill = Language)) +
  geom_boxplot() +
  labs(
    title = "Distribution of Engagement Scores Across Languages",
    x = "Language",
    y = "Engagement Score"
  ) +
  theme_minimal()

box_plot

# Visualization 5-Correlation heatmap

# Select numeric columns

numeric_data <- dat %>%
  select(Impressions,
         Clicks,
         Leads,
         Conversions,
         Revenue,
         ROI,
         Engagement_Score)

# Correlation matrix

cor_matrix <- cor(numeric_data,
                  use = "complete.obs")
# Heatmap

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.cex = 0.8)

# Visualization 6 - Customer segment by campaign type

stacked_plot <- ggplot(dat,
                       aes(x = Campaign_Type,
                           fill = Customer_Segment)) +
  geom_bar() +
  labs(
    title = "Customer Segment Distribution by Campaign Type",
    x = "Campaign Type",
    y = "Count"
  ) +
  theme_minimal()

stacked_plot

# Visualization 7- Interactive plotly visulaization

interactive_plot <- plot_ly(
  data = dat,
  x = ~Impressions,
  y = ~Revenue,
  color = ~Campaign_Type,
  type = "scatter",
  mode = "markers",
  text = ~paste(
    "Campaign:", Campaign_Type,
    "<br>ROI:", ROI,
    "<br>Engagement:", Engagement_Score
  )
)

interactive_plot

#Visualization 8 - Revenue distribution by channel

facet_plot <- ggplot(dat,
                     aes(x = Revenue,
                         fill = Channel_Used)) +
  geom_histogram(bins = 20) +
  facet_wrap(~Campaign_Type) +
  labs(
    title = "Revenue Distribution Across Different Channels",
    x = "Revenue",
    y = "Count"
  ) +
  theme_minimal()

facet_plot

# Conclusion

## 
## This project analyzed Nykaa marketing campaign performance using multiple visualization techniques.
## The analysis identified relationships between impressions, conversions, engagement, ROI, and revenue trends.
## Interactive and comparative visualizations provided insights into campaign effectiveness across customer segments and channels.