Data Cleaning and EDA

library(dplyr)  # Load the dplyr package

data<-read.csv("skincare_products.csv")

DATA CLEANING

# Check the structure and summary of the dataset
str(data)

## 'data.frame':    1138 obs. of  5 variables:
##  $ product_name : chr  "The Ordinary Natural Moisturising Factors + HA 30ml" "CeraVe Facial Moisturising Lotion SPF 25 52ml" "The Ordinary Hyaluronic Acid 2% + B5 Hydration Support Formula 30ml" "AMELIORATE Transforming Body Lotion 200ml" ...
##  $ product_url  : chr  "https://www.lookfantastic.com/the-ordinary-natural-moisturising-factors-ha-30ml/11396687.html" "https://www.lookfantastic.com/cerave-facial-moisturising-lotion-spf-25-52ml/11798689.html" "https://www.lookfantastic.com/the-ordinary-hyaluronic-acid-2-b5-hydration-support-formula-30ml/11363395.html" "https://www.lookfantastic.com/ameliorate-transforming-body-lotion-200ml/11865352.html" ...
##  $ product_type : chr  "Moisturiser" "Moisturiser" "Moisturiser" "Moisturiser" ...
##  $ clean_ingreds: chr  "['capric triglyceride', 'cetyl alcohol', 'propanediol', 'stearyl alcohol', 'glycerin', 'sodium hyaluronate', 'a"| __truncated__ "['homosalate', 'glycerin', 'octocrylene', 'ethylhexyl', 'salicylate', 'niacinamide', 'silica', 'butyl methoxydi"| __truncated__ "['sodium hyaluronate', 'sodium hyaluronate', 'panthenol', 'ahnfeltia concinna extract', 'glycerin', 'pentylene "| __truncated__ "['ammonium lactate', 'c12-15', 'glycerin', 'prunus amygdalus dulcis', 'ethylhexyl palmitate', 'cetearyl alcohol"| __truncated__ ...
##  $ price        : chr  "£5.20" "£13.00" "£6.20" "£22.50" ...

summary(data)

##  product_name       product_url        product_type       clean_ingreds     
##  Length:1138        Length:1138        Length:1138        Length:1138       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##     price          
##  Length:1138       
##  Class :character  
##  Mode  :character

# Identify missing values
missing_values <- is.na(data)

# Count the number of missing values per variable
colSums(missing_values)

##  product_name   product_url  product_type clean_ingreds         price 
##             0             0             0             0             0

#remove product types related with body care.
# Remove rows where 'product_type' is 'Bath Salts'
data <- data[data$product_type != "Bath Salts", ]

# Remove rows where 'product_type' is 'Bath Oil'
data <- data[data$product_type != "Bath Oil", ]

# Remove rows where 'product_type' is 'Body Wash'
data <- data[data$product_type != "Body Wash", ]

# Convert 'price' column to float
data$price <- as.numeric(gsub("[^0-9.]", "", data$price))

# Print the shape of the dataframe
cat("Number of rows:", nrow(data), "\n")

## Number of rows: 946

cat("Number of columns:", ncol(data), "\n")

## Number of columns: 5

# Create a dataframe with the counts of each product type
Prod_Type <- data.frame(table(data$product_type))
colnames(Prod_Type) <- c("product_type", "count")

# Print the dataframe
print(Prod_Type)

##    product_type count
## 1          Balm    61
## 2      Cleanser   115
## 3    Exfoliator    57
## 4      Eye Care   100
## 5          Mask   124
## 6          Mist    80
## 7   Moisturiser   115
## 8           Oil    76
## 9          Peel    32
## 10        Serum   113
## 11        Toner    73

# Group by 'product_type' and calculate the mean price
PricexProd <- data %>%
  group_by(product_type) %>%
  summarize(mean_price = mean(price)) %>%
  arrange(desc(mean_price))

# Print the result
print(PricexProd)

## # A tibble: 11 × 2
##    product_type mean_price
##    <chr>             <dbl>
##  1 Peel               49.5
##  2 Serum              46.2
##  3 Eye Care           30.7
##  4 Oil                26.1
##  5 Moisturiser        23.9
##  6 Mist               22.9
##  7 Toner              21.2
##  8 Balm               20.9
##  9 Exfoliator         18.9
## 10 Cleanser           17.6
## 11 Mask               15.1

#CREATING NEW VARIABLE "brand"
# Define the specific brand names to consider
specific_brands <- c("Alchimie Forever","Aromatherapy Associates","Balance Me","Bloom & Blossom","Bobbi Brown","Bondi Sands","Bubble T","Burt's Bees",
                     "By Terry","Comfort Zone","Dear, Klairs", "Dr Dennis Gross","Dr. Hauschka","Dr. Brandt","Dr. Hauschka","Dr. PAWPAW","Egyptian Magic",
                     "Elizabeth Arden","Emma Hardie","Erno Laszlo","Estee Lauder","Eve Lom","Face by Skinny Tan","Fade Out","First Aid Beauty",
                     "Frank Body","Goldfaden MD","Holika Holika","Indeed Labs","Instant Effects","Institut Esthederm","James Read","Jo Malone","L.A BRUKET",
                     "La Roche-Posay","Lancer Skincare","Laura Mercier","Little Butterfly","Liz Earle","L'Oreal Paris","Love Boo","Lumene Nordic",
                     "Mama Mio","Manuka Doctor","Molton Brown","Natura Bisse","Neal's Yard","Oh K!","Perricone MD","Pestle & Mortar",
                     "Peter Thomas","Piz Buin","Radical Skincare","Recipe for Men","Revolution Skincare","Salcura Antiac","Sanctuary Spa",
                     "Sarah Chapman","Sea Magik","Skin Doctors","Sol de Janeiro","Spa Magik Organiks","Talika Skintelligence","The Chemistry",
                     "The INKEY","The Ordinary","The Organic Pharmacy","The Ritual","Too Faced","yes to")

# Create a new variable 'brand' by extracting data from 'product_name' based on specific criteria
data$brand <- ""
for (brand in specific_brands) {
  data$brand <- ifelse(grepl(brand, data$product_name), brand, data$brand)
}
data$brand[data$brand == ""] <- gsub("\\s.*", "", data$product_name[data$brand == ""])

#FREQUENCY TABLE FOR VARIABLE PRODUCT TYPE, BRAND
# Create a frequency table for product types
freq_table1 <- table(data$product_type)

# Print the frequency table
print(freq_table1)

## 
##        Balm    Cleanser  Exfoliator    Eye Care        Mask        Mist 
##          61         115          57         100         124          80 
## Moisturiser         Oil        Peel       Serum       Toner 
##         115          76          32         113          73

# Create a frequency table for brand
freq_table2 <- table(data$brand)

# Print the frequency table
print(freq_table2)

## 
##                 A'kin              Acorelle                 Aesop 
##                     2                     1                     4 
##                 AHAVA      Alchimie Forever               Alpha-H 
##                     9                     1                     2 
##                 Ambre            AMELIORATE             Antipodes 
##                     1                     3                     6 
##               APIVITA                   ARK                Armani 
##                    14                     1                     1 
##            AromaWorks               Aurelia                 Avant 
##                     2                     4                     3 
##                 Aveda                 Avene                 Avène 
##                     2                     8                    11 
##            Balance Me                BARBER          bareMinerals 
##                     3                     2                     7 
##                   BBB             BeautyPro               benefit 
##                     3                     5                     3 
##                Benton              Bioderma                 Bloom 
##                     2                     2                     1 
##           Bobbi Brown           Bondi Sands               Bulldog 
##                     6                     6                    13 
##           Burt's Bees              By Terry              Caudalie 
##                     5                     4                    16 
##                CeraVe          Chantecaille              Clinique 
##                    13                     5                    46 
##          Comfort Zone                 COSRX               Crystal 
##                     2                     5                     1 
##               Darphin          Dear, Klairs               DECLÉOR 
##                     8                     2                    20 
##           Dermalogica                   DHC                    Dr 
##                     9                     9                     2 
##       Dr Dennis Gross            Dr. Brandt          Dr. Hauschka 
##                     2                     1                     1 
##            Dr. PAWPAW              Dr.Jart+        Egyptian Magic 
##                     3                     2                     2 
##                Elemis                ELEMIS       Elizabeth Arden 
##                    19                     4                    18 
##           Embryolisse           Emma Hardie           Erno Laszlo 
##                     4                     3                     6 
##                  ESPA                 Estée               Eucerin 
##                     1                    23                     1 
##              Eucerin®               Eve Lom    Face by Skinny Tan 
##                     1                     5                     1 
##              Fade Out               FARMACY               Filorga 
##                     4                     2                     3 
##      First Aid Beauty                 FOREO            Frank Body 
##                    14                     3                     9 
##           Freezeframe              Gallinée               Garnier 
##                     1                     5                    33 
##              GLAMGLOW          Goldfaden MD                 Green 
##                     5                     2                     1 
##         Holika Holika          ilapothecary           Indeed Labs 
##                    23                     1                     6 
##                 INIKA       Instant Effects    Institut Esthederm 
##                     1                     1                     1 
##            James Read                 JASON              Jurlique 
##                     1                     5                     6 
##               KLORANE               L'Oréal         L'Oreal Paris 
##                     1                    25                     1 
##               L’Oréal        La Roche-Posay       Lancer Skincare 
##                     2                    36                     1 
##               Lancôme              Lanolips         Laura Mercier 
##                    17                     4                     2 
##             LIXIRSKIN             Liz Earle             Löwengrip 
##                     1                     9                     3 
##         Lumene Nordic                   MAC                MÁDARA 
##                     6                     2                     2 
##          MAGICSTRIPES              Mama Mio         Manuka Doctor 
##                     2                     6                     5 
##                 Mauli                Mavala            Maybelline 
##                     1                     3                     1 
##                Medik8                Menaji                  MONU 
##                     9                     1                     2 
##              MONUPLUS           Moroccanoil                 Murad 
##                     1                     1                    13 
##                  NARS                 Natio                Natura 
##                     1                     8                     4 
##           Neal's Yard            Neutrogena           Neutrogena® 
##                     6                    12                     1 
##                  NIOD               NIP+FAB                  NUXE 
##                     5                    12                     9 
##                   NYX                 Oh K!            Omorovicza 
##                     1                     5                     7 
##               Origins                 OSKIA                   Pai 
##                    12                     7                     7 
##          Perricone MD       Pestle & Mortar          Peter Thomas 
##                     2                     1                    11 
##            philosophy                  PIXI              Piz Buin 
##                     2                    29                     1 
##                Polaar                  PRAI               Project 
##                     2                     1                     1 
##      Radical Skincare              RapidEye        Recipe for Men 
##                     2                     1                     1 
##                   REN                  RENU   Revolution Skincare 
##                    16                     1                     5 
##                Rodial        Salcura Antiac         Sanctuary Spa 
##                     5                     2                    11 
##         Sarah Chapman             Sea Magik              Shiseido 
##                     4                     1                     2 
##          Skin Doctors         SkinCeuticals    Spa Magik Organiks 
##                     2                     3                     1 
##            StriVectin                 Sukin                   SVR 
##                     1                    14                     2 
## Talika Skintelligence              Tan-Luxe            TanOrganic 
##                     1                     1                     1 
##         The Chemistry             The INKEY          The Ordinary 
##                     3                     7                    17 
##  The Organic Pharmacy            The Ritual             Too Faced 
##                     3                     6                     1 
##               Trilogy              Ultrasun              ULTRASUN 
##                     4                     1                     1 
##                Uriage                 VERSO                 Vichy 
##                     1                     1                     1 
##                 VICHY                Weleda                yes to 
##                     7                     6                     2 
##                Zelens 
##                     5

EXPLORATORY DATA ANALYSIS

library(dplyr)
library(tidyr)
library(ggplot2)

###DATA VISUALIZATION

#TOP 10 BRANDS WITH AVERAGE PRICE
# Calculate the count of each brand
brand_counts <- data %>%
  count(brand) %>%
  arrange(desc(n))

# Average price by brand
avg_price_by_brand <- data %>%
  group_by(brand) %>%
  summarise(avg_price = mean(price, na.rm = TRUE)) %>%
  arrange(desc(avg_price))

# Display top 10 brands
top_10_brands <- head(avg_price_by_brand, 10)

# Create a vertical bar chart of the average prices of the top 10 brands
ggplot(top_10_brands, aes(x = reorder(brand, -avg_price), y = avg_price)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Top 10 Brands by Average Price", x = "Brand", y = "Average Price") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create the chart: Distribution of product type
chart1 <- ggplot(data, aes(x = product_type)) +
  geom_bar(fill = "lightblue", color = "black") +
  labs(title = "Distribution of Product Types", x = "Product Type", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(chart1)

library(wordcloud)
library(tm)

# Word cloud of clean ingredients
clean_ingreds <- unlist(strsplit(data$clean_ingreds, ","))
wordcloud(clean_ingreds, max.words = 50, random.order = FALSE)

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# Create the boxplot before removing outliers
boxplot_before <- ggplot(data, aes(y = price)) +
  geom_boxplot() +
  labs(title = "Boxplot of Price (Before Removing Outliers)", y = "Price") +
  theme_minimal()
print(boxplot_before)

# Calculate the lower and upper bounds for outliers using the IQR method
q1 <- quantile(data$price, 0.25)
q3 <- quantile(data$price, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr

# Remove outliers
data_filtered <- subset(data, price >= lower_bound & price <= upper_bound)

# Create the boxplot after removing outliers
boxplot_after <- ggplot(data_filtered, aes(y = price)) +
  geom_boxplot() +
  labs(title = "Boxplot of Price (After Removing Outliers)", y = "Price") +
  theme_minimal()
print(boxplot_after)

Cross-tabulate

categorical_vars <- c("product_type", "brand")

# Create a cross-tabulation with frequency counts and percentages for each categorical variable
for (var in categorical_vars) {
  cross_tab <- table(data[[var]])

  if (var == "brand") {
    # Get the top 10 brands
    top_10_brands <- names(head(sort(cross_tab, decreasing = TRUE), 10))

    # Filter the cross-tabulation for the top 10 brands
    cross_tab <- cross_tab[top_10_brands]
  }

  # Calculate percentage
  cross_tab_percentage <- prop.table(cross_tab) * 100

  # Convert the cross-tabulation and percentage to data frames
  cross_tab_df <- as.data.frame(cross_tab)
  cross_tab_percentage_df <- as.data.frame(cross_tab_percentage)

  # Format the percentage column with four decimal places and the '%' symbol
  cross_tab_percentage_df$Freq <- paste0(format(cross_tab_percentage_df$Freq, nsmall = 4), "%")

  # Print the cross-tabulation with frequency counts and percentages as a table
  cat("Cross-Tabulation for", var, "\n")
  print(knitr::kable(cross_tab_df))
  cat("\n")
  cat("Percentage for", var, "\n")
  print(knitr::kable(cross_tab_percentage_df))
  cat("\n")
}

## Cross-Tabulation for product_type 
## 
## 
## |Var1        | Freq|
## |:-----------|----:|
## |Balm        |   61|
## |Cleanser    |  115|
## |Exfoliator  |   57|
## |Eye Care    |  100|
## |Mask        |  124|
## |Mist        |   80|
## |Moisturiser |  115|
## |Oil         |   76|
## |Peel        |   32|
## |Serum       |  113|
## |Toner       |   73|
## 
## Percentage for product_type 
## 
## 
## |Var1        |Freq       |
## |:-----------|:----------|
## |Balm        |6.448203%  |
## |Cleanser    |12.156448% |
## |Exfoliator  |6.025370%  |
## |Eye Care    |10.570825% |
## |Mask        |13.107822% |
## |Mist        |8.456660%  |
## |Moisturiser |12.156448% |
## |Oil         |8.033827%  |
## |Peel        |3.382664%  |
## |Serum       |11.945032% |
## |Toner       |7.716702%  |
## 
## Cross-Tabulation for brand 
## 
## 
## |Var1            | Freq|
## |:---------------|----:|
## |Clinique        |   46|
## |La Roche-Posay  |   36|
## |Garnier         |   33|
## |PIXI            |   29|
## |L'Oréal         |   25|
## |Estée           |   23|
## |Holika Holika   |   23|
## |DECLÉOR         |   20|
## |Elemis          |   19|
## |Elizabeth Arden |   18|
## 
## Percentage for brand 
## 
## 
## |Var1            |Freq       |
## |:---------------|:----------|
## |Clinique        |16.911765% |
## |La Roche-Posay  |13.235294% |
## |Garnier         |12.132353% |
## |PIXI            |10.661765% |
## |L'Oréal         |9.191176%  |
## |Estée           |8.455882%  |
## |Holika Holika   |8.455882%  |
## |DECLÉOR         |7.352941%  |
## |Elemis          |6.985294%  |
## |Elizabeth Arden |6.617647%  |

# Create a cross-tabulation with frequency counts and percentages for each categorical variable
for (var in categorical_vars) {
  cross_tab <- table(data[[var]])

  if (var == "brand") {
    # Get the top 10 brands
    top_10_brands <- names(head(sort(cross_tab, decreasing = TRUE), 10))

    # Filter the cross-tabulation for the top 10 brands
    cross_tab <- cross_tab[top_10_brands]
  }

  # Calculate percentage
  cross_tab_percentage <- prop.table(cross_tab) * 100

  # Convert the percentage to a data frame
  cross_tab_percentage_df <- as.data.frame(cross_tab_percentage)

  # Plot the percentage as a bar chart
  p <- ggplot(cross_tab_percentage_df, aes(x = Var1, y = Freq)) +
    geom_bar(stat = "identity", fill = "green") +
    labs(x = var, y = "Percentage") +
    ggtitle(paste0("Percentage for ", var))

  # Print the plot
  print(p)
}

Mode

categorical_vars <- c("product_type", "brand","product_name")

for (var in categorical_vars) {
  if (var == "product_name") {
    next
  }
  
  freq_table <- table(data[[var]])
  mode <- names(freq_table)[which.max(freq_table)]
  
  cat("Mode for", var, ":", mode, "\n")
}

## Mode for product_type : Mask 
## Mode for brand : Clinique

library(ggplot2)

categorical_vars <- c("product_type", "brand")

# Create a cross-tabulation with frequency counts for each categorical variable
for (var in categorical_vars) {
  freq_table <- table(data[[var]])

  # Find the category with the highest count (mode)
  mode <- names(freq_table)[which.max(freq_table)]

  # Convert the frequency table to a data frame
  freq_table_df <- data.frame(Category = names(freq_table), Frequency = as.vector(freq_table))

  # Plot the bar chart
  p <- ggplot(freq_table_df, aes(x = Category, y = Frequency, fill = Category == mode)) +
    geom_bar(stat = "identity") +
    labs(x = var, y = "Frequency") +
    ggtitle(paste0("Frequency Counts for ", var)) +
    scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    guides(fill = FALSE)

  # Print the plot
  print(p)

  # Print the mode
  cat("Mode for", var, ":", mode, "\n\n")
}

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Mode for product_type : Mask

## Mode for brand : Clinique

Chi-square Analysis

#analyze relationship between variables

# Create the contingency table
cont_table <- table(data$clean_ingreds, data$product_type)

# Perform chi-square test of independence
chi_test <- chisq.test(cont_table)

## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect

# Print the test results
print(chi_test)

## 
##  Pearson's Chi-squared test
## 
## data:  cont_table
## X-squared = 9419.6, df = 8970, p-value = 0.0004713

Data Cleaning and EDA

Data Brains

2023-06-17

DATA CLEANING

EXPLORATORY DATA ANALYSIS

Cross-tabulate

Mode

Chi-square Analysis