DATA CLEANING
# Check the structure and summary of the dataset
str(data)
## 'data.frame': 1138 obs. of 5 variables:
## $ product_name : chr "The Ordinary Natural Moisturising Factors + HA 30ml" "CeraVe Facial Moisturising Lotion SPF 25 52ml" "The Ordinary Hyaluronic Acid 2% + B5 Hydration Support Formula 30ml" "AMELIORATE Transforming Body Lotion 200ml" ...
## $ product_url : chr "https://www.lookfantastic.com/the-ordinary-natural-moisturising-factors-ha-30ml/11396687.html" "https://www.lookfantastic.com/cerave-facial-moisturising-lotion-spf-25-52ml/11798689.html" "https://www.lookfantastic.com/the-ordinary-hyaluronic-acid-2-b5-hydration-support-formula-30ml/11363395.html" "https://www.lookfantastic.com/ameliorate-transforming-body-lotion-200ml/11865352.html" ...
## $ product_type : chr "Moisturiser" "Moisturiser" "Moisturiser" "Moisturiser" ...
## $ clean_ingreds: chr "['capric triglyceride', 'cetyl alcohol', 'propanediol', 'stearyl alcohol', 'glycerin', 'sodium hyaluronate', 'a"| __truncated__ "['homosalate', 'glycerin', 'octocrylene', 'ethylhexyl', 'salicylate', 'niacinamide', 'silica', 'butyl methoxydi"| __truncated__ "['sodium hyaluronate', 'sodium hyaluronate', 'panthenol', 'ahnfeltia concinna extract', 'glycerin', 'pentylene "| __truncated__ "['ammonium lactate', 'c12-15', 'glycerin', 'prunus amygdalus dulcis', 'ethylhexyl palmitate', 'cetearyl alcohol"| __truncated__ ...
## $ price : chr "£5.20" "£13.00" "£6.20" "£22.50" ...
summary(data)
## product_name product_url product_type clean_ingreds
## Length:1138 Length:1138 Length:1138 Length:1138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## price
## Length:1138
## Class :character
## Mode :character
# Identify missing values
missing_values <- is.na(data)
# Count the number of missing values per variable
colSums(missing_values)
## product_name product_url product_type clean_ingreds price
## 0 0 0 0 0
#remove product types related with body care.
# Remove rows where 'product_type' is 'Bath Salts'
data <- data[data$product_type != "Bath Salts", ]
# Remove rows where 'product_type' is 'Bath Oil'
data <- data[data$product_type != "Bath Oil", ]
# Remove rows where 'product_type' is 'Body Wash'
data <- data[data$product_type != "Body Wash", ]
# Convert 'price' column to float
data$price <- as.numeric(gsub("[^0-9.]", "", data$price))
# Print the shape of the dataframe
cat("Number of rows:", nrow(data), "\n")
## Number of rows: 946
cat("Number of columns:", ncol(data), "\n")
## Number of columns: 5
# Create a dataframe with the counts of each product type
Prod_Type <- data.frame(table(data$product_type))
colnames(Prod_Type) <- c("product_type", "count")
# Print the dataframe
print(Prod_Type)
## product_type count
## 1 Balm 61
## 2 Cleanser 115
## 3 Exfoliator 57
## 4 Eye Care 100
## 5 Mask 124
## 6 Mist 80
## 7 Moisturiser 115
## 8 Oil 76
## 9 Peel 32
## 10 Serum 113
## 11 Toner 73
# Group by 'product_type' and calculate the mean price
PricexProd <- data %>%
group_by(product_type) %>%
summarize(mean_price = mean(price)) %>%
arrange(desc(mean_price))
# Print the result
print(PricexProd)
## # A tibble: 11 × 2
## product_type mean_price
## <chr> <dbl>
## 1 Peel 49.5
## 2 Serum 46.2
## 3 Eye Care 30.7
## 4 Oil 26.1
## 5 Moisturiser 23.9
## 6 Mist 22.9
## 7 Toner 21.2
## 8 Balm 20.9
## 9 Exfoliator 18.9
## 10 Cleanser 17.6
## 11 Mask 15.1
#CREATING NEW VARIABLE "brand"
# Define the specific brand names to consider
specific_brands <- c("Alchimie Forever","Aromatherapy Associates","Balance Me","Bloom & Blossom","Bobbi Brown","Bondi Sands","Bubble T","Burt's Bees",
"By Terry","Comfort Zone","Dear, Klairs", "Dr Dennis Gross","Dr. Hauschka","Dr. Brandt","Dr. Hauschka","Dr. PAWPAW","Egyptian Magic",
"Elizabeth Arden","Emma Hardie","Erno Laszlo","Estee Lauder","Eve Lom","Face by Skinny Tan","Fade Out","First Aid Beauty",
"Frank Body","Goldfaden MD","Holika Holika","Indeed Labs","Instant Effects","Institut Esthederm","James Read","Jo Malone","L.A BRUKET",
"La Roche-Posay","Lancer Skincare","Laura Mercier","Little Butterfly","Liz Earle","L'Oreal Paris","Love Boo","Lumene Nordic",
"Mama Mio","Manuka Doctor","Molton Brown","Natura Bisse","Neal's Yard","Oh K!","Perricone MD","Pestle & Mortar",
"Peter Thomas","Piz Buin","Radical Skincare","Recipe for Men","Revolution Skincare","Salcura Antiac","Sanctuary Spa",
"Sarah Chapman","Sea Magik","Skin Doctors","Sol de Janeiro","Spa Magik Organiks","Talika Skintelligence","The Chemistry",
"The INKEY","The Ordinary","The Organic Pharmacy","The Ritual","Too Faced","yes to")
# Create a new variable 'brand' by extracting data from 'product_name' based on specific criteria
data$brand <- ""
for (brand in specific_brands) {
data$brand <- ifelse(grepl(brand, data$product_name), brand, data$brand)
}
data$brand[data$brand == ""] <- gsub("\\s.*", "", data$product_name[data$brand == ""])
#FREQUENCY TABLE FOR VARIABLE PRODUCT TYPE, BRAND
# Create a frequency table for product types
freq_table1 <- table(data$product_type)
# Print the frequency table
print(freq_table1)
##
## Balm Cleanser Exfoliator Eye Care Mask Mist
## 61 115 57 100 124 80
## Moisturiser Oil Peel Serum Toner
## 115 76 32 113 73
# Create a frequency table for brand
freq_table2 <- table(data$brand)
# Print the frequency table
print(freq_table2)
##
## A'kin Acorelle Aesop
## 2 1 4
## AHAVA Alchimie Forever Alpha-H
## 9 1 2
## Ambre AMELIORATE Antipodes
## 1 3 6
## APIVITA ARK Armani
## 14 1 1
## AromaWorks Aurelia Avant
## 2 4 3
## Aveda Avene Avène
## 2 8 11
## Balance Me BARBER bareMinerals
## 3 2 7
## BBB BeautyPro benefit
## 3 5 3
## Benton Bioderma Bloom
## 2 2 1
## Bobbi Brown Bondi Sands Bulldog
## 6 6 13
## Burt's Bees By Terry Caudalie
## 5 4 16
## CeraVe Chantecaille Clinique
## 13 5 46
## Comfort Zone COSRX Crystal
## 2 5 1
## Darphin Dear, Klairs DECLÉOR
## 8 2 20
## Dermalogica DHC Dr
## 9 9 2
## Dr Dennis Gross Dr. Brandt Dr. Hauschka
## 2 1 1
## Dr. PAWPAW Dr.Jart+ Egyptian Magic
## 3 2 2
## Elemis ELEMIS Elizabeth Arden
## 19 4 18
## Embryolisse Emma Hardie Erno Laszlo
## 4 3 6
## ESPA Estée Eucerin
## 1 23 1
## Eucerin® Eve Lom Face by Skinny Tan
## 1 5 1
## Fade Out FARMACY Filorga
## 4 2 3
## First Aid Beauty FOREO Frank Body
## 14 3 9
## Freezeframe Gallinée Garnier
## 1 5 33
## GLAMGLOW Goldfaden MD Green
## 5 2 1
## Holika Holika ilapothecary Indeed Labs
## 23 1 6
## INIKA Instant Effects Institut Esthederm
## 1 1 1
## James Read JASON Jurlique
## 1 5 6
## KLORANE L'Oréal L'Oreal Paris
## 1 25 1
## L’Oréal La Roche-Posay Lancer Skincare
## 2 36 1
## Lancôme Lanolips Laura Mercier
## 17 4 2
## LIXIRSKIN Liz Earle Löwengrip
## 1 9 3
## Lumene Nordic MAC MÁDARA
## 6 2 2
## MAGICSTRIPES Mama Mio Manuka Doctor
## 2 6 5
## Mauli Mavala Maybelline
## 1 3 1
## Medik8 Menaji MONU
## 9 1 2
## MONUPLUS Moroccanoil Murad
## 1 1 13
## NARS Natio Natura
## 1 8 4
## Neal's Yard Neutrogena Neutrogena®
## 6 12 1
## NIOD NIP+FAB NUXE
## 5 12 9
## NYX Oh K! Omorovicza
## 1 5 7
## Origins OSKIA Pai
## 12 7 7
## Perricone MD Pestle & Mortar Peter Thomas
## 2 1 11
## philosophy PIXI Piz Buin
## 2 29 1
## Polaar PRAI Project
## 2 1 1
## Radical Skincare RapidEye Recipe for Men
## 2 1 1
## REN RENU Revolution Skincare
## 16 1 5
## Rodial Salcura Antiac Sanctuary Spa
## 5 2 11
## Sarah Chapman Sea Magik Shiseido
## 4 1 2
## Skin Doctors SkinCeuticals Spa Magik Organiks
## 2 3 1
## StriVectin Sukin SVR
## 1 14 2
## Talika Skintelligence Tan-Luxe TanOrganic
## 1 1 1
## The Chemistry The INKEY The Ordinary
## 3 7 17
## The Organic Pharmacy The Ritual Too Faced
## 3 6 1
## Trilogy Ultrasun ULTRASUN
## 4 1 1
## Uriage VERSO Vichy
## 1 1 1
## VICHY Weleda yes to
## 7 6 2
## Zelens
## 5
EXPLORATORY DATA ANALYSIS
library(dplyr)
library(tidyr)
library(ggplot2)
###DATA VISUALIZATION
#TOP 10 BRANDS WITH AVERAGE PRICE
# Calculate the count of each brand
brand_counts <- data %>%
count(brand) %>%
arrange(desc(n))
# Average price by brand
avg_price_by_brand <- data %>%
group_by(brand) %>%
summarise(avg_price = mean(price, na.rm = TRUE)) %>%
arrange(desc(avg_price))
# Display top 10 brands
top_10_brands <- head(avg_price_by_brand, 10)
# Create a vertical bar chart of the average prices of the top 10 brands
ggplot(top_10_brands, aes(x = reorder(brand, -avg_price), y = avg_price)) +
geom_bar(stat = "identity", fill = "lightblue") +
labs(title = "Top 10 Brands by Average Price", x = "Brand", y = "Average Price") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create the chart: Distribution of product type
chart1 <- ggplot(data, aes(x = product_type)) +
geom_bar(fill = "lightblue", color = "black") +
labs(title = "Distribution of Product Types", x = "Product Type", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
print(chart1)

library(wordcloud)
library(tm)
# Word cloud of clean ingredients
clean_ingreds <- unlist(strsplit(data$clean_ingreds, ","))
wordcloud(clean_ingreds, max.words = 50, random.order = FALSE)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

# Create the boxplot before removing outliers
boxplot_before <- ggplot(data, aes(y = price)) +
geom_boxplot() +
labs(title = "Boxplot of Price (Before Removing Outliers)", y = "Price") +
theme_minimal()
print(boxplot_before)

# Calculate the lower and upper bounds for outliers using the IQR method
q1 <- quantile(data$price, 0.25)
q3 <- quantile(data$price, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
# Remove outliers
data_filtered <- subset(data, price >= lower_bound & price <= upper_bound)
# Create the boxplot after removing outliers
boxplot_after <- ggplot(data_filtered, aes(y = price)) +
geom_boxplot() +
labs(title = "Boxplot of Price (After Removing Outliers)", y = "Price") +
theme_minimal()
print(boxplot_after)

Cross-tabulate
categorical_vars <- c("product_type", "brand")
# Create a cross-tabulation with frequency counts and percentages for each categorical variable
for (var in categorical_vars) {
cross_tab <- table(data[[var]])
if (var == "brand") {
# Get the top 10 brands
top_10_brands <- names(head(sort(cross_tab, decreasing = TRUE), 10))
# Filter the cross-tabulation for the top 10 brands
cross_tab <- cross_tab[top_10_brands]
}
# Calculate percentage
cross_tab_percentage <- prop.table(cross_tab) * 100
# Convert the cross-tabulation and percentage to data frames
cross_tab_df <- as.data.frame(cross_tab)
cross_tab_percentage_df <- as.data.frame(cross_tab_percentage)
# Format the percentage column with four decimal places and the '%' symbol
cross_tab_percentage_df$Freq <- paste0(format(cross_tab_percentage_df$Freq, nsmall = 4), "%")
# Print the cross-tabulation with frequency counts and percentages as a table
cat("Cross-Tabulation for", var, "\n")
print(knitr::kable(cross_tab_df))
cat("\n")
cat("Percentage for", var, "\n")
print(knitr::kable(cross_tab_percentage_df))
cat("\n")
}
## Cross-Tabulation for product_type
##
##
## |Var1 | Freq|
## |:-----------|----:|
## |Balm | 61|
## |Cleanser | 115|
## |Exfoliator | 57|
## |Eye Care | 100|
## |Mask | 124|
## |Mist | 80|
## |Moisturiser | 115|
## |Oil | 76|
## |Peel | 32|
## |Serum | 113|
## |Toner | 73|
##
## Percentage for product_type
##
##
## |Var1 |Freq |
## |:-----------|:----------|
## |Balm |6.448203% |
## |Cleanser |12.156448% |
## |Exfoliator |6.025370% |
## |Eye Care |10.570825% |
## |Mask |13.107822% |
## |Mist |8.456660% |
## |Moisturiser |12.156448% |
## |Oil |8.033827% |
## |Peel |3.382664% |
## |Serum |11.945032% |
## |Toner |7.716702% |
##
## Cross-Tabulation for brand
##
##
## |Var1 | Freq|
## |:---------------|----:|
## |Clinique | 46|
## |La Roche-Posay | 36|
## |Garnier | 33|
## |PIXI | 29|
## |L'Oréal | 25|
## |Estée | 23|
## |Holika Holika | 23|
## |DECLÉOR | 20|
## |Elemis | 19|
## |Elizabeth Arden | 18|
##
## Percentage for brand
##
##
## |Var1 |Freq |
## |:---------------|:----------|
## |Clinique |16.911765% |
## |La Roche-Posay |13.235294% |
## |Garnier |12.132353% |
## |PIXI |10.661765% |
## |L'Oréal |9.191176% |
## |Estée |8.455882% |
## |Holika Holika |8.455882% |
## |DECLÉOR |7.352941% |
## |Elemis |6.985294% |
## |Elizabeth Arden |6.617647% |
# Create a cross-tabulation with frequency counts and percentages for each categorical variable
for (var in categorical_vars) {
cross_tab <- table(data[[var]])
if (var == "brand") {
# Get the top 10 brands
top_10_brands <- names(head(sort(cross_tab, decreasing = TRUE), 10))
# Filter the cross-tabulation for the top 10 brands
cross_tab <- cross_tab[top_10_brands]
}
# Calculate percentage
cross_tab_percentage <- prop.table(cross_tab) * 100
# Convert the percentage to a data frame
cross_tab_percentage_df <- as.data.frame(cross_tab_percentage)
# Plot the percentage as a bar chart
p <- ggplot(cross_tab_percentage_df, aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "green") +
labs(x = var, y = "Percentage") +
ggtitle(paste0("Percentage for ", var))
# Print the plot
print(p)
}


Mode
categorical_vars <- c("product_type", "brand","product_name")
for (var in categorical_vars) {
if (var == "product_name") {
next
}
freq_table <- table(data[[var]])
mode <- names(freq_table)[which.max(freq_table)]
cat("Mode for", var, ":", mode, "\n")
}
## Mode for product_type : Mask
## Mode for brand : Clinique
library(ggplot2)
categorical_vars <- c("product_type", "brand")
# Create a cross-tabulation with frequency counts for each categorical variable
for (var in categorical_vars) {
freq_table <- table(data[[var]])
# Find the category with the highest count (mode)
mode <- names(freq_table)[which.max(freq_table)]
# Convert the frequency table to a data frame
freq_table_df <- data.frame(Category = names(freq_table), Frequency = as.vector(freq_table))
# Plot the bar chart
p <- ggplot(freq_table_df, aes(x = Category, y = Frequency, fill = Category == mode)) +
geom_bar(stat = "identity") +
labs(x = var, y = "Frequency") +
ggtitle(paste0("Frequency Counts for ", var)) +
scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = FALSE)
# Print the plot
print(p)
# Print the mode
cat("Mode for", var, ":", mode, "\n\n")
}
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Mode for product_type : Mask

## Mode for brand : Clinique