library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.4.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(tm)
## Warning: package 'tm' was built under R version 4.4.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.4.2
library(Matrix)
## Warning: package 'Matrix' was built under R version 4.4.3
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df <- fread("D:/angular/Aafrin/dataset/datasets/cosmetics.csv")
glimpse(df)
## Rows: 1,472
## Columns: 11
## $ Label <chr> "Moisturizer", "Moisturizer", "Moisturizer", "Moisturizer"âŚ
## $ Brand <chr> "LA MER", "SK-II", "DRUNK ELEPHANT", "LA MER", "IT COSMETIâŚ
## $ Name <chr> "Crème de la Mer", "Facial Treatment Essence", "Protini⢠PâŚ
## $ Price <int> 175, 179, 68, 175, 38, 68, 60, 72, 29, 325, 45, 47, 38, 10âŚ
## $ Rank <dbl> 4.1, 4.1, 4.4, 3.8, 4.1, 4.2, 4.2, 4.4, 4.4, 5.0, 4.3, 4.4âŚ
## $ Ingredients <chr> "Algae (Seaweed) Extract, Mineral Oil, Petrolatum, GlyceriâŚ
## $ Combination <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0âŚ
## $ Dry <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1âŚ
## $ Normal <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1âŚ
## $ Oily <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0âŚ
## $ Sensitive <int> 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1âŚ
table(df$Label)
##
## Cleanser Eye cream Face Mask Moisturizer Sun protect Treatment
## 281 209 266 298 170 248
moisturizers_dry <- df[Label == "Moisturizer" & Dry == 1, .(Name, Ingredients)]
moisturizers_dry[, id := .I]
head(moisturizers_dry)
## Name
## <char>
## 1: Crème de la Mer
## 2: Facial Treatment Essence
## 3: Protini⢠Polypeptide Cream
## 4: The Moisturizing Soft Cream
## 5: Your Skin But Better⢠CC+⢠Cream with SPF 50+
## 6: Lala Retro⢠Whipped Cream
## Ingredients
## <char>
## 1: Algae (Seaweed) Extract, Mineral Oil, Petrolatum, Glycerin, Isohexadecane, Microcrystalline Wax, Lanolin Alcohol, Citrus Aurantifolia (Lime) Extract, Sesamum Indicum (Sesame) Seed Oil, Eucalyptus Globulus (Eucalyptus) Leaf Oil, Sesamum Indicum (Sesame) Seed Powder, Medicago Sativa (Alfalfa) Seed Powder, Helianthus Annuus (Sunflower) Seedcake, Prunus Amygdalus Dulcis (Sweet Almond) Seed Meal, Sodium Gluconate, Copper Gluconate, Calcium Gluconate, Magnesium Gluconate, Zinc Gluconate, Magnesium Sulfate, Paraffin, Tocopheryl Succinate, Niacin, Water, Beta-Carotene, Decyl Oleate, Aluminum Distearate, Octyldodecanol, Citric Acid, Cyanocobalamin, Magnesium Stearate, Panthenol, Limonene, Geraniol, Linalool, Hydroxycitronellal, Citronellol, Benzyl Salicylate, Citral, Sodium Benzoate, Alcohol Denat., Fragrance.
## 2: Galactomyces Ferment Filtrate (Pitera), Butylene Glycol, Pentylene Glycol, Water, Sodium Benzoate, Methylparaben, Sorbic Acid.
## 3: Water, Dicaprylyl Carbonate, Glycerin, Cetearyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Sclerocarya Birrea Seed Oil, Bacillus/Soybean/ Folic Acid Ferment Extract, Nymphaea Alba Root Extract, sh-Oligopeptide-1, sh-Oligopeptide-2, sh-Polypeptide-1, sh-Polypeptide-9, sh-Polypeptide-11, Copper Palmitoyl Heptapeptide-14, Heptapeptide-15 Palmitate, Palmitoyl Tetrapeptide-7, Palmitoyl Tripeptide-1, Alanine, Arginine, Glycine, Histidine, Isoleucine, Phenylalanine, Proline, Serine, Threonine, Valine, Acetyl Glutamine, Coconut Alkanes , Coco-Caprylate/Caprate, Sodium Hyaluronate, Aspartic Acid, Linoleic Acid, Linolenic Acid, Lecithin, Butylene Glycol, Polyvinyl Alcohol, Sodium Lactate, Sodium PCA, PCA, Sorbitan Isostearate, Carbomer, Polysorbate 20, Polysorbate 60, Lactic Acid/Glycolic Acid Copolymer, Hydroxyethyl Acrylate/Sodium Acryloyldimethyl Taurate Copolymer, Xanthan Gum, Isomalt, 1,2-Hexanediol, Caprylyl Glycol, Chlorphenesin, Phenoxyethanol, Tocopherol, Sodium Benzoate, Phenylpropanol, Glyceryl Caprylate, Symphytum Officinale Callus Culture Extract.
## 4: Algae (Seaweed) Extract, Cyclopentasiloxane, Petrolatum, Glyceryl Distearate, Phenyl Trimethicone, Butylene Glycol, Hydrogenated Vegetable Oil, Cholesterol, Butyrospermum Parkii (Shea Butter), Steareth-10, Dimethicone, Glyceryl Stearate Se, Polysilicone-11, Sesamum Indicum (Sesame) Seed Oil, Medicago Sativa (Alfalfa) Seed Powder, Helianthus Annuus (Sunflower) Seedcake, Prunus Amygdalus Dulcis (Sweet Almond) Seed Meal, Eucalyptus Globulus (Eucalyptus) Leaf Oil, Sodium Gluconate, Copper Gluconate, Calcium Gluconate, Magnesium Gluconate, Zinc Gluconate, Tocopheryl Succinate, Niacin, Sesamum Indicum (Sesame) Seed Powder, Water, Citrus Aurantifolia (Lime) Peel Extract, Laminaria Digitata Extract, Crithmum Maritimum Extract, Salicornia Herbacea Extract, Plankton Extract, Chlorella Vulgaris Extract, Glycine Soja (Soybean) Seed Extract, Glycerin, Caffeine, Sea Salt/Maris Sal/Sel Marin, Micrococcus Lysate, Diethylhexyl Succinate, Adenosine Phosphate, Creatine, Hydrolyzed Algin, Isocetyl Stearoyl Stearate, Cetyl Alcohol, Sucrose, Acetyl Hexapeptide-8, Glucose Oxidase, Polyacrylamide, Acetyl Carnitine Hcl, Glucose, Caprylic/Capric Triglyceride, C13-14 Isoparaffin, Tocopheryl Acetate, Tetrahexyldecyl Ascorbate, Sodium Pca, Glycosaminoglycans, Urea, Distearyldimonium Chloride, Dipalmitoyl Hydroxyproline, Sodium Hyaluronate, Laureth-7, Lecithin, Trehalose, Polyquaternium-51, Lactoperoxidase, Hydroxypropyl Cyclodextrin, Cyanocobalamin, Pentylene Glycol, Fragrance, Disodium Edta, Bht, Citronellol, Hydroxycitronellal, Geraniol, Linalool, Limonene, Potassium Sorbate, Phenoxyethanol
## 5: Water, Snail Secretion Filtrate, Phenyl Trimethicone, Dimethicone, Butylene Glycol, Butylene Glycol Dicaprylate/Dicaprate, Orbignya Oleifera Seed Oil, Butyloctyl Salicylate, Cetyl Peg/Ppg-10/1 Dimethicone, Cyclopentasiloxane, Cyclohexasiloxane, Magnesium Sulfate, Polyglyceryl-4 Isostearate, Dimethicone/Vinyl Dimethicone Crosspolymer, Aluminum Hydroxide, Hexyl Laurate, Stearic Acid, Calcium Stearate, Caprylyl Glycol, Triethoxycaprylylsilane, Ethylhexylglycerin, Citrus Medica Limonum (Lemon) Peel Oil, Tocopheryl Acetate, Sorbitan Isostearate, Phenoxyethanol, Citrus Aurantium Bergamia (Bergamot) Fruit Oil, 1,2-Hexanediol, Disodium Edta, Citrus Aurantium Dulcis (Orange) Peel Oil, Citrus Aurantifolia (Lime) Oil, Vitis Vinifera (Grape) Seed Oil, Punica Granatum Seed Oil, Pinus Sylvestris Leaf Oil, Persea Gratissima (Avocado) Oil, Niacinamide, Citrus Grandis (Grapefruit) Peel Oil, Cholesterol, Anthemis Nobilis Flower Water, Lactobacillus/Honeysuckle Flower/Licorice Root/Morus Alba Root/Pueraria Lobata Root/Schizandra Chinensis Fruit/Scutellaria Baicalensis Root/Sophora Japonica Flower Extract Ferment Filtrate, Perfluorohexane, Olea Europaea (Olive) Leaf Extract, Glycerin, Eucalyptus Globulus Leaf Oil, Camellia Sinensis Leaf Extract, Chrysanthemum Indicum Flower Extract, Pueraria Lobata Root Extract, Perfluorodecalin, Morus Alba Fruit Extract, Magnolia Kobus Bark Extract, Glycine Soja (Soybean) Sprout Extract, Diospyros Kaki Leaf Extract, Cinnamomum Cassia Bark Extract, Artemisia Princeps Leaf Extract, Pentafluoropropane, Curcuma Longa (Turmeric) Root Extract, Steareth-20, Hydrolyzed Hyaluronic Acid, Colloidal Oatmeal, Hydrolyzed Silk, Citric Acid, Sodium Benzoate, Potassium Sorbate, Aloe Barbadensis Leaf Extract, N-Hydroxysuccinimide, Hydrolyzed Collagen, Caprylhydroxamic Acid, Tocopherol, Thiamine Hcl, Riboflavin, Retinyl Palmitate, Pantothenic Acid, Palmitoyl Oligopeptide, Niacin, Folic Acid, Chrysin, Carnitine Hcl, Biotin, Ascorbic Acid, Palmitoyl Tetrapeptide-7, Chlorhexidine Digluconate. May Contain: Iron Oxides (Ci 77492, Ci 77491, Ci 77499).
## 6: Water, Glycerin, Caprylic/ Capric Triglyceride, Isopropyl Isostearate, Pseudozyma Epicola/Camellia Sinensis Seed Oil/Glucose/Glycine Soja Meal/Malt Extract/Yeast Extract Ferment Filtrate (Pseudozyma Epicola/Camellia Sinensis Seed Oil/Glucose/Yeast Extract Ferment Filtrate), Stearic Acid, Glyceryl Stearate SE, Cetearyl Alcohol, Pentylene Glycol, Plantago Lanceolata Leaf Extract, Adansonia Digitata Seed Oil, Citrullus Lanatus (Watermelon) Seed Oil, Passiflora Edulis Seed Oil, Schinziophyton Rautanenii Kernel Oil, Sclerocarya Birrea Seed Oil, Polyglyceryl-6 Ximenia Americana Seedate, Sodium Hyaluronate Crosspolymer, Ceteareth-20, Trisodium Ethylenediamine Disuccinate, Sodium Hydroxide, Citric Acid, Carbomer, Xanthan Gum, Caprylyl Glycol, Chlorphenesin, Phenoxyethanol, Ethylhexylglycerin.
## id
## <int>
## 1: 1
## 2: 2
## 3: 3
## 4: 4
## 5: 5
## 6: 6
tokenized <- moisturizers_dry[, .(id, Ingredients = str_to_lower(Ingredients))]
tokenized <- tokenized[, .(Ingredient = unlist(str_split(Ingredients, ", "))), by = id]
head(tokenized)
## id Ingredient
## <int> <char>
## 1: 1 algae (seaweed) extract
## 2: 1 mineral oil
## 3: 1 petrolatum
## 4: 1 glycerin
## 5: 1 isohexadecane
## 6: 1 microcrystalline wax
dtm <- sparseMatrix(
i = as.integer(factor(tokenized$id)),
j = as.integer(factor(tokenized$Ingredient)),
x = 1
)
dense_dtm <- as.matrix(dtm)
unique_dtm <- unique(dense_dtm)
tsne_result <- Rtsne(unique_dtm, dims = 2, perplexity = 30, verbose = TRUE)
## Performing PCA
## Read the 183 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.04 seconds (sparsity = 0.729553)!
## Learning embedding...
## Iteration 50: error is 53.873739 (50 iterations in 0.03 seconds)
## Iteration 100: error is 60.200620 (50 iterations in 0.02 seconds)
## Iteration 150: error is 59.334936 (50 iterations in 0.02 seconds)
## Iteration 200: error is 56.400981 (50 iterations in 0.02 seconds)
## Iteration 250: error is 58.904631 (50 iterations in 0.02 seconds)
## Iteration 300: error is 1.291788 (50 iterations in 0.03 seconds)
## Iteration 350: error is 0.831682 (50 iterations in 0.03 seconds)
## Iteration 400: error is 0.681425 (50 iterations in 0.02 seconds)
## Iteration 450: error is 0.651650 (50 iterations in 0.01 seconds)
## Iteration 500: error is 0.627297 (50 iterations in 0.01 seconds)
## Iteration 550: error is 0.617058 (50 iterations in 0.01 seconds)
## Iteration 600: error is 0.611218 (50 iterations in 0.01 seconds)
## Iteration 650: error is 0.611641 (50 iterations in 0.02 seconds)
## Iteration 700: error is 0.609564 (50 iterations in 0.02 seconds)
## Iteration 750: error is 0.605577 (50 iterations in 0.01 seconds)
## Iteration 800: error is 0.607848 (50 iterations in 0.01 seconds)
## Iteration 850: error is 0.602385 (50 iterations in 0.02 seconds)
## Iteration 900: error is 0.600724 (50 iterations in 0.02 seconds)
## Iteration 950: error is 0.600881 (50 iterations in 0.01 seconds)
## Iteration 1000: error is 0.599512 (50 iterations in 0.02 seconds)
## Fitting performed in 0.37 seconds.
tsne_df <- data.frame(tsne_result$Y)
tsne_df$id <- match(data.frame(t(unique_dtm)) |> as.list(), data.frame(t(dense_dtm)) |> as.list())
tsne_df <- merge(tsne_df, moisturizers_dry, by = "id")
plot_ly(
data = tsne_df,
x = ~X1,
y = ~X2,
type = 'scatter',
mode = 'markers',
text = ~Name,
marker = list(size = 10)
)
prod1_ingredients <- tokenized[ id == 1, Ingredient ]
prod2_ingredients <- tokenized[ id == 2, Ingredient ]
common <- intersect(prod1_ingredients, prod2_ingredients)
only_in_prod1 <- setdiff(prod1_ingredients, prod2_ingredients)
only_in_prod2 <- setdiff(prod2_ingredients, prod1_ingredients)
list(
Common = common,
Only_in_Product_1 = only_in_prod1,
Only_in_Product_2 = only_in_prod2
)
## $Common
## [1] "water" "sodium benzoate"
##
## $Only_in_Product_1
## [1] "algae (seaweed) extract"
## [2] "mineral oil"
## [3] "petrolatum"
## [4] "glycerin"
## [5] "isohexadecane"
## [6] "microcrystalline wax"
## [7] "lanolin alcohol"
## [8] "citrus aurantifolia (lime) extract"
## [9] "sesamum indicum (sesame) seed oil"
## [10] "eucalyptus globulus (eucalyptus) leaf oil"
## [11] "sesamum indicum (sesame) seed powder"
## [12] "medicago sativa (alfalfa) seed powder"
## [13] "helianthus annuus (sunflower) seedcake"
## [14] "prunus amygdalus dulcis (sweet almond) seed meal"
## [15] "sodium gluconate"
## [16] "copper gluconate"
## [17] "calcium gluconate"
## [18] "magnesium gluconate"
## [19] "zinc gluconate"
## [20] "magnesium sulfate"
## [21] "paraffin"
## [22] "tocopheryl succinate"
## [23] "niacin"
## [24] "beta-carotene"
## [25] "decyl oleate"
## [26] "aluminum distearate"
## [27] "octyldodecanol"
## [28] "citric acid"
## [29] "cyanocobalamin"
## [30] "magnesium stearate"
## [31] "panthenol"
## [32] "limonene"
## [33] "geraniol"
## [34] "linalool"
## [35] "hydroxycitronellal"
## [36] "citronellol"
## [37] "benzyl salicylate"
## [38] "citral"
## [39] "alcohol denat."
## [40] "fragrance."
##
## $Only_in_Product_2
## [1] "galactomyces ferment filtrate (pitera)"
## [2] "butylene glycol"
## [3] "pentylene glycol"
## [4] "methylparaben"
## [5] "sorbic acid."
This project demonstrates how data science can assist skincare recommendations using content-based filtering based on cosmetic chemical components. We filtered for dry skin moisturizers, tokenized their ingredients, built a sparse matrix, applied t-SNE for similarity mapping, and used Plotly for interactive visual clustering.