Install libraries if needed

# install libraries
# only need to run once ever
# comment it out before knitting
#install.packages("dplyr")
#install.packages("sqldf")

#...whatever other packages you are missing
# don't show warnings while knitting
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

Constants

# constants
my_working_dir_pc <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora"
 # if on a PC

myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"

myFile_reviews  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"

myFile_authors  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"

Import data

# import data
setwd(my_working_dir_pc) # on a PC computer
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)

Take a look at the products table

#head(products) # first 6 rows of the products table
str(products) # structure of the products table
## 'data.frame':    8494 obs. of  24 variables:
##  $ product_id        : chr  "P473671" "P473668" "P473662" "P473660" ...
##  $ product_name      : chr  "Fragrance Discovery Set" "La Habana Eau de Parfum" "Rainbow Bar Eau de Parfum" "Kasbah Eau de Parfum" ...
##  $ brand_id          : int  6342 6342 6342 6342 6342 6342 6342 6342 6342 6342 ...
##  $ brand_name        : chr  "19-69" "19-69" "19-69" "19-69" ...
##  $ loves_count       : int  6320 3827 3253 3018 2691 2448 1619 1542 1542 1377 ...
##  $ rating            : num  3.64 4.15 4.25 4.48 3.23 ...
##  $ reviews           : int  11 13 16 21 13 21 13 8 7 8 ...
##  $ size              : chr  "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
##  $ variation_type    : chr  "" "Size + Concentration + Formulation" "Size + Concentration + Formulation" "Size + Concentration + Formulation" ...
##  $ variation_value   : chr  "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
##  $ variation_desc    : chr  "" "" "" "" ...
##  $ ingredients       : chr  "['Capri Eau de Parfum:', 'Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Linalool, Benzyl Sali"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Ethylhexyl Methoxycinnamate, Ethylhexyl Salicylate, Butyl"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Ethylhexyl Methoxycinnamate, Butyl Methoxydib"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Coumarin, Ethylhexyl Methoxycinnamate, Butyl Methoxydiben"| __truncated__ ...
##  $ price_usd         : num  35 195 195 195 195 30 30 195 195 30 ...
##  $ value_price_usd   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sale_price_usd    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ limited_edition   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ online_only       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ out_of_stock      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sephora_exclusive : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highlights        : chr  "['Unisex/ Genderless Scent', 'Warm &Spicy Scent', 'Woody & Earthy Scent', 'Fresh Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Woody & Earthy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" ...
##  $ primary_category  : chr  "Fragrance" "Fragrance" "Fragrance" "Fragrance" ...
##  $ secondary_category: chr  "Value & Gift Sets" "Women" "Women" "Women" ...
##  $ tertiary_category : chr  "Perfume Gift Sets" "Perfume" "Perfume" "Perfume" ...

Take a look at the reviews table

#head(reviews) # first 6 rows of the reviews table
str(reviews) # structure of the reviews table
## 'data.frame':    1536 obs. of  12 variables:
##  $ review_id               : int  1035085 1035086 1035087 1035088 1035089 1035090 1035091 1035092 1035093 1035094 ...
##  $ author_id               : num  5.23e+09 3.54e+10 2.73e+10 1.70e+09 2.69e+09 ...
##  $ product_id              : chr  "P387511" "P387511" "P387511" "P387511" ...
##  $ rating                  : int  5 2 4 4 4 3 2 5 3 3 ...
##  $ is_recommended          : int  1 0 0 1 1 0 0 1 0 1 ...
##  $ helpfulness             : num  1 0.5 1 0.75 0.933 ...
##  $ total_feedback_count    : int  3 4 6 4 15 4 7 2 12 15 ...
##  $ total_neg_feedback_count: int  0 2 0 1 1 2 2 0 7 8 ...
##  $ total_pos_feedback_count: int  3 2 6 3 14 2 5 2 5 7 ...
##  $ submission_date         : chr  "11/29/22" "8/12/22" "2/18/22" "1/12/21" ...
##  $ review_text             : chr  "This cream is pricey, but it is so worth it! I started getting acne in my 20s from a stressful job  and tried a"| __truncated__ "I used this in rotation with the Divine Youth Oil. Neither of these products left me feeling wowed in any way. "| __truncated__ "I really did like this cream but I don’t think I’d buy it again just because of how expensive it is. It smells "| __truncated__ "Received a 5 ml sample of this cream, and it’s great! I love a high end moisturizer that is thick and rich, but"| __truncated__ ...
##  $ review_title            : chr  "My holy grail" "Not for me!" "" "" ...
# convert the column to type "date"
reviews$submission_date <- as.Date(reviews$submission_date, format = "%m/%d/%y")

Take a look at the authors table

# look at the first 6 rows of the authors table
#head(authors) # first 6 rows of the authors table
str(authors)
## 'data.frame':    1526 obs. of  5 variables:
##  $ author_id : num  5.23e+09 3.54e+10 2.73e+10 1.70e+09 2.69e+09 ...
##  $ skin_tone : chr  "" "mediumTan" "light" "light" ...
##  $ eye_color : chr  "brown" "brown" "brown" "brown" ...
##  $ skin_type : chr  "dry" "combination" "combination" "combination" ...
##  $ hair_color: chr  "black" "black" "black" "black" ...

SQL to pull in product info to the reviews table

# pull in product info by inner join
# call the library, sqldf, 
# then the function, which also happens to be called sqldf
reviews_products <- sqldf::sqldf(" 
SELECT r.*, p.brand_name, p.product_name
FROM reviews r INNER JOIN products p ON r.product_id = p.product_id
                      ")

Get distinct values

# get distinct values of brand name and product name
# in the products table
# call the library, dplyr, then the function distinct
dplyr::distinct(reviews_products, brand_name, product_name)

Calculate the means

# calculate means
mean(reviews_products$rating)
## [1] 4.038411
mean(reviews_products[reviews_products$brand_name=='L\'Occitane',]$rating)
## [1] 3.880503
# Need the slash because of the ' in L'Occitane

mean(reviews_products[reviews_products$brand_name=='La Mer',]$rating)
## [1] 4.056645

Review Length

# Review length in characters
reviews_products$review_length <- nchar(reviews_products$review_text)

# Mean review length
mean(reviews_products$review_length, na.rm = TRUE)
## [1] 404.1374

Histogram of Review Length

hist(reviews_products$review_length,
     main = "Distribution of Review Length",
     xlab = "Review Length (Characters)",
     breaks = 30)

# Long VS Short Review

median_length <- median(reviews_products$review_length, na.rm = TRUE)
median_length
## [1] 327
reviews_products$length_group <- ifelse(
  reviews_products$review_length > median_length,
  "Long", "Short"
)

#Count of Long vs Short Reviews
table(reviews_products$length_group)
## 
##  Long Short 
##   763   773
#Proportion of Long VS SHort Reviews
prop.table(table(reviews_products$length_group))
## 
##      Long     Short 
## 0.4967448 0.5032552
#Count by Star Rating
table(reviews_products$rating, reviews_products$length_group)
##    
##     Long Short
##   1   64   103
##   2   53    65
##   3   74    48
##   4  123    88
##   5  449   469

Plot histograms

# plot histograms. Lay out the 3 graphs
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

hTotal <- hist(reviews_products$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of Ratings', xlab = 'Review Ratings (in Stars)')

h1 <- hist(reviews[reviews_products$brand_name == 'L\'Occitane',]$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of L\'Occitane Ratings', xlab = 'Review Ratings (in Stars)')

h2 <- hist(reviews[reviews_products$brand_name == 'La Mer',]$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of La Mer Ratings', xlab = 'Review Ratings (in Stars)')

Code generated by GPT

# Assuming 'loccitane_reviews' is your DataFrame and 'rating' is the column with the ratings for L'Occitane products
# First, make sure you have the correct data filtered for L'Occitane. Here's a mock-up for loading and filtering data.

# Load the data
#reviews <- read.csv("path_to_reviews.csv")
#products <- read.csv("path_to_products.csv")
my_working_dir_mac <- "~/Desktop/MKTAnalytics/Data/Sephora" # working directory if on a Mac
my_working_dir_pc <- "C:\\Users\\rebec\\OneDrive\\Desktop\\MKTAnalytics\\Data\\Sephora" # if on a PC

myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"

myFile_reviews  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"

myFile_authors  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"

#setwd(my_working_dir_mac)
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)

# Merge reviews with products
review_product_data <- merge(reviews, products, by = "product_id")

# Filter for L'Occitane products
loccitane_reviews <- subset(review_product_data, brand_name == "L'Occitane")

# Ensure the rating column is numeric
#loccitane_reviews$rating <- as.numeric(loccitane_reviews$rating)
loccitane_reviews$rating.x <- as.numeric(loccitane_reviews$rating.x)

# Handling possible NAs in rating after conversion
#loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating), ]
loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating.x), ]

# Plot histogram of ratings
#hist(loccitane_reviews$rating, 
hist(loccitane_reviews$rating.x, 
     breaks = seq(0.5, 5.5, by = 1), # Adjust breaks to suit the range and bin width you want
     main = "Histogram of Ratings for L'Occitane Products",
     xlab = "Ratings",
     ylab = "Frequency",
     col = "blue",
     border = "black")

# Adding a grid for better visualization
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

Vader Sentiment Scores

#install.packages("vader")
library(vader)

# Filter for one-star reviews only
one_star <- subset(reviews_products, rating == 1)

# Remove missing review text
one_star <- one_star[!is.na(one_star$review_text), ]

# Calculate VADER compound sentiment scores
one_star$vader <- vader_df(one_star$review_text)$compound

# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = one_star, mean)
# T-test: Long vs Short one-star reviews
t.test(vader ~ length_group, data = one_star)
## 
##  Welch Two Sample t-test
## 
## data:  vader by length_group
## t = 2.2084, df = 114.02, p-value = 0.02922
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
##  0.02096895 0.38630563
## sample estimates:
##  mean in group Long mean in group Short 
##          0.30173437          0.09809709

5 Stars - Long VS Short

# Filter for five-star reviews only
five_star <- subset(reviews_products, rating == 5)

# Remove missing review text
five_star <- five_star[!is.na(five_star$review_text), ]

# Calculate VADER compound sentiment scores
five_star$vader <- vader_df(five_star$review_text)$compound

# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = five_star, mean)
# T-test: Long vs Short five-star reviews
t.test(vader ~ length_group, data = five_star)
## 
##  Welch Two Sample t-test
## 
## data:  vader by length_group
## t = 6.7245, df = 900.07, p-value = 3.128e-11
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
##  0.1013724 0.1849338
## sample estimates:
##  mean in group Long mean in group Short 
##           0.8315902           0.6884371