Install libraries if needed

# install libraries
# only need to run once ever
# comment it out before knitting
#install.packages("dplyr")
#install.packages("sqldf")

#...whatever other packages you are missing

# don't show warnings while knitting
knitr::opts_chunk$set(warning = FALSE, message = FALSE)

Constants

# constants
my_working_dir_pc <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora"
 # if on a PC

myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"

myFile_reviews  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"

myFile_authors  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"

Import data

# import data
setwd(my_working_dir_pc) # on a PC computer
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)

Take a look at the products table

#head(products) # first 6 rows of the products table
str(products) # structure of the products table

## 'data.frame':    8494 obs. of  24 variables:
##  $ product_id        : chr  "P473671" "P473668" "P473662" "P473660" ...
##  $ product_name      : chr  "Fragrance Discovery Set" "La Habana Eau de Parfum" "Rainbow Bar Eau de Parfum" "Kasbah Eau de Parfum" ...
##  $ brand_id          : int  6342 6342 6342 6342 6342 6342 6342 6342 6342 6342 ...
##  $ brand_name        : chr  "19-69" "19-69" "19-69" "19-69" ...
##  $ loves_count       : int  6320 3827 3253 3018 2691 2448 1619 1542 1542 1377 ...
##  $ rating            : num  3.64 4.15 4.25 4.48 3.23 ...
##  $ reviews           : int  11 13 16 21 13 21 13 8 7 8 ...
##  $ size              : chr  "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
##  $ variation_type    : chr  "" "Size + Concentration + Formulation" "Size + Concentration + Formulation" "Size + Concentration + Formulation" ...
##  $ variation_value   : chr  "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
##  $ variation_desc    : chr  "" "" "" "" ...
##  $ ingredients       : chr  "['Capri Eau de Parfum:', 'Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Linalool, Benzyl Sali"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Ethylhexyl Methoxycinnamate, Ethylhexyl Salicylate, Butyl"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Ethylhexyl Methoxycinnamate, Butyl Methoxydib"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Coumarin, Ethylhexyl Methoxycinnamate, Butyl Methoxydiben"| __truncated__ ...
##  $ price_usd         : num  35 195 195 195 195 30 30 195 195 30 ...
##  $ value_price_usd   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ sale_price_usd    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ limited_edition   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ online_only       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ out_of_stock      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sephora_exclusive : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ highlights        : chr  "['Unisex/ Genderless Scent', 'Warm &Spicy Scent', 'Woody & Earthy Scent', 'Fresh Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Woody & Earthy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" ...
##  $ primary_category  : chr  "Fragrance" "Fragrance" "Fragrance" "Fragrance" ...
##  $ secondary_category: chr  "Value & Gift Sets" "Women" "Women" "Women" ...
##  $ tertiary_category : chr  "Perfume Gift Sets" "Perfume" "Perfume" "Perfume" ...

Take a look at the reviews table

#head(reviews) # first 6 rows of the reviews table
str(reviews) # structure of the reviews table

## 'data.frame':    1536 obs. of  12 variables:
##  $ review_id               : int  1035085 1035086 1035087 1035088 1035089 1035090 1035091 1035092 1035093 1035094 ...
##  $ author_id               : num  5.23e+09 3.54e+10 2.73e+10 1.70e+09 2.69e+09 ...
##  $ product_id              : chr  "P387511" "P387511" "P387511" "P387511" ...
##  $ rating                  : int  5 2 4 4 4 3 2 5 3 3 ...
##  $ is_recommended          : int  1 0 0 1 1 0 0 1 0 1 ...
##  $ helpfulness             : num  1 0.5 1 0.75 0.933 ...
##  $ total_feedback_count    : int  3 4 6 4 15 4 7 2 12 15 ...
##  $ total_neg_feedback_count: int  0 2 0 1 1 2 2 0 7 8 ...
##  $ total_pos_feedback_count: int  3 2 6 3 14 2 5 2 5 7 ...
##  $ submission_date         : chr  "11/29/22" "8/12/22" "2/18/22" "1/12/21" ...
##  $ review_text             : chr  "This cream is pricey, but it is so worth it! I started getting acne in my 20s from a stressful job  and tried a"| __truncated__ "I used this in rotation with the Divine Youth Oil. Neither of these products left me feeling wowed in any way. "| __truncated__ "I really did like this cream but I don’t think I’d buy it again just because of how expensive it is. It smells "| __truncated__ "Received a 5 ml sample of this cream, and it’s great! I love a high end moisturizer that is thick and rich, but"| __truncated__ ...
##  $ review_title            : chr  "My holy grail" "Not for me!" "" "" ...

# convert the column to type "date"
reviews$submission_date <- as.Date(reviews$submission_date, format = "%m/%d/%y")

Take a look at the authors table

# look at the first 6 rows of the authors table
#head(authors) # first 6 rows of the authors table
str(authors)

## 'data.frame':    1526 obs. of  5 variables:
##  $ author_id : num  5.23e+09 3.54e+10 2.73e+10 1.70e+09 2.69e+09 ...
##  $ skin_tone : chr  "" "mediumTan" "light" "light" ...
##  $ eye_color : chr  "brown" "brown" "brown" "brown" ...
##  $ skin_type : chr  "dry" "combination" "combination" "combination" ...
##  $ hair_color: chr  "black" "black" "black" "black" ...

SQL to pull in product info to the reviews table

# pull in product info by inner join
# call the library, sqldf, 
# then the function, which also happens to be called sqldf
reviews_products <- sqldf::sqldf(" 
SELECT r.*, p.brand_name, p.product_name
FROM reviews r INNER JOIN products p ON r.product_id = p.product_id
                      ")

Get distinct values

# get distinct values of brand name and product name
# in the products table
# call the library, dplyr, then the function distinct
dplyr::distinct(reviews_products, brand_name, product_name)

Calculate the means

# calculate means
mean(reviews_products$rating)

## [1] 4.038411

mean(reviews_products[reviews_products$brand_name=='L\'Occitane',]$rating)

## [1] 3.880503

# Need the slash because of the ' in L'Occitane

mean(reviews_products[reviews_products$brand_name=='La Mer',]$rating)

## [1] 4.056645

Review Length

# Review length in characters
reviews_products$review_length <- nchar(reviews_products$review_text)

# Mean review length
mean(reviews_products$review_length, na.rm = TRUE)

## [1] 404.1374

Histogram of Review Length

hist(reviews_products$review_length,
     main = "Distribution of Review Length",
     xlab = "Review Length (Characters)",
     breaks = 30)

# Long VS Short Review

median_length <- median(reviews_products$review_length, na.rm = TRUE)
median_length

## [1] 327

reviews_products$length_group <- ifelse(
  reviews_products$review_length > median_length,
  "Long", "Short"
)

#Count of Long vs Short Reviews
table(reviews_products$length_group)

## 
##  Long Short 
##   763   773

#Proportion of Long VS SHort Reviews
prop.table(table(reviews_products$length_group))

## 
##      Long     Short 
## 0.4967448 0.5032552

#Count by Star Rating
table(reviews_products$rating, reviews_products$length_group)

##    
##     Long Short
##   1   64   103
##   2   53    65
##   3   74    48
##   4  123    88
##   5  449   469

Plot histograms

# plot histograms. Lay out the 3 graphs
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

hTotal <- hist(reviews_products$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of Ratings', xlab = 'Review Ratings (in Stars)')

h1 <- hist(reviews[reviews_products$brand_name == 'L\'Occitane',]$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of L\'Occitane Ratings', xlab = 'Review Ratings (in Stars)')

h2 <- hist(reviews[reviews_products$brand_name == 'La Mer',]$rating, breaks = seq(0, 5, by = 1),
           main = 'Distribution of La Mer Ratings', xlab = 'Review Ratings (in Stars)')

Code generated by GPT

# Assuming 'loccitane_reviews' is your DataFrame and 'rating' is the column with the ratings for L'Occitane products
# First, make sure you have the correct data filtered for L'Occitane. Here's a mock-up for loading and filtering data.

# Load the data
#reviews <- read.csv("path_to_reviews.csv")
#products <- read.csv("path_to_products.csv")
my_working_dir_mac <- "~/Desktop/MKTAnalytics/Data/Sephora" # working directory if on a Mac
my_working_dir_pc <- "C:\\Users\\rebec\\OneDrive\\Desktop\\MKTAnalytics\\Data\\Sephora" # if on a PC

myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"

myFile_reviews  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"

myFile_authors  <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"

#setwd(my_working_dir_mac)
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)

# Merge reviews with products
review_product_data <- merge(reviews, products, by = "product_id")

# Filter for L'Occitane products
loccitane_reviews <- subset(review_product_data, brand_name == "L'Occitane")

# Ensure the rating column is numeric
#loccitane_reviews$rating <- as.numeric(loccitane_reviews$rating)
loccitane_reviews$rating.x <- as.numeric(loccitane_reviews$rating.x)

# Handling possible NAs in rating after conversion
#loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating), ]
loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating.x), ]

# Plot histogram of ratings
#hist(loccitane_reviews$rating, 
hist(loccitane_reviews$rating.x, 
     breaks = seq(0.5, 5.5, by = 1), # Adjust breaks to suit the range and bin width you want
     main = "Histogram of Ratings for L'Occitane Products",
     xlab = "Ratings",
     ylab = "Frequency",
     col = "blue",
     border = "black")

# Adding a grid for better visualization
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")

Vader Sentiment Scores

#install.packages("vader")
library(vader)

# Filter for one-star reviews only
one_star <- subset(reviews_products, rating == 1)

# Remove missing review text
one_star <- one_star[!is.na(one_star$review_text), ]

# Calculate VADER compound sentiment scores
one_star$vader <- vader_df(one_star$review_text)$compound

# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = one_star, mean)

# T-test: Long vs Short one-star reviews
t.test(vader ~ length_group, data = one_star)

## 
##  Welch Two Sample t-test
## 
## data:  vader by length_group
## t = 2.2084, df = 114.02, p-value = 0.02922
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
##  0.02096895 0.38630563
## sample estimates:
##  mean in group Long mean in group Short 
##          0.30173437          0.09809709

5 Stars - Long VS Short

# Filter for five-star reviews only
five_star <- subset(reviews_products, rating == 5)

# Remove missing review text
five_star <- five_star[!is.na(five_star$review_text), ]

# Calculate VADER compound sentiment scores
five_star$vader <- vader_df(five_star$review_text)$compound

# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = five_star, mean)

# T-test: Long vs Short five-star reviews
t.test(vader ~ length_group, data = five_star)

## 
##  Welch Two Sample t-test
## 
## data:  vader by length_group
## t = 6.7245, df = 900.07, p-value = 3.128e-11
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
##  0.1013724 0.1849338
## sample estimates:
##  mean in group Long mean in group Short 
##           0.8315902           0.6884371

Listening and Online Reviews - Sephora

Install libraries if needed

Constants

Import data

Take a look at the products table

Take a look at the reviews table

Take a look at the authors table

SQL to pull in product info to the reviews table

Get distinct values

Calculate the means

Review Length

Histogram of Review Length

Plot histograms

Code generated by GPT

Vader Sentiment Scores

5 Stars - Long VS Short