# install libraries
# only need to run once ever
# comment it out before knitting
#install.packages("dplyr")
#install.packages("sqldf")
#...whatever other packages you are missing
# don't show warnings while knitting
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# constants
my_working_dir_pc <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora"
# if on a PC
myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"
myFile_reviews <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"
myFile_authors <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"
# import data
setwd(my_working_dir_pc) # on a PC computer
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)
#head(products) # first 6 rows of the products table
str(products) # structure of the products table
## 'data.frame': 8494 obs. of 24 variables:
## $ product_id : chr "P473671" "P473668" "P473662" "P473660" ...
## $ product_name : chr "Fragrance Discovery Set" "La Habana Eau de Parfum" "Rainbow Bar Eau de Parfum" "Kasbah Eau de Parfum" ...
## $ brand_id : int 6342 6342 6342 6342 6342 6342 6342 6342 6342 6342 ...
## $ brand_name : chr "19-69" "19-69" "19-69" "19-69" ...
## $ loves_count : int 6320 3827 3253 3018 2691 2448 1619 1542 1542 1377 ...
## $ rating : num 3.64 4.15 4.25 4.48 3.23 ...
## $ reviews : int 11 13 16 21 13 21 13 8 7 8 ...
## $ size : chr "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
## $ variation_type : chr "" "Size + Concentration + Formulation" "Size + Concentration + Formulation" "Size + Concentration + Formulation" ...
## $ variation_value : chr "" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" "3.4 oz/ 100 mL" ...
## $ variation_desc : chr "" "" "" "" ...
## $ ingredients : chr "['Capri Eau de Parfum:', 'Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Linalool, Benzyl Sali"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Ethylhexyl Methoxycinnamate, Ethylhexyl Salicylate, Butyl"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) D-Limonene, Ethylhexyl Methoxycinnamate, Butyl Methoxydib"| __truncated__ "['Alcohol Denat. (SD Alcohol 39C), Parfum (Fragrance) Coumarin, Ethylhexyl Methoxycinnamate, Butyl Methoxydiben"| __truncated__ ...
## $ price_usd : num 35 195 195 195 195 30 30 195 195 30 ...
## $ value_price_usd : num NA NA NA NA NA NA NA NA NA NA ...
## $ sale_price_usd : num NA NA NA NA NA NA NA NA NA NA ...
## $ limited_edition : int 0 0 0 0 0 0 0 0 0 0 ...
## $ new : int 0 0 0 0 0 0 0 0 0 0 ...
## $ online_only : int 1 1 1 1 1 1 1 1 1 1 ...
## $ out_of_stock : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sephora_exclusive : int 0 0 0 0 0 0 0 0 0 0 ...
## $ highlights : chr "['Unisex/ Genderless Scent', 'Warm &Spicy Scent', 'Woody & Earthy Scent', 'Fresh Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Woody & Earthy Scent']" "['Unisex/ Genderless Scent', 'Layerable Scent', 'Warm &Spicy Scent']" ...
## $ primary_category : chr "Fragrance" "Fragrance" "Fragrance" "Fragrance" ...
## $ secondary_category: chr "Value & Gift Sets" "Women" "Women" "Women" ...
## $ tertiary_category : chr "Perfume Gift Sets" "Perfume" "Perfume" "Perfume" ...
#head(reviews) # first 6 rows of the reviews table
str(reviews) # structure of the reviews table
## 'data.frame': 1536 obs. of 12 variables:
## $ review_id : int 1035085 1035086 1035087 1035088 1035089 1035090 1035091 1035092 1035093 1035094 ...
## $ author_id : num 5.23e+09 3.54e+10 2.73e+10 1.70e+09 2.69e+09 ...
## $ product_id : chr "P387511" "P387511" "P387511" "P387511" ...
## $ rating : int 5 2 4 4 4 3 2 5 3 3 ...
## $ is_recommended : int 1 0 0 1 1 0 0 1 0 1 ...
## $ helpfulness : num 1 0.5 1 0.75 0.933 ...
## $ total_feedback_count : int 3 4 6 4 15 4 7 2 12 15 ...
## $ total_neg_feedback_count: int 0 2 0 1 1 2 2 0 7 8 ...
## $ total_pos_feedback_count: int 3 2 6 3 14 2 5 2 5 7 ...
## $ submission_date : chr "11/29/22" "8/12/22" "2/18/22" "1/12/21" ...
## $ review_text : chr "This cream is pricey, but it is so worth it! I started getting acne in my 20s from a stressful job and tried a"| __truncated__ "I used this in rotation with the Divine Youth Oil. Neither of these products left me feeling wowed in any way. "| __truncated__ "I really did like this cream but I don’t think I’d buy it again just because of how expensive it is. It smells "| __truncated__ "Received a 5 ml sample of this cream, and it’s great! I love a high end moisturizer that is thick and rich, but"| __truncated__ ...
## $ review_title : chr "My holy grail" "Not for me!" "" "" ...
# convert the column to type "date"
reviews$submission_date <- as.Date(reviews$submission_date, format = "%m/%d/%y")
# pull in product info by inner join
# call the library, sqldf,
# then the function, which also happens to be called sqldf
reviews_products <- sqldf::sqldf("
SELECT r.*, p.brand_name, p.product_name
FROM reviews r INNER JOIN products p ON r.product_id = p.product_id
")
# get distinct values of brand name and product name
# in the products table
# call the library, dplyr, then the function distinct
dplyr::distinct(reviews_products, brand_name, product_name)
# calculate means
mean(reviews_products$rating)
## [1] 4.038411
mean(reviews_products[reviews_products$brand_name=='L\'Occitane',]$rating)
## [1] 3.880503
# Need the slash because of the ' in L'Occitane
mean(reviews_products[reviews_products$brand_name=='La Mer',]$rating)
## [1] 4.056645
# Review length in characters
reviews_products$review_length <- nchar(reviews_products$review_text)
# Mean review length
mean(reviews_products$review_length, na.rm = TRUE)
## [1] 404.1374
hist(reviews_products$review_length,
main = "Distribution of Review Length",
xlab = "Review Length (Characters)",
breaks = 30)
# Long VS Short Review
median_length <- median(reviews_products$review_length, na.rm = TRUE)
median_length
## [1] 327
reviews_products$length_group <- ifelse(
reviews_products$review_length > median_length,
"Long", "Short"
)
#Count of Long vs Short Reviews
table(reviews_products$length_group)
##
## Long Short
## 763 773
#Proportion of Long VS SHort Reviews
prop.table(table(reviews_products$length_group))
##
## Long Short
## 0.4967448 0.5032552
#Count by Star Rating
table(reviews_products$rating, reviews_products$length_group)
##
## Long Short
## 1 64 103
## 2 53 65
## 3 74 48
## 4 123 88
## 5 449 469
# plot histograms. Lay out the 3 graphs
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
hTotal <- hist(reviews_products$rating, breaks = seq(0, 5, by = 1),
main = 'Distribution of Ratings', xlab = 'Review Ratings (in Stars)')
h1 <- hist(reviews[reviews_products$brand_name == 'L\'Occitane',]$rating, breaks = seq(0, 5, by = 1),
main = 'Distribution of L\'Occitane Ratings', xlab = 'Review Ratings (in Stars)')
h2 <- hist(reviews[reviews_products$brand_name == 'La Mer',]$rating, breaks = seq(0, 5, by = 1),
main = 'Distribution of La Mer Ratings', xlab = 'Review Ratings (in Stars)')
# Assuming 'loccitane_reviews' is your DataFrame and 'rating' is the column with the ratings for L'Occitane products
# First, make sure you have the correct data filtered for L'Occitane. Here's a mock-up for loading and filtering data.
# Load the data
#reviews <- read.csv("path_to_reviews.csv")
#products <- read.csv("path_to_products.csv")
my_working_dir_mac <- "~/Desktop/MKTAnalytics/Data/Sephora" # working directory if on a Mac
my_working_dir_pc <- "C:\\Users\\rebec\\OneDrive\\Desktop\\MKTAnalytics\\Data\\Sephora" # if on a PC
myFile_products <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/products.csv"
myFile_reviews <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/reviews.csv"
myFile_authors <- "C:/Users/risar/Downloads/Lehigh/MSBA - 2025/Spring 2026 Classes/MKT 326/Sephora/Sephora/authors.csv"
#setwd(my_working_dir_mac)
products <- read.csv(myFile_products)
reviews <- read.csv(myFile_reviews)
authors <- read.csv(myFile_authors)
# Merge reviews with products
review_product_data <- merge(reviews, products, by = "product_id")
# Filter for L'Occitane products
loccitane_reviews <- subset(review_product_data, brand_name == "L'Occitane")
# Ensure the rating column is numeric
#loccitane_reviews$rating <- as.numeric(loccitane_reviews$rating)
loccitane_reviews$rating.x <- as.numeric(loccitane_reviews$rating.x)
# Handling possible NAs in rating after conversion
#loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating), ]
loccitane_reviews <- loccitane_reviews[!is.na(loccitane_reviews$rating.x), ]
# Plot histogram of ratings
#hist(loccitane_reviews$rating,
hist(loccitane_reviews$rating.x,
breaks = seq(0.5, 5.5, by = 1), # Adjust breaks to suit the range and bin width you want
main = "Histogram of Ratings for L'Occitane Products",
xlab = "Ratings",
ylab = "Frequency",
col = "blue",
border = "black")
# Adding a grid for better visualization
grid(nx = NULL, ny = NULL, col = "gray", lty = "dotted")
#install.packages("vader")
library(vader)
# Filter for one-star reviews only
one_star <- subset(reviews_products, rating == 1)
# Remove missing review text
one_star <- one_star[!is.na(one_star$review_text), ]
# Calculate VADER compound sentiment scores
one_star$vader <- vader_df(one_star$review_text)$compound
# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = one_star, mean)
# T-test: Long vs Short one-star reviews
t.test(vader ~ length_group, data = one_star)
##
## Welch Two Sample t-test
##
## data: vader by length_group
## t = 2.2084, df = 114.02, p-value = 0.02922
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
## 0.02096895 0.38630563
## sample estimates:
## mean in group Long mean in group Short
## 0.30173437 0.09809709
# Filter for five-star reviews only
five_star <- subset(reviews_products, rating == 5)
# Remove missing review text
five_star <- five_star[!is.na(five_star$review_text), ]
# Calculate VADER compound sentiment scores
five_star$vader <- vader_df(five_star$review_text)$compound
# Average VADER scores: Long vs Short
aggregate(vader ~ length_group, data = five_star, mean)
# T-test: Long vs Short five-star reviews
t.test(vader ~ length_group, data = five_star)
##
## Welch Two Sample t-test
##
## data: vader by length_group
## t = 6.7245, df = 900.07, p-value = 3.128e-11
## alternative hypothesis: true difference in means between group Long and group Short is not equal to 0
## 95 percent confidence interval:
## 0.1013724 0.1849338
## sample estimates:
## mean in group Long mean in group Short
## 0.8315902 0.6884371