This report presents an analysis of customer reviews for the online clothing retailer, Fashion Nova. The dataset includes information such as the reviewer’s name, country of origin, review date and time, rating, and the content of the reviews, covering 131,914 unique customers from 182 different countries. This analysis aims to provide valuable insights into consumer behavior, offering the company a clearer understanding of customer preferences. These insights can support strategic decision-making and business improvement efforts.
# import data
df <- read.csv('~/Case Studies/Fashion Nova Case Study/fashionnova_reviews.csv')
# import libraries
library('stringr')
library('dplyr')
library('tidyr')
library('magrittr')
library('ggplot2')
library('patchwork')
library('wordcloud')
library('tm')
# view first few rows of table
head(df)
## Reviewer.Name Profile.Link Country Review.Count
## 1 Champagne /users/66c78240087b6269ffbcb5fb US 1 review
## 2 Vg customer /users/6618fdb53d4198001210cbe7 VG 3 reviews
## 3 Colleen Burgher /users/64e9595206be1a001244ff73 US 3 reviews
## 4 R.G.M /users/66c58ad1c6ab36352a08f57a US 1 review
## 5 Rosalyn Cousar /users/60ad4b6ef3788e001adbb8e3 US 5 reviews
## 6 Kim Weatherspoon /users/5bea9ff75483f4b8a9d4b063 US 2 reviews
## Review.Date Rating
## 1 2024-08-22T20:24:02.000Z Rated 5 out of 5 stars
## 2 2024-08-21T05:43:11.000Z Rated 5 out of 5 stars
## 3 2024-08-21T17:09:14.000Z Rated 5 out of 5 stars
## 4 2024-08-21T08:36:03.000Z Rated 5 out of 5 stars
## 5 2024-08-22T00:46:16.000Z Rated 3 out of 5 stars
## 6 2024-08-15T07:02:43.000Z Rated 5 out of 5 stars
## Review.Title
## 1 I love ordering from fashion nova
## 2 Top tier content for fashion nova
## 3 Prices and quality of products are…
## 4 Great customer service
## 5 False advertising
## 6 Love the sales
## Review.Text
## 1 I love ordering from fashion nova. The clothes are good quality as long as you read your reviews on the items so you are picking the right sizes you should be comfortable and happy, I even got my BF putting in his orders lol.
## 2 Always amazing clothes and the fast shipping is really good but let’s talk about the CLOTHES omggggg the stretchy fabrics I loveeee . If you need a quick and easy fit fashion nova is the store for you I’m an international customer trust me fashion nova will never disappoint.. love them so much
## 3 Prices and quality of products are GREAT Would love to see a cash refund if I have to return an unused item. Bought extra items since I forget to add expedited shipping, sent an email to change shipping method and I would pay extra but no response so I ended up with double items. Other than that great job.
## 4 Great customer service. I was helped until the Fashion Nova was able to provide a solution to my problem. I appreciate what they did for me. I love their products and I’m a very happy customer who will continue to support and buy from this brand!!!!
## 5 Disappointing experience. You don’t live up to your advertising! BOGO should be applied to all items being purchased. If more than 2 items are part of purchase then BOGO should continue to be applied to whole purchase and all items! The advertising is false and not true. I did not read anywhere that BOGO is only to be applied once during purchase. The company is being shady!!!
## 6 Love the sales, and huge discounts. The variety and quality wins me over every time. Lastly, the ease of returning items if I'm not satisfied gives me confidence when purchasing online.
## Date.of.Experience
## 1 August 22, 2024
## 2 August 18, 2024
## 3 August 21, 2024
## 4 August 20, 2024
## 5 August 21, 2024
## 6 August 14, 2024
# see column names
colnames(df)
## [1] "Reviewer.Name" "Profile.Link" "Country"
## [4] "Review.Count" "Review.Date" "Rating"
## [7] "Review.Title" "Review.Text" "Date.of.Experience"
# rename column names
colnames(df) <- gsub("\\.", "_", tolower(colnames(df)))
# rename column names
df <- df %>%
rename(review_date_time = review_date)
# capitalise the first letter of each word
df$reviewer_name <- str_to_title(df$reviewer_name)
# remove 'T' and '.000Z' from the date-time string
df$review_date_time <- sub("T", " ", df$review_date_time)
df$review_date_time <- sub("\\.000Z", " ", df$review_date_time)
# see unique values in country column
unique(df$country)
## [1] "US" "VG" "CA" "PT" "MY" "BM" "IQ" "TT" "DE" "GB" "JM" "NG" "PL" "IN" "MX"
## [16] "BS" "AU" "PA" "KN" "KE" "BB" "BR" "FR" "AF" NA "PS" "BZ" "LS" "AE" "SC"
## [31] "VI" "LC" "GR" "XK" "BE" "NL" "GP" "AI" "ZA" "NZ" "KW" "HR" "SR" "DK" "IE"
## [46] "CV" "SG" "AG" "ZM" "CM" "GD" "ES" "CW" "CO" "LT" "TR" "DZ" "MC" "DO" "GY"
## [61] "RU" "CL" "SE" "PR" "TN" "LB" "BO" "DM" "HK" "MQ" "RW" "HT" "PE" "FI" "KY"
## [76] "GE" "LV" "SN" "PK" "TC" "MZ" "TH" "AT" "KR" "IL" "BW" "SA" "GU" "GH" "CH"
## [91] "DJ" "AW" "JP" "NO" "IS" "CI" "IT" "GT" "VC" "VE" "EC" "SV" "MV" "AO" "MU"
## [106] "SL" "CR" "EG" "AR" "HN" "AL" "SS" "SX" "BA" "TZ" "BQ" "MA" "NP" "RE" "MW"
## [121] "NI" "RO" "PF" "ZW" "TW" "PH" "CY" "GL" "VN" "HU" "QA" "UG" "UY" "BH" "RS"
## [136] "ET" "MK" "VU" "MP" "SZ" "AM" "MF" "AZ" "SK" "CZ" "MT" "KH" "SI" "BD" "LU"
## [151] "GF" "ID" "FO" "NC" "OM" "UZ" "BG" "UA" "CD" "CU" "PG" "CG" "KZ" "EE" "ME"
## [166] "LK" "CN" "AS" "LR" "GM" "BL" "JO" "PW" "FK" "SO" "MO" "KG" "PY" "TG" "MD"
## [181] "GI" "BY"
# see unique values in rating column
unique(df$rating)
## [1] "Rated 5 out of 5 stars" "Rated 3 out of 5 stars" "Rated 2 out of 5 stars"
## [4] "Rated 1 out of 5 stars" "Rated 4 out of 5 stars"
# keep only the sixth character of variables in rating column
df$rating <- substr(df$rating, 7, 7)
# keep only the first character of variables in review_count column
df$review_count <- substr(df$review_count, 1, 1)
# see structure of data
str(df)
## 'data.frame': 131980 obs. of 9 variables:
## $ reviewer_name : chr "Champagne" "Vg Customer" "Colleen Burgher" "R.g.m" ...
## $ profile_link : chr "/users/66c78240087b6269ffbcb5fb" "/users/6618fdb53d4198001210cbe7" "/users/64e9595206be1a001244ff73" "/users/66c58ad1c6ab36352a08f57a" ...
## $ country : chr "US" "VG" "US" "US" ...
## $ review_count : chr "1" "3" "3" "1" ...
## $ review_date_time : chr "2024-08-22 20:24:02 " "2024-08-21 05:43:11 " "2024-08-21 17:09:14 " "2024-08-21 08:36:03 " ...
## $ rating : chr "5" "5" "5" "5" ...
## $ review_title : chr "I love ordering from fashion nova" "Top tier content for fashion nova " "Prices and quality of products are…" "Great customer service" ...
## $ review_text : chr "I love ordering from fashion nova. The clothes are good quality as long as you read your reviews on the items s"| __truncated__ "Always amazing clothes and the fast shipping is really good but let’s talk about the CLOTHES omggggg the stretc"| __truncated__ "Prices and quality of products are GREAT Would love to see a cash refund if I have to return an unused item. Bo"| __truncated__ "Great customer service. I was helped until the Fashion Nova was able to provide a solution to my problem. I app"| __truncated__ ...
## $ date_of_experience: chr "August 22, 2024" "August 18, 2024" "August 21, 2024" "August 20, 2024" ...
# change data type for review count, rating, date of experience columns and review_date_time
df$review_count <- as.numeric(df$review_count)
df$rating <- as.numeric(df$rating)
df$date_of_experience <- as.Date(df$date_of_experience, format = '%B %d, %Y')
df$review_date_time <- as.POSIXct(df$review_date_time, format= '%Y-%m-%d %H:%M:%S')
# separate review date and review time into separate columns
df$review_date <- as.Date(df$review_date_time)
df$review_time <- format(df$review_date_time, format = '%H:%M:%S')
# create review duration column
df$review_duration <- df$review_date - df$date_of_experience
# keep only first character of review duration column
df$review_duration <- substr(df$review_duration, 1, 1)
# convert data type for review duration
df$review_duration <- as.numeric(df$review_duration)
## Warning: NAs introduced by coercion
# check for duplicates
sum(duplicated(df))
## [1] 66
# drop duplicates
df <- df %>%
distinct()
# check for NAs
sum(is.na(df))
## [1] 34245
# generate the exact location of NAs (rows amd columns)
na <- which(is.na(df) , arr.ind = TRUE)
# replace NAs with 0s and nulls
df$review_duration[is.na(df$review_duration)] <- 0
df <- df %>%
mutate(across(everything(), ~replace_na(.x, "null")))
# keep necessary columns
fndf <- df %>%
select(-profile_link)
# average rating score
mean(fndf$rating)
## [1] 4.275096
# total review count
sum(fndf$review_count)
## [1] 200234
# average review duration
mean(fndf$review_duration)
## [1] 0.1428885
# No. of countries served
n_distinct(fndf$country)
## [1] 182
# summary
summary(fndf)
## reviewer_name country review_count
## Length:131914 Length:131914 Min. :1.000
## Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Median :1.000
## Mean :1.518
## 3rd Qu.:2.000
## Max. :9.000
## review_date_time rating review_title
## Min. :2016-11-27 19:54:08.00 Min. :1.000 Length:131914
## 1st Qu.:2021-09-23 00:07:06.50 1st Qu.:4.000 Class :character
## Median :2021-11-24 01:43:13.50 Median :5.000 Mode :character
## Mean :2022-02-15 12:58:40.79 Mean :4.275
## 3rd Qu.:2022-12-16 00:22:19.25 3rd Qu.:5.000
## Max. :2024-08-23 00:59:57.00 Max. :5.000
## review_text date_of_experience review_date
## Length:131914 Min. :2016-11-27 Min. :2016-11-27
## Class :character 1st Qu.:2021-09-22 1st Qu.:2021-09-22
## Mode :character Median :2021-11-24 Median :2021-11-23
## Mean :2022-02-11 Mean :2022-02-14
## 3rd Qu.:2022-12-13 3rd Qu.:2022-12-15
## Max. :2024-08-22 Max. :2024-08-22
## review_time review_duration
## Length:131914 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.1429
## 3rd Qu.:0.0000
## Max. :9.0000
# average ratings by each country
avg_rating_and_rating_count <- fndf %>%
group_by(country) %>%
summarise(avg_rating = round(mean(rating), 2),
total_review_count = sum(review_count)) %>%
arrange(desc(total_review_count), desc(avg_rating))
head(avg_rating_and_rating_count)
## # A tibble: 6 × 3
## country avg_rating total_review_count
## <chr> <dbl> <dbl>
## 1 US 4.32 183287
## 2 CA 3.78 4877
## 3 GB 2.86 3332
## 4 AU 3.52 928
## 5 JM 4.11 688
## 6 TT 4.23 505
# average rating from the top 5 countries with the highest rating count
top_5_countries_rating_count <- avg_rating_and_rating_count %>%
head(5)
p1 <- ggplot(top_5_countries_rating_count, aes(x=country, y=total_review_count, fill=country)) +
geom_bar(stat='identity') +
labs(x='Country', y='Total Review Count', fill='Country',
title='Top 5 Countries with The Most Reviews') +
geom_text(aes(label=total_review_count), vjust=-0.5, size=3.5, color='white') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
p2 <- ggplot(top_5_countries_rating_count, aes(x=country, y=avg_rating, fill=country)) +
geom_bar(stat='identity') +
labs(x='Country', y='Average Rating', fill='Country',
title='Average Ratings of The Top 5 Countries with The Most Reviews') +
geom_text(aes(label=avg_rating), vjust=-0.5, size=3.5, color='white') +
theme(plot.background = element_rect(fill = "grey15"),
panel.background = element_rect(fill = "grey9"),
panel.grid = element_line(colour = "grey15"),
legend.background = element_rect(fill = "grey15"),
axis.text.x = element_text(color='white', size=8),
axis.text.y = element_text(color='white'),
plot.title = element_text(color='white'),
axis.title.x = element_text(color='white'),
axis.title.y = element_text(color='white'),
legend.text = element_text(color='white'),
legend.title = element_text(color='white'))
combined1 <- p1 + p2 + plot_layout(ncol=2)
combined1
# word cloud for review title
corpus_title <- Corpus(VectorSource(fndf$review_title))
corpus_title <- tm_map(corpus_title, content_transformer(tolower))
corpus_title <- tm_map(corpus_title, removePunctuation)
corpus_title <- tm_map(corpus_title,removeWords,stopwords("english"))
corpus_title <- tm_map(corpus_title, stripWhitespace)
tdm_title <- TermDocumentMatrix(corpus_title)
matrix_title <- as.matrix(tdm_title)
word_freqs_title <- sort(rowSums(matrix_title), decreasing = TRUE)
data_title <- data.frame(word=names(word_freqs_title), freq=word_freqs_title)
wordcloud(words = data_title$word, freq = data_title$freq, max.words = 250,
random.order = FALSE, colors = brewer.pal(8, "Dark2"))
# word cloud for review_text
corpus_review <- Corpus(VectorSource(fndf$review_text))
corpus_review <- tm_map(corpus_review, content_transformer(tolower))
corpus_review <- tm_map(corpus_review, removePunctuation)
corpus_review <- tm_map(corpus_review, removeWords, stopwords("english"))
corpus_review <- tm_map(corpus_review, stripWhitespace)
tdm_review <- TermDocumentMatrix(corpus_review)
tdm_review <- removeSparseTerms(tdm_review, 0.99) # Keep only terms that appear in at least 1% of documents
matrix_review <- as.matrix(tdm_review)
word_freqs_review <- sort(rowSums(matrix_review), decreasing = TRUE)
data_review <- data.frame(word=names(word_freqs_review), freq=word_freqs_review)
wordcloud(words=data_review$word, freq=data_review$freq, max.words = 250,
random.order = FALSE, colors = brewer.pal(8, "Dark2"))