Final Project

Author

Hana Rose

library(jpeg)

Warning: package 'jpeg' was built under R version 4.4.3

library(grid)

img <- readJPEG("ice cream.jpg")  # Read Image
grid::grid.raster(img) # Display Image
grid.text("Source: Pexels. (2025). Pexels.com; Pexels. https://www.pexels.com/search/ice%20cream/", 
          x = 0.5, y = 0.02, gp = gpar(fontsize = 10, col = "black")) # Label caption

Introduction

The reviews dataset documents the customer ratings of ice cream flavors over the years using compiled data from Ben & Jerry’s, Häagen-Dazs, Breyers, and Talenti. This data was likely collected from the publicly available customer reviews on the official websites of these ice brands. Each product on these sites has reviews, including star ratings, text commentary, and helpfulness votes. This dataset was sourced on the data respitory, Kaggle, and while the exact data collection methodology is not known, the author states the data was collected directly from the official brand websites, implying this data was compiled through web scraping or manual extraction from review pages. For this project, I will be exploring the variables:

stars (quantitative)
date (quantitative)
helpful_yes (quantitative)
helpful_no (quantitative)
author (categorical)

The question I want to ask is: “Do the star ratings of reviews influence their helpfulness ratings?” I will attempt to connect this data with real-world psychology, navigating phenomena like emotional arousal and extremity bias in exploring online reviews.

I chose this dataset because, in addition to my love for ice cream, I wanted to do something lighthearted and fun. With the semester coming to a close and summer approaching, a dataset about ice cream is a sweet note to end on.

Load Libraries

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.4.3

Warning: package 'ggplot2' was built under R version 4.4.3

Warning: package 'readr' was built under R version 4.4.3

Warning: package 'dplyr' was built under R version 4.4.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(ggthemes)

Warning: package 'ggthemes' was built under R version 4.4.3

library(plotly)

Warning: package 'plotly' was built under R version 4.4.3


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(highcharter)

Warning: package 'highcharter' was built under R version 4.4.3

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

library(ggplot2)
library(dplyr)
library(extrafont)

Registering fonts with R

warning=FALSE
message=FALSE

Load Dataset

setwd("C:/Users/Hana Rose/OneDrive/Data 110") # Set the working directory to the location of the dataset
reviews <- read_csv("reviews.csv") # Read the data in CSV format

Rows: 7943 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): key, author, date, title, text
dbl (3): stars, helpful_yes, helpful_no

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(reviews) # Take a glimpse at the dataset to understand its structure

Rows: 7,943
Columns: 8
$ key         <chr> "0_bj", "0_bj", "0_bj", "0_bj", "0_bj", "0_bj", "0_bj", "0…
$ author      <chr> "Ilovebennjerry", "Sweettooth909", "LaTanga71", "chicago22…
$ date        <chr> "4/15/2017", "1/5/2020", "4/26/2018", "1/14/2018", "7/24/2…
$ stars       <dbl> 3, 5, 3, 5, 1, 2, 3, 3, 2, 3, 2, 1, 3, 1, 3, 5, 5, 5, 4, 5…
$ title       <chr> "Not enough brownies!", "I’m OBSESSED with this pint!", "M…
$ helpful_yes <dbl> 10, 3, 5, 24, 1, 3, 3, 4, 8, 1, 0, 0, 3, 21, 0, 0, 1, 0, 2…
$ helpful_no  <dbl> 3, 0, 2, 1, 5, 1, 3, 1, 6, 1, 1, 2, 2, 39, 1, 0, 1, 0, 1, …
$ text        <chr> "Super good, don't get me wrong. But I came for the carame…

Remove NAs

reviews <- reviews %>% # Filter for complete cases only
  filter(complete.cases(reviews))

Create a Variable for Net Helpfulness

reviews <- reviews %>%
  mutate(net_helpfulness = helpful_yes - helpful_no) # Create the new variable net_helpfulness by subtracting helpful_no from helpful_yes
glimpse(reviews) # # Glimpse at the reviews dataframe to see the new variable

Rows: 5,204
Columns: 9
$ key             <chr> "0_bj", "0_bj", "0_bj", "0_bj", "0_bj", "0_bj", "0_bj"…
$ author          <chr> "Ilovebennjerry", "Sweettooth909", "LaTanga71", "chica…
$ date            <chr> "4/15/2017", "1/5/2020", "4/26/2018", "1/14/2018", "7/…
$ stars           <dbl> 3, 5, 3, 5, 1, 2, 3, 3, 2, 3, 2, 1, 3, 3, 5, 5, 5, 4, …
$ title           <chr> "Not enough brownies!", "I’m OBSESSED with this pint!"…
$ helpful_yes     <dbl> 10, 3, 5, 24, 1, 3, 3, 4, 8, 1, 0, 0, 3, 0, 0, 1, 0, 2…
$ helpful_no      <dbl> 3, 0, 2, 1, 5, 1, 3, 1, 6, 1, 1, 2, 2, 1, 0, 1, 0, 1, …
$ text            <chr> "Super good, don't get me wrong. But I came for the ca…
$ net_helpfulness <dbl> 7, 3, 3, 23, -4, 2, 0, 3, 2, 0, -1, -2, 1, -1, 0, 0, 0…

Summarize Average Net Helpfulness by Stars

avg_helpfulness <- reviews %>% # Create the new dataframe avg_helpfulness by summarizing reviews_subset
  group_by(stars) %>% # Group the data by stars
  summarize(avg_net_helpfulness = mean(net_helpfulness, na.rm = TRUE)) # Calculate the average net helpfulness for each star group and remove NA values

Linear Regression Model

model <- lm(net_helpfulness ~ stars, data = reviews) # Create a linear regression model predicting net_helpfulness by stars from the reviews dataframe
summary(model) # Show a summary of the linear model


Call:
lm(formula = net_helpfulness ~ stars, data = reviews)

Residuals:
    Min      1Q  Median      3Q     Max 
-77.104  -0.864  -0.864   0.136 104.136 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.08633    0.18144  -0.476    0.634    
stars        0.19005    0.04188   4.538 5.82e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.549 on 5202 degrees of freedom
Multiple R-squared:  0.003943,  Adjusted R-squared:  0.003751 
F-statistic: 20.59 on 1 and 5202 DF,  p-value: 5.817e-06

Linear Regression Analysis

The coefficient for stars is 0.19005, and the p-value is 5.82e-06 or 0.00000582, meaning that for each additional star, net helpfulness increases by about 0.19 votes, on average. Because the p-value is so low, it is highly unlikely that this result is due to random chance, suggesting a statistically significant relationship between star rating and net helpfulness. Higher star ratings appear to positively predict helpfulness. However, the R-squared value is only 0.003943, indicating that star rating explains less than 0.4% of the variance in net helpfulness. So, while the model is statistically significant, it’s weak. This suggests that other variables—such as review length, content, timing, tone, or total number of votes—likely have a much greater impact on net helpfulness than the star rating alone. One key limitation of this model is that it assumes a linear relationship between stars and helpfulness, which may overlook non-linear patterns.

Visualization #1: Diagnostic Plots

par(mfrow=c(2,2)) # Create a 2x2 grid for the plots
plot(model) # Plot the diagnostic plots to assess the fit of the linear model

Visualization #2: Jitter Plot

ggplot(reviews, aes(x = factor(stars), y = net_helpfulness)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +  # Set colors 
  geom_jitter(width = 0.2, alpha = 0.4, color = "red") +  # Add jittered points to see amount of votes
  labs(
    title = "Net Helpfulness by Star Rating",
    x = "Star Rating",
    y = "Net Helpfulness",
    caption = "Sources: Ben & Jerry's, Häagen-Dazs, Breyers, and Talenti"  # Label title and axes
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Times New Roman"),  # Apply Times New Roman font
    plot.background = element_rect(fill = "#FFF9DB", color = NA),   
    panel.background = element_rect(fill = "#FFF9DB", color = NA)   # Apply background color
  )

Analyzing Extremity Bias

These results are very interesting as they display a particular phenomena I expected to occur; extremity bias. The jitter plot shows than extreme reviews, either 1 star or 5 star, tend to garner the most engagement, implying that more extreme reviews tend to elicit stronger reactions or be viewed more frequently. 1 star reviews are the most polarizing, often being rated as either very helpful or very unhelpful. Despite this high variance, the 1-star group tends to be rated as one of the most unhelpful overall. The 5 star group is overwhelmingly rated as the most helpful group, with one outlier even extending beyond the 100 mark and a strong, unanimous upward distribution. Comparatively, 2, 3, and 4 star reviews tend to receive much less engagement.

Visualization #3: Line Plot

reviews_allstars <- reviews %>%  # Create a dataframe for reviews of all stars using the reviews dataframe
  group_by(stars) %>%  # Group the data by stars
  summarize(avg_net_helpfulness = mean(net_helpfulness, na.rm = TRUE))  # Calculate the average net helpfulness for each star

ggplot(reviews_allstars, aes(x = factor(stars), y = avg_net_helpfulness, group = 1)) +  # Plot the average net helpfulness of each star 
  geom_line(color = "#0073C2", size = 1.2) +  # Add a line to show trend
  geom_point(size = 3, color = "#FF6666") +  # Add dots on the line for each star 
  labs(
    title = "Average Net Helpfulness by Star Rating",
    x = "Star Rating",
    y = "Average Net Helpfulness",
    caption = "Sources: Ben & Jerry's, Häagen-Dazs, Breyers, and Talenti"  # Label the title, axes, and caption
  ) +
  theme_minimal() +  # Apply a theme
  theme(
    text = element_text(family = "Times New Roman")  # Apply Times New Roman font
  )

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Understanding Helpfulness Rankings

The helpfulness rankings by star rating are as follows:

Most to Least Helpful

5 star reviews
4 star reviews
3 star reviews
1 star reviews
2 star reviews

I think the reason 2 star ratings are voted as most unhelpful is because they don’t share the same polarizing nature as 1 star reviews, meaning they’re less likely to receive especially high or frequent helpfulness votes that could raise their average. As a result, they tend to be viewed as the least helpful overall. I suspect the reason postitive reviews are more positively recieved by reviwers in this context is that the product being reviewed is often considered universally loved: ice cream. I imagine it’s hard for most people to feel negatively about ice cream, thus leading to postitive reviews often being rewarded by audiences more than negative ones.

Visualization #4: Bar Plot

review_counts <- reviews %>% 
  count(stars)  # Count the number of reviews for each star rating

ggplot(review_counts, aes(x = factor(stars), y = n, fill = factor(stars))) +  # Create a bar plot of review counts by star
  geom_bar(stat = "identity") +  # Represent the counts using bars
  labs(
    title = "Count of Reviews by Star Rating",
    x = "Star Rating",
    y = "Number of Reviews",
    caption = "Sources: Ben & Jerry's, Häagen-Dazs, Breyers, and Talenti"  # Label the title, axes, and caption
  ) +
  scale_fill_brewer(palette = "Spectral") +  # Apply a color palette 
  theme_minimal() +  # Apply a theme
  theme(
    legend.position = "none",  # Remove the legend
    plot.title = element_text(size = 14),  # Customize the title appearance
    text = element_text(family = "Times New Roman")  # Apply Times New Roman font
  )

Assessing Reach

The plot shows that people tend to most often leave 5-star reviews on the chosen ice cream sites, followed by one star reviews. I suspect that people are more likely to be prompted to leave a review if they have a strong reaction to a product, which would explain why 5 and 1 star reveiwes are the two most common groups. Again, since it’s ice cream we’re dealing with, it’s no surprise the majoirty of these strong reactions are very positive! I find it interesting that the relationship between review count and review helpfulness in the moderate group is inverse, with more positive reviews in the moderate group receiving greater helpfulness ratings despite there being less positive reviews in the moderate group than negative ones. This suggests potential positivity bias and confirmation bias in reader reactions.

Visualization #5: Interactive Ice Cream Reviewer Leaderboard

Just for fun, I’m creating an interactive ice cream reviewer leaderboard!

Create Custom Ice Cream Color Palette

ice_cream_colors <- c("#FCCCD4", "#FFFDD0", "#6b3e26", "#BDE8D4","#990F4B",
                   "#FDBE02", "#92B35C", "#CBAACB", "#FF9AA2", "#F7EF79") # Create a pastel ice cream color pallette

Create List of the Top 10 Authors by Unique Review Counts with Custom Colors

# Create the top_authors dataframe using the reviews dataframe
top_authors <- reviews %>%
  count(author, sort = TRUE) %>% # Count number of reviews per author and sort descending
  distinct(n, .keep_all = TRUE) %>% # Keep only one author per unique review count
  top_n(10) %>% # Get the top 10 authors with the most unique review counts
  arrange(desc(n)) %>% # Ensure descending order for ranking
  mutate(reviews = n, # Rename n to reviews to avoid error in hchart
         rank = row_number(), # Create a rank column for display
         color = ice_cream_colors[seq_along(rank)]) # Correctly assign pastel color to each rank

Selecting by n

Plot the Leaderboard

# Create the highcharter plot
hchart(top_authors, "column", hcaes(x = author, y = reviews, color = color)) %>% # Use column chart and map axes and color
  hc_chart(backgroundColor = "lightblue") %>% # Apply a pink background color
  hc_title(text = "🍦 Ice Cream Reviewer Leaderboard 🍦", # Create title
           style = list(fontFamily = "Bubblegum Sans", # Apply Bubblegum Sans font
                        fontSize = "20px", # Select font size
                        color = "#FFFFFF")) %>% # Make title white
  hc_xAxis( # Customize the x-axis
    title = list(
      text = "Author", # Ttitle the x-axis
                 style = list(
                   color = "#FFFFFF", # Make x-axis title white
                   fontFamily = "Bubblegum Sans")), # Make x-axis font Bubblegum Sans
    labels = list(rotation = -45, # Tilt the author names for readability
                  style = list(color = "#FFFFFF", # Make the x-axis labels white
                               fontFamily = "Bubblegum Sans")) # Make the font of the x-axis labels Bubblegum Sans
  ) %>%
  hc_yAxis(  # Customize the y-axis
    title = list(text = "Number of Reviews", # Title the y-axis
                 style = list(color = "#FFFFFF", # # Make the y-axis title white
                              fontFamily = "Bubblegum Sans")),  # Make the font of the y-axis title Bubblegum Sans
    labels = list(
      style = list(
        color = "#FFFFFF", # Make the labels white
        fontFamily = "Bubblegum Sans")) # Apply Bubblegum Sans font 
  ) %>%
  hc_tooltip( # Add an interactive tooltip
    useHTML = TRUE,  # Enables HTML formatting within the tooltip
    headerFormat = '<span style="font-size: 15px; font-family: Bubblegum Sans">{point.key}</span><br/>', # Adjust the formatting of the tooltip
    pointFormat = '<span style="color:{point.color}">\u25CF</span> Reviews: <b> {point.y}</b> ✍️ <br/>Rank: <b>#{point.rank}</b>🏆', # Add information and details like colored bullets, review count, and emojis
    style = list(color = "#000000", fontFamily = "Bubblegum Sans") # Make tooltip font black for readability
  ) %>% 
hc_caption(text = "Sources: Ben & Jerry's, Häagen-Dazs, Breyers, and Talenti", # Label caption
              style = list(fontFamily = "Bubblegum Sans", # Apply Bubblegum Sans font
                           fontSize = "14px", # Select font size
                           color = "#FFFFFF")) # Make the caption text white

Top cone-tenders! Who takes the scoop? 🍨

User FuzzyGut boasts the most ice cream reviews, with 43 reviews! Maybe they’re a hobbyist? Reviewing ice cream flavors seems like an awesome hobby to pick up for the summer! It would be fun to keep track of each ice cream flavor you review and how many you’ve done–maybe that’s what FuzzyGut is doing.

Happy Summer! :)