1. Introduction

This report performs Exploratory Data Analysis (EDA) on the Amazon Fine Food Reviews dataset. The dataset contains customer reviews including ratings, text, helpfulness scores, and timestamps.


2. Load Libraries

library(tidyverse)
library(data.table)
library(lubridate)
library(ggplot2)
library(tm)
library(wordcloud)

3. Load Dataset

# Replace with your file path
data <- fread("Reviews.csv")

# View structure
str(data)
## Classes 'data.table' and 'data.frame':   568454 obs. of  10 variables:
##  $ Id                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ProductId             : chr  "B001E4KFG0" "B00813GRG4" "B000LQOCH0" "B000UA0QIQ" ...
##  $ UserId                : chr  "A3SGXH7AUHU8GW" "A1D87F6ZCVE5NK" "ABXLMWJIXXAIN" "A395BORC6FGVXV" ...
##  $ ProfileName           : chr  "delmartian" "dll pa" "Natalia Corres \"\"Natalia Corres\"\"" "Karl" ...
##  $ HelpfulnessNumerator  : int  1 0 1 3 0 0 0 0 1 0 ...
##  $ HelpfulnessDenominator: int  1 0 1 3 0 0 0 0 1 0 ...
##  $ Score                 : int  5 1 4 2 5 4 5 5 5 5 ...
##  $ Time                  : int  1303862400 1346976000 1219017600 1307923200 1350777600 1342051200 1340150400 1336003200 1322006400 1351209600 ...
##  $ Summary               : chr  "Good Quality Dog Food" "Not as Advertised" "\"\"Delight\"\" says it all" "Cough Medicine" ...
##  $ Text                  : chr  "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. T"| __truncated__ "Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if t"| __truncated__ "This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - i"| __truncated__ "If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition t"| __truncated__ ...
##  - attr(*, ".internal.selfref")=<externalptr>

4. Data Overview

dim(data)
## [1] 568454     10
summary(data)
##        Id          ProductId            UserId          ProfileName       
##  Min.   :     1   Length:568454      Length:568454      Length:568454     
##  1st Qu.:142114   Class :character   Class :character   Class :character  
##  Median :284228   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :284228                                                           
##  3rd Qu.:426341                                                           
##  Max.   :568454                                                           
##  HelpfulnessNumerator HelpfulnessDenominator     Score      
##  Min.   :  0.000      Min.   :  0.000        Min.   :1.000  
##  1st Qu.:  0.000      1st Qu.:  0.000        1st Qu.:4.000  
##  Median :  0.000      Median :  1.000        Median :5.000  
##  Mean   :  1.744      Mean   :  2.229        Mean   :4.183  
##  3rd Qu.:  2.000      3rd Qu.:  2.000        3rd Qu.:5.000  
##  Max.   :866.000      Max.   :923.000        Max.   :5.000  
##       Time             Summary              Text          
##  Min.   :9.393e+08   Length:568454      Length:568454     
##  1st Qu.:1.271e+09   Class :character   Class :character  
##  Median :1.311e+09   Mode  :character   Mode  :character  
##  Mean   :1.296e+09                                        
##  3rd Qu.:1.333e+09                                        
##  Max.   :1.351e+09
colSums(is.na(data))
##                     Id              ProductId                 UserId 
##                      0                      0                      0 
##            ProfileName   HelpfulnessNumerator HelpfulnessDenominator 
##                      4                      0                      0 
##                  Score                   Time                Summary 
##                      0                      0                      0 
##                   Text 
##                      0

Observation:
- Dataset contains review text, ratings (1–5), and helpfulness metrics. - Missing values are minimal.


5. Data Cleaning & Feature Engineering

Convert Time to Date

data$Time <- as_datetime(data$Time)
data$Year <- year(data$Time)

Create Helpfulness Ratio

data$HelpfulnessRatio <- ifelse(data$HelpfulnessDenominator == 0,
                                0,
                                data$HelpfulnessNumerator /
                                data$HelpfulnessDenominator)

Review Length

data$ReviewLength <- nchar(data$Text)

6. Univariate Analysis

Distribution of Ratings

ggplot(data, aes(x = as.factor(Score))) +
  geom_bar(fill = "steelblue") +
  labs(title = "Distribution of Review Scores",
       x = "Rating",
       y = "Count")

Insight: Most reviews are 4 and 5 stars, indicating positive bias.


Reviews Over Time

reviews_year <- data %>%
  group_by(Year) %>%
  summarise(Count = n())

ggplot(reviews_year, aes(x = Year, y = Count)) +
  geom_line(color="darkgreen") +
  geom_point() +
  labs(title = "Number of Reviews per Year")

Insight: Reviews increased significantly over time.


7. Bivariate Analysis

Helpfulness vs Rating

data %>%
  group_by(Score) %>%
  summarise(AvgHelpfulness = mean(HelpfulnessRatio)) %>%
  ggplot(aes(x = as.factor(Score), y = AvgHelpfulness)) +
  geom_col(fill="orange") +
  labs(title="Average Helpfulness Ratio by Rating")

Insight: Lower ratings sometimes receive higher helpfulness.


Review Length vs Rating

ggplot(data, aes(x = as.factor(Score), y = ReviewLength)) +
  geom_boxplot(fill="purple") +
  labs(title="Review Length vs Rating")

Insight: Longer reviews tend to be more detailed.


8. Text Analysis (Word Cloud)

sample_text <- data$Text[1:10000]

corpus <- Corpus(VectorSource(sample_text))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
tdm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(tdm)
word_freq <- sort(rowSums(matrix), decreasing=TRUE)

wordcloud(names(word_freq), word_freq, max.words=100)

Insight: Frequently used words reflect common product experiences.


9. Sentiment Categorization

data$Sentiment <- ifelse(data$Score >= 4, "Positive",
                  ifelse(data$Score == 3, "Neutral", "Negative"))

ggplot(data, aes(x = Sentiment)) +
  geom_bar(fill="red") +
  labs(title="Sentiment Distribution")

Insight: Majority of reviews are positive.


10. Conclusion

The dataset shows strong positive customer satisfaction overall.