This report performs Exploratory Data Analysis (EDA) on the Amazon Fine Food Reviews dataset. The dataset contains customer reviews including ratings, text, helpfulness scores, and timestamps.
library(tidyverse)
library(data.table)
library(lubridate)
library(ggplot2)
library(tm)
library(wordcloud)
# Replace with your file path
data <- fread("Reviews.csv")
# View structure
str(data)
## Classes 'data.table' and 'data.frame': 568454 obs. of 10 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ProductId : chr "B001E4KFG0" "B00813GRG4" "B000LQOCH0" "B000UA0QIQ" ...
## $ UserId : chr "A3SGXH7AUHU8GW" "A1D87F6ZCVE5NK" "ABXLMWJIXXAIN" "A395BORC6FGVXV" ...
## $ ProfileName : chr "delmartian" "dll pa" "Natalia Corres \"\"Natalia Corres\"\"" "Karl" ...
## $ HelpfulnessNumerator : int 1 0 1 3 0 0 0 0 1 0 ...
## $ HelpfulnessDenominator: int 1 0 1 3 0 0 0 0 1 0 ...
## $ Score : int 5 1 4 2 5 4 5 5 5 5 ...
## $ Time : int 1303862400 1346976000 1219017600 1307923200 1350777600 1342051200 1340150400 1336003200 1322006400 1351209600 ...
## $ Summary : chr "Good Quality Dog Food" "Not as Advertised" "\"\"Delight\"\" says it all" "Cough Medicine" ...
## $ Text : chr "I have bought several of the Vitality canned dog food products and have found them all to be of good quality. T"| __truncated__ "Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if t"| __truncated__ "This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - i"| __truncated__ "If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition t"| __truncated__ ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(data)
## [1] 568454 10
summary(data)
## Id ProductId UserId ProfileName
## Min. : 1 Length:568454 Length:568454 Length:568454
## 1st Qu.:142114 Class :character Class :character Class :character
## Median :284228 Mode :character Mode :character Mode :character
## Mean :284228
## 3rd Qu.:426341
## Max. :568454
## HelpfulnessNumerator HelpfulnessDenominator Score
## Min. : 0.000 Min. : 0.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:4.000
## Median : 0.000 Median : 1.000 Median :5.000
## Mean : 1.744 Mean : 2.229 Mean :4.183
## 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.:5.000
## Max. :866.000 Max. :923.000 Max. :5.000
## Time Summary Text
## Min. :9.393e+08 Length:568454 Length:568454
## 1st Qu.:1.271e+09 Class :character Class :character
## Median :1.311e+09 Mode :character Mode :character
## Mean :1.296e+09
## 3rd Qu.:1.333e+09
## Max. :1.351e+09
colSums(is.na(data))
## Id ProductId UserId
## 0 0 0
## ProfileName HelpfulnessNumerator HelpfulnessDenominator
## 4 0 0
## Score Time Summary
## 0 0 0
## Text
## 0
Observation:
- Dataset contains review text, ratings (1–5), and helpfulness metrics.
- Missing values are minimal.
data$Time <- as_datetime(data$Time)
data$Year <- year(data$Time)
data$HelpfulnessRatio <- ifelse(data$HelpfulnessDenominator == 0,
0,
data$HelpfulnessNumerator /
data$HelpfulnessDenominator)
data$ReviewLength <- nchar(data$Text)
ggplot(data, aes(x = as.factor(Score))) +
geom_bar(fill = "steelblue") +
labs(title = "Distribution of Review Scores",
x = "Rating",
y = "Count")
Insight: Most reviews are 4 and 5 stars, indicating positive bias.
reviews_year <- data %>%
group_by(Year) %>%
summarise(Count = n())
ggplot(reviews_year, aes(x = Year, y = Count)) +
geom_line(color="darkgreen") +
geom_point() +
labs(title = "Number of Reviews per Year")
Insight: Reviews increased significantly over time.
data %>%
group_by(Score) %>%
summarise(AvgHelpfulness = mean(HelpfulnessRatio)) %>%
ggplot(aes(x = as.factor(Score), y = AvgHelpfulness)) +
geom_col(fill="orange") +
labs(title="Average Helpfulness Ratio by Rating")
Insight: Lower ratings sometimes receive higher helpfulness.
ggplot(data, aes(x = as.factor(Score), y = ReviewLength)) +
geom_boxplot(fill="purple") +
labs(title="Review Length vs Rating")
Insight: Longer reviews tend to be more detailed.
sample_text <- data$Text[1:10000]
corpus <- Corpus(VectorSource(sample_text))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
tdm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(tdm)
word_freq <- sort(rowSums(matrix), decreasing=TRUE)
wordcloud(names(word_freq), word_freq, max.words=100)
Insight: Frequently used words reflect common product experiences.
data$Sentiment <- ifelse(data$Score >= 4, "Positive",
ifelse(data$Score == 3, "Neutral", "Negative"))
ggplot(data, aes(x = Sentiment)) +
geom_bar(fill="red") +
labs(title="Sentiment Distribution")
Insight: Majority of reviews are positive.
The dataset shows strong positive customer satisfaction overall.