library(ggplot2)Warning: package 'ggplot2' was built under R version 4.5.3
library(corrplot)Warning: package 'corrplot' was built under R version 4.5.3
corrplot 0.95 loaded
Course Name: Exploratory Data Analysis
Course Code: 22ISE644
Academic Year: 2025 – 26, 6th semester
Team Name:The Outliers
Team Number:33
Team member1: Jaishree M(1NT23IS088)
Team member2: Chinmayi(1NT23IS063)
library(ggplot2)Warning: package 'ggplot2' was built under R version 4.5.3
library(corrplot)Warning: package 'corrplot' was built under R version 4.5.3
corrplot 0.95 loaded
data <- read.csv("foodreview.csv", stringsAsFactors = FALSE)# Fix Time
data$Time <- as.numeric(data$Time)Warning: NAs introduced by coercion
data <- data[!is.na(data$Time), ]
data$Time <- as.POSIXct(data$Time, origin="1970-01-01")
# Fix Score
data$Score <- as.numeric(data$Score)Warning: NAs introduced by coercion
data <- data[!is.na(data$Score), ]
# Create features
data$year <- format(data$Time, "%Y")
data$month <- format(data$Time, "%m")
# Create TextLength
data$TextLength <- nchar(data$Text)
# Remove unnecessary columns
data <- data[, !(names(data) %in% c("Id","ProductId","UserId","ProfileName"))]
# Remove outliers
data <- data[data$TextLength < 1000, ]
data <- data[data$HelpfulnessNumerator < 50, ]
data <- data[data$HelpfulnessDenominator != 0, ]
# Remove NA
data <- na.omit(data)set.seed(123)
sample_data <- data[sample(nrow(data), 5000), ]sample_data$Score <- as.numeric(sample_data$Score)
sample_data$TextLength <- as.numeric(sample_data$TextLength)
sample_data$HelpfulnessNumerator <- as.numeric(sample_data$HelpfulnessNumerator)
sample_data$HelpfulnessDenominator <- as.numeric(sample_data$HelpfulnessDenominator)Warning: NAs introduced by coercion
sample_data <- sample_data[
is.finite(sample_data$Score) &
is.finite(sample_data$TextLength) &
is.finite(sample_data$HelpfulnessNumerator) &
is.finite(sample_data$HelpfulnessDenominator), ]
# Keep valid ratings only
sample_data <- sample_data[sample_data$Score >= 1 & sample_data$Score <= 5, ]# 1 Distribution of ratings (overall sentiment)
ggplot(sample_data, aes(x=factor(Score))) +
geom_bar(fill="skyblue", color="black")# 2 Percentage of ratings (color categories)
ggplot(sample_data, aes(x=factor(Score), fill=factor(Score))) +
geom_bar() + scale_fill_brewer(palette="Set2")# 3 Density of ratings (smooth distribution)
ggplot(sample_data, aes(x=Score)) +
geom_density(fill="lightgreen", alpha=0.5)# 4 Ratings by month (seasonal trend)
ggplot(sample_data, aes(x=month, fill=month)) +
geom_bar() + scale_fill_brewer(palette="Paired")# 5 Ratings by year (trend over time)
ggplot(sample_data, aes(x=factor(year), fill=factor(year))) +
geom_bar() + theme(axis.text.x=element_text(angle=90))# 6 Distribution of review length (short vs long)
ggplot(sample_data, aes(x=TextLength)) +
geom_histogram(fill="purple", bins=30)# 7 Review length vs rating (relationship)
ggplot(sample_data, aes(x=factor(Score), y=TextLength, fill=factor(Score))) +
geom_boxplot()# 8 Density of review length (spread of text size)
ggplot(sample_data, aes(x=TextLength)) +
geom_density(fill="orange", alpha=0.5)# 9 Review length vs helpfulness (scatter relation)
ggplot(sample_data, aes(x=TextLength, y=HelpfulnessNumerator)) +
geom_point(color="blue", alpha=0.3)# 10 Trend: length vs helpfulness (linear trend)
ggplot(sample_data, aes(x=TextLength, y=HelpfulnessNumerator)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm", color="red")`geom_smooth()` using formula = 'y ~ x'
# 11 Distribution of helpful votes (limited range for clarity)
ggplot(sample_data[sample_data$HelpfulnessNumerator < 20, ],
aes(x=HelpfulnessNumerator)) +
geom_histogram(fill="cyan", bins=20)# 12 Helpfulness vs rating (usefulness by rating)
ggplot(sample_data, aes(x=factor(Score), y=HelpfulnessNumerator, fill=factor(Score))) +
geom_boxplot()# 13 Distribution of helpfulness denominator (limited range)
ggplot(sample_data[sample_data$HelpfulnessDenominator < 20, ],
aes(x=HelpfulnessDenominator)) +
geom_histogram(fill="gold", bins=20)# 14 Numerator vs denominator (relationship)
ggplot(sample_data, aes(x=HelpfulnessDenominator, y=HelpfulnessNumerator)) +
geom_point(color="darkgreen", alpha=0.3)# 15 Smooth helpfulness relation (linear fit)
ggplot(sample_data, aes(x=HelpfulnessDenominator, y=HelpfulnessNumerator)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm", color="blue")`geom_smooth()` using formula = 'y ~ x'
# 16 Ratings distribution across months (shows seasonal sentiment pattern)
ggplot(sample_data, aes(x=month, fill=factor(Score))) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set3") +
labs(title="Ratings Distribution by Month",
x="Month", y="Count", fill="Rating")# 17.1 Create helpfulness ratio
sample_data$help_ratio <- sample_data$HelpfulnessNumerator /
sample_data$HelpfulnessDenominator# 17.2 Distribution of helpfulness ratio
ggplot(sample_data, aes(x=help_ratio)) +
geom_histogram(fill="pink", bins=30)# 18 Helpfulness ratio vs rating
ggplot(sample_data, aes(x=factor(Score), y=help_ratio, fill=factor(Score))) +
geom_boxplot()# 19 Density of helpfulness ratio
ggplot(sample_data, aes(x=help_ratio)) +
geom_density(fill="lightblue", alpha=0.5)# 20 Ratio vs review length
ggplot(sample_data, aes(x=TextLength, y=help_ratio)) +
geom_point(alpha=0.3, color="purple")# 21 Reviews per year (activity trend)
ggplot(sample_data, aes(x=factor(year), fill=factor(year))) +
geom_bar() + theme(axis.text.x=element_text(angle=90))# 22 Reviews per month (seasonal activity)
ggplot(sample_data, aes(x=month, fill=month)) +
geom_bar()# 23 Sample trend of ratings (line)
sample_small <- sample_data[1:1000,]
ggplot(sample_small, aes(x=1:nrow(sample_small), y=Score)) +
geom_line(color="blue")# 24 Trend of helpfulness (line)
ggplot(sample_small, aes(x=1:nrow(sample_small), y=HelpfulnessNumerator)) +
geom_line(color="red")# 25 Area plot of ratings trend
ggplot(sample_small, aes(x=1:nrow(sample_small), y=Score)) +
geom_area(fill="lightgreen")# 26 Review length distribution by rating (shows spread + density)
ggplot(sample_data, aes(x=factor(Score), y=TextLength, fill=factor(Score))) +
geom_violin(alpha=0.7) +
labs(title="Review Length Distribution by Rating",
x="Rating", y="Text Length")# 27.1 Correlation matrix preparation
num <- sample_data[, c("Score","HelpfulnessNumerator",
"HelpfulnessDenominator","TextLength")]
num <- na.omit(num)# 27.2 Correlation plot (colorful)
corrplot(cor(num),
method="circle",
col=colorRampPalette(c("red","white","blue"))(200),
addCoef.col="black")# 28 Scatter with color gradient
ggplot(sample_data, aes(x=TextLength, y=Score, color=TextLength)) +
geom_point(alpha=0.5)# 29 Bubble plot (size = review length)
ggplot(sample_data, aes(x=Score, y=HelpfulnessNumerator,
size=TextLength, color=Score)) +
geom_point(alpha=0.5)# 30 Facet histogram by rating
ggplot(sample_data, aes(x=TextLength, fill=factor(Score))) +
geom_histogram(bins=30) +
facet_wrap(~Score)