LA-2

Author

Jaishree(1NT23IS088)and Chinmayi(1NT23IS063)

Course Name: Exploratory Data Analysis

Course Code: 22ISE644

Academic Year: 2025 – 26, 6^th semester

Team Name:The Outliers

Team Number:33

Team member1: Jaishree M(1NT23IS088)

Team member2: Chinmayi(1NT23IS063)

library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.5.3

library(corrplot)

Warning: package 'corrplot' was built under R version 4.5.3

corrplot 0.95 loaded

data <- read.csv("foodreview.csv", stringsAsFactors = FALSE)

# Fix Time
data$Time <- as.numeric(data$Time)

Warning: NAs introduced by coercion

data <- data[!is.na(data$Time), ]
data$Time <- as.POSIXct(data$Time, origin="1970-01-01")

# Fix Score
data$Score <- as.numeric(data$Score)

Warning: NAs introduced by coercion

data <- data[!is.na(data$Score), ]

# Create features
data$year <- format(data$Time, "%Y")
data$month <- format(data$Time, "%m")

# Create TextLength
data$TextLength <- nchar(data$Text)

# Remove unnecessary columns
data <- data[, !(names(data) %in% c("Id","ProductId","UserId","ProfileName"))]

# Remove outliers
data <- data[data$TextLength < 1000, ]
data <- data[data$HelpfulnessNumerator < 50, ]
data <- data[data$HelpfulnessDenominator != 0, ]

# Remove NA
data <- na.omit(data)

set.seed(123)
sample_data <- data[sample(nrow(data), 5000), ]

sample_data$Score <- as.numeric(sample_data$Score)
sample_data$TextLength <- as.numeric(sample_data$TextLength)
sample_data$HelpfulnessNumerator <- as.numeric(sample_data$HelpfulnessNumerator)
sample_data$HelpfulnessDenominator <- as.numeric(sample_data$HelpfulnessDenominator)

Warning: NAs introduced by coercion

sample_data <- sample_data[
  is.finite(sample_data$Score) &
  is.finite(sample_data$TextLength) &
  is.finite(sample_data$HelpfulnessNumerator) &
  is.finite(sample_data$HelpfulnessDenominator), ]

# Keep valid ratings only
sample_data <- sample_data[sample_data$Score >= 1 & sample_data$Score <= 5, ]

# 1 Distribution of ratings (overall sentiment)
ggplot(sample_data, aes(x=factor(Score))) +
  geom_bar(fill="skyblue", color="black")

# 2 Percentage of ratings (color categories)
ggplot(sample_data, aes(x=factor(Score), fill=factor(Score))) +
  geom_bar() + scale_fill_brewer(palette="Set2")

# 3 Density of ratings (smooth distribution)
ggplot(sample_data, aes(x=Score)) +
  geom_density(fill="lightgreen", alpha=0.5)

# 4 Ratings by month (seasonal trend)
ggplot(sample_data, aes(x=month, fill=month)) +
  geom_bar() + scale_fill_brewer(palette="Paired")

# 5 Ratings by year (trend over time)
ggplot(sample_data, aes(x=factor(year), fill=factor(year))) +
  geom_bar() + theme(axis.text.x=element_text(angle=90))

# 6 Distribution of review length (short vs long)
ggplot(sample_data, aes(x=TextLength)) +
  geom_histogram(fill="purple", bins=30)

# 7 Review length vs rating (relationship)
ggplot(sample_data, aes(x=factor(Score), y=TextLength, fill=factor(Score))) +
  geom_boxplot()

# 8 Density of review length (spread of text size)
ggplot(sample_data, aes(x=TextLength)) +
  geom_density(fill="orange", alpha=0.5)

# 9 Review length vs helpfulness (scatter relation)
ggplot(sample_data, aes(x=TextLength, y=HelpfulnessNumerator)) +
  geom_point(color="blue", alpha=0.3)

# 10 Trend: length vs helpfulness (linear trend)
ggplot(sample_data, aes(x=TextLength, y=HelpfulnessNumerator)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", color="red")

`geom_smooth()` using formula = 'y ~ x'

# 11 Distribution of helpful votes (limited range for clarity)
ggplot(sample_data[sample_data$HelpfulnessNumerator < 20, ],
       aes(x=HelpfulnessNumerator)) +
  geom_histogram(fill="cyan", bins=20)

# 12 Helpfulness vs rating (usefulness by rating)
ggplot(sample_data, aes(x=factor(Score), y=HelpfulnessNumerator, fill=factor(Score))) +
  geom_boxplot()

# 13 Distribution of helpfulness denominator (limited range)
ggplot(sample_data[sample_data$HelpfulnessDenominator < 20, ],
       aes(x=HelpfulnessDenominator)) +
  geom_histogram(fill="gold", bins=20)

# 14 Numerator vs denominator (relationship)
ggplot(sample_data, aes(x=HelpfulnessDenominator, y=HelpfulnessNumerator)) +
  geom_point(color="darkgreen", alpha=0.3)

# 15 Smooth helpfulness relation (linear fit)
ggplot(sample_data, aes(x=HelpfulnessDenominator, y=HelpfulnessNumerator)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", color="blue")

`geom_smooth()` using formula = 'y ~ x'

# 16 Ratings distribution across months (shows seasonal sentiment pattern)
ggplot(sample_data, aes(x=month, fill=factor(Score))) +
  geom_bar(position="stack") +
  scale_fill_brewer(palette="Set3") +
  labs(title="Ratings Distribution by Month",
       x="Month", y="Count", fill="Rating")

# 17.1 Create helpfulness ratio
sample_data$help_ratio <- sample_data$HelpfulnessNumerator /
                         sample_data$HelpfulnessDenominator

# 17.2 Distribution of helpfulness ratio
ggplot(sample_data, aes(x=help_ratio)) +
  geom_histogram(fill="pink", bins=30)

# 18 Helpfulness ratio vs rating
ggplot(sample_data, aes(x=factor(Score), y=help_ratio, fill=factor(Score))) +
  geom_boxplot()

# 19 Density of helpfulness ratio
ggplot(sample_data, aes(x=help_ratio)) +
  geom_density(fill="lightblue", alpha=0.5)

# 20 Ratio vs review length
ggplot(sample_data, aes(x=TextLength, y=help_ratio)) +
  geom_point(alpha=0.3, color="purple")

# 21 Reviews per year (activity trend)
ggplot(sample_data, aes(x=factor(year), fill=factor(year))) +
  geom_bar() + theme(axis.text.x=element_text(angle=90))

# 22 Reviews per month (seasonal activity)
ggplot(sample_data, aes(x=month, fill=month)) +
  geom_bar()

# 23 Sample trend of ratings (line)
sample_small <- sample_data[1:1000,]
ggplot(sample_small, aes(x=1:nrow(sample_small), y=Score)) +
  geom_line(color="blue")

# 24 Trend of helpfulness (line)
ggplot(sample_small, aes(x=1:nrow(sample_small), y=HelpfulnessNumerator)) +
  geom_line(color="red")

# 25 Area plot of ratings trend
ggplot(sample_small, aes(x=1:nrow(sample_small), y=Score)) +
  geom_area(fill="lightgreen")

# 26 Review length distribution by rating (shows spread + density)
ggplot(sample_data, aes(x=factor(Score), y=TextLength, fill=factor(Score))) +
  geom_violin(alpha=0.7) +
  labs(title="Review Length Distribution by Rating",
       x="Rating", y="Text Length")

# 27.1 Correlation matrix preparation
num <- sample_data[, c("Score","HelpfulnessNumerator",
                      "HelpfulnessDenominator","TextLength")]
num <- na.omit(num)

# 27.2 Correlation plot (colorful)
corrplot(cor(num),
         method="circle",
         col=colorRampPalette(c("red","white","blue"))(200),
         addCoef.col="black")

# 28 Scatter with color gradient
ggplot(sample_data, aes(x=TextLength, y=Score, color=TextLength)) +
  geom_point(alpha=0.5)

# 29 Bubble plot (size = review length)
ggplot(sample_data, aes(x=Score, y=HelpfulnessNumerator,
                        size=TextLength, color=Score)) +
  geom_point(alpha=0.5)

# 30 Facet histogram by rating
ggplot(sample_data, aes(x=TextLength, fill=factor(Score))) +
  geom_histogram(bins=30) +
  facet_wrap(~Score)