data <- read.csv("C:\\Users\\SHREYA\\OneDrive\\Documents\\Gitstuff\\modified_dataset.csv")
data$review_date <- as.Date(paste0(data$review_date, "-03-03"))
head(data$review_date, 20)
## [1] "2019-03-03" "2019-03-03" "2019-03-03" "2021-03-03" "2021-03-03"
## [6] "2021-03-03" "2021-03-03" "2012-03-03" "2012-03-03" "2013-03-03"
## [11] "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03"
## [16] "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- data %>%
mutate(review_date_as_date = as.Date(review_date))
head(data, 10)
## ref company_manufacturer company_location review_date
## 1 2454 5150 U.S.A. 2019-03-03
## 2 2458 5150 U.S.A. 2019-03-03
## 3 2454 5150 U.S.A. 2019-03-03
## 4 2542 5150 U.S.A. 2021-03-03
## 5 2546 5150 U.S.A. 2021-03-03
## 6 2546 5150 U.S.A. 2021-03-03
## 7 2542 5150 U.S.A. 2021-03-03
## 8 797 A. Morin France 2012-03-03
## 9 797 A. Morin France 2012-03-03
## 10 1011 A. Morin France 2013-03-03
## country_of_bean_origin specific_bean_origin_or_bar_name cocoa_percent
## 1 Tanzania Kokoa Kamili, batch 1 0.76
## 2 Dominican Republic Zorzal, batch 1 0.76
## 3 Madagascar Bejofo Estate, batch 1 0.76
## 4 Fiji Matasawalevu, batch 1 0.68
## 5 Venezuela Sur del Lago, batch 1 0.72
## 6 Uganda Semuliki Forest, batch 1 0.80
## 7 India Anamalai, batch 1 0.68
## 8 Bolivia Bolivia 0.70
## 9 Peru Peru 0.63
## 10 Panama Panama 0.70
## ingredients most_memorable_characteristics rating review_date_as_date
## 1 3- B,S,C rich cocoa, fatty, bready 3.25 2019-03-03
## 2 3- B,S,C cocoa, vegetal, savory 3.50 2019-03-03
## 3 3- B,S,C cocoa, blackberry, full body 3.75 2019-03-03
## 4 3- B,S,C chewy, off, rubbery 3.00 2021-03-03
## 5 3- B,S,C fatty, earthy, moss, nutty,chalky 3.00 2021-03-03
## 6 3- B,S,C mildly bitter, basic cocoa, fatty 3.25 2021-03-03
## 7 3- B,S,C milk brownie, macadamia,chewy 3.50 2021-03-03
## 8 4- B,S,C,L vegetal, nutty 3.50 2012-03-03
## 9 4- B,S,C,L fruity, melon, roasty 3.75 2012-03-03
## 10 4- B,S,C,L brief fruit note, earthy, nutty 2.75 2013-03-03
grouped_data <- data %>%
group_by(review_date_as_date, cocoa_percent) %>%
summarize(count = n())
## `summarise()` has grouped output by 'review_date_as_date'. You can override
## using the `.groups` argument.
head(grouped_data,10)
## # A tibble: 10 × 3
## # Groups: review_date_as_date [1]
## review_date_as_date cocoa_percent count
## <date> <dbl> <int>
## 1 2006-03-03 0.53 1
## 2 2006-03-03 0.6 2
## 3 2006-03-03 0.61 2
## 4 2006-03-03 0.62 2
## 5 2006-03-03 0.64 5
## 6 2006-03-03 0.65 7
## 7 2006-03-03 0.66 2
## 8 2006-03-03 0.67 1
## 9 2006-03-03 0.68 2
## 10 2006-03-03 0.7 12
library(tsibble)
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
tsib <- as_tsibble(grouped_data, index = "review_date_as_date", key = "cocoa_percent") %>%
select(review_date_as_date, cocoa_percent)
ggplot(tsib, aes(x = review_date_as_date, y = cocoa_percent)) +
geom_line() +
labs(x = "Review Date", y = "cocoa_percent
", title = "Cocoa Percent Over Time")
A column that could be analyzed over time is the cocoa_percent column. This column represents the percentage of cocoa in each chocolate bar, which could be of particular interest for analysis over time to see if there are any trends or patterns in cocoa content.
filter the graph
library(dplyr)
library(ggplot2)
filtered_data <- grouped_data %>%
filter(review_date_as_date >= as.Date("2016-01-01") & review_date_as_date <= as.Date("2021-12-31"))
tsib <- as_tsibble(filtered_data, index = "review_date_as_date", key = "cocoa_percent") %>%
select(review_date_as_date, cocoa_percent)
ggplot(tsib, aes(x = review_date_as_date, y = cocoa_percent)) +
geom_line() +
labs(x = "Review Date", y = "Cocoa Percent", title = "Cocoa Percent Over Time (2016-2020)")
Detect trends using linear regression
library(dplyr)
library(ggplot2)
library(broom)
lm_model <- lm(cocoa_percent ~ review_date_as_date, data = filtered_data)
summary(lm_model)
##
## Call:
## lm(formula = cocoa_percent ~ review_date_as_date, data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.128354 -0.053798 -0.006076 0.039368 0.276202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.185e-01 2.015e-01 3.070 0.00276 **
## review_date_as_date 6.242e-06 1.138e-05 0.549 0.58449
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07555 on 99 degrees of freedom
## Multiple R-squared: 0.003031, Adjusted R-squared: -0.007039
## F-statistic: 0.301 on 1 and 99 DF, p-value: 0.5845
# linear regression line
ggplot(filtered_data, aes(x = review_date_as_date, y = cocoa_percent)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Review Date", y = "Cocoa Percent", title = "Linear Regression of Cocoa Percent Over Time (2016-2021)")
## `geom_smooth()` using formula = 'y ~ x'
The linear regression model relationship between review_date_as_date and cocoa_percent is weak Yet we can see a slight increase in the cocoa perent between the years 2016-2020
smoothing to detect at least one season
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Smooth the data using LOESS
smoothed_data <- grouped_data %>%
group_by(review_date_as_date) %>%
summarise(smoothed_cocoa_percent = mean(cocoa_percent))
# Plot the smoothed data
ggplot(smoothed_data, aes(x = review_date_as_date, y = smoothed_cocoa_percent)) +
geom_line() +
labs(x = "Review Date", y = "Smoothed Cocoa Percent", title = "LOESS Smoothed Cocoa Percent Over Time") +
theme_minimal()
# Calculate and plot the ACF and PACF
acf_result <- acf(smoothed_data$smoothed_cocoa_percent, lag.max = 30, plot = TRUE)
pacf_result <- pacf(smoothed_data$smoothed_cocoa_percent, lag.max = 30, plot = TRUE)
To summarize the insights and significance of the linear regression analysis:
Trend Detection: The linear regression analysis showed a very weak trend (coefficient estimate of 6.242e-06) in the cocoa_percent variable over time (review_date_as_date). T
Residual Analysis: The residuals (the differences between the observed values and the predicted values from the linear regression model) appear to be fairly small and randomly distributed, indicating that the linear regression model is not missing any major patterns in the data.
Statistical Significance: However, the p-value for the coefficient estimate of review_date_as_date is 0.58449, which is greater than the typical significance level of 0.05.
Further Questions: Further investigation could explore other potential patterns or relationships in the data. For example, it might be interesting to examine seasonal patterns in cocoa percentages or to investigate whether certain manufacturers or countries have different trends over time.