data <- read.csv("C:\\Users\\SHREYA\\OneDrive\\Documents\\Gitstuff\\modified_dataset.csv")

Convert into a Date in R

data$review_date <- as.Date(paste0(data$review_date, "-03-03"))

head(data$review_date, 20)
##  [1] "2019-03-03" "2019-03-03" "2019-03-03" "2021-03-03" "2021-03-03"
##  [6] "2021-03-03" "2021-03-03" "2012-03-03" "2012-03-03" "2013-03-03"
## [11] "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03"
## [16] "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03" "2013-03-03"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- data %>%
  mutate(review_date_as_date = as.Date(review_date))


head(data, 10)
##     ref company_manufacturer company_location review_date
## 1  2454                 5150           U.S.A.  2019-03-03
## 2  2458                 5150           U.S.A.  2019-03-03
## 3  2454                 5150           U.S.A.  2019-03-03
## 4  2542                 5150           U.S.A.  2021-03-03
## 5  2546                 5150           U.S.A.  2021-03-03
## 6  2546                 5150           U.S.A.  2021-03-03
## 7  2542                 5150           U.S.A.  2021-03-03
## 8   797             A. Morin           France  2012-03-03
## 9   797             A. Morin           France  2012-03-03
## 10 1011             A. Morin           France  2013-03-03
##    country_of_bean_origin specific_bean_origin_or_bar_name cocoa_percent
## 1                Tanzania            Kokoa Kamili, batch 1          0.76
## 2      Dominican Republic                  Zorzal, batch 1          0.76
## 3              Madagascar           Bejofo Estate, batch 1          0.76
## 4                    Fiji            Matasawalevu, batch 1          0.68
## 5               Venezuela            Sur del Lago, batch 1          0.72
## 6                  Uganda         Semuliki Forest, batch 1          0.80
## 7                   India                Anamalai, batch 1          0.68
## 8                 Bolivia                          Bolivia          0.70
## 9                    Peru                             Peru          0.63
## 10                 Panama                           Panama          0.70
##    ingredients    most_memorable_characteristics rating review_date_as_date
## 1     3- B,S,C         rich cocoa, fatty, bready   3.25          2019-03-03
## 2     3- B,S,C            cocoa, vegetal, savory   3.50          2019-03-03
## 3     3- B,S,C      cocoa, blackberry, full body   3.75          2019-03-03
## 4     3- B,S,C               chewy, off, rubbery   3.00          2021-03-03
## 5     3- B,S,C fatty, earthy, moss, nutty,chalky   3.00          2021-03-03
## 6     3- B,S,C mildly bitter, basic cocoa, fatty   3.25          2021-03-03
## 7     3- B,S,C     milk brownie, macadamia,chewy   3.50          2021-03-03
## 8   4- B,S,C,L                    vegetal, nutty   3.50          2012-03-03
## 9   4- B,S,C,L             fruity, melon, roasty   3.75          2012-03-03
## 10  4- B,S,C,L   brief fruit note, earthy, nutty   2.75          2013-03-03
grouped_data <- data %>%
  group_by(review_date_as_date, cocoa_percent) %>%
  summarize(count = n())
## `summarise()` has grouped output by 'review_date_as_date'. You can override
## using the `.groups` argument.
head(grouped_data,10)
## # A tibble: 10 × 3
## # Groups:   review_date_as_date [1]
##    review_date_as_date cocoa_percent count
##    <date>                      <dbl> <int>
##  1 2006-03-03                   0.53     1
##  2 2006-03-03                   0.6      2
##  3 2006-03-03                   0.61     2
##  4 2006-03-03                   0.62     2
##  5 2006-03-03                   0.64     5
##  6 2006-03-03                   0.65     7
##  7 2006-03-03                   0.66     2
##  8 2006-03-03                   0.67     1
##  9 2006-03-03                   0.68     2
## 10 2006-03-03                   0.7     12

create a tsibble object

library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(ggplot2)

tsib <- as_tsibble(grouped_data, index = "review_date_as_date", key = "cocoa_percent") %>%
  select(review_date_as_date, cocoa_percent)

ggplot(tsib, aes(x = review_date_as_date, y = cocoa_percent)) +
  geom_line() +
  labs(x = "Review Date", y = "cocoa_percent
       ", title = "Cocoa Percent Over Time")

A column that could be analyzed over time is the cocoa_percent column. This column represents the percentage of cocoa in each chocolate bar, which could be of particular interest for analysis over time to see if there are any trends or patterns in cocoa content.

filter the graph

library(dplyr)
library(ggplot2)

filtered_data <- grouped_data %>%
  filter(review_date_as_date >= as.Date("2016-01-01") & review_date_as_date <= as.Date("2021-12-31"))

tsib <- as_tsibble(filtered_data, index = "review_date_as_date", key = "cocoa_percent") %>%
  select(review_date_as_date, cocoa_percent)

ggplot(tsib, aes(x = review_date_as_date, y = cocoa_percent)) +
  geom_line() +
  labs(x = "Review Date", y = "Cocoa Percent", title = "Cocoa Percent Over Time (2016-2020)")

Detect trends using linear regression

library(dplyr)
library(ggplot2)
library(broom)

lm_model <- lm(cocoa_percent ~ review_date_as_date, data = filtered_data)

summary(lm_model)
## 
## Call:
## lm(formula = cocoa_percent ~ review_date_as_date, data = filtered_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.128354 -0.053798 -0.006076  0.039368  0.276202 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         6.185e-01  2.015e-01   3.070  0.00276 **
## review_date_as_date 6.242e-06  1.138e-05   0.549  0.58449   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07555 on 99 degrees of freedom
## Multiple R-squared:  0.003031,   Adjusted R-squared:  -0.007039 
## F-statistic: 0.301 on 1 and 99 DF,  p-value: 0.5845
#  linear regression line
ggplot(filtered_data, aes(x = review_date_as_date, y = cocoa_percent)) +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Review Date", y = "Cocoa Percent", title = "Linear Regression of Cocoa Percent Over Time (2016-2021)")
## `geom_smooth()` using formula = 'y ~ x'

The linear regression model relationship between review_date_as_date and cocoa_percent is weak Yet we can see a slight increase in the cocoa perent between the years 2016-2020

smoothing to detect at least one season

library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Smooth the data using LOESS
smoothed_data <- grouped_data %>%
  group_by(review_date_as_date) %>%
  summarise(smoothed_cocoa_percent = mean(cocoa_percent))

# Plot the smoothed data
ggplot(smoothed_data, aes(x = review_date_as_date, y = smoothed_cocoa_percent)) +
  geom_line() +
  labs(x = "Review Date", y = "Smoothed Cocoa Percent", title = "LOESS Smoothed Cocoa Percent Over Time") +
  theme_minimal()

# Calculate and plot the ACF and PACF
acf_result <- acf(smoothed_data$smoothed_cocoa_percent, lag.max = 30, plot = TRUE)

pacf_result <- pacf(smoothed_data$smoothed_cocoa_percent, lag.max = 30, plot = TRUE)

To summarize the insights and significance of the linear regression analysis:

  1. Trend Detection: The linear regression analysis showed a very weak trend (coefficient estimate of 6.242e-06) in the cocoa_percent variable over time (review_date_as_date). T

  2. Residual Analysis: The residuals (the differences between the observed values and the predicted values from the linear regression model) appear to be fairly small and randomly distributed, indicating that the linear regression model is not missing any major patterns in the data.

  3. Statistical Significance: However, the p-value for the coefficient estimate of review_date_as_date is 0.58449, which is greater than the typical significance level of 0.05.

  4. Further Questions: Further investigation could explore other potential patterns or relationships in the data. For example, it might be interesting to examine seasonal patterns in cocoa percentages or to investigate whether certain manufacturers or countries have different trends over time.