Loading Libraries

library(car)
library(mosaic)
library(pander)
library(tidyverse)
library(readxl)
library(janitor)
library(lubridate)

Filtering the Data by narrowing the date range to August onwards and removing a couple outliers

wordle_data <- read_csv("Problem_C_Data_Wordle.csv")
wordle_data <- clean_names(wordle_data)
wordle_data <- wordle_data[-c(154:359),]
wordle_data <- wordle_data[-c(7,32,109),]
wordle_data$date <- mdy(wordle_data$date)

Linear Regression Model

word.lm <- lm(number_of_reported_results ~ date, data = wordle_data)
summary(word.lm) %>% pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 2004661 61357 32.67 1.488e-69
date -102.5 3.182 -32.2 9.936e-69
Fitting linear model: number_of_reported_results ~ date
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
150 1715 0.8751 0.8742

Assessing Linear Regression Validity

par(mfrow=c(1,3))

plot(word.lm, which=1:2)
plot(word.lm$residuals, ylab="Residuals")
mtext("Residuals vs Order", side=3)

Prediction Interval

predict(word.lm, newdata = data.frame(date = as.Date("2023-03-01")), interval = 'prediction') %>% pander()
fit lwr upr
15108 11600 18615

Linear Regression Graph

ggplot(wordle_data, aes(x=date, y=number_of_reported_results))+
  geom_point()+
  geom_smooth(method = "lm", formula = y~x, se=FALSE)+
  theme_bw()