This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
#Running functions everytime before work
library(dplyr)
library(ggplot2)
library(tidyr)
library(car)
library(corrplot)
library(arm)
library(MASS)
library(cowplot)
setwd("C:/Users/amern/OneDrive/Binghamton/DIda 380/Data")
footdata <- read.csv("Data_Proj_1.csv")
datasmall <- footdata %>% dplyr::select(GP, REC, TGTS, YDS, AVG, TD, LNG, YDS.G, YAC)
head(datasmall)
#Yards will be my dependent variable and I will observe if the other statistics affect that variable. IE the other variables will #be my independent variables.
#below is going to be a summary of my smaller dataset
summary <- datasmall %>%
group_by(GP) %>%
summarise(avg_rec = mean(REC, na.rm=T),
avg_tgts = mean(TGTS, na.rm=T),
avg_yds.g = mean(YDS.G, na.rm=T),
avg_avg = mean(AVG, na.rm=T),
avg_td = mean(TD, na.rm=T),
avg_lng = mean(LNG, na.rm=T),
avg_yds = mean(YDS, na.rm=T),
avg_yac = mean(YAC, na.rm=T))
histdata <- datasmall %>% dplyr::select(2:9)
ggplot(gather(histdata), aes(value)) +
geom_histogram(bins = 30, fill = "tomato") +
facet_wrap(~key, scales = 'free_x')+
theme_dark()
corrdata <- cor(histdata, use = "complete.obs")
#Then we can plot the data like so using the corrplot function
corrplot::corrplot(corrdata, method = "circle")
#lets use gg plot
ggplot(datasmall, aes(x = REC, y = YDS)) +
geom_point() +
geom_smooth(method = "lm")
#Regression Model is below with graphs
#Cell 75 is a summary model of the dependent variable (Yards) and the independed variables
model <- lm(YDS ~ GP + REC + TGTS + AVG + TD + LNG + YDS.G + YAC, data = datasmall)
summary(model)
Call:
lm(formula = YDS ~ GP + REC + TGTS + AVG + TD + LNG + YDS.G +
YAC, data = datasmall)
Residuals:
Min 1Q Median 3Q Max
-113.317 -14.484 -0.312 14.128 64.001
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.704e+02 3.470e+01 -25.083 < 2e-16 ***
GP 4.031e+01 3.282e+00 12.282 < 2e-16 ***
REC 3.239e+00 7.914e-01 4.093 9.2e-05 ***
TGTS 6.030e-01 3.566e-01 1.691 0.0943 .
AVG 1.626e+01 2.692e+00 6.041 3.3e-08 ***
TD 1.154e+00 1.314e+00 0.879 0.3820
LNG 1.441e-01 2.494e-01 0.578 0.5648
YDS.G 1.066e+01 7.263e-01 14.674 < 2e-16 ***
YAC 1.061e-03 3.237e-02 0.033 0.9739
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 29.45 on 91 degrees of freedom
Multiple R-squared: 0.9886, Adjusted R-squared: 0.9876
F-statistic: 988 on 8 and 91 DF, p-value: < 2.2e-16
plot(model)
#Steps are right just look for models that veer off and see if you can make transformations of them
residualPlots(model)
Test stat Pr(>|Test stat|)
GP -3.9992 0.0001301 ***
REC -0.6510 0.5167104
TGTS -0.0169 0.9865870
AVG -3.6997 0.0003712 ***
TD 2.7448 0.0073086 **
LNG 1.9345 0.0561973 .
YDS.G -1.4337 0.1551258
YAC 0.4314 0.6672202
Tukey test 4.6013 4.198e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
vif(model)
GP REC TGTS AVG TD LNG YDS.G YAC
5.419003 32.686844 12.995664 5.863733 1.768398 1.813364 18.206146 2.770292
#This is a function to view all histograms together
a <- ggplot(datasmall, aes(log(AVG))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
b <- ggplot(datasmall, aes(log(LNG))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
c <- ggplot(datasmall, aes(1/sqrt(AVG))) +
geom_histogram(bins = 30, fill = "magenta") +
theme_dark()
d <- ggplot(datasmall, aes(sqrt(LNG))) +
geom_histogram(bins = 30, fill = "magenta") +
theme_dark()
plot_grid(a, b, c, d,
nrow = 2,
labels = c("A", "B", "C", "D"))
# I am going to create a log model to see if that helps the resdiuals vs fitted model
log_model <- lm(log(YDS) ~ GP + REC + TGTS + AVG + TD + LNG + YDS.G + YAC, data = datasmall)
plot(log_model, which = 1)
``` Works Cited “NFL Conference Receiving Stat Leaders, 2024 Regular Season - ESPN.” ESPN, 2024,