packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
loading data
url = 'https://raw.githubusercontent.com/fivethirtyeight/WNBA-stats/master/wnba-team-elo-ratings.csv'
data = read.csv(url)
scatter plot
elo_score = data %>%
summarize(elo_diff = elo1_pre - elo2_pre,score_diff = score1-score2)
elo_score %>%
ggplot(aes(x = elo_diff,y= score_diff))+geom_point()+labs(title = 'Elo vs Score') + geom_smooth(method='lm', formula= y~x)

There seems to be some trend between the two variables
correlation = cor(elo_score$elo_diff,elo_score$score_diff)
paste('Correlation is',correlation)
## [1] "Correlation is 0.350828195335071"
linear model
model has an R-squared value of 0.123 which is not very accurate
model1 = lm(score_diff~elo_diff,elo_score)
summary(model1)
##
## Call:
## lm(formula = score_diff ~ elo_diff, data = elo_score)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.315 -8.352 0.000 8.352 47.315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.641e-18 1.189e-01 0.00 1
## elo_diff 3.806e-02 9.921e-04 38.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.18 on 10486 degrees of freedom
## Multiple R-squared: 0.1231, Adjusted R-squared: 0.123
## F-statistic: 1472 on 1 and 10486 DF, p-value: < 2.2e-16
conclusion: While the data seemed to have some correlation, linear approximation may not be a good predictor because of the large amount of variance