Overview

The following data set contains predictions for the 2021-2022 NBA season for each game for every team, with probabilities of the game’s result and Elo ratings before the game, as well as probability of Elo rating after the game, assigned. Elo is a measure of strength that is accumulated by a team on a game-by-game basis that takes into account the final score of a game, where it was played, and the result. A description of the Elo rating system can be found here: https://fivethirtyeight.com/features/how-we-calculate-nba-elo-ratings/.

1. Load libraries and import data

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)
library(curl)

## Using libcurl 7.64.1 with LibreSSL/2.8.3

## 
## Attaching package: 'curl'

## The following object is masked from 'package:readr':
## 
##     parse_date

library(ggplot2)
library(dplyr)

nba_2022<-read.csv(curl("https://raw.githubusercontent.com/brsingh7/DATA607/main/nba_elo_latest.csv"))

2. Extract subset of New York Knicks games for analysis and rename columns. Convert Date of game to date field.

nyk_2022 <- subset(nba_2022, (team1 == "NYK" | team2 == "NYK"), select = c(date, team1, team2, elo_prob1, elo_prob2, score1, score2))

colnames(nyk_2022) <- c("Date_of_Game", "Home_Team", "Away_Team", "Elo_Home", "Elo_Away", "Home_Team_Score", "Away_Team_Score")

nyk_2022$Date_of_Game <- as.Date(nyk_2022$Date_of_Game)

head(nyk_2022,n=10)

##     Date_of_Game Home_Team Away_Team  Elo_Home  Elo_Away Home_Team_Score
## 6     2021-10-20       NYK       BOS 0.7009303 0.2990697             138
## 18    2021-10-22       ORL       NYK 0.3248790 0.6751210              96
## 36    2021-10-24       NYK       ORL 0.8821465 0.1178535             104
## 50    2021-10-26       NYK       PHI 0.5371938 0.4628062             112
## 68    2021-10-28       CHI       NYK 0.5956166 0.4043834             103
## 79    2021-10-30       NOP       NYK 0.4897252 0.5102748             117
## 97    2021-11-01       NYK       TOR 0.7519696 0.2480304             104
## 112   2021-11-03       IND       NYK 0.5272967 0.4727033             111
## 127   2021-11-05       MIL       NYK 0.7286528 0.2713472              98
## 143   2021-11-07       NYK       CLE 0.7951675 0.2048325             109
##     Away_Team_Score
## 6               134
## 18              121
## 36              110
## 50               99
## 68              104
## 79              123
## 97              113
## 112              98
## 127             113
## 143             126

3. Add a column to determine the winning team.

nyk_2022 <- nyk_2022 %>%
    add_column(Winning_Team = "")

4. Add data to the new column to return the winning team for games played through 2/5/2022 and for games not yet played, return the winning team based on Elo probability.

nyk_2022$Winning_Team <- ifelse(nyk_2022$Date_of_Game < as.Date('2022-02-06'),ifelse(nyk_2022$Home_Team_Score > nyk_2022$Away_Team_Score,nyk_2022$Winning_Team <- nyk_2022$Home_Team,nyk_2022$Winning_Team <- nyk_2022$Away_Team),ifelse(nyk_2022$Date_of_Game > as.Date('2022-02-05'),ifelse(nyk_2022$Elo_Home > nyk_2022$Elo_Away,nyk_2022$Winning_Team <- nyk_2022$Home_Team,nyk_2022$Winning_Team <- nyk_2022$Away_Team),""))

5. Add a column to denote Knicks result (win/loss).

nyk_2022 <- nyk_2022 %>%
    add_column(Win_Loss = "")

6. Populate win/loss column (1 for win, 0 for loss)

nyk_2022$Win_Loss <- ifelse(nyk_2022$Winning_Team == "NYK", nyk_2022$Win_Loss <- 1,nyk_2022$Win_Loss <- 0)

7. Summarize Data

print(summary(nyk_2022))

##   Date_of_Game         Home_Team          Away_Team            Elo_Home     
##  Min.   :2021-10-20   Length:82          Length:82          Min.   :0.3249  
##  1st Qu.:2021-11-30   Class :character   Class :character   1st Qu.:0.5423  
##  Median :2022-01-11   Mode  :character   Mode  :character   Median :0.6421  
##  Mean   :2022-01-12                                         Mean   :0.6341  
##  3rd Qu.:2022-03-01                                         3rd Qu.:0.7465  
##  Max.   :2022-04-09                                         Max.   :0.8845  
##                                                                             
##     Elo_Away      Home_Team_Score Away_Team_Score Winning_Team      
##  Min.   :0.1155   Min.   : 85.0   Min.   : 75.0   Length:82         
##  1st Qu.:0.2535   1st Qu.: 97.0   1st Qu.: 96.0   Class :character  
##  Median :0.3579   Median :105.0   Median :104.0   Mode  :character  
##  Mean   :0.3659   Mean   :105.1   Mean   :104.5                     
##  3rd Qu.:0.4577   3rd Qu.:111.0   3rd Qu.:113.0                     
##  Max.   :0.6751   Max.   :138.0   Max.   :134.0                     
##                   NA's   :29      NA's   :29                        
##     Win_Loss    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.439  
##  3rd Qu.:1.000  
##  Max.   :1.000  
##

table(nyk_2022$Win_Loss)

## 
##  0  1 
## 46 36

wins <- count(nyk_2022,Win_Loss)

8. Add visuals to depict Knicks wins/losses for the season.

record_hist <- hist(nyk_2022$Win_Loss,breaks=2, main = "NYK 2022 Wins/Losses", xlab = "Wins(1), Losses(0)", xlim = c(0,1),ylim=c(0,50),xaxp=c(0,1,1))
text(record_hist$mids, record_hist$counts, labels = record_hist$counts, adj=c(0.5,-0.5))

ggplot(data=nyk_2022)+
    geom_smooth(mapping = aes(x=Date_of_Game,y=Win_Loss, color = "blue"))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Brian_Singh_DATA607_Week1