knitr::opts_chunk$set(echo = TRUE)
#install.packages('tidyverse')
#install.packages('RCurl')
library(RCurl)
## Loading required package: bitops
library(tidyverse)
## -- Attaching packages --------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1     <U+2713> purrr   0.3.3
## <U+2713> tibble  2.1.3     <U+2713> dplyr   0.8.3
## <U+2713> tidyr   1.0.0     <U+2713> stringr 1.4.0
## <U+2713> readr   1.3.1     <U+2713> forcats 0.4.0
## -- Conflicts ------------------------ tidyverse_conflicts() --
## x tidyr::complete() masks RCurl::complete()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
url<-getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv")
rank<-read.csv(text=url)

Data Filtering

Looked at the 2014-2015 season

rank_2014_2015<-rank%>%
  mutate(forecast_opp=1-forecast)%>%
  filter(year_id==2015)%>%
  select(team_id,opp_id,date_game,forecast,forecast_opp)


tail(rank_2014_2015)
##      team_id opp_id date_game  forecast forecast_opp
## 2617     GSW    CLE 6/11/2015 0.4534277    0.5465723
## 2618     CLE    GSW 6/11/2015 0.5465723    0.4534277
## 2619     GSW    CLE 6/14/2015 0.7655654    0.2344346
## 2620     CLE    GSW 6/14/2015 0.2344346    0.7655654
## 2621     CLE    GSW 6/16/2015 0.4814501    0.5185499
## 2622     GSW    CLE 6/16/2015 0.5185499    0.4814501

Shot Data was gathered, Broke up the shot clock into four six second intervals

getwd()
## [1] "\\\\datastor/students/cdimenst/Sports Analytics"
shot_data<-read.csv("shot.csv")
names(shot_data)[names(shot_data)=="game_id"]<-"date_game"
names(shot_data)[names(shot_data)=="HOME_TEAM"]<-"team_id"
names(shot_data)[names(shot_data)=="AWAY_TEAM"]<-"opp_id"
names(rank_2014_2015)[names(rank_2014_2015)=="forecast"]<-"forecast_home"



view(shot_data)
df<-merge(x=shot_data,y=rank_2014_2015,by=c("date_game","opp_id","team_id"),all.x=TRUE)
#view(df)
df<-df%>%
 select(date_game, opp_id, team_id, TEAMS, STADIUM, MATCHUP, LOCATION, opp_W_L,home_W_L, FINAL_MARGIN, PERIOD, GAME_CLOCK, SHOT_CLOCK, SHOT_DIST, PTS_TYPE,HOME_SHOT_TYPE, AWAY_SHOT_TYPE, CLOSE_DEF_DIST, PTS, forecast_home, forecast_opp,S0_6_SC, S6_12_SC, S12_18_SC, S18_24_SC)
#view(df)
#Taking out null values
df<-df[complete.cases(df), ]
#each game probability for each team
#view(df)
df%>%
  group_by(TEAMS, STADIUM)%>%
  summarise(home_prob=mean(forecast_home),
            away_prob=mean(forecast_opp),
            home_Avg3_count=sum(HOME_SHOT_TYPE==3),  #number of threes taken by the home team during that game
            away_Avg_3_count=sum(AWAY_SHOT_TYPE==3)) #number of threes taken by the away team during that game
## # A tibble: 701 x 6
## # Groups:   TEAMS [701]
##    TEAMS       STADIUM home_prob away_prob home_Avg3_count away_Avg_3_count
##    <fct>       <fct>       <dbl>     <dbl>           <int>            <int>
##  1 ATL vs. BOS ATL         0.790     0.210              24               21
##  2 ATL vs. CHI ATL         0.598     0.402              20               25
##  3 ATL vs. CLE ATL         0.731     0.269              49               67
##  4 ATL vs. DAL ATL         0.697     0.303              34               30
##  5 ATL vs. DEN ATL         0.707     0.293              26               34
##  6 ATL vs. DET ATL         0.822     0.178              53               56
##  7 ATL vs. GSW ATL         0.587     0.413              27               30
##  8 ATL vs. HOU ATL         0.703     0.297              27               40
##  9 ATL vs. IND ATL         0.875     0.125              28               24
## 10 ATL vs. LAC ATL         0.607     0.393              23               26
## # … with 691 more rows

Linear Regressions

#Testing to see if forecasted probability effects number of 3's taken
library(broom)
df1<-df%>%
  group_by(forecast_home)%>%
  summarise(home_Avg3_count=sum(HOME_SHOT_TYPE==3),
            away_Avg3_count=sum(AWAY_SHOT_TYPE==3),
            home_prob=mean(forecast_home),
            SHOT0_6=sum(S0_6_SC),
            SHOT6_12=sum(S6_12_SC),
            SHOT12_18=sum(S12_18_SC),
            SHOT18_24=sum(S18_24_SC),
            ave_time_sc=mean(SHOT_CLOCK))
library(splines)
hProb_3s<-lm(home_Avg3_count~home_prob,data=df1) #Home a
aProb_3s<-lm(away_Avg3_count~home_prob,data=df1)

 
ShotClockEffect3<-lm(home_Avg3_count~SHOT0_6+SHOT6_12+SHOT12_18+SHOT18_24+home_prob, data=df1)
S0_6Eff<-lm(SHOT0_6~home_prob,data=df1)
S6_12Eff<-lm(SHOT6_12~home_prob,data=df1)
S12_18Eff<-lm(SHOT12_18~home_prob,data=df1)
S18_24Eff<-lm(SHOT18_24~home_prob,data=df1)
fit4<-lm(ave_time_sc~ns(home_prob,3), data=df1)


tidy(hProb_3s)
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)    16.1      0.690     23.3  8.20e-96
## 2 home_prob       8.20     1.06       7.73 2.70e-14
tidy(aProb_3s)
## # A tibble: 2 x 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    24.4      0.671     36.4  3.94e-184
## 2 home_prob      -5.38     1.03      -5.22 2.24e-  7
tidy(ShotClockEffect3)   
## # A tibble: 6 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -0.102      2.14     -0.0478 9.62e- 1
## 2 SHOT0_6      0.108      0.0366    2.95   3.26e- 3
## 3 SHOT6_12     0.00624    0.0248    0.251  8.02e- 1
## 4 SHOT12_18    0.173      0.0216    8.00   3.43e-15
## 5 SHOT18_24    0.152      0.0275    5.53   4.13e- 8
## 6 home_prob    7.54       1.00      7.51   1.32e-13
tidy(fit4)
## # A tibble: 4 x 5
##   term              estimate std.error statistic p.value
##   <chr>                <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)         12.7      0.137      92.5  0      
## 2 ns(home_prob, 3)1   -0.272    0.0960     -2.83 0.00475
## 3 ns(home_prob, 3)2   -0.365    0.326      -1.12 0.263  
## 4 ns(home_prob, 3)3    0.127    0.105       1.21 0.228
plot2<-ggplot(df1, aes(home_prob, ave_time_sc)) + 
  geom_smooth()+ 
  geom_point()
plot2+ylab("time when shots were taken")+xlab("home team probability")+ggtitle("home probability effect on shot time")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Linear Models

hProb_3s–> As the probability of the home team winning increases by one unit they take 8.2 more 3 point shots during a game.

aProb_3s–> As the probability of the home team winning increases by one unit they take 5.3 less 3 point shotes during a game.

ShotClockEffect3–> One more shot taken between 0 and 6 seconds on the shot clock raises the number of 3 point shots taken that game by 0.107. The most significant finding from this linear model is that as one more shot is taken between 12 and 18 seconds on the shot clock the probability of that shot being a 3 pointer increaes by .173.

ShotClockEffect–> As one more shot is taken between 0 and 6 seconds the probability of the home team winning increases by .002 (significant to the 5% level but a low t score). As one more shot is taken beween the 6 and 12 seconds on the shot clock the odds of the home team winning increaes by .0007.

fit2–> as one more three point shot is taken the probability of the home team winning increases by 0.011

The linear model that has the highest R^2 is the hProb_3’s model

fit4–> Model that depicts the relationship between the home team’s win probability and how long they hold onto the ball for.

AIC scores of above linear models

AIC(hProb_3s)
## [1] 6481.55
AIC(aProb_3s)
## [1] 6426.976
AIC(ShotClockEffect3)
## [1] 6360.73
AIC(fit4)
## [1] 2075.443

Team by team analysis

#Average forecasted probability of each team in the season
df_rank <- df%>%
  group_by(STADIUM)%>%
  summarise(home_id_AvgRank=mean(forecast_home))%>%
  arrange(-home_id_AvgRank)

df_rank
## # A tibble: 27 x 2
##    STADIUM home_id_AvgRank
##    <fct>             <dbl>
##  1 GSW               0.838
##  2 SAS               0.786
##  3 LAC               0.786
##  4 ATL               0.758
##  5 POR               0.741
##  6 HOU               0.729
##  7 MEM               0.726
##  8 TOR               0.723
##  9 CHI               0.720
## 10 DAL               0.709
## # … with 17 more rows
library(forcats)

df4<-df%>%

  group_by(STADIUM)%>%
  select(SHOT_CLOCK,HOME_SHOT_TYPE,forecast_home,STADIUM) %>% 
  inner_join(df_rank)
## Joining, by = "STADIUM"
plot<-ggplot(subset(df4,STADIUM %in% c("GSW","SAS","PHI","ORL")),aes(x=SHOT_CLOCK))+
  geom_histogram()+
  facet_wrap(~fct_reorder(STADIUM, home_id_AvgRank))
plot+labs(title="Number of shots taken during various shot clock times")+ ylab("Shot count")+xlab("Shot clock time")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

First Quarter Analysis

In the first quarter as the home probability rises by one unit the number of 3 point shots taken by the home team rises by 3.4 and the number of 3 points taken by the away team rises by 2.2.

df3<-df%>%
  group_by(TEAMS,PERIOD)%>%
  summarise(forecast_home1=(mean(forecast_home)),
            forecast_away1=(mean(forecast_opp)),
            home_3taken=sum(HOME_SHOT_TYPE==3), #model for home/away team
            away_3taken=sum(AWAY_SHOT_TYPE==3),
            H_W_diff_shot=(sum(HOME_SHOT_TYPE==3)-sum(AWAY_SHOT_TYPE==3)))
           
head(df3)
## # A tibble: 6 x 7
## # Groups:   TEAMS [2]
##   TEAMS PERIOD forecast_home1 forecast_away1 home_3taken away_3taken
##   <fct>  <int>          <dbl>          <dbl>       <int>       <int>
## 1 ATL …      1          0.790          0.210           5           4
## 2 ATL …      2          0.790          0.210           8           6
## 3 ATL …      3          0.790          0.210           5           5
## 4 ATL …      4          0.790          0.210           6           6
## 5 ATL …      1          0.598          0.402           7           8
## 6 ATL …      2          0.598          0.402           6           5
## # … with 1 more variable: H_W_diff_shot <int>
#HOME 3's taken
df_Q1<-df3%>%
  filter(PERIOD==1)
#home first quarter threes taken in first
ggplot(data=df_Q1, aes(x=forecast_home1, y=home_3taken))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

fit_1<-lm(home_3taken~forecast_home1, data=df_Q1)
##away first qurater threes taken
ggplot(data=df_Q1, aes(x=forecast_away1, y=away_3taken))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

fit_2=lm(away_3taken~forecast_away1, data=df_Q1)

tidy(fit_1)
## # A tibble: 2 x 5
##   term           estimate std.error statistic  p.value
##   <chr>             <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)        4.89     0.500      9.77 3.32e-21
## 2 forecast_home1     3.44     0.768      4.47 8.96e- 6
tidy(fit_2)
## # A tibble: 2 x 5
##   term           estimate std.error statistic  p.value
##   <chr>             <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)        6.08     0.304     20.0  2.13e-70
## 2 forecast_away1     2.24     0.713      3.14 1.79e- 3
#view(df3)

Four Quarter Away Team 3 point shot tendencies

#HOME 3's taken
df_Q4<-df3%>%
  filter(PERIOD==4)
#home first quarter threes taken in first

fit_6<-lm(away_3taken~forecast_home1, data=df_Q4)
tidy(fit_6)
## # A tibble: 2 x 5
##   term           estimate std.error statistic  p.value
##   <chr>             <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)        8.83     0.527     16.8  2.71e-53
## 2 forecast_home1    -1.47     0.811     -1.81 7.03e- 2

Spline terms

h_linear_spline<-ggplot()+
  geom_point(data=filter(df3,forecast_home1<.25),aes(forecast_home1,home_3taken))+
  geom_smooth(data=df3,aes(forecast_home1,home_3taken),
              method="lm")+
  geom_point(data=filter(df3,forecast_home1<.5,forecast_home1>.25),aes(forecast_home1,home_3taken))+
  geom_smooth(data=filter(df3,forecast_home1<.5,forecast_home1>.25),aes(forecast_home1,home_3taken),
              method="lm")+
  geom_point(data=filter(df3,forecast_home1<.75,forecast_home1>.5),aes(forecast_home1,home_3taken))+
  geom_smooth(data=filter(df3,forecast_home1<.75,forecast_home1>.5),aes(forecast_home1,home_3taken),
              method="lm")+
  geom_point(data=filter(df3,forecast_home1>.75),aes(forecast_home1,home_3taken))+
  geom_smooth(data=filter(df3,forecast_home1>.75),aes(forecast_home1,home_3taken),
              method="lm")+            
  annotate("point",x=.25,y=4,colour="red")+
  annotate("point",x=.5,y=7,colour="red")+
  annotate("point",x=.75,y=7,colour="red")

h_linear_spline+ylab("3 point shots taken-> home team")+xlab("home team probability")

QQ plot

A qq plot is used becasue the relationship between a home teams forecasted win probability and the amount of three points taken should be a linear relationship. Two QQ plots are made: one for the home teams forecasted probability in relation the the amount of three points they take in a gaem and one for how the home team’s win probability effects teh awya teams three points taken.

the quantiles for fit one and two seem to be normally distributed with some slight skew.

fit_1 is the relationship between the forcasted home win probability and the number of 3 point shots taken by the home team.

fit_2 is the relationship between the forecasted home win probability and the number of 3 point shots taken by the away team.

#residuals of Q1
qqnorm(fit_1$resid)
qqline(fit_1$residuals)

Q1<-ggplot(data=fit_1,aes(x=fit_1$fitted.values,y= fit_1$residuals))+
  geom_point()+
  geom_smooth()
Q1+labs(title="Normal Q-Q Plot for Q1")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

qqnorm(fit_2$resid)
qqline(fit_2$residuals)

Q1<-ggplot(data=fit_2,aes(x=fit_1$fitted.values,y= fit_2$residuals))+
  geom_point()+
  geom_smooth()
Q1+labs(title="Normal Q-Q Plot for Q1")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'