knitr::opts_chunk$set(echo = TRUE)
#install.packages('tidyverse')
#install.packages('RCurl')
library(RCurl)
## Loading required package: bitops
library(tidyverse)
## -- Attaching packages --------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1 <U+2713> purrr 0.3.3
## <U+2713> tibble 2.1.3 <U+2713> dplyr 0.8.3
## <U+2713> tidyr 1.0.0 <U+2713> stringr 1.4.0
## <U+2713> readr 1.3.1 <U+2713> forcats 0.4.0
## -- Conflicts ------------------------ tidyverse_conflicts() --
## x tidyr::complete() masks RCurl::complete()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
url<-getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv")
rank<-read.csv(text=url)
Looked at the 2014-2015 season
rank_2014_2015<-rank%>%
mutate(forecast_opp=1-forecast)%>%
filter(year_id==2015)%>%
select(team_id,opp_id,date_game,forecast,forecast_opp)
tail(rank_2014_2015)
## team_id opp_id date_game forecast forecast_opp
## 2617 GSW CLE 6/11/2015 0.4534277 0.5465723
## 2618 CLE GSW 6/11/2015 0.5465723 0.4534277
## 2619 GSW CLE 6/14/2015 0.7655654 0.2344346
## 2620 CLE GSW 6/14/2015 0.2344346 0.7655654
## 2621 CLE GSW 6/16/2015 0.4814501 0.5185499
## 2622 GSW CLE 6/16/2015 0.5185499 0.4814501
Shot Data was gathered, Broke up the shot clock into four six second intervals
getwd()
## [1] "\\\\datastor/students/cdimenst/Sports Analytics"
shot_data<-read.csv("shot.csv")
names(shot_data)[names(shot_data)=="game_id"]<-"date_game"
names(shot_data)[names(shot_data)=="HOME_TEAM"]<-"team_id"
names(shot_data)[names(shot_data)=="AWAY_TEAM"]<-"opp_id"
names(rank_2014_2015)[names(rank_2014_2015)=="forecast"]<-"forecast_home"
view(shot_data)
df<-merge(x=shot_data,y=rank_2014_2015,by=c("date_game","opp_id","team_id"),all.x=TRUE)
#view(df)
df<-df%>%
select(date_game, opp_id, team_id, TEAMS, STADIUM, MATCHUP, LOCATION, opp_W_L,home_W_L, FINAL_MARGIN, PERIOD, GAME_CLOCK, SHOT_CLOCK, SHOT_DIST, PTS_TYPE,HOME_SHOT_TYPE, AWAY_SHOT_TYPE, CLOSE_DEF_DIST, PTS, forecast_home, forecast_opp,S0_6_SC, S6_12_SC, S12_18_SC, S18_24_SC)
#view(df)
#Taking out null values
df<-df[complete.cases(df), ]
#each game probability for each team
#view(df)
df%>%
group_by(TEAMS, STADIUM)%>%
summarise(home_prob=mean(forecast_home),
away_prob=mean(forecast_opp),
home_Avg3_count=sum(HOME_SHOT_TYPE==3), #number of threes taken by the home team during that game
away_Avg_3_count=sum(AWAY_SHOT_TYPE==3)) #number of threes taken by the away team during that game
## # A tibble: 701 x 6
## # Groups: TEAMS [701]
## TEAMS STADIUM home_prob away_prob home_Avg3_count away_Avg_3_count
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 ATL vs. BOS ATL 0.790 0.210 24 21
## 2 ATL vs. CHI ATL 0.598 0.402 20 25
## 3 ATL vs. CLE ATL 0.731 0.269 49 67
## 4 ATL vs. DAL ATL 0.697 0.303 34 30
## 5 ATL vs. DEN ATL 0.707 0.293 26 34
## 6 ATL vs. DET ATL 0.822 0.178 53 56
## 7 ATL vs. GSW ATL 0.587 0.413 27 30
## 8 ATL vs. HOU ATL 0.703 0.297 27 40
## 9 ATL vs. IND ATL 0.875 0.125 28 24
## 10 ATL vs. LAC ATL 0.607 0.393 23 26
## # … with 691 more rows
#Testing to see if forecasted probability effects number of 3's taken
library(broom)
df1<-df%>%
group_by(forecast_home)%>%
summarise(home_Avg3_count=sum(HOME_SHOT_TYPE==3),
away_Avg3_count=sum(AWAY_SHOT_TYPE==3),
home_prob=mean(forecast_home),
SHOT0_6=sum(S0_6_SC),
SHOT6_12=sum(S6_12_SC),
SHOT12_18=sum(S12_18_SC),
SHOT18_24=sum(S18_24_SC),
ave_time_sc=mean(SHOT_CLOCK))
library(splines)
hProb_3s<-lm(home_Avg3_count~home_prob,data=df1) #Home a
aProb_3s<-lm(away_Avg3_count~home_prob,data=df1)
ShotClockEffect3<-lm(home_Avg3_count~SHOT0_6+SHOT6_12+SHOT12_18+SHOT18_24+home_prob, data=df1)
S0_6Eff<-lm(SHOT0_6~home_prob,data=df1)
S6_12Eff<-lm(SHOT6_12~home_prob,data=df1)
S12_18Eff<-lm(SHOT12_18~home_prob,data=df1)
S18_24Eff<-lm(SHOT18_24~home_prob,data=df1)
fit4<-lm(ave_time_sc~ns(home_prob,3), data=df1)
tidy(hProb_3s)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 16.1 0.690 23.3 8.20e-96
## 2 home_prob 8.20 1.06 7.73 2.70e-14
tidy(aProb_3s)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 24.4 0.671 36.4 3.94e-184
## 2 home_prob -5.38 1.03 -5.22 2.24e- 7
tidy(ShotClockEffect3)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.102 2.14 -0.0478 9.62e- 1
## 2 SHOT0_6 0.108 0.0366 2.95 3.26e- 3
## 3 SHOT6_12 0.00624 0.0248 0.251 8.02e- 1
## 4 SHOT12_18 0.173 0.0216 8.00 3.43e-15
## 5 SHOT18_24 0.152 0.0275 5.53 4.13e- 8
## 6 home_prob 7.54 1.00 7.51 1.32e-13
tidy(fit4)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 12.7 0.137 92.5 0
## 2 ns(home_prob, 3)1 -0.272 0.0960 -2.83 0.00475
## 3 ns(home_prob, 3)2 -0.365 0.326 -1.12 0.263
## 4 ns(home_prob, 3)3 0.127 0.105 1.21 0.228
plot2<-ggplot(df1, aes(home_prob, ave_time_sc)) +
geom_smooth()+
geom_point()
plot2+ylab("time when shots were taken")+xlab("home team probability")+ggtitle("home probability effect on shot time")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
hProb_3s–> As the probability of the home team winning increases by one unit they take 8.2 more 3 point shots during a game.
aProb_3s–> As the probability of the home team winning increases by one unit they take 5.3 less 3 point shotes during a game.
ShotClockEffect3–> One more shot taken between 0 and 6 seconds on the shot clock raises the number of 3 point shots taken that game by 0.107. The most significant finding from this linear model is that as one more shot is taken between 12 and 18 seconds on the shot clock the probability of that shot being a 3 pointer increaes by .173.
ShotClockEffect–> As one more shot is taken between 0 and 6 seconds the probability of the home team winning increases by .002 (significant to the 5% level but a low t score). As one more shot is taken beween the 6 and 12 seconds on the shot clock the odds of the home team winning increaes by .0007.
fit2–> as one more three point shot is taken the probability of the home team winning increases by 0.011
The linear model that has the highest R^2 is the hProb_3’s model
fit4–> Model that depicts the relationship between the home team’s win probability and how long they hold onto the ball for.
AIC(hProb_3s)
## [1] 6481.55
AIC(aProb_3s)
## [1] 6426.976
AIC(ShotClockEffect3)
## [1] 6360.73
AIC(fit4)
## [1] 2075.443
#Average forecasted probability of each team in the season
df_rank <- df%>%
group_by(STADIUM)%>%
summarise(home_id_AvgRank=mean(forecast_home))%>%
arrange(-home_id_AvgRank)
df_rank
## # A tibble: 27 x 2
## STADIUM home_id_AvgRank
## <fct> <dbl>
## 1 GSW 0.838
## 2 SAS 0.786
## 3 LAC 0.786
## 4 ATL 0.758
## 5 POR 0.741
## 6 HOU 0.729
## 7 MEM 0.726
## 8 TOR 0.723
## 9 CHI 0.720
## 10 DAL 0.709
## # … with 17 more rows
library(forcats)
df4<-df%>%
group_by(STADIUM)%>%
select(SHOT_CLOCK,HOME_SHOT_TYPE,forecast_home,STADIUM) %>%
inner_join(df_rank)
## Joining, by = "STADIUM"
plot<-ggplot(subset(df4,STADIUM %in% c("GSW","SAS","PHI","ORL")),aes(x=SHOT_CLOCK))+
geom_histogram()+
facet_wrap(~fct_reorder(STADIUM, home_id_AvgRank))
plot+labs(title="Number of shots taken during various shot clock times")+ ylab("Shot count")+xlab("Shot clock time")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In the first quarter as the home probability rises by one unit the number of 3 point shots taken by the home team rises by 3.4 and the number of 3 points taken by the away team rises by 2.2.
df3<-df%>%
group_by(TEAMS,PERIOD)%>%
summarise(forecast_home1=(mean(forecast_home)),
forecast_away1=(mean(forecast_opp)),
home_3taken=sum(HOME_SHOT_TYPE==3), #model for home/away team
away_3taken=sum(AWAY_SHOT_TYPE==3),
H_W_diff_shot=(sum(HOME_SHOT_TYPE==3)-sum(AWAY_SHOT_TYPE==3)))
head(df3)
## # A tibble: 6 x 7
## # Groups: TEAMS [2]
## TEAMS PERIOD forecast_home1 forecast_away1 home_3taken away_3taken
## <fct> <int> <dbl> <dbl> <int> <int>
## 1 ATL … 1 0.790 0.210 5 4
## 2 ATL … 2 0.790 0.210 8 6
## 3 ATL … 3 0.790 0.210 5 5
## 4 ATL … 4 0.790 0.210 6 6
## 5 ATL … 1 0.598 0.402 7 8
## 6 ATL … 2 0.598 0.402 6 5
## # … with 1 more variable: H_W_diff_shot <int>
#HOME 3's taken
df_Q1<-df3%>%
filter(PERIOD==1)
#home first quarter threes taken in first
ggplot(data=df_Q1, aes(x=forecast_home1, y=home_3taken))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
fit_1<-lm(home_3taken~forecast_home1, data=df_Q1)
##away first qurater threes taken
ggplot(data=df_Q1, aes(x=forecast_away1, y=away_3taken))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
fit_2=lm(away_3taken~forecast_away1, data=df_Q1)
tidy(fit_1)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.89 0.500 9.77 3.32e-21
## 2 forecast_home1 3.44 0.768 4.47 8.96e- 6
tidy(fit_2)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.08 0.304 20.0 2.13e-70
## 2 forecast_away1 2.24 0.713 3.14 1.79e- 3
#view(df3)
#HOME 3's taken
df_Q4<-df3%>%
filter(PERIOD==4)
#home first quarter threes taken in first
fit_6<-lm(away_3taken~forecast_home1, data=df_Q4)
tidy(fit_6)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.83 0.527 16.8 2.71e-53
## 2 forecast_home1 -1.47 0.811 -1.81 7.03e- 2
h_linear_spline<-ggplot()+
geom_point(data=filter(df3,forecast_home1<.25),aes(forecast_home1,home_3taken))+
geom_smooth(data=df3,aes(forecast_home1,home_3taken),
method="lm")+
geom_point(data=filter(df3,forecast_home1<.5,forecast_home1>.25),aes(forecast_home1,home_3taken))+
geom_smooth(data=filter(df3,forecast_home1<.5,forecast_home1>.25),aes(forecast_home1,home_3taken),
method="lm")+
geom_point(data=filter(df3,forecast_home1<.75,forecast_home1>.5),aes(forecast_home1,home_3taken))+
geom_smooth(data=filter(df3,forecast_home1<.75,forecast_home1>.5),aes(forecast_home1,home_3taken),
method="lm")+
geom_point(data=filter(df3,forecast_home1>.75),aes(forecast_home1,home_3taken))+
geom_smooth(data=filter(df3,forecast_home1>.75),aes(forecast_home1,home_3taken),
method="lm")+
annotate("point",x=.25,y=4,colour="red")+
annotate("point",x=.5,y=7,colour="red")+
annotate("point",x=.75,y=7,colour="red")
h_linear_spline+ylab("3 point shots taken-> home team")+xlab("home team probability")
A qq plot is used becasue the relationship between a home teams forecasted win probability and the amount of three points taken should be a linear relationship. Two QQ plots are made: one for the home teams forecasted probability in relation the the amount of three points they take in a gaem and one for how the home team’s win probability effects teh awya teams three points taken.
the quantiles for fit one and two seem to be normally distributed with some slight skew.
fit_1 is the relationship between the forcasted home win probability and the number of 3 point shots taken by the home team.
fit_2 is the relationship between the forecasted home win probability and the number of 3 point shots taken by the away team.
#residuals of Q1
qqnorm(fit_1$resid)
qqline(fit_1$residuals)
Q1<-ggplot(data=fit_1,aes(x=fit_1$fitted.values,y= fit_1$residuals))+
geom_point()+
geom_smooth()
Q1+labs(title="Normal Q-Q Plot for Q1")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
qqnorm(fit_2$resid)
qqline(fit_2$residuals)
Q1<-ggplot(data=fit_2,aes(x=fit_1$fitted.values,y= fit_2$residuals))+
geom_point()+
geom_smooth()
Q1+labs(title="Normal Q-Q Plot for Q1")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'