About the data

This data is from the kaggle https://www.kaggle.com/c/kobe-bryant-shot-selection You can find a python version project at http://anacomda.org/dbsnail

Load the data

dat = read.csv('./kobe/data.csv',na.strings = "NA",stringsAsFactors = TRUE)
str(dat)
## 'data.frame':    30697 obs. of  25 variables:
##  $ action_type       : Factor w/ 57 levels "Alley Oop Dunk Shot",..: 27 27 27 27 6 27 28 27 27 42 ...
##  $ combined_shot_type: Factor w/ 6 levels "Bank Shot","Dunk",..: 4 4 4 4 2 4 5 4 4 4 ...
##  $ game_event_id     : int  10 12 35 43 155 244 251 254 265 294 ...
##  $ game_id           : int  20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 ...
##  $ lat               : num  34 34 33.9 33.9 34 ...
##  $ loc_x             : int  167 -157 -101 138 0 -145 0 1 -65 -33 ...
##  $ loc_y             : int  72 0 135 175 0 -11 0 28 108 125 ...
##  $ lon               : num  -118 -118 -118 -118 -118 ...
##  $ minutes_remaining : int  10 10 7 6 6 9 8 8 6 3 ...
##  $ period            : int  1 1 1 1 2 3 3 3 3 3 ...
##  $ playoffs          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ season            : Factor w/ 20 levels "1996-97","1997-98",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ seconds_remaining : int  27 22 45 52 19 32 52 5 12 36 ...
##  $ shot_distance     : int  18 15 16 22 0 14 0 2 12 12 ...
##  $ shot_made_flag    : int  NA 0 1 0 1 0 1 NA 1 0 ...
##  $ shot_type         : Factor w/ 2 levels "2PT Field Goal",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ shot_zone_area    : Factor w/ 6 levels "Back Court(BC)",..: 6 4 3 5 2 4 2 2 4 2 ...
##  $ shot_zone_basic   : Factor w/ 7 levels "Above the Break 3",..: 5 5 5 5 6 5 6 6 3 3 ...
##  $ shot_zone_range   : Factor w/ 5 levels "16-24 ft.","24+ ft.",..: 1 3 1 1 5 3 5 5 3 3 ...
##  $ team_id           : int  1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 ...
##  $ team_name         : Factor w/ 1 level "Los Angeles Lakers": 1 1 1 1 1 1 1 1 1 1 ...
##  $ game_date         : Factor w/ 1559 levels "1996-11-03","1996-11-05",..: 311 311 311 311 311 311 311 311 311 311 ...
##  $ matchup           : Factor w/ 74 levels "LAL @ ATL","LAL @ BKN",..: 29 29 29 29 29 29 29 29 29 29 ...
##  $ opponent          : Factor w/ 33 levels "ATL","BKN","BOS",..: 26 26 26 26 26 26 26 26 26 26 ...
##  $ shot_id           : int  1 2 3 4 5 6 7 8 9 10 ...

Manipulate the data

#Exclude columns:game_event_id, game_id, team_is, team_name, shot_id
dat = dat[,-which(names(dat) %in% c("game_event_id", "team_id","team_name","shot_id"))]

#Convert the value of matchup column to home or away games
dat = within(dat, {matchup = ifelse(grepl("@", dat$matchup)==TRUE, 'Away', 'Home')})

#Subset data according to shot_made_flag
dat_train = dat[!is.na(dat['shot_made_flag']),]
dat_test = dat[is.na(dat['shot_made_flag']),]

Analyze the data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#total shots have been made
dim(dat)[1]
## [1] 30697
#average shot percentage
mean(dat_train$shot_made_flag)
## [1] 0.446161
#total shot and average shot percentage according to shot types
group_by(dat_train, shot_type) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag))
## Source: local data frame [2 x 3]
## 
##        shot_type total_shot shot_perc
##           (fctr)      (int)     (dbl)
## 1 2PT Field Goal      20285 0.4773478
## 2 3PT Field Goal       5412 0.3292683
#total shot and average shot percentage according to matchup
group_by(dat_train, matchup) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag))
## Source: local data frame [2 x 3]
## 
##   matchup total_shot shot_perc
##     (chr)      (int)     (dbl)
## 1    Away      13212 0.4364214
## 2    Home      12485 0.4564678
#top 3 opponents with lowest and highest shot percentage
shot_opp<-group_by(dat_train, opponent) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag)) %>% arrange(shot_perc) 
#lowest 3
head(shot_opp, n=3)
## Source: local data frame [3 x 3]
## 
##   opponent total_shot shot_perc
##     (fctr)      (int)     (dbl)
## 1      BKN         45 0.4000000
## 2      IND        626 0.4009585
## 3      NOP        287 0.4076655
#highest 3
tail(shot_opp, n=3)
## Source: local data frame [3 x 3]
## 
##   opponent total_shot shot_perc
##     (fctr)      (int)     (dbl)
## 1      SAC       1397 0.4652827
## 2      VAN        204 0.4705882
## 3      NYK        566 0.4770318
#Average score per games
##create a function
shot_score <-function(type){
  score<- ifelse(type =='2PT Field Goal', 2, 3)
  return(score)
}

##add a score column to dat_train
dat_train$score<- shot_score(dat_train$shot_type) * dat_train$shot_made_flag
head(dat_train)
##         action_type combined_shot_type  game_id     lat loc_x loc_y
## 2         Jump Shot          Jump Shot 20000012 34.0443  -157     0
## 3         Jump Shot          Jump Shot 20000012 33.9093  -101   135
## 4         Jump Shot          Jump Shot 20000012 33.8693   138   175
## 5 Driving Dunk Shot               Dunk 20000012 34.0443     0     0
## 6         Jump Shot          Jump Shot 20000012 34.0553  -145   -11
## 7        Layup Shot              Layup 20000012 34.0443     0     0
##         lon minutes_remaining period playoffs  season seconds_remaining
## 2 -118.4268                10      1        0 2000-01                22
## 3 -118.3708                 7      1        0 2000-01                45
## 4 -118.1318                 6      1        0 2000-01                52
## 5 -118.2698                 6      2        0 2000-01                19
## 6 -118.4148                 9      3        0 2000-01                32
## 7 -118.2698                 8      3        0 2000-01                52
##   shot_distance shot_made_flag      shot_type        shot_zone_area
## 2            15              0 2PT Field Goal          Left Side(L)
## 3            16              1 2PT Field Goal  Left Side Center(LC)
## 4            22              0 2PT Field Goal Right Side Center(RC)
## 5             0              1 2PT Field Goal             Center(C)
## 6            14              0 2PT Field Goal          Left Side(L)
## 7             0              1 2PT Field Goal             Center(C)
##   shot_zone_basic shot_zone_range  game_date matchup opponent score
## 2       Mid-Range        8-16 ft. 2000-10-31    Away      POR     0
## 3       Mid-Range       16-24 ft. 2000-10-31    Away      POR     2
## 4       Mid-Range       16-24 ft. 2000-10-31    Away      POR     0
## 5 Restricted Area Less Than 8 ft. 2000-10-31    Away      POR     2
## 6       Mid-Range        8-16 ft. 2000-10-31    Away      POR     0
## 7 Restricted Area Less Than 8 ft. 2000-10-31    Away      POR     2
##calculate the average score
score_game<-group_by(dat_train, game_id) %>% summarise(scores = sum(score)) 
mean(score_game$scores)
## [1] 15.86136

Plotting the data

library(ggplot2)
#scores per game distribution
p <- ggplot(score_game, aes(scores)) + geom_histogram(binwidth=5)
p

#Scatter plot according to shot flag
dat_train$made_miss<-factor(dat_train$shot_made_flag, labels=c("Miss", "Made"))
p <- qplot(x = loc_x, y= loc_y, color = made_miss, data=dat_train, geom ="point") 
p

#Scatter plot according to shot type
p <- qplot(x = loc_x, y= loc_y, color = shot_type, data=dat_train, geom ="point") 
p