This data is from the kaggle https://www.kaggle.com/c/kobe-bryant-shot-selection You can find a python version project at http://anacomda.org/dbsnail
dat = read.csv('./kobe/data.csv',na.strings = "NA",stringsAsFactors = TRUE)
str(dat)
## 'data.frame': 30697 obs. of 25 variables:
## $ action_type : Factor w/ 57 levels "Alley Oop Dunk Shot",..: 27 27 27 27 6 27 28 27 27 42 ...
## $ combined_shot_type: Factor w/ 6 levels "Bank Shot","Dunk",..: 4 4 4 4 2 4 5 4 4 4 ...
## $ game_event_id : int 10 12 35 43 155 244 251 254 265 294 ...
## $ game_id : int 20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 20000012 ...
## $ lat : num 34 34 33.9 33.9 34 ...
## $ loc_x : int 167 -157 -101 138 0 -145 0 1 -65 -33 ...
## $ loc_y : int 72 0 135 175 0 -11 0 28 108 125 ...
## $ lon : num -118 -118 -118 -118 -118 ...
## $ minutes_remaining : int 10 10 7 6 6 9 8 8 6 3 ...
## $ period : int 1 1 1 1 2 3 3 3 3 3 ...
## $ playoffs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ season : Factor w/ 20 levels "1996-97","1997-98",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ seconds_remaining : int 27 22 45 52 19 32 52 5 12 36 ...
## $ shot_distance : int 18 15 16 22 0 14 0 2 12 12 ...
## $ shot_made_flag : int NA 0 1 0 1 0 1 NA 1 0 ...
## $ shot_type : Factor w/ 2 levels "2PT Field Goal",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ shot_zone_area : Factor w/ 6 levels "Back Court(BC)",..: 6 4 3 5 2 4 2 2 4 2 ...
## $ shot_zone_basic : Factor w/ 7 levels "Above the Break 3",..: 5 5 5 5 6 5 6 6 3 3 ...
## $ shot_zone_range : Factor w/ 5 levels "16-24 ft.","24+ ft.",..: 1 3 1 1 5 3 5 5 3 3 ...
## $ team_id : int 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 1610612747 ...
## $ team_name : Factor w/ 1 level "Los Angeles Lakers": 1 1 1 1 1 1 1 1 1 1 ...
## $ game_date : Factor w/ 1559 levels "1996-11-03","1996-11-05",..: 311 311 311 311 311 311 311 311 311 311 ...
## $ matchup : Factor w/ 74 levels "LAL @ ATL","LAL @ BKN",..: 29 29 29 29 29 29 29 29 29 29 ...
## $ opponent : Factor w/ 33 levels "ATL","BKN","BOS",..: 26 26 26 26 26 26 26 26 26 26 ...
## $ shot_id : int 1 2 3 4 5 6 7 8 9 10 ...
#Exclude columns:game_event_id, game_id, team_is, team_name, shot_id
dat = dat[,-which(names(dat) %in% c("game_event_id", "team_id","team_name","shot_id"))]
#Convert the value of matchup column to home or away games
dat = within(dat, {matchup = ifelse(grepl("@", dat$matchup)==TRUE, 'Away', 'Home')})
#Subset data according to shot_made_flag
dat_train = dat[!is.na(dat['shot_made_flag']),]
dat_test = dat[is.na(dat['shot_made_flag']),]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#total shots have been made
dim(dat)[1]
## [1] 30697
#average shot percentage
mean(dat_train$shot_made_flag)
## [1] 0.446161
#total shot and average shot percentage according to shot types
group_by(dat_train, shot_type) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag))
## Source: local data frame [2 x 3]
##
## shot_type total_shot shot_perc
## (fctr) (int) (dbl)
## 1 2PT Field Goal 20285 0.4773478
## 2 3PT Field Goal 5412 0.3292683
#total shot and average shot percentage according to matchup
group_by(dat_train, matchup) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag))
## Source: local data frame [2 x 3]
##
## matchup total_shot shot_perc
## (chr) (int) (dbl)
## 1 Away 13212 0.4364214
## 2 Home 12485 0.4564678
#top 3 opponents with lowest and highest shot percentage
shot_opp<-group_by(dat_train, opponent) %>% summarise(total_shot = n(), shot_perc = mean(shot_made_flag)) %>% arrange(shot_perc)
#lowest 3
head(shot_opp, n=3)
## Source: local data frame [3 x 3]
##
## opponent total_shot shot_perc
## (fctr) (int) (dbl)
## 1 BKN 45 0.4000000
## 2 IND 626 0.4009585
## 3 NOP 287 0.4076655
#highest 3
tail(shot_opp, n=3)
## Source: local data frame [3 x 3]
##
## opponent total_shot shot_perc
## (fctr) (int) (dbl)
## 1 SAC 1397 0.4652827
## 2 VAN 204 0.4705882
## 3 NYK 566 0.4770318
#Average score per games
##create a function
shot_score <-function(type){
score<- ifelse(type =='2PT Field Goal', 2, 3)
return(score)
}
##add a score column to dat_train
dat_train$score<- shot_score(dat_train$shot_type) * dat_train$shot_made_flag
head(dat_train)
## action_type combined_shot_type game_id lat loc_x loc_y
## 2 Jump Shot Jump Shot 20000012 34.0443 -157 0
## 3 Jump Shot Jump Shot 20000012 33.9093 -101 135
## 4 Jump Shot Jump Shot 20000012 33.8693 138 175
## 5 Driving Dunk Shot Dunk 20000012 34.0443 0 0
## 6 Jump Shot Jump Shot 20000012 34.0553 -145 -11
## 7 Layup Shot Layup 20000012 34.0443 0 0
## lon minutes_remaining period playoffs season seconds_remaining
## 2 -118.4268 10 1 0 2000-01 22
## 3 -118.3708 7 1 0 2000-01 45
## 4 -118.1318 6 1 0 2000-01 52
## 5 -118.2698 6 2 0 2000-01 19
## 6 -118.4148 9 3 0 2000-01 32
## 7 -118.2698 8 3 0 2000-01 52
## shot_distance shot_made_flag shot_type shot_zone_area
## 2 15 0 2PT Field Goal Left Side(L)
## 3 16 1 2PT Field Goal Left Side Center(LC)
## 4 22 0 2PT Field Goal Right Side Center(RC)
## 5 0 1 2PT Field Goal Center(C)
## 6 14 0 2PT Field Goal Left Side(L)
## 7 0 1 2PT Field Goal Center(C)
## shot_zone_basic shot_zone_range game_date matchup opponent score
## 2 Mid-Range 8-16 ft. 2000-10-31 Away POR 0
## 3 Mid-Range 16-24 ft. 2000-10-31 Away POR 2
## 4 Mid-Range 16-24 ft. 2000-10-31 Away POR 0
## 5 Restricted Area Less Than 8 ft. 2000-10-31 Away POR 2
## 6 Mid-Range 8-16 ft. 2000-10-31 Away POR 0
## 7 Restricted Area Less Than 8 ft. 2000-10-31 Away POR 2
##calculate the average score
score_game<-group_by(dat_train, game_id) %>% summarise(scores = sum(score))
mean(score_game$scores)
## [1] 15.86136
library(ggplot2)
#scores per game distribution
p <- ggplot(score_game, aes(scores)) + geom_histogram(binwidth=5)
p
#Scatter plot according to shot flag
dat_train$made_miss<-factor(dat_train$shot_made_flag, labels=c("Miss", "Made"))
p <- qplot(x = loc_x, y= loc_y, color = made_miss, data=dat_train, geom ="point")
p
#Scatter plot according to shot type
p <- qplot(x = loc_x, y= loc_y, color = shot_type, data=dat_train, geom ="point")
p