library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(jsonlite)

資料來源:kaggle上的NBA shot logs資料集
https://www.kaggle.com/dansbecker/nba-shot-logs

讀入資料

# 讀入資料
records = read_csv("shot_logs.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  MATCHUP = col_character(),
  LOCATION = col_character(),
  W = col_character(),
  GAME_CLOCK = col_time(format = ""),
  SHOT_RESULT = col_character(),
  CLOSEST_DEFENDER = col_character(),
  player_name = col_character()
)
See spec(...) for full column specifications.
records

會發現 CLOSEST_DEFENDER 的名字與 player_name 的格式不一樣,因此我們先將格式統一。

# 出手球員的id與name
player_list <- records %>% select(player_name, player_id) %>% unique() %>% rename(name = player_name)
# 統一姓名格式
pro_records <- records %>% 
  separate(CLOSEST_DEFENDER, c("first_name", "last_name"), fill = "warn", sep = ", ") %>% 
  unite("defender_name", last_name, first_name, sep =" ") %>%
  left_join(player_list, by = c("CLOSEST_DEFENDER_PLAYER_ID" = "player_id")) %>% 
  mutate(defender_name = ifelse(!is.na(name), name, tolower(defender_name))) %>% 
  select(-name)
Expected 2 pieces. Missing pieces filled with `NA` in 389 rows [86, 658, 659, 671, 679, 682, 683, 1353, 1354, 1358, 2218, 2227, 2230, 3443, 3811, 5670, 5671, 5673, 6179, 6180, ...].
pro_records
# 挑出defender_name(防守者名字)、player_name(出手者名字)、MATCHUP(比賽資訊) 三個欄位
link <- pro_records %>% 
  select(defender_name, player_name, MATCHUP)
# 建立網路關係
def_Network <- graph_from_data_frame(d=link, directed=T)
# 畫出網路圖
plot(def_Network, vertex.size=2, edge.arrow.size=0.5, vertex.label.cex=0.8)

挑出有該年度冠軍金州勇士的球員出手紀錄

該年度總冠軍隊伍為金州勇士,我們將其球員的出手紀錄與對到的防守球員挑出來做分析。

warriors <- pro_records %>% 
  filter(str_detect(MATCHUP, "GSW ")) %>% 
  add_count(defender_name, player_name, name = "d_FG") %>% 
  group_by(defender_name, player_name, d_FG) %>% 
  summarise(d_FGM = sum(FGM)) %>% 
  filter(d_FG >= 10) %>% 
  ungroup() %>% 
  mutate(d_FGP = d_FGM/d_FG)
warriors
# 建立網路關係
was_Network <- graph_from_data_frame(d=warriors, directed=T)
# 畫出網路圖
# 線的粗細代表防守到的次數多寡
# 綠線表示防守效益優於平均
# 紅線表示防守效益差於平均
set.seed(231)
E(was_Network)$color <- ifelse(E(was_Network)$d_FGP < mean(E(was_Network)$d_FGP) , "lightgreen", "palevioletred")
E(was_Network)$width <- (E(was_Network)$d_FG/10)^1.5
plot(was_Network, vertex.size=2, edge.arrow.size=0.1, vertex.label.cex=0.7)

勇士教練Kerr在設計戰術時,球員可以多挑防守效益不佳的球員做單打,若對上防守效益好的球員則可透過傳導化解。

篩選出總出手次數大於一定次數的紀錄

選出所有球員中總出手次數大於940的球員,相對其他球員可表示他們在場上時間更多,為隊上的主力輸出。

# 篩選出總出手次數大於940的紀錄
# 挑出相遇次數大於 15 次的紀錄
count_records <- pro_records %>% 
  add_count(player_name, name = "p_count") %>% 
  filter(p_count >= 940) %>% 
  add_count(defender_name, player_name, name = "d_FG") %>% 
  group_by(defender_name, player_name, d_FG) %>% 
  summarise(d_FGM = sum(FGM)) %>% 
  filter(d_FG >= 15) %>% 
  ungroup() %>% 
  mutate(d_FGP = d_FGM/d_FG)
count_records
# 建立網路關係
count_Network <- graph_from_data_frame(d=count_records, directed=T)
# 畫出網路圖
# 線的粗細代表防守到的次數多寡
# 綠線表示防守效益優於平均
# 紅線表示防守效益差於平均
set.seed(1213)
E(count_Network)$color <- ifelse(E(count_Network)$d_FGP < mean(E(count_Network)$d_FGP) , "lightgreen", "palevioletred")
E(count_Network)$width <- E(count_Network)$d_FG/10
plot(count_Network, vertex.size=2, edge.arrow.size=0.2, vertex.label.cex=0.7)

防守效益比較

透過計算出一名出手球員的總命中率,與個別球員防守時的命中率差值,
來單獨看每位防守球員在對上哪位出手球員時,有較大的防守效益。

# 出手球員總命中率
field_goal_per <- pro_records %>% 
  group_by(player_name) %>% 
  mutate(total_FGM = sum(FGM), total_FG = n()) %>% 
  mutate(total_FGP = total_FGM/total_FG) %>% 
  ungroup()
field_goal_per
# 出手球員與個別防守球員相對時之命中率
pair_def_records <- field_goal_per %>% 
  select(defender_name, player_name, FGM, total_FGP) %>% 
  group_by(defender_name, player_name, total_FGP) %>% 
  summarise(def_FGM = sum(FGM), def_FG = n()) %>% 
  mutate(def_FGP = def_FGM/def_FG) %>% 
  ungroup()
pair_def_records
# 個別組合間的防守效益差距
# 挑出相遇次數大於 8 次的紀錄
# 挑出差異大於 30% 的組合
s_pair_def_records <- pair_def_records %>% 
  filter(def_FG >= 8) %>% 
  mutate(dif_FGP = def_FGP - total_FGP) %>% 
  filter(abs(dif_FGP) >= 0.3) %>% 
  select(defender_name, player_name, dif_FGP)
s_pair_def_records
# 建立網路關係
s_pair_def_Network <- graph_from_data_frame(d=s_pair_def_records, directed=T)
# 畫出網路圖
# 防守影響較大的關係:綠色
# 防守影響較小的關係:紅色
set.seed(1234)
E(s_pair_def_Network)$color <- ifelse(E(s_pair_def_Network)$dif_FGP < 0 , "lightgreen", "palevioletred")
plot(s_pair_def_Network, vertex.size=2, edge.arrow.size=0.3, vertex.label.cex=0.7)

年度防守隊伍

NBA每年都會投票選出防守第一隊與第二隊,從球場上的各個位置選最適合的人選。
這裡我們找出2014-15賽季,防守第一隊與第二隊的名單,
透過在名單上各球員的防守下,出手球員命中率的差別來驗證這份名單。

年度防守第一隊

# 個別組合間的防守效益差距
# 挑出相遇次數的大於 3 次的紀錄
# 挑出差異大於 25% 的組合
first_team_records <- pair_def_records %>% 
  filter(defender_name == "kawhi leonard" | 
         defender_name == "draymond green" |
         defender_name == "deandre jordan" |
         defender_name == "tony allen" |
         defender_name == "chris paul") %>% 
  filter(def_FG >= 3) %>% 
  mutate(dif_FGP = def_FGP - total_FGP) %>% 
  filter(abs(dif_FGP) >= 0.25) %>% 
  select(defender_name, player_name, dif_FGP)
first_team_records
# 建立網路關係
first_team_Network <- graph_from_data_frame(d=first_team_records, directed=T)
# 畫出網路圖
# 防守影響較大的關係:綠色
# 防守影響較小的關係:紅色
set.seed(31)
E(first_team_Network)$color <- ifelse(E(first_team_Network)$dif_FGP < 0 , "lightgreen", "palevioletred")
plot(first_team_Network, vertex.size=2, edge.arrow.size=0.3, vertex.label.cex=0.7)

年度防守第二隊

# 個別組合間的防守效益差距
# 挑出相遇次數的大於 3 次的紀錄
# 挑出差異大於 25% 的組合
second_team_records <- pair_def_records %>% 
  filter(defender_name == "anthony davis" | 
         defender_name == "tim duncan" |
         defender_name == "andrew bogut" |
         defender_name == "jimmy butler" |
         defender_name == "john wall") %>% 
  filter(def_FG >= 3) %>% 
  mutate(dif_FGP = def_FGP - total_FGP) %>% 
  filter(abs(dif_FGP) >= 0.25) %>% 
  select(defender_name, player_name, dif_FGP)
second_team_records
# 建立網路關係
second_team_Network <- graph_from_data_frame(d=second_team_records, directed=T)
# 畫出網路圖
# 防守影響較大的關係:綠色
# 防守影響較小的關係:紅色
set.seed(1234)
E(second_team_Network)$color <- ifelse(E(second_team_Network)$dif_FGP < 0 , "lightgreen", "palevioletred")
plot(second_team_Network, vertex.size=2, edge.arrow.size=0.3, vertex.label.cex=0.7)

去除Wide Open狀態下的資料

來源資料中是每一筆的出手紀錄,故其中的防守球員只是距離出手球員最近的球員,
但球場上可能透過卡位、空切、傳球等手段來跑出空檔,此時出手球員會與防守球員拉開一段距離,
此時防守球員是誰的意義便不會那麼大,因此我們透過數據中的最近防守球員距離為依據,
將距離超過 4.92feet(=1.5公尺) 的紀錄去除,再看剛剛的年度防守第一隊和第二隊的紀錄。

# 挑出與防守者距離小於 4.92feet 的紀錄
no_wide_records <- pro_records %>% 
  filter(CLOSE_DEF_DIST <= 4.92)
no_wide_records
# 出手球員總命中率
n_field_goal_per <- no_wide_records %>% 
  group_by(player_name) %>% 
  mutate(total_FGM = sum(FGM), total_FG = n()) %>% 
  mutate(total_FGP = total_FGM/total_FG) %>% 
  ungroup()
# 出手球員與個別防守球員相對時之命中率
n_pair_def_records <- n_field_goal_per %>% 
  select(defender_name, player_name, FGM, total_FGP) %>% 
  group_by(defender_name, player_name, total_FGP) %>% 
  summarise(def_FGM = sum(FGM), def_FG = n()) %>% 
  mutate(def_FGP = def_FGM/def_FG) %>% 
  ungroup()
n_pair_def_records
# 個別組合間的防守效益差距
# 挑出相遇次數的大於 3 次的紀錄
# 挑出差異大於 25% 的組合
n_first_team_records <- n_pair_def_records %>% 
  filter(defender_name == "kawhi leonard" | 
         defender_name == "draymond green" |
         defender_name == "deandre jordan" |
         defender_name == "tony allen" |
         defender_name == "chris paul") %>% 
  filter(def_FG >= 3) %>% 
  mutate(dif_FGP = def_FGP - total_FGP) %>% 
  filter(abs(dif_FGP) >= 0.25) %>% 
  select(defender_name, player_name, dif_FGP)
n_first_team_records
# 建立網路關係
n_first_team_Network <- graph_from_data_frame(d=n_first_team_records, directed=T)
# 畫出網路圖
# 防守影響較大的關係:綠色
# 防守影響較小的關係:紅色
set.seed(31)
E(n_first_team_Network)$color <- ifelse(E(n_first_team_Network)$dif_FGP < 0 , "lightgreen", "palevioletred")
plot(n_first_team_Network, vertex.size=2, edge.arrow.size=0.3, vertex.label.cex=0.7)

# 個別組合間的防守效益差距
# 挑出相遇次數的大於 3 次的紀錄
# 挑出差異大於 25% 的組合
n_second_team_records <- n_pair_def_records %>% 
  filter(defender_name == "anthony davis" | 
         defender_name == "tim duncan" |
         defender_name == "andrew bogut" |
         defender_name == "jimmy butler" |
         defender_name == "john wall") %>% 
  filter(def_FG >= 3) %>% 
  mutate(dif_FGP = def_FGP - total_FGP) %>% 
  filter(abs(dif_FGP) >= 0.25) %>% 
  select(defender_name, player_name, dif_FGP)
n_second_team_records
# 建立網路關係
n_second_team_Network <- graph_from_data_frame(d=n_second_team_records, directed=T)
# 畫出網路圖
# 防守影響較大的關係:綠色
# 防守影響較小的關係:紅色
set.seed(2019)
E(n_second_team_Network)$color <- ifelse(E(n_second_team_Network)$dif_FGP < 0 , "lightgreen", "palevioletred")
plot(n_second_team_Network, vertex.size=2, edge.arrow.size=0.3, vertex.label.cex=0.7)

