################################################################################
## Using webscrapping and visualization to analyze NBA players' performance
## by Dr. Jimmy (Zhenning) Xu,
## follow me on Twitter https://twitter.com/MKTJimmyxu
################################################################################
Playing basketball is my hobby. I enjoy playing it even though I am not good at it at all!
Watching NBA games is a luxury for me. However, from a marketing perspective, valuing player’s performance seems to be pretty interesting since every major sports brand wants to be an early sponsor of a future NBA star!
The goal of this case analysis is to present a solution to scrap NBA data and to analyze using basic viausaliation techniques. Which player has the best performance? I just found something very counterintuitive!!! Will do some more analysis when I have some free time this summer!
#### Let's pull the new NBA player tracking data from online and merge it into a data set in R ####
rm(list=ls())
#list of addresses for raw data.
addressList <- list(
pullup_address ="http://stats.nba.com/js/data/sportvu/pullUpShootData.js",
drives_address ="http://stats.nba.com/js/data/sportvu/drivesData.js",
defense_address ="http://stats.nba.com/js/data/sportvu/defenseData.js",
passing_address ="http://stats.nba.com/js/data/sportvu/passingData.js",
touches_address ="http://stats.nba.com/js/data/sportvu/touchesData.js",
speed_address ="http://stats.nba.com/js/data/sportvu/speedData.js",
rebounding_address ="http://stats.nba.com/js/data/sportvu/reboundingData.js",
catchshoot_address ="http://stats.nba.com/js/data/sportvu/catchShootData.js",
shooting_address ="http://stats.nba.com/js/data/sportvu/shootingData.js"
)
#function that grabs the data from the website and converts to R data frame
readIt <- function(address){
web_page <- readLines(address)
##regex to strip javascript bits and convert raw to csv format
x1 <- gsub("[\\{\\}\\]]", "", web_page, perl=TRUE)
x2 <- gsub("[\\[]", "\n", x1, perl=TRUE)
x3 <- gsub("\"rowSet\":\n", "", x2, perl=TRUE)
x4 <- gsub(";", ",",x3, perl=TRUE)
nba<-read.table(textConnection(x4), header=T, sep=",", skip=2, stringsAsFactors=FALSE)
nba <- nba[,1:ncol(nba)-1] #strip last column
return(nba)
}
# function for merging a list of data sets
merge.rec <- function(.list, ...){
if(length(.list)==1) return(.list[[1]])
Recall(c(list(merge(.list[[1]], .list[[2]], ...)), .list[-(1:2)]), ...)
}
#using each address, read in a data set
df_list <- lapply(addressList, readIt)
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/pullUpShootData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/drivesData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/defenseData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/passingData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/touchesData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/speedData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/reboundingData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/catchShootData.js'
## Warning in readLines(address): incomplete final line found on 'http://
## stats.nba.com/js/data/sportvu/shootingData.js'
# variables to merge on
mergevars <- c("PLAYER_ID", "PLAYER", "FIRST_NAME", "LAST_NAME", "TEAM_ABBREVIATION", "GP", "MIN")
# create the final data
final.data <- merge.rec(df_list, by=mergevars)
## Warning in merge.data.frame(.list[[1]], .list[[2]], ...): column names
## 'PTS.x', 'PTS.y' are duplicated in the result
#create more informative column names:
varnames <- c(
"PlAYER_ID",
"PLAYER",
"FIRST_NAME",
"LAST_NAME",
"TEAM_ABBREVIATION",
"GP",
"MIN",
"PU_PPG",
"PU_FGM_PG",
"PU_FGA_PG",
"PU_FG_PCT",
"PU_FG3M_PG",
"PU_FG3A_PG",
"PU_FG3_PCT",
"PU_EFG_PCT",
"PU_PTS_TOT",
"DVS", #START DRIVES STATS
"DVS_PPG", #drives player PPG
"DVS_TPPG", #TEAM drives ppg
"DVS_FG_PCT",
"DVS_PP48", #drives pts per 48 min driving
"DVS_PTS_TOT",
"DVS_TOT",
"BLK_PG", #START DEFENSIVE STATS
"STL_PG",
"OPP_FGM_RIM",
"OPP_FGA_RIM",
"OPP_FGP_RIM",
"BLK_TOT",
"PASS_PG", #START PASSING STATS
"AST_PG",
"AST_FT",
"AST_SEC",
"AST_OPPS_PG",
"AST_PTS_CRT_PG",
"AST_PTS_CRT_48",
"AST_TOT",
"TCH_PG", #BEGIN TOUCHES STATS
"TCH_FC_PG", #FC = front court
"TOP", #time of posession
"CL_TCH_PG", #touches close to basket
"EL_TCH_PG", #touches in the "elbow" area
"PPG", #raw ppg
"PP_TCH", #pts per touch
"PP_HC_TCH", #pts per half court touch
"TCH_TOT",
"DIST", #START SPEED AND DIST STATS
"AVG_SPD",
"DIST_PG",
"DIST_48",
"DIST_OFF",
"DIST_DEF",
"AVG_SPD_OFF",
"AVG_SPD_DEF",
"REB_PG", #START REBOUNDING STATS
"REB_CHANCE_PG",
"REB_COL_PCT",
"REB_CONTESTED",
"REB_UNCONTESTED",
"REB_UNCONTESTED_PCT",
"REB_TOT",
"OREB",
"OREB_CHANCE",
"OREB_COL_PCT",
"OREB_CONTESTED",
"OREB_UNCONTESTED",
"OREB_UNCONTESTED_PCT",
"DREB",
"DREB_CHANCE",
"DREB_COL_PCT",
"DREB_CONTESTED",
"DREB_UNCONTESTED",
"DREB_UNCONTESTED_PCT",
"CS_PPG", #START CATCH AND SHOOT STATS
"CS_FGM_PG",
"CS_FGA_PG",
"CS_FG_PCT",
"CS_FG3M_PG",
"CS_FG3A_PG",
"CS_FG3_PCT",
"CS_EFG_PCT",
"CS_PTS_TOT",
"CS_PPG",
"PTS_DRIVE", #START SHOOTING EFFICIENCY STATS
"FGP_DRIVE",
"PTS_CLOSE",
"FGP_CLOSE",
"PTS_CATCH_SHOOT",
"FGP_CATCH_SHOOT",
"PTS_PULL_UP",
"FGP_PULL_UP",
"FGA_DRIVE",
"FGA_CLOSE",
"FGA_CATCH_SHOOT",
"FGA_PULL_UP",
"EFG_PCT"
)
colnames(final.data) <- varnames
To reveal NBA player’s performance, I used “minutes per game” and “points per game” in this analysis. Let’s create a simple visualization. Adding labels to a scatterplot does not seem to be easy. It took me almost an hour to figure it out.It still does not look perfect though! Aha!!!
#https://www.r-graph-gallery.com/275-add-text-labels-with-ggplot2/
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.5.3
library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Error: `data` must be uniquely named but has duplicate columns
data <- final.data[ , !duplicated(colnames(final.data))]
options(repr.plot.width=6, repr.plot.height=3)
ggplot(data, aes(MIN, PU_PPG)) + geom_point() + geom_smooth(method = "lm") +
labs(x = "MIN", y = "PU_PPG")+
geom_text(aes(label=FIRST_NAME), size=3.5)
What did you find? Share your insights with me (@MKTJimmyxu) on Twitter! stats.nba.com