Using only first round draft picks from 1980-2012, I’ve calculated the average seasons each team’s draft picks play with their original team to compare all 30 NBA teams.
Here I define a function created to convert previous team names to their current names when cleaning the data (ex. SEA -> OKC):
teamsub <- function(pattern, replacement, x, ...) {
if (length(pattern)!=length(replacement)) {
stop("pattern and replacement do not have the same length.")
}
result <- x
for (i in 1:length(pattern)) {
result <- gsub(pattern[i], replacement[i], result, ...)
}
result
}
The next function I define is to convert factor variables to numeric:
as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}
Step 1: Setting up our dataframes
I initially downloaded datasets from basketballreference.com of each draft class and their relevant stats. Here I load these csv files in my project and combine them into one main dataframe. I also clean up the df by: 1. Removing some columns 2. Giving 0 to any NA’s in the Seasons, Games, Win Shares or WS/48 stats 3. Converting the older team names to the current team names :
OGnames <- sprintf("%sclass.csv", 1980:2012)
for (i in OGnames){
name <- gsub("class.csv", "", i)
name <- paste("class", name, sep = "")
assign(name, read.csv(i, header = T))
}
dfnames <- sprintf("class%s", 1980:2012)
Alldrafted <- rbind.fill(list(class1980, class1981, class1982, class1983, class1984, class1985, class1986, class1987, class1988, class1989, class1990, class1991, class1992, class1993, class1994, class1995, class1996, class1997, class1998, class1999, class2000, class2001, class2002, class2003, class2004, class2005, class2006, class2007, class2008, class2009, class2010, class2011, class2012))
Alldrafted$Player <- gsub("\\\\.*", "", Alldrafted$Player)
Alldrafted <- subset(Alldrafted, select = -c(Rk))
Alldrafted$Tm <- as.character(Alldrafted$Tm)
Alldrafted$Yrs[is.na(Alldrafted$Yrs)] <- 0
Alldrafted$G[is.na(Alldrafted$G)] <- 0
Alldrafted$WS[is.na(Alldrafted$WS)] <- 0
Alldrafted$WS.48[is.na(Alldrafted$WS.48)] <- 0
Alldrafted$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), Alldrafted$Tm)
head(Alldrafted)
## Pk Tm Player Yrs G WS WS.48
## 1 1 GSW Joe Barry Carroll 10 705 35.6 0.075
## 2 2 UTA Darrell Griffith 10 765 22.0 0.049
## 3 3 BOS Kevin McHale 13 971 113.0 0.180
## 4 4 CHI Kelvin Ransey 6 474 17.9 0.074
## 5 5 DEN James Ray 3 103 -0.3 -0.017
## 6 6 BRK Mike O'Koren 8 407 14.6 0.080
I also utilized data from Real GM that had more information on every first round draft selection which included their draft year, height, weight, Pre NBA Source. I add the data and clean up some of its columns by 1. Converting the height variable to inches 2. Converting the team names to the proper acronyms :
playerinfo <- read.csv("draftedplayerinfo.csv", header = T, fill = T, stringsAsFactors=FALSE, fileEncoding="latin1")
playerinfo$Height <- gsub("11-May", "5-11", playerinfo$Height)
playerinfo$Height <- gsub("11-Jun", "6-11", playerinfo$Height)
playerinfo$Height <- gsub("10-Jun", "6-10", playerinfo$Height)
playerinfo$Height <- gsub("09-Jun", "6-9", playerinfo$Height)
playerinfo$Height <- gsub("08-Jun", "6-8", playerinfo$Height)
playerinfo$Height <- gsub("07-Jun", "6-7", playerinfo$Height)
playerinfo$Height <- gsub("06-Jun", "6-6", playerinfo$Height)
playerinfo$Height <- gsub("05-Jun", "6-5", playerinfo$Height)
playerinfo$Height <- gsub("04-Jun", "6-4", playerinfo$Height)
playerinfo$Height <- gsub("03-Jun", "6-3", playerinfo$Height)
playerinfo$Height <- gsub("02-Jun", "6-2", playerinfo$Height)
playerinfo$Height <- gsub("01-Jun", "6-1", playerinfo$Height)
playerinfo$Height <- gsub("10-May", "5-10", playerinfo$Height)
playerinfo$Height <- gsub("09-May", "5-9", playerinfo$Height)
playerinfo$Height <- gsub("08-May", "5-8", playerinfo$Height)
playerinfo$Height <- gsub("07-May", "5-7", playerinfo$Height)
playerinfo$Height <- gsub("06-May", "5-6", playerinfo$Height)
playerinfo$Height <- gsub("05-May", "5-5", playerinfo$Height)
playerinfo$Height <- gsub("03-May", "5-3", playerinfo$Height)
playerinfo$Height <- gsub("01-Jul", "7-1", playerinfo$Height)
playerinfo$Height <- gsub("02-Jul", "7-2", playerinfo$Height)
playerinfo$Height <- gsub("03-Jul", "7-3", playerinfo$Height)
playerinfo$Height <- gsub("04-Jul", "7-4", playerinfo$Height)
playerinfo$Height <- gsub("05-Jul", "7-5", playerinfo$Height)
playerinfo$Height <- gsub("06-Jul", "7-6", playerinfo$Height)
playerinfo$Height <- gsub("07-Jul", "7-7", playerinfo$Height)
playerinfo$Team <- teamsub(c("Portland Trail Blazers", "Milwaukee Bucks", "Toronto Raptors", "Houston Rockets", "Washington Wizards", "Cleveland Cavaliers", "Golden State Warriors" , "Orlando Magic", "New Jersey Nets", "Chicago Bulls", "Los Angeles Clippers", "Philadelphia Sixers", "San Antonio Spurs", "New York Knicks", "Sacramento Kings", "Charlotte Hornets (1988)", "Los Angeles Lakers", "Dallas Mavericks", "New Orleans Pelicans", "Minnesota Timberwolves", "Seattle SuperSonics", "Atlanta Hawks", "Vancouver Grizzlies", "Detroit Pistons", "Charlotte Hornets", "Miami Heat", "Indiana Pacers", "Phoenix Suns", "San Diego Clippers", "Boston Celtics", "Memphis Grizzlies", "Utah Jazz", "Denver Nuggets", "Oklahoma City Thunder", "Kansas City Kings", "Brooklyn Nets" ), c("POR", "MIL", "TOR", "HOU", "WAS", "CLE", "GSW", "ORL", "BRK", "CHI", "LAC", "PHI", "SAS", "NYK", "SAC", "NOP", "LAL", "DAL", "NOP", "MIN", "OKC", "ATL", "MEM", "DET", "CHO", "MIA", "IND", "PHO", "LAC", "BOS", "MEM", "UTA", "DEN", "OKC", "SAC", "BRK"), playerinfo$Team)
playerinfo$Draft.Trades <- teamsub(c("UTH", "SAN", "PHL", "GOS", "SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("UTA", "SAS", "PHI", "GSW", "LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), playerinfo$Draft.Trades)
head(Alldrafted)
## Pk Tm Player Yrs G WS WS.48
## 1 1 GSW Joe Barry Carroll 10 705 35.6 0.075
## 2 2 UTA Darrell Griffith 10 765 22.0 0.049
## 3 3 BOS Kevin McHale 13 971 113.0 0.180
## 4 4 CHI Kelvin Ransey 6 474 17.9 0.074
## 5 5 DEN James Ray 3 103 -0.3 -0.017
## 6 6 BRK Mike O'Koren 8 407 14.6 0.080
More cleaning of the playerinfo df: 1.Taking away the draft picks post 2012 2. Checking to see if the names from the Real Gm data match the names from the Basketball Reference data (ex. Penny Hardaway vs. Afernee Hardaway) 3. For some reason, Tim Duncan’s stats were not included so I add them in :
playerinfo <- subset(playerinfo, playerinfo$Year != 2013)
playerinfo <- subset(playerinfo, playerinfo$Year != 2014)
playerinfo <- subset(playerinfo, playerinfo$Year != 2015)
playerinfo <- subset(playerinfo, playerinfo$Year != 2016)
namecheck <- merge(Alldrafted, playerinfo, all.x = T, by = "Player")
investigate <- namecheck[complete.cases(namecheck) == F,]
playerinfo$Player[playerinfo$Player == "Alek Radojevic"] <- "Aleksandar Radojevic"
playerinfo$Player[playerinfo$Player == "Penny Hardaway"] <- "Anfernee Hardaway"
playerinfo$Player[playerinfo$Player == "Charles Daniel Smith"] <- "Charles Smith"
playerinfo$Player[playerinfo$Player == "Dontae Jones"] <- "Dontae' Jones"
playerinfo$Player[playerinfo$Player == "Efthimios Rentzias"] <- "Efthimi Rentzias"
playerinfo$Player[playerinfo$Player == "Frankie Williams"] <- "Frank Williams"
playerinfo$Player[playerinfo$Player == "Freddie Banks"] <- "Fred Banks"
playerinfo$Player[playerinfo$Player == "George Muresan"] <- "Gheorghe Muresan"
playerinfo$Player[playerinfo$Player == "Cadillac Anderson"] <- "Greg Anderson"
playerinfo$Player[playerinfo$Player == "J.R. Rider"] <- "Isaiah Rider"
playerinfo$Player[playerinfo$Player == "Jose Rafael Ortiz-Rijos"] <- "Jose Ortiz"
playerinfo$Player[playerinfo$Player == "Marcus D. Williams"] <- "Marcus Williams"
playerinfo$Player[playerinfo$Player == "Mo Taylor"] <- "Maurice Taylor"
playerinfo$Player[playerinfo$Player == "Mel Turpin"] <- "Melvin Turpin"
playerinfo$Player[playerinfo$Player == "Babby Araujo"] <- "Rafael Araujo"
playerinfo$Player[playerinfo$Player == "Saer Sene"] <- "Mouhamed Sene"
playerinfo$Player[playerinfo$Player == "Maybyner Nene"] <- "Nene Hilario"
playerinfo$Player[playerinfo$Player == "Roy Rogers, Jr."] <- "Roy Rogers"
playerinfo$Player[playerinfo$Player == "Jianlian Yi"] <- "Yi Jianlian"
timmy <- c("1997", "1", "1", "Tim Duncan", "SAS", "", "C", "83", "250", "21", "19", "Wake Forest")
playerinfo <-rbind(playerinfo, timmy)
head(playerinfo)
## Year Rnd Pick Player Team Draft.Trades Pos Height Weight Age
## 1 2007 1 1 Greg Oden POR C 7-0 273 19
## 2 2005 1 1 Andrew Bogut MIL C 7-0 260 20
## 3 2006 1 1 Andrea Bargnani TOR PF/C 7-0 245 20
## 4 2002 1 1 Yao Ming HOU C 7-6 310 21
## 5 2001 1 1 Kwame Brown WAS C 6-11 290 19
## 6 2003 1 1 LeBron James CLE SF/PF 6-8 250 18
## YOS Pre.Draft.Team
## 1 6 Ohio State
## 2 11 Utah
## 3 10 Universo De'Longhi Treviso (Italy)
## 4 9 Shanghai Dongfang (China)
## 5 12 Glynn Academy (Georgia)
## 6 13 St. Vincent St. Mary High School (Ohio)
The final set of data I use is scraped from the Basketball Reference website. This data holds the season by season records (1980-2016) of every player and their stats. I will be using this dataset instead of a each player’s career totals because I am only interested in the stats with their drafted team. Once loaded, I will be cleaning the data by: 1. Converting team names 2.Removing records of the combined season stats for players who were traded midseason. If a player is traded midseason, they will have a record for their stats on the original team, stats on their new team and combined stats for the whole season. I remove the combined stats for the year
URLs <- sprintf("http://www.basketball-reference.com/leagues/NBA_%s_advanced.html", 1981:2017)
tabla <- list()
for(v in seq_along(URLs))
{
total <- readHTMLTable(URLs[v])
n.rows <- unlist(lapply(total, function(t) dim(t)[1]))
tabla[[v]] <- as.data.frame(total[[which.max(n.rows)]])
}
allplayers <- do.call(rbind.data.frame, tabla)
allplayers$Player <- gsub("\\*", "", allplayers$Player)
allplayers$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), allplayers$Tm)
allplayers <- subset(allplayers, Tm != "Tm")
allplayers <- subset(allplayers, Tm != "TOT")
head(allplayers, n=3)
## Rk Player Pos Age Tm G MP PER TS% 3PAr FTr ORB%
## 1 1 Kareem Abdul-Jabbar C 33 LAL 80 2976 25.5 .616 .001 .379 7.6
## 3 2 Tom Abernethy SF 26 GSW 10 39 3.2 .463 .000 1.000 2.8
## 4 2 Tom Abernethy SF 26 IND 29 259 8.7 .458 .018 .339 7.8
## DRB% TRB% AST% STL% BLK% TOV% USG% OWS DWS WS WS/48 .1 OBPM DBPM
## 1 21.5 15.0 13.6 0.9 4.0 12.8 26.3 9.6 4.6 14.3 .230 3.9 1.4
## 3 20.2 11.4 2.9 1.2 0.0 31.6 6.4 0.0 0.0 0.0 -0.004 -6.0 -0.2
## 4 9.1 8.4 8.8 1.1 0.7 8.5 10.9 0.2 0.2 0.4 .072 -2.0 -0.8
## BPM VORP
## 1 5.3 5.4
## 3 -6.2 0.0
## 4 -2.8 -0.1
Step 2: Cleaning the dataframes.
With all three datasets in place, I continue cleaning the data. First off, I want to make sure I uniquely define any players with the same names (ex. Tim Hardaway and Tim Hardaway JR.).
I’ll first find duplicates in the Alldrafted and playerinfo df’s:
unique_test <- data.frame(table(Alldrafted$Player))
unique_test[unique_test$Freq > 1,]
## Var1 Freq
## 140 Charles Smith 2
Alldrafted$Player[Alldrafted$Player == "Charles Smith" & Alldrafted$Pk == 26] <- "Charles Cornelius Smith"
unique_test2 <- data.frame(table(playerinfo$Player))
unique_test2[unique_test2$Freq > 1,]
## [1] Var1 Freq
## <0 rows> (or 0-length row.names)
playerinfo$Player[playerinfo$Player == "Charles Daniel Smith" & playerinfo$Pick == 3] <- "Charles Smith"
Now for the allplayers dataframe. This one is a bit more difficult as there are records for each season played by a player, so there will for sure be duplicates of player names. I will take two approaches to combat this.
The allplayers dataset does not include the season played but does include the age of the player. When I compile the career stats, I want to make sure I am not adding the stats of two seperate players into one. Here I aggregate by player and age and sum games played stats. If there are players at an age with more than 84 games played, I investigate to see if there are duplicate names:
c1 <- allplayers
c1$G <- as.numeric(levels(c1$G))[c1$G]
c2 <- aggregate(c1$G, list(c1$Player, c1$Age), sum)
c2 <- subset(c2, x>84)
names(c2) <- c("Player", "Age", "Games")
c2test <- merge(c2, Alldrafted, all.x = T)
head(c2test)
## Player Age Games Pk Tm Yrs G WS WS.48
## 1 Charles Jones 23 107 NA <NA> NA NA NA NA
## 2 Charles Jones 24 99 NA <NA> NA NA NA NA
## 3 Charles Smith 26 109 3 PHI 9 564 40.9 0.12
## 4 Charles Smith 22 94 3 PHI 9 564 40.9 0.12
## 5 Charles Smith 30 95 3 PHI 9 564 40.9 0.12
## 6 Charles Smith 23 99 3 PHI 9 564 40.9 0.12
Now with the list of players with more than 84 games played in a season, I attempt to remove any players from allplayers who aren’t in the targeted draft group (Round 1, 1980-2012) or change the names of any duplicated players.
allplayers <- allplayers[!(allplayers$Player == "Charles Smith" & allplayers$Tm == "BOS"),]
allplayers <- allplayers[!(allplayers$Player == "Charles Smith" & allplayers$Tm == "MIN"),]
allplayers$Player[allplayers$Player == "Charles Smith" & allplayers$Pos == "SG"] <- "Charles Cornelius Smith"
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "UTA"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "WAS"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "PHO"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "TOT"),]
allplayers <- allplayers[!(allplayers$Player == "Gerald Henderson" & allplayers$Pos == "PG"),]
allplayers <- allplayers[!(allplayers$Player == "Glen Rice" & allplayers$Pos == "SG"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Tm == "IND"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Pos == "SG"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Tm == "PHI" & allplayers$Age == 21),]
allplayers <- allplayers[!(allplayers$Player == "Larry Drew" & allplayers$Tm == "PHI"),]
allplayers <- allplayers[!(allplayers$Player == "Marcus Williams" & allplayers$Tm == "LAC"),]
allplayers <- allplayers[!(allplayers$Player == "Marcus Williams" & allplayers$Tm == "SAS"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "SAC"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "MEM"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "WAS"),]
allplayers <- allplayers[!(allplayers$Player == "Patrick Ewing" & allplayers$Tm == "NOP"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "GSW"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "NOP"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "CHO"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "OKC"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "SAS" & allplayers$G ==20),]
allplayers$Player[allplayers$Player == "Tim Hardaway" & allplayers$Tm == "NYK"] <- "Tim Hardaway Jr"
allplayers$Player[allplayers$Player == "Tim Hardaway" & allplayers$Tm == "ATL"] <- "Tim Hardaway Jr"
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "HOU"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "SAS"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "PHI"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "MIL" & allplayers$Pos == "PG"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "ATL"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "CLE"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "SEA" & allplayers$G == 24),]
Second Attempt at finding duplicate names. In the allplayers dataset, if a player was traded midseason, they would have three records for that season: one for the stats on their initial team, one for the stats on their second team. Knowing this, I find rows that have the same player and age. I then match these players against players who have been traded midseason. If they were traded midseason, there is the possibility that there were two players with the same name who played at the same age.
### I have saved the allplayers file to a csv and have loaded the original data
c3 <- read.csv("allplayers.csv")
c3$Player <- gsub("\\*", "", c3$Player)
c3$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), c3$Tm)
c3$Player <- as.character(c3$Player)
c3$Age <- as.numeric.factor(c3$Age)
c3 <- c3[c3$Tm != "Tm",]
TradedPlayers <- subset(c3, Tm == "TOT")
TradedPlayers <- TradedPlayers[c(3,5,6)]
c3dupl <- c3[c(3,5,6)]
c3dupl <- c3dupl[c3dupl$Tm != "TOT",]
c3dupl$Freq <- 1
c3dupl <- aggregate(c3dupl$Freq, by = list(c3dupl$Player, c3dupl$Age), sum)
c3dupl <- subset(c3dupl, x >1)
c3check <- merge(c3dupl, TradedPlayers, by.x = c("Group.1", "Group.2"), by.y = c("Player", "Age"), all.x = T)
head(c3check[complete.cases(c3check) == F,])
## Group.1 Group.2 x Tm
## 241 Charles Jones 23 2 <NA>
## 242 Charles Jones 24 2 <NA>
## 243 Charles Jones 26 2 <NA>
## 246 Charles Smith 23 3 <NA>
## 247 Charles Smith 26 2 <NA>
## 248 Charles Smith 27 2 <NA>
After further investigation, I have ensured that the allplayers dataframe has unique player names for the targeted group.
We now have data frame with every draft selection, their draft team and their pick position. Some players were however traded on draft day or the subsequent days after to another team. To account for this, I will utilize some of the information provided in the playerinfo df on draft day trades. For players traded, I will change their Team variable in the Alldrafted df to the corresponding team they’ve been traded to:
draft_day_trades <- subset(playerinfo, select = c("Player", "Pick", "Draft.Trades", "Year"))
draft_day_trades$Draft.Trades[draft_day_trades$Draft.Trades == ""] <- NA
draft_day_trades <- draft_day_trades[complete.cases(draft_day_trades),]
draft_day_trades$Team <- substr(draft_day_trades$Draft.Trades, 8, 10)
draft_day_trades <- subset(draft_day_trades, select = -Draft.Trades)
names(draft_day_trades) <- c("Player", "Pk", "Year", "Team")
Alldrafted <- merge(Alldrafted, draft_day_trades, by = c("Player", "Pk"), all.x = T)
Alldrafted$Tm <- as.character(Alldrafted$Tm)
Alldrafted$Tm[is.na(Alldrafted$Team) == F] <- Alldrafted$Team[is.na(Alldrafted$Team) == F]
Alldrafted <- subset(Alldrafted, select = -c(8, 9))
I just want to check once more to see if there are any players that may have the wrong draft team. I check by seeing which players have no season stats with their drafted team and then investigate further to see if they were traded immediately after the draft.
drafttradetest <- merge(Alldrafted, allplayers, all.x = T, by.x = c("Player", "Tm"), by.y = c("Player", "Tm"))
drafttradetest2 <- drafttradetest[complete.cases(drafttradetest) == F,]
head(drafttradetest2, n=2)
## Player Tm Pk Yrs G.x WS.x WS.48 Rk Pos Age G.y MP PER
## 223 Ansu Sesay DAL 30 4 127 1.4 0.051 <NA> <NA> <NA> <NA> <NA> <NA>
## 229 Anthony Avent DEN 15 6 352 0.3 0.002 <NA> <NA> <NA> <NA> <NA> <NA>
## TS% 3PAr FTr ORB% DRB% TRB% AST% STL% BLK% TOV% USG% OWS DWS
## 223 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 229 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## WS.y WS/48 .1 OBPM DBPM BPM VORP
## 223 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 229 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
Alldrafted$Tm[Alldrafted$Player == "Brendan Haywood"] <-"WAS"
Alldrafted$Tm[Alldrafted$Player == "Byron Houston" ] <- "GSW"
Alldrafted$Tm[Alldrafted$Player == "Craig Brackins" ] <- "PHI"
Alldrafted$Tm[Alldrafted$Player == "Darrell Arthur" ] <- "MEM"
Alldrafted$Tm[Alldrafted$Player == "Anthony Avent"] <- "MIL"
Alldrafted$Tm[Alldrafted$Player == "Efthimi Rentzias"] <- "DEN"
Alldrafted$Tm[Alldrafted$Player == "Fred Roberts" ] <- "MIL"
Alldrafted$Tm[Alldrafted$Player == "Jamaal Tinsley" ] <- "IND"
Alldrafted$Tm[Alldrafted$Player == "Larry Krystkowiak" ] <- "SAS"
Alldrafted$Tm[Alldrafted$Player == "Leandro Barbosa" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Kenny Battle" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Mirsad Turkcan" ] <- "HOU"
Alldrafted$Tm[Alldrafted$Player == "Nikola Mirotic" ] <- "CHI"
Alldrafted$Tm[Alldrafted$Player == "Norris Cole" ] <- "MIA"
Alldrafted$Tm[Alldrafted$Player == "Pat Garrity" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Randy Foye" ] <- "MIN"
Alldrafted$Tm[Alldrafted$Player == "Tracy Murray" ] <- "POR"
Next I want to prepare the playerinfo dataframe to be merged with our Alldrafted df. I am going to remove some of the unnecessary columns first.
playerinfo <- subset(playerinfo, select = -c(Rnd, Team, Draft.Trades))
names(playerinfo) <- c("Draft Year", "Pk", "Player", "POS", "Height", "Weight", "Age", "YOS", "PreNBA")
I’m adding a new column that details whether or not the draftee came from College (c), Overseas (o) or High school (h). Players from High School or Overseas have the region of their school or team in brackets (ex. Glynn Academy (Georgia)). I start by finding only the players from Overseas or High School as they have a the region of their former team in brackets. I use regex to extract only players with brackets in their PreNba variable:
IorH <- playerinfo[which(grepl("\\(", playerinfo$PreNBA) == T),]
There are however some draftees who have brackets in their PreNBA column:
playerinfo$PreNBA[playerinfo$Player == "Alfredrick Hughes"] <- "Loyola"
playerinfo$PreNBA[playerinfo$Player == "John Salmons"] <- "University of Miami"
playerinfo$PreNBA[playerinfo$Player == "Ron Harper"] <- "Miami University"
playerinfo$PreNBA[playerinfo$Player == "Tim James"] <- "University of Miami"
playerinfo$PreNBA[playerinfo$Player == "Wally Szczerbiak"] <- "Miami University"
Let’s try again:
IorH <- playerinfo[which(grepl("\\(", playerinfo$PreNBA) == T),]
head(IorH)
## Draft Year Pk Player POS Height Weight Age YOS
## 3 2006 1 Andrea Bargnani PF/C 7-0 245 20 10
## 4 2002 1 Yao Ming C 7-6 310 21 9
## 5 2001 1 Kwame Brown C 6-11 290 19 12
## 6 2003 1 LeBron James SF/PF 6-8 250 18 13
## 8 2004 1 Dwight Howard C/PF 6-11 265 18 12
## 41 2003 2 Darko Milicic C/PF 7-0 275 18 10
## PreNBA
## 3 Universo De'Longhi Treviso (Italy)
## 4 Shanghai Dongfang (China)
## 5 Glynn Academy (Georgia)
## 6 St. Vincent St. Mary High School (Ohio)
## 8 Southwest Atlanta Christian Academy (Georgia)
## 41 KK Vrsac Swisslion (Serbia)
To figure out whether or not they came from High School or Overseas, I will match what’s in the brackets to a list of American states. I have checked and ensured there were no players who came from playing in Georgia (country not state). I create a new column in the IorH df that gives a TRue or False of whether or not it is a US state. I then add this column to the playerinfo df and give an “O” for False, an “H” for True and a “C” for NA.
states <- read.csv("states.csv", header = T)
head(states)
## State Abbreviation
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
states <- subset(states, select = State)
IorH$a <- str_extract(IorH$PreNBA, "\\(.*\\)")
IorH$a <- gsub("\\(", "", IorH$a)
IorH$a <- gsub("\\)", "", IorH$a)
IorH$b <- IorH$a %in% states$State
IorH <- subset(IorH, select = c(Player, Pk, b))
playerinfo <- merge(playerinfo, IorH, by = c("Player", "Pk"), all.x = T)
UHI <- function (x) {
ifelse(is.na(x) == T, "C",
ifelse(x == "TRUE", "H", "O"))
}
playerinfo$b <- UHI(playerinfo$b)
names(playerinfo) <- c("Player", "Pk", "Draft.Year", "POS", "Height", "Weight", "Age", "YOS", "PreNBA", "Source")
head(playerinfo[complete.cases(playerinfo) == F,])
## [1] Player Pk Draft.Year POS Height Weight
## [7] Age YOS PreNBA Source
## <0 rows> (or 0-length row.names)
head(playerinfo)
## Player Pk Draft.Year POS Height Weight Age YOS PreNBA Source
## 1 A.C. Green 23 1985 PF 6-9 220 21 16 Oregon State C
## 2 Aaron Brooks 26 2007 PG 6-0 161 22 8 Oregon C
## 3 Aaron McKie 17 1994 G 6-5 209 21 14 Temple C
## 4 Acie Earl 19 1993 PF 6-10 240 23 4 Iowa C
## 5 Acie Law 11 2007 SG 6-3 202 22 4 Texas A&M C
## 6 Adam Keefe 10 1992 F 6-9 230 22 9 Stanford C
Now to assign one position for each of the players. I want to break it down into the classic 5 positions used. I will use the playerinfo df but for any players whose position isn’t one of the classic 5, I will assign the position of their rookie year (from the allplayers df).
allplayers$Age <- as.numeric(levels(allplayers$Age))[allplayers$Age]
POS_rookie <- allplayers %>%
group_by(Player) %>%
slice(which.min(Age))
POS_check <- merge(playerinfo, POS_rookie, all.x = T, by.x = "Player", by.y = "Player")
POS_check$Pos <- as.character(POS_check$Pos)
POS_check$Pos[is.na(POS_check$Pos) ==T] <- POS_check$POS
head(POS_check[complete.cases(POS_check) == F,], n=2)
## Player Pk Draft.Year POS Height Weight Age.x YOS PreNBA
## 108 Brian Jackson 26 1981 PF 6-8 200 22 0 Utah State
## 126 Calvin Duncan 30 1985 G 6-4 195 24 1 VCU
## Source Rk Pos Age.y Tm G MP PER TS% 3PAr FTr ORB% DRB%
## 108 C <NA> PF NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 126 C <NA> PG NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## TRB% AST% STL% BLK% TOV% USG% OWS DWS WS WS/48 .1 OBPM DBPM
## 108 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 126 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## BPM VORP
## 108 <NA> <NA>
## 126 <NA> <NA>
POS_check$POS[POS_check$POS == "C"] <- 5
POS_check$POS[POS_check$POS == "PF"] <- 4
POS_check$POS[POS_check$POS == "SF"] <- 3
POS_check$POS[POS_check$POS == "SG"] <- 2
POS_check$POS[POS_check$POS == "PG"] <- 1
POS_check$Pos[POS_check$Pos == "C"] <- 5
POS_check$Pos[POS_check$Pos == "PF"] <- 4
POS_check$Pos[POS_check$Pos == "SF"] <- 3
POS_check$Pos[POS_check$Pos == "SG"] <- 2
POS_check$Pos[POS_check$Pos == "PG"] <- 1
head(POS_check, n=2)
## Player Pk Draft.Year POS Height Weight Age.x YOS PreNBA
## 1 A.C. Green 23 1985 4 6-9 220 21 16 Oregon State
## 2 Aaron Brooks 26 2007 1 6-0 161 22 8 Oregon
## Source Rk Pos Age.y Tm G MP PER TS% 3PAr FTr ORB% DRB% TRB% AST%
## 1 C 98 4 22 LAL 82 1542 11.8 .564 .015 .430 12.4 15.5 14.0 4.2
## 2 C 60 1 23 HOU 51 608 13.1 .535 .484 .218 2.5 7.9 5.2 23.5
## STL% BLK% TOV% USG% OWS DWS WS WS/48 .1 OBPM DBPM BPM VORP
## 1 1.5 1.7 17.7 14.7 1.4 2.0 3.3 .103 -0.5 0.8 0.3 0.9
## 2 1.1 0.6 15.1 21.8 0.6 0.8 1.4 .112 0.0 -1.8 -1.8 0.0
POS_Func <- function (x) {
ifelse(POS_check$POS != 1|2|3|4|5, POS_check$Pos, POS_check$POS)
}
POS_check$POS <- POS_Func(POS_check$POS)
POS_check <- POS_check[c(1,4)]
POS_check$POS[POS_check$POS == 5] <- "C"
POS_check$POS[POS_check$POS == 4] <- "PF"
POS_check$POS[POS_check$POS == 3] <- "SF"
POS_check$POS[POS_check$POS == 2] <- "SG"
POS_check$POS[POS_check$POS == 1] <-"PG"
playerinfo <- merge(playerinfo, POS_check, by.x = "Player", by.y = "Player")
playerinfo <- playerinfo[,c(1,2,3,11,5,6,7,8,9,10)]
names(playerinfo) <- c("Player", "Pk", "Draft.Year", "POS", "Height", "Weight", "Age", "YOS", "PreNBA", "Source")
head(playerinfo)
## Player Pk Draft.Year POS Height Weight Age YOS PreNBA Source
## 1 A.C. Green 23 1985 PF 6-9 220 21 16 Oregon State C
## 2 Aaron Brooks 26 2007 PG 6-0 161 22 8 Oregon C
## 3 Aaron McKie 17 1994 SG 6-5 209 21 14 Temple C
## 4 Acie Earl 19 1993 C 6-10 240 23 4 Iowa C
## 5 Acie Law 11 2007 PG 6-3 202 22 4 Texas A&M C
## 6 Adam Keefe 10 1992 PF 6-9 230 22 9 Stanford C
Step 3: Aggregating Stats
I can now begin the process of aggregating season stats to each player to find their stats with their drafted team. I begin by creating a new df careerstats and converting names and classes of the variables:
careerstats <- subset(allplayers, select = c(Player, Age, Tm, G, MP, WS, `WS/48`, DWS, OWS, PER, BPM, OBPM, DBPM, VORP, `TS%`, `3PAr`, FTr, `TRB%`, `AST%`, `STL%`, `BLK%`, `TOV%`, `USG%`))
names(careerstats) <- c("Player", "Age", "Team", "Games", "Minutes", "WS", "WS.48", "DWS", "OWS", "PER", "BPM", "OBPM", "DBPM", "VORP", "TSp", "TPAr", "FTr", "TRBp", "ASTp", "STLp", "BLKp", "TOVp", "USGp")
careerstats[,4:23] <- lapply(careerstats[,4:23], as.numeric.factor)
head(careerstats)
## Player Age Team Games Minutes WS WS.48 DWS OWS PER
## 1 Kareem Abdul-Jabbar 33 LAL 80 2976 14.3 0.230 4.6 9.6 25.5
## 3 Tom Abernethy 26 GSW 10 39 0.0 -0.004 0.0 0.0 3.2
## 4 Tom Abernethy 26 IND 29 259 0.4 0.072 0.2 0.2 8.7
## 5 Alvan Adams 26 PHO 75 2054 7.7 0.180 4.5 3.3 20.3
## 6 Darrell Allums 22 DAL 22 276 -0.3 -0.054 0.2 -0.5 5.3
## 7 Tiny Archibald 32 BOS 80 2820 6.9 0.118 2.3 4.6 14.3
## BPM OBPM DBPM VORP TSp TPAr FTr TRBp ASTp STLp BLKp TOVp USGp
## 1 5.3 3.9 1.4 5.4 0.616 0.001 0.379 15.0 13.6 0.9 4.0 12.8 26.3
## 3 -6.2 -6.0 -0.2 0.0 0.463 0.000 1.000 11.4 2.9 1.2 0.0 31.6 6.4
## 4 -2.8 -2.0 -0.8 -0.1 0.458 0.018 0.339 8.4 8.8 1.1 0.7 8.5 10.9
## 5 5.3 2.0 3.3 3.8 0.567 0.000 0.298 14.7 24.5 2.4 1.9 18.7 23.0
## 6 -5.6 -5.9 0.4 -0.2 0.385 0.015 0.328 13.4 12.5 0.9 1.7 23.1 15.1
## 7 -1.3 0.4 -1.6 0.5 0.582 0.012 0.547 3.6 28.5 1.3 0.4 21.8 17.5
Check for any NA’s. We run into one of the most challenging decisions of the project. After this check, I realize there are NA’s value for players who did not play a minute in the NBA. For some values like Seasons or Win Shares, I can easily assign a 0. The problem was that for a stat like PER and VORP, it is relevant to the other players in the league - and these both represent an impact you had(whether negative or positive). If a player never plays a game, is that as bad as playing the worst game possible? Or should they find the sweet spot where they had no impact on the game at all. I decided to assign players with NA’s the average of players who have only played one game:
head(careerstats[complete.cases(careerstats) == F,])
## Player Age Team Games Minutes WS WS.48 DWS OWS PER BPM
## 1360 Kevin Loder 24 LAC 1 4 0 -0.018 0 0 -3.9 -5.3
## 2088 Yvon Joseph 28 BRK 1 5 0 0.309 0 0 12.8 -0.9
## 2383 Darren Daye 26 CHI 1 7 0 -0.124 0 0 -5.2 -11.4
## 2904 Jeff Lamp 28 LAL 3 7 0 0.227 0 0 9.2 -2.0
## 3108 Tony White 22 CHI 2 2 0 0.012 0 0 0.0 -6.2
## 3509 John Stroeder 30 SAS 1 2 0 -0.652 0 0 -38.8 -25.6
## OBPM DBPM VORP TSp TPAr FTr TRBp ASTp STLp BLKp TOVp USGp
## 1360 -5.3 -0.1 0 NA NA NA 0.0 0.0 0 0 NA 0.0
## 2088 2.7 -3.6 0 1.136 NA NA 0.0 0.0 0 0 0 7.0
## 2383 -11.9 0.5 0 NA NA NA 8.1 16.8 0 0 100 6.0
## 2904 1.2 -3.2 0 1.136 NA NA 0.0 0.0 0 0 0 5.2
## 3108 -6.6 0.4 0 NA NA NA 0.0 0.0 0 0 NA 0.0
## 3509 -25.4 -0.2 0 NA NA NA 0.0 0.0 0 0 100 19.4
careerstats$WS[is.na(careerstats$WS) == T] <- 0
careerstats$WS.48[is.na(careerstats$WS.48) == T] <- 0
careerstats$DWS[is.na(careerstats$DWS) == T] <- 0
careerstats$OWS[is.na(careerstats$OWS) == T] <- 0
careerstats$PER[is.na(careerstats$PER) == T] <- mean(careerstats$PER[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$BPM[is.na(careerstats$BPM) == T] <- mean(careerstats$BPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$OBPM[is.na(careerstats$OBPM) == T] <- mean(careerstats$OBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$DBPM[is.na(careerstats$DBPM) == T] <- mean(careerstats$DBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$VORP[is.na(careerstats$VORP) == T] <- mean(careerstats$VORP[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$TSp[is.na(careerstats$TSp) == T] <- 0
careerstats$FTr[is.na(careerstats$FTr) == T] <- 0
careerstats$TPAr[is.na(careerstats$TPAr) == T] <- 0
careerstats$TRBp[is.na(careerstats$TRBp) == T] <- 0
careerstats$ASTp[is.na(careerstats$ASTp) == T] <- 0
careerstats$STLp[is.na(careerstats$STLp) == T] <- 0
careerstats$BLKp[is.na(careerstats$BLKp) == T] <- 0
careerstats$USGp[is.na(careerstats$USGp) == T] <- 0
careerstats$TOVp[is.na(careerstats$TOVp) == T] <- 0
Calculate the number of seasons played by each player:
seasonsplayed <- count(careerstats, c('Player', 'Age', "Team"))
names(seasonsplayed) <- c('Player', 'Age', 'Team', 'Seasons')
seasonsplayed <- aggregate(seasonsplayed$Seasons, list(seasonsplayed$Player, seasonsplayed$Team), sum)
names(seasonsplayed) <- c('Player', 'Team', 'Seasons')
head(seasonsplayed)
## Player Team Seasons
## 1 Acie Law ATL 2
## 2 Adam Keefe ATL 2
## 3 Adreian Payne ATL 1
## 4 Al Harrington ATL 2
## 5 Al Horford ATL 9
## 6 Al Wood ATL 1
Aggregating each player’s stats into one record (their total career stats):
stats_with_Teams <- merge(aggregate(careerstats[c(4,5,6,8,9)], list(careerstats$Player, careerstats$Team), sum), aggregate(careerstats[c(7, 10:23)], list(careerstats$Player, careerstats$Team), mean))
names(stats_with_Teams)[1] <- "Player"
names(stats_with_Teams)[2] <- "Tm"
head(stats_with_Teams, n=2)
## Player Tm Games Minutes WS DWS OWS WS.48 PER
## 1 A.C. Green DAL 188 5517 10.6 4.9 5.7 0.08933333 12.63333
## 2 A.C. Green LAL 735 21451 64.4 27.2 37.2 0.14188889 14.72222
## BPM OBPM DBPM VORP TSp TPAr FTr
## 1 -1.333333 -1.6666667 0.3333333 0.4666667 0.4880000 0.02866667 0.3083333
## 2 1.044444 0.3333333 0.7000000 1.8888889 0.5647778 0.03911111 0.4818889
## TRBp ASTp STLp BLKp TOVp USGp
## 1 14.46667 5.366667 1.533333 0.6333333 9.30000 12.53333
## 2 14.74444 5.077778 1.511111 1.1111111 12.32222 15.17778
Now I append the career stats to the drafted players from 1980-2012, as well as their total seasons played with their initial team:
drafted_stats <- merge(Alldrafted, stats_with_Teams, by.x = c("Player", "Tm"), by.y = c("Player", "Tm"), all.x = T)
drafted_stats <- merge(drafted_stats,seasonsplayed,by.x = c("Player", "Tm"), by.y = c("Player", "Team"), all.x = T)
I again need to adjust for players who did not play a minute (NA across the board) with the team that drafted them:
drafted_stats$Seasons[is.na(drafted_stats$Seasons) == T] <- 0
drafted_stats$Games[is.na(drafted_stats$Games) == T] <- 0
drafted_stats$Minutes[is.na(drafted_stats$Minutes) == T] <- 0
drafted_stats$WS.y[is.na(drafted_stats$WS.y) == T] <- 0
drafted_stats$WS.48.y[is.na(drafted_stats$WS.48.y) == T] <- 0
drafted_stats$DWS[is.na(drafted_stats$DWS) == T] <- 0
drafted_stats$OWS[is.na(drafted_stats$OWS) == T] <- 0
drafted_stats$PER[is.na(drafted_stats$PER) == T] <- mean(careerstats$PER[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$BPM[is.na(drafted_stats$BPM) == T] <- mean(careerstats$BPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$OBPM[is.na(drafted_stats$OBPM) == T] <- mean(careerstats$OBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$DBPM[is.na(drafted_stats$DBPM) == T] <- mean(careerstats$DBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$VORP[is.na(drafted_stats$VORP) == T] <- mean(careerstats$VORP[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$TSp[is.na(drafted_stats$TSp) == T] <- 0
drafted_stats$FTr[is.na(drafted_stats$FTr) == T] <- 0
drafted_stats$TPAr[is.na(drafted_stats$TPAr) == T] <- 0
drafted_stats$TRBp[is.na(drafted_stats$TRBp) == T] <- 0
drafted_stats$ASTp[is.na(drafted_stats$ASTp) == T] <- 0
drafted_stats$STLp[is.na(drafted_stats$STLp) == T] <- 0
drafted_stats$BLKp[is.na(drafted_stats$BLKp) == T] <- 0
drafted_stats$USGp[is.na(drafted_stats$USGp) == T] <- 0
drafted_stats$TOVp[is.na(drafted_stats$TOVp) == T] <- 0
drafted_stats <- merge(drafted_stats, playerinfo, by.x = c("Player", "Pk"), by.y = c("Player", "Pk"))
drafted_stats <- drafted_stats[c(1,2,3,29:33,35,36,28,8:27)]
names(drafted_stats)[14] <- "WS"
names(drafted_stats)[17] <- "WS.48"
head(drafted_stats, n=3)
## Player Pk Tm Draft.Year POS Height Weight Age PreNBA Source
## 1 A.C. Green 23 LAL 1985 PF 6-9 220 21 Oregon State C
## 2 Aaron Brooks 26 HOU 2007 PG 6-0 161 22 Oregon C
## 3 Aaron McKie 17 POR 1994 SG 6-5 209 21 Temple C
## Seasons Games Minutes WS DWS OWS WS.48 PER BPM
## 1 9 735 21451 64.4 27.2 37.2 0.14188889 14.72222 1.044444e+00
## 2 6 297 7090 12.0 5.4 6.5 0.05416667 11.86667 -2.500000e+00
## 3 3 167 3861 8.4 5.5 2.8 0.09800000 11.93333 -2.775558e-17
## OBPM DBPM VORP TSp TPAr FTr TRBp
## 1 0.3333333 0.70 1.8888889 0.5647778 0.03911111 0.4818889 14.744444
## 2 -0.4500000 -2.05 0.2666667 0.4973333 0.45833333 0.1738333 4.233333
## 3 -1.5333333 1.50 0.7666667 0.5043333 0.20733333 0.2900000 7.866667
## ASTp STLp BLKp TOVp USGp
## 1 5.077778 1.511111 1.111111 12.32222 15.17778
## 2 22.616667 1.300000 1.333333 15.30000 22.75000
## 3 15.433333 2.200000 1.200000 15.76667 16.13333
Step 4: Visualizing the Data
Plot 1:
The first plot I will be creating is the most “loyal” draft classes. Find the classes with the greatest average for seasons played with their original drafted team.
loyaltycheck <- aggregate(drafted_stats$Seasons, list(drafted_stats$Draft.Year), mean)
names(loyaltycheck) <- c("Draft.Class", "Average.Seasons")
I also want to show the player who has played the most seasons in each draft class. I find this by selecting the player with the most seasons for each class(if there is a tie for seasons, I use games to determine who is the most loyal):
loyaltycheck_top <- aggregate(drafted_stats$Seasons, list(drafted_stats$Draft.Year), max, na.rm = TRUE)
names(loyaltycheck_top) <- c("Draft.Year", "Seasons")
loyaltycheck_top <- merge(loyaltycheck_top, drafted_stats, by = c("Draft.Year", "Seasons"), all.x = T)
loyaltycheck_top <- aggregate(loyaltycheck_top$Games, list(loyaltycheck_top$Draft.Year), max, na.rm = TRUE)
names(loyaltycheck_top) <- c("Draft.Year", "Games")
loyaltycheck_top <- merge(loyaltycheck_top, drafted_stats, by = c("Draft.Year", "Games"), all.x = T)
loyaltycheck_top <- subset(loyaltycheck_top, select = c(Player, Draft.Year, Seasons, Tm))
Using plotly to create the graph. I need to split the data records of top players into different sets to allow for my annotations to lay clean on the plot with no overlapping:
loyaltycheck_top$Draft.Year <- as.numeric(loyaltycheck_top$Draft.Year)
loyaltycheck_topH <- subset(loyaltycheck_top, Draft.Year %in% c(1981,1987,1988,1998))
loyaltycheck_topL <- subset(loyaltycheck_top, Draft.Year %in% c(1983,2002,2006))
loyaltycheck_topR <- subset(loyaltycheck_top, Draft.Year %in% c(1982,1994,2004,2008,2009,2010,2011,2012))
loyaltycheck_topM <- subset(loyaltycheck_top, Draft.Year %in% c(1980,1984,1985,1986,1989,1990,1991,1992,1993,1995,1996,1997,1999,2000,2001,2003,2005,2007))
Loyal.Class <- plot_ly(loyaltycheck, x = loyaltycheck$Draft.Class, y = loyaltycheck$Average.Seasons,
type = "scatter", mode = "lines", name = "Average Seasons") %>%
add_trace(y = ~loyaltycheck_top$Seasons, name = 'Most Loyal', mode = 'markers') %>%
add_annotations(x = loyaltycheck_topM$Draft.Year,
y = loyaltycheck_topM$Seasons,
text = paste(loyaltycheck_topM$Player, loyaltycheck_topM$Tm),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 20,
ay = -40,
font = list(color = '#264E86',
family = 'sans serif',
size = 12))%>%
add_annotations(x = loyaltycheck_topH$Draft.Year,
y = loyaltycheck_topH$Seasons,
text = paste(loyaltycheck_topH$Player, loyaltycheck_topH$Tm),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 20,
ay = -60,
font = list(color = '#264E86',
family = 'sans serif',
size = 12))%>%
add_annotations(x = loyaltycheck_topR$Draft.Year,
y = loyaltycheck_topR$Seasons,
text = paste(loyaltycheck_topR$Player, loyaltycheck_topR$Tm),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 40,
ay = -40,
font = list(color = '#264E86',
family = 'sans serif',
size = 12))%>%
add_annotations(x = loyaltycheck_topL$Draft.Year,
y = loyaltycheck_topL$Seasons,
text = paste(loyaltycheck_topL$Player, loyaltycheck_topL$Tm),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 20,
ay = 40,
font = list(color = '#264E86',
family = 'sans serif',
size = 12))%>%
layout(title = "Most Loyal Draft Classes",
xaxis = list(title = "Draft Class"),
yaxis = list(title = "Total Seasons With Drafted Team"))
I will create the same plot but for teams instead of draft classes:
Tloyaltycheck <- aggregate(drafted_stats$Seasons, list(drafted_stats$Tm), mean)
names(Tloyaltycheck) <- c("Tm", "Average.Seasons")
Tloyaltycheck_top <- aggregate(drafted_stats$Seasons, list(drafted_stats$Tm), max, na.rm = TRUE)
names(Tloyaltycheck_top) <- c("Tm", "Seasons")
Tloyaltycheck_top <- merge(Tloyaltycheck_top, drafted_stats, by = c("Tm", "Seasons"), all.x = T)
Tloyaltycheck_top <- aggregate(Tloyaltycheck_top$Games, list(Tloyaltycheck_top$Tm), max, na.rm = TRUE)
names(Tloyaltycheck_top) <- c("Tm", "Games")
Tloyaltycheck_top <- merge(Tloyaltycheck_top, drafted_stats, by = c("Tm", "Games"), all.x = T)
Tloyaltycheck_top <- subset(Tloyaltycheck_top, select = c(Player, Tm, Seasons, Draft.Year))
Tloyaltycheck_topLeft <- subset(Tloyaltycheck_top, Tm %in% c("HOU", "MIA", "LAC", "MIL", "MIN", "SAS", "ORL", "SAC"))
Tloyaltycheck_topLow <- subset(Tloyaltycheck_top, Tm %in% c("TOR"))
Tloyaltycheck_topHR <- subset(Tloyaltycheck_top, Tm %in% c("POR"))
Tloyaltycheck_topM <- subset(Tloyaltycheck_top, Tm %in% c("ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW", "IND", "LAL", "MEM", "NOP", "NYK", "OKC", "PHI", "PHO", "UTA", "WAS"))
Loyal.Team <- plot_ly(Tloyaltycheck, x = Tloyaltycheck$Tm, y = Tloyaltycheck$Average.Seasons,
type = "scatter", mode = "lines", name = "Average Seasons") %>%
add_trace(y = ~Tloyaltycheck_top$Seasons, name = 'Most Loyal', mode = 'markers') %>%
add_annotations(x = Tloyaltycheck_topM$Tm,
y = Tloyaltycheck_topM$Seasons,
text = paste(Tloyaltycheck_topM$Player, Tloyaltycheck_topM$Draft.Year),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 20,
ay = -40, font = list(color = '#264E86',
family = 'sans serif',
size = 12)) %>%
add_annotations(x = Tloyaltycheck_topLeft$Tm,
y = Tloyaltycheck_topLeft$Seasons,
text = paste(Tloyaltycheck_topLeft$Player, Tloyaltycheck_topLeft$Draft.Year),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = -20,
ay = -40, font = list(color = '#264E86',
family = 'sans serif',
size = 12)) %>%
add_annotations(x = Tloyaltycheck_topLow$Tm,
y = Tloyaltycheck_topLow$Seasons,
text = paste(Tloyaltycheck_topLow$Player, Tloyaltycheck_topLow$Draft.Year),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 20,
ay = 40, font = list(color = '#264E86',
family = 'sans serif',
size = 12)) %>%
add_annotations(x = Tloyaltycheck_topHR$Tm,
y = Tloyaltycheck_topHR$Seasons,
text = paste(Tloyaltycheck_topHR$Player, Tloyaltycheck_topHR$Draft.Year),
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
ax = 40,
ay = -60,font = list(color = '#264E86',
family = 'sans serif',
size = 12)) %>%
layout(title = "Average Seasons Played with Drafted Team",
xaxis = list(title = "Team"),
yaxis = list(title = "Total Seasons With Drafted Team"))
Loyal.Team