Using only first round draft picks from 1980-2012, I’ve calculated the average seasons each team’s draft picks play with their original team to compare all 30 NBA teams.

Here I define a function created to convert previous team names to their current names when cleaning the data (ex. SEA -> OKC):

teamsub <- function(pattern, replacement, x, ...) {
  if (length(pattern)!=length(replacement)) {
    stop("pattern and replacement do not have the same length.")
  }
  result <- x
  for (i in 1:length(pattern)) {
    result <- gsub(pattern[i], replacement[i], result, ...)
  }
  result
}

The next function I define is to convert factor variables to numeric:

as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}

Step 1: Setting up our dataframes

I initially downloaded datasets from basketballreference.com of each draft class and their relevant stats. Here I load these csv files in my project and combine them into one main dataframe. I also clean up the df by: 1. Removing some columns 2. Giving 0 to any NA’s in the Seasons, Games, Win Shares or WS/48 stats 3. Converting the older team names to the current team names :

OGnames <- sprintf("%sclass.csv", 1980:2012)
for (i in OGnames){
  name <- gsub("class.csv", "", i)
name <- paste("class", name, sep = "")
assign(name, read.csv(i, header = T))
}

dfnames <- sprintf("class%s", 1980:2012)

Alldrafted <- rbind.fill(list(class1980, class1981, class1982, class1983, class1984, class1985, class1986, class1987, class1988, class1989, class1990, class1991, class1992, class1993, class1994, class1995, class1996, class1997, class1998, class1999, class2000, class2001, class2002, class2003, class2004, class2005, class2006, class2007, class2008, class2009, class2010, class2011, class2012))

Alldrafted$Player <- gsub("\\\\.*", "", Alldrafted$Player)
Alldrafted <- subset(Alldrafted, select = -c(Rk))
Alldrafted$Tm <- as.character(Alldrafted$Tm)
Alldrafted$Yrs[is.na(Alldrafted$Yrs)] <- 0
Alldrafted$G[is.na(Alldrafted$G)] <- 0
Alldrafted$WS[is.na(Alldrafted$WS)] <- 0
Alldrafted$WS.48[is.na(Alldrafted$WS.48)] <- 0

Alldrafted$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), Alldrafted$Tm)

head(Alldrafted)
##   Pk  Tm            Player Yrs   G    WS  WS.48
## 1  1 GSW Joe Barry Carroll  10 705  35.6  0.075
## 2  2 UTA  Darrell Griffith  10 765  22.0  0.049
## 3  3 BOS      Kevin McHale  13 971 113.0  0.180
## 4  4 CHI     Kelvin Ransey   6 474  17.9  0.074
## 5  5 DEN         James Ray   3 103  -0.3 -0.017
## 6  6 BRK      Mike O'Koren   8 407  14.6  0.080

I also utilized data from Real GM that had more information on every first round draft selection which included their draft year, height, weight, Pre NBA Source. I add the data and clean up some of its columns by 1. Converting the height variable to inches 2. Converting the team names to the proper acronyms :

playerinfo <- read.csv("draftedplayerinfo.csv", header = T, fill = T, stringsAsFactors=FALSE, fileEncoding="latin1")
playerinfo$Height <- gsub("11-May", "5-11", playerinfo$Height)
playerinfo$Height <- gsub("11-Jun", "6-11", playerinfo$Height)
playerinfo$Height <- gsub("10-Jun", "6-10", playerinfo$Height)
playerinfo$Height <- gsub("09-Jun", "6-9", playerinfo$Height)
playerinfo$Height <- gsub("08-Jun", "6-8", playerinfo$Height)
playerinfo$Height <- gsub("07-Jun", "6-7", playerinfo$Height)
playerinfo$Height <- gsub("06-Jun", "6-6", playerinfo$Height)
playerinfo$Height <- gsub("05-Jun", "6-5", playerinfo$Height)
playerinfo$Height <- gsub("04-Jun", "6-4", playerinfo$Height)
playerinfo$Height <- gsub("03-Jun", "6-3", playerinfo$Height)
playerinfo$Height <- gsub("02-Jun", "6-2", playerinfo$Height)
playerinfo$Height <- gsub("01-Jun", "6-1", playerinfo$Height)
playerinfo$Height <- gsub("10-May", "5-10", playerinfo$Height)
playerinfo$Height <- gsub("09-May", "5-9", playerinfo$Height)
playerinfo$Height <- gsub("08-May", "5-8", playerinfo$Height)
playerinfo$Height <- gsub("07-May", "5-7", playerinfo$Height)
playerinfo$Height <- gsub("06-May", "5-6", playerinfo$Height)
playerinfo$Height <- gsub("05-May", "5-5", playerinfo$Height)
playerinfo$Height <- gsub("03-May", "5-3", playerinfo$Height)
playerinfo$Height <- gsub("01-Jul", "7-1", playerinfo$Height)
playerinfo$Height <- gsub("02-Jul", "7-2", playerinfo$Height)
playerinfo$Height <- gsub("03-Jul", "7-3", playerinfo$Height)
playerinfo$Height <- gsub("04-Jul", "7-4", playerinfo$Height)
playerinfo$Height <- gsub("05-Jul", "7-5", playerinfo$Height)
playerinfo$Height <- gsub("06-Jul", "7-6", playerinfo$Height)
playerinfo$Height <- gsub("07-Jul", "7-7", playerinfo$Height)

playerinfo$Team <- teamsub(c("Portland Trail Blazers", "Milwaukee Bucks", "Toronto Raptors", "Houston Rockets", "Washington Wizards", "Cleveland Cavaliers", "Golden State Warriors" , "Orlando Magic", "New Jersey Nets", "Chicago Bulls", "Los Angeles Clippers", "Philadelphia Sixers", "San Antonio Spurs", "New York Knicks", "Sacramento Kings", "Charlotte Hornets (1988)", "Los Angeles Lakers", "Dallas Mavericks", "New Orleans Pelicans", "Minnesota Timberwolves", "Seattle SuperSonics", "Atlanta Hawks", "Vancouver Grizzlies", "Detroit Pistons", "Charlotte Hornets", "Miami Heat", "Indiana Pacers", "Phoenix Suns", "San Diego Clippers", "Boston Celtics", "Memphis Grizzlies", "Utah Jazz", "Denver Nuggets", "Oklahoma City Thunder", "Kansas City Kings", "Brooklyn Nets" ), c("POR", "MIL", "TOR", "HOU", "WAS", "CLE", "GSW", "ORL", "BRK", "CHI", "LAC", "PHI", "SAS", "NYK", "SAC", "NOP", "LAL", "DAL", "NOP", "MIN", "OKC", "ATL", "MEM", "DET", "CHO", "MIA", "IND", "PHO", "LAC", "BOS", "MEM", "UTA", "DEN", "OKC", "SAC", "BRK"), playerinfo$Team)

playerinfo$Draft.Trades <- teamsub(c("UTH", "SAN", "PHL", "GOS", "SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("UTA", "SAS", "PHI", "GSW", "LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), playerinfo$Draft.Trades)

head(Alldrafted)
##   Pk  Tm            Player Yrs   G    WS  WS.48
## 1  1 GSW Joe Barry Carroll  10 705  35.6  0.075
## 2  2 UTA  Darrell Griffith  10 765  22.0  0.049
## 3  3 BOS      Kevin McHale  13 971 113.0  0.180
## 4  4 CHI     Kelvin Ransey   6 474  17.9  0.074
## 5  5 DEN         James Ray   3 103  -0.3 -0.017
## 6  6 BRK      Mike O'Koren   8 407  14.6  0.080

More cleaning of the playerinfo df: 1.Taking away the draft picks post 2012 2. Checking to see if the names from the Real Gm data match the names from the Basketball Reference data (ex. Penny Hardaway vs. Afernee Hardaway) 3. For some reason, Tim Duncan’s stats were not included so I add them in :

playerinfo <- subset(playerinfo, playerinfo$Year != 2013)
playerinfo <- subset(playerinfo, playerinfo$Year != 2014)
playerinfo <- subset(playerinfo, playerinfo$Year != 2015)
playerinfo <- subset(playerinfo, playerinfo$Year != 2016)

namecheck <- merge(Alldrafted, playerinfo, all.x = T, by = "Player")
investigate <- namecheck[complete.cases(namecheck) == F,]

playerinfo$Player[playerinfo$Player == "Alek Radojevic"] <- "Aleksandar Radojevic"
playerinfo$Player[playerinfo$Player == "Penny Hardaway"] <- "Anfernee Hardaway"
playerinfo$Player[playerinfo$Player == "Charles Daniel Smith"] <- "Charles Smith"
playerinfo$Player[playerinfo$Player == "Dontae Jones"] <- "Dontae' Jones"
playerinfo$Player[playerinfo$Player == "Efthimios Rentzias"] <- "Efthimi Rentzias"
playerinfo$Player[playerinfo$Player == "Frankie Williams"] <- "Frank Williams"
playerinfo$Player[playerinfo$Player == "Freddie Banks"] <- "Fred Banks"
playerinfo$Player[playerinfo$Player == "George Muresan"] <- "Gheorghe Muresan"
playerinfo$Player[playerinfo$Player == "Cadillac Anderson"] <- "Greg Anderson"
playerinfo$Player[playerinfo$Player == "J.R. Rider"] <- "Isaiah Rider"
playerinfo$Player[playerinfo$Player == "Jose Rafael Ortiz-Rijos"] <- "Jose Ortiz"
playerinfo$Player[playerinfo$Player == "Marcus D. Williams"] <- "Marcus Williams"
playerinfo$Player[playerinfo$Player == "Mo Taylor"] <- "Maurice Taylor"
playerinfo$Player[playerinfo$Player == "Mel Turpin"] <- "Melvin Turpin"
playerinfo$Player[playerinfo$Player == "Babby Araujo"] <- "Rafael Araujo"
playerinfo$Player[playerinfo$Player == "Saer Sene"] <- "Mouhamed Sene"
playerinfo$Player[playerinfo$Player == "Maybyner Nene"] <- "Nene Hilario"
playerinfo$Player[playerinfo$Player == "Roy Rogers, Jr."] <- "Roy Rogers"
playerinfo$Player[playerinfo$Player == "Jianlian Yi"] <- "Yi Jianlian"

timmy <- c("1997", "1", "1", "Tim Duncan", "SAS", "", "C", "83", "250", "21", "19", "Wake Forest")
playerinfo <-rbind(playerinfo, timmy)

head(playerinfo)
##   Year Rnd Pick          Player Team Draft.Trades   Pos Height Weight Age
## 1 2007   1    1       Greg Oden  POR                  C    7-0    273  19
## 2 2005   1    1    Andrew Bogut  MIL                  C    7-0    260  20
## 3 2006   1    1 Andrea Bargnani  TOR               PF/C    7-0    245  20
## 4 2002   1    1        Yao Ming  HOU                  C    7-6    310  21
## 5 2001   1    1     Kwame Brown  WAS                  C   6-11    290  19
## 6 2003   1    1    LeBron James  CLE              SF/PF    6-8    250  18
##   YOS                          Pre.Draft.Team
## 1   6                              Ohio State
## 2  11                                    Utah
## 3  10      Universo De'Longhi Treviso (Italy)
## 4   9               Shanghai Dongfang (China)
## 5  12                 Glynn Academy (Georgia)
## 6  13 St. Vincent St. Mary High School (Ohio)

The final set of data I use is scraped from the Basketball Reference website. This data holds the season by season records (1980-2016) of every player and their stats. I will be using this dataset instead of a each player’s career totals because I am only interested in the stats with their drafted team. Once loaded, I will be cleaning the data by: 1. Converting team names 2.Removing records of the combined season stats for players who were traded midseason. If a player is traded midseason, they will have a record for their stats on the original team, stats on their new team and combined stats for the whole season. I remove the combined stats for the year

URLs <- sprintf("http://www.basketball-reference.com/leagues/NBA_%s_advanced.html", 1981:2017)
tabla <- list()
for(v in seq_along(URLs))
{
  total <- readHTMLTable(URLs[v])
  n.rows <- unlist(lapply(total, function(t) dim(t)[1]))
  tabla[[v]] <- as.data.frame(total[[which.max(n.rows)]])
}

allplayers <- do.call(rbind.data.frame, tabla)

allplayers$Player <- gsub("\\*", "", allplayers$Player)
allplayers$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), allplayers$Tm)

allplayers <- subset(allplayers, Tm != "Tm")
allplayers <- subset(allplayers, Tm != "TOT")

head(allplayers, n=3)
##   Rk              Player Pos Age  Tm  G   MP  PER  TS% 3PAr   FTr ORB%
## 1  1 Kareem Abdul-Jabbar   C  33 LAL 80 2976 25.5 .616 .001  .379  7.6
## 3  2       Tom Abernethy  SF  26 GSW 10   39  3.2 .463 .000 1.000  2.8
## 4  2       Tom Abernethy  SF  26 IND 29  259  8.7 .458 .018  .339  7.8
##   DRB% TRB% AST% STL% BLK% TOV% USG%   OWS DWS   WS  WS/48  .1 OBPM DBPM
## 1 21.5 15.0 13.6  0.9  4.0 12.8 26.3   9.6 4.6 14.3   .230      3.9  1.4
## 3 20.2 11.4  2.9  1.2  0.0 31.6  6.4   0.0 0.0  0.0 -0.004     -6.0 -0.2
## 4  9.1  8.4  8.8  1.1  0.7  8.5 10.9   0.2 0.2  0.4   .072     -2.0 -0.8
##    BPM VORP
## 1  5.3  5.4
## 3 -6.2  0.0
## 4 -2.8 -0.1

Step 2: Cleaning the dataframes.

With all three datasets in place, I continue cleaning the data. First off, I want to make sure I uniquely define any players with the same names (ex. Tim Hardaway and Tim Hardaway JR.).

I’ll first find duplicates in the Alldrafted and playerinfo df’s:

unique_test <- data.frame(table(Alldrafted$Player))
unique_test[unique_test$Freq > 1,]
##              Var1 Freq
## 140 Charles Smith    2
Alldrafted$Player[Alldrafted$Player == "Charles Smith" & Alldrafted$Pk == 26] <- "Charles Cornelius Smith"

unique_test2 <- data.frame(table(playerinfo$Player))
unique_test2[unique_test2$Freq > 1,]
## [1] Var1 Freq
## <0 rows> (or 0-length row.names)
playerinfo$Player[playerinfo$Player == "Charles Daniel Smith" & playerinfo$Pick == 3] <- "Charles Smith"

Now for the allplayers dataframe. This one is a bit more difficult as there are records for each season played by a player, so there will for sure be duplicates of player names. I will take two approaches to combat this.

The allplayers dataset does not include the season played but does include the age of the player. When I compile the career stats, I want to make sure I am not adding the stats of two seperate players into one. Here I aggregate by player and age and sum games played stats. If there are players at an age with more than 84 games played, I investigate to see if there are duplicate names:

c1 <- allplayers
c1$G <- as.numeric(levels(c1$G))[c1$G]
c2 <- aggregate(c1$G, list(c1$Player, c1$Age), sum)
c2 <- subset(c2, x>84)
names(c2) <- c("Player", "Age", "Games")
c2test <- merge(c2, Alldrafted, all.x = T)
head(c2test)
##          Player Age Games Pk   Tm Yrs   G   WS WS.48
## 1 Charles Jones  23   107 NA <NA>  NA  NA   NA    NA
## 2 Charles Jones  24    99 NA <NA>  NA  NA   NA    NA
## 3 Charles Smith  26   109  3  PHI   9 564 40.9  0.12
## 4 Charles Smith  22    94  3  PHI   9 564 40.9  0.12
## 5 Charles Smith  30    95  3  PHI   9 564 40.9  0.12
## 6 Charles Smith  23    99  3  PHI   9 564 40.9  0.12

Now with the list of players with more than 84 games played in a season, I attempt to remove any players from allplayers who aren’t in the targeted draft group (Round 1, 1980-2012) or change the names of any duplicated players.

allplayers <- allplayers[!(allplayers$Player == "Charles Smith" & allplayers$Tm == "BOS"),]
allplayers <- allplayers[!(allplayers$Player == "Charles Smith" & allplayers$Tm == "MIN"),]
allplayers$Player[allplayers$Player == "Charles Smith" & allplayers$Pos == "SG"] <- "Charles Cornelius Smith"
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "UTA"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "WAS"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "PHO"),]
allplayers <- allplayers[!(allplayers$Player == "Dee Brown" & allplayers$Pos == "PG" & allplayers$Tm == "TOT"),]
allplayers <- allplayers[!(allplayers$Player == "Gerald Henderson" & allplayers$Pos == "PG"),]
allplayers <- allplayers[!(allplayers$Player == "Glen Rice" & allplayers$Pos == "SG"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Tm == "IND"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Pos == "SG"),]
allplayers <- allplayers[!(allplayers$Player == "Glenn Robinson" & allplayers$Tm == "PHI" & allplayers$Age == 21),]
allplayers <- allplayers[!(allplayers$Player == "Larry Drew" & allplayers$Tm == "PHI"),]
allplayers <- allplayers[!(allplayers$Player == "Marcus Williams" & allplayers$Tm == "LAC"),]
allplayers <- allplayers[!(allplayers$Player == "Marcus Williams" & allplayers$Tm == "SAS"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "SAC"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "MEM"),]
allplayers <- allplayers[!(allplayers$Player == "Michael Smith" & allplayers$Tm == "WAS"),]
allplayers <- allplayers[!(allplayers$Player == "Patrick Ewing" & allplayers$Tm == "NOP"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "GSW"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "NOP"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "CHO"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "OKC"),]
allplayers <- allplayers[!(allplayers$Player == "Reggie Williams" & allplayers$Tm == "SAS" & allplayers$G ==20),]
allplayers$Player[allplayers$Player == "Tim Hardaway" & allplayers$Tm == "NYK"] <- "Tim Hardaway Jr"
allplayers$Player[allplayers$Player == "Tim Hardaway" & allplayers$Tm == "ATL"] <- "Tim Hardaway Jr"
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "HOU"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "SAS"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "PHI"),]
allplayers <- allplayers[!(allplayers$Player == "Mike Dunleavy" & allplayers$Tm == "MIL" & allplayers$Pos == "PG"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "ATL"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "CLE"),]
allplayers <- allplayers[!(allplayers$Player == "Eddie Johnson" & allplayers$Tm == "SEA" & allplayers$G == 24),]

Second Attempt at finding duplicate names. In the allplayers dataset, if a player was traded midseason, they would have three records for that season: one for the stats on their initial team, one for the stats on their second team. Knowing this, I find rows that have the same player and age. I then match these players against players who have been traded midseason. If they were traded midseason, there is the possibility that there were two players with the same name who played at the same age.

### I have saved the allplayers file to a csv and have loaded the original data
c3 <- read.csv("allplayers.csv")
c3$Player <- gsub("\\*", "", c3$Player)
c3$Tm <- teamsub(c("SDC", "VAN", "NJN", "CHH", "NOH", "NOK", "KCK", "WSB", "SEA", "CHA", "PHX"), c("LAC", "MEM", "BRK", "NOP", "NOP", "NOP", "SAC", "WAS", "OKC", "CHO", "PHO"), c3$Tm)
c3$Player <- as.character(c3$Player)
c3$Age <- as.numeric.factor(c3$Age)
c3 <- c3[c3$Tm != "Tm",]

TradedPlayers <- subset(c3, Tm == "TOT")
TradedPlayers <- TradedPlayers[c(3,5,6)]

c3dupl <- c3[c(3,5,6)]
c3dupl <- c3dupl[c3dupl$Tm != "TOT",]
c3dupl$Freq <- 1
c3dupl <- aggregate(c3dupl$Freq, by = list(c3dupl$Player, c3dupl$Age), sum)
c3dupl <- subset(c3dupl, x >1)
c3check <- merge(c3dupl, TradedPlayers, by.x = c("Group.1", "Group.2"), by.y = c("Player", "Age"), all.x = T)
head(c3check[complete.cases(c3check) == F,])
##           Group.1 Group.2 x   Tm
## 241 Charles Jones      23 2 <NA>
## 242 Charles Jones      24 2 <NA>
## 243 Charles Jones      26 2 <NA>
## 246 Charles Smith      23 3 <NA>
## 247 Charles Smith      26 2 <NA>
## 248 Charles Smith      27 2 <NA>

After further investigation, I have ensured that the allplayers dataframe has unique player names for the targeted group.

We now have data frame with every draft selection, their draft team and their pick position. Some players were however traded on draft day or the subsequent days after to another team. To account for this, I will utilize some of the information provided in the playerinfo df on draft day trades. For players traded, I will change their Team variable in the Alldrafted df to the corresponding team they’ve been traded to:

draft_day_trades <- subset(playerinfo, select = c("Player", "Pick", "Draft.Trades", "Year"))
draft_day_trades$Draft.Trades[draft_day_trades$Draft.Trades == ""] <- NA
draft_day_trades <- draft_day_trades[complete.cases(draft_day_trades),]
draft_day_trades$Team <- substr(draft_day_trades$Draft.Trades, 8, 10)
draft_day_trades <- subset(draft_day_trades, select = -Draft.Trades)
names(draft_day_trades) <- c("Player", "Pk", "Year", "Team")
Alldrafted <- merge(Alldrafted, draft_day_trades, by = c("Player", "Pk"), all.x = T)
Alldrafted$Tm <- as.character(Alldrafted$Tm)
Alldrafted$Tm[is.na(Alldrafted$Team) == F] <- Alldrafted$Team[is.na(Alldrafted$Team) == F]
Alldrafted <- subset(Alldrafted, select = -c(8, 9))

I just want to check once more to see if there are any players that may have the wrong draft team. I check by seeing which players have no season stats with their drafted team and then investigate further to see if they were traded immediately after the draft.

drafttradetest <- merge(Alldrafted, allplayers, all.x = T, by.x = c("Player", "Tm"), by.y = c("Player", "Tm"))
drafttradetest2 <- drafttradetest[complete.cases(drafttradetest) == F,]
head(drafttradetest2, n=2)
##            Player  Tm Pk Yrs G.x WS.x WS.48   Rk  Pos  Age  G.y   MP  PER
## 223    Ansu Sesay DAL 30   4 127  1.4 0.051 <NA> <NA> <NA> <NA> <NA> <NA>
## 229 Anthony Avent DEN 15   6 352  0.3 0.002 <NA> <NA> <NA> <NA> <NA> <NA>
##      TS% 3PAr  FTr ORB% DRB% TRB% AST% STL% BLK% TOV% USG%       OWS  DWS
## 223 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 229 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
##     WS.y WS/48   .1 OBPM DBPM  BPM VORP
## 223 <NA>  <NA> <NA> <NA> <NA> <NA> <NA>
## 229 <NA>  <NA> <NA> <NA> <NA> <NA> <NA>
Alldrafted$Tm[Alldrafted$Player == "Brendan Haywood"] <-"WAS" 
Alldrafted$Tm[Alldrafted$Player == "Byron Houston" ] <- "GSW" 
Alldrafted$Tm[Alldrafted$Player == "Craig Brackins" ] <- "PHI"
Alldrafted$Tm[Alldrafted$Player == "Darrell Arthur" ] <- "MEM" 
Alldrafted$Tm[Alldrafted$Player == "Anthony Avent"] <- "MIL"
Alldrafted$Tm[Alldrafted$Player == "Efthimi Rentzias"] <- "DEN"   
Alldrafted$Tm[Alldrafted$Player == "Fred Roberts" ] <- "MIL"
Alldrafted$Tm[Alldrafted$Player == "Jamaal Tinsley" ] <- "IND" 
Alldrafted$Tm[Alldrafted$Player == "Larry Krystkowiak" ] <- "SAS"
Alldrafted$Tm[Alldrafted$Player == "Leandro Barbosa" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Kenny Battle" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Mirsad Turkcan" ] <- "HOU" 
Alldrafted$Tm[Alldrafted$Player == "Nikola Mirotic" ] <- "CHI"
Alldrafted$Tm[Alldrafted$Player == "Norris Cole" ] <- "MIA"
Alldrafted$Tm[Alldrafted$Player == "Pat Garrity" ] <- "PHO"
Alldrafted$Tm[Alldrafted$Player == "Randy Foye" ] <- "MIN"
Alldrafted$Tm[Alldrafted$Player == "Tracy Murray" ] <- "POR"

Next I want to prepare the playerinfo dataframe to be merged with our Alldrafted df. I am going to remove some of the unnecessary columns first.

playerinfo <- subset(playerinfo, select =  -c(Rnd, Team, Draft.Trades))
names(playerinfo) <- c("Draft Year", "Pk", "Player", "POS", "Height", "Weight", "Age", "YOS", "PreNBA")

I’m adding a new column that details whether or not the draftee came from College (c), Overseas (o) or High school (h). Players from High School or Overseas have the region of their school or team in brackets (ex. Glynn Academy (Georgia)). I start by finding only the players from Overseas or High School as they have a the region of their former team in brackets. I use regex to extract only players with brackets in their PreNba variable:

IorH <- playerinfo[which(grepl("\\(", playerinfo$PreNBA) == T),]

There are however some draftees who have brackets in their PreNBA column:

playerinfo$PreNBA[playerinfo$Player == "Alfredrick Hughes"] <- "Loyola"
playerinfo$PreNBA[playerinfo$Player == "John Salmons"] <- "University of Miami"
playerinfo$PreNBA[playerinfo$Player == "Ron Harper"] <- "Miami University"
playerinfo$PreNBA[playerinfo$Player == "Tim James"] <- "University of Miami"
playerinfo$PreNBA[playerinfo$Player == "Wally Szczerbiak"] <- "Miami University"

Let’s try again:

IorH <- playerinfo[which(grepl("\\(", playerinfo$PreNBA) == T),]
head(IorH)
##    Draft Year Pk          Player   POS Height Weight Age YOS
## 3        2006  1 Andrea Bargnani  PF/C    7-0    245  20  10
## 4        2002  1        Yao Ming     C    7-6    310  21   9
## 5        2001  1     Kwame Brown     C   6-11    290  19  12
## 6        2003  1    LeBron James SF/PF    6-8    250  18  13
## 8        2004  1   Dwight Howard  C/PF   6-11    265  18  12
## 41       2003  2   Darko Milicic  C/PF    7-0    275  18  10
##                                           PreNBA
## 3             Universo De'Longhi Treviso (Italy)
## 4                      Shanghai Dongfang (China)
## 5                        Glynn Academy (Georgia)
## 6        St. Vincent St. Mary High School (Ohio)
## 8  Southwest Atlanta Christian Academy (Georgia)
## 41                   KK Vrsac Swisslion (Serbia)

To figure out whether or not they came from High School or Overseas, I will match what’s in the brackets to a list of American states. I have checked and ensured there were no players who came from playing in Georgia (country not state). I create a new column in the IorH df that gives a TRue or False of whether or not it is a US state. I then add this column to the playerinfo df and give an “O” for False, an “H” for True and a “C” for NA.

states <- read.csv("states.csv", header = T)
head(states)
##        State Abbreviation
## 1    Alabama           AL
## 2     Alaska           AK
## 3    Arizona           AZ
## 4   Arkansas           AR
## 5 California           CA
## 6   Colorado           CO
states <- subset(states, select = State)

IorH$a <- str_extract(IorH$PreNBA, "\\(.*\\)")
IorH$a <- gsub("\\(", "", IorH$a)
IorH$a <- gsub("\\)", "", IorH$a)
IorH$b <- IorH$a %in% states$State
IorH <- subset(IorH, select = c(Player, Pk, b))

playerinfo <- merge(playerinfo, IorH, by = c("Player", "Pk"), all.x = T)


UHI <- function (x) {
      ifelse(is.na(x) == T, "C",
             ifelse(x == "TRUE", "H", "O"))
}
playerinfo$b <- UHI(playerinfo$b)
names(playerinfo) <- c("Player", "Pk", "Draft.Year", "POS", "Height", "Weight", "Age", "YOS", "PreNBA", "Source")

head(playerinfo[complete.cases(playerinfo) == F,])
##  [1] Player     Pk         Draft.Year POS        Height     Weight    
##  [7] Age        YOS        PreNBA     Source    
## <0 rows> (or 0-length row.names)
head(playerinfo)
##         Player Pk Draft.Year POS Height Weight Age YOS       PreNBA Source
## 1   A.C. Green 23       1985  PF    6-9    220  21  16 Oregon State      C
## 2 Aaron Brooks 26       2007  PG    6-0    161  22   8       Oregon      C
## 3  Aaron McKie 17       1994   G    6-5    209  21  14       Temple      C
## 4    Acie Earl 19       1993  PF   6-10    240  23   4         Iowa      C
## 5     Acie Law 11       2007  SG    6-3    202  22   4    Texas A&M      C
## 6   Adam Keefe 10       1992   F    6-9    230  22   9     Stanford      C

Now to assign one position for each of the players. I want to break it down into the classic 5 positions used. I will use the playerinfo df but for any players whose position isn’t one of the classic 5, I will assign the position of their rookie year (from the allplayers df).

allplayers$Age <- as.numeric(levels(allplayers$Age))[allplayers$Age]
POS_rookie <- allplayers %>% 
    group_by(Player) %>% 
    slice(which.min(Age))

POS_check <- merge(playerinfo, POS_rookie, all.x = T, by.x = "Player", by.y = "Player")
POS_check$Pos <- as.character(POS_check$Pos)
POS_check$Pos[is.na(POS_check$Pos) ==T] <- POS_check$POS
head(POS_check[complete.cases(POS_check) == F,], n=2)
##            Player Pk Draft.Year POS Height Weight Age.x YOS     PreNBA
## 108 Brian Jackson 26       1981  PF    6-8    200    22   0 Utah State
## 126 Calvin Duncan 30       1985   G    6-4    195    24   1        VCU
##     Source   Rk Pos Age.y   Tm    G   MP  PER  TS% 3PAr  FTr ORB% DRB%
## 108      C <NA>  PF    NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 126      C <NA>  PG    NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
##     TRB% AST% STL% BLK% TOV% USG%       OWS  DWS   WS WS/48   .1 OBPM DBPM
## 108 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>  <NA> <NA> <NA> <NA>
## 126 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>  <NA> <NA> <NA> <NA>
##      BPM VORP
## 108 <NA> <NA>
## 126 <NA> <NA>
POS_check$POS[POS_check$POS == "C"] <- 5
POS_check$POS[POS_check$POS == "PF"] <- 4
POS_check$POS[POS_check$POS == "SF"] <- 3
POS_check$POS[POS_check$POS == "SG"] <- 2
POS_check$POS[POS_check$POS == "PG"] <- 1


POS_check$Pos[POS_check$Pos == "C"] <- 5
POS_check$Pos[POS_check$Pos == "PF"] <- 4
POS_check$Pos[POS_check$Pos == "SF"] <- 3
POS_check$Pos[POS_check$Pos == "SG"] <- 2
POS_check$Pos[POS_check$Pos == "PG"] <- 1

head(POS_check, n=2)
##         Player Pk Draft.Year POS Height Weight Age.x YOS       PreNBA
## 1   A.C. Green 23       1985   4    6-9    220    21  16 Oregon State
## 2 Aaron Brooks 26       2007   1    6-0    161    22   8       Oregon
##   Source Rk Pos Age.y  Tm  G   MP  PER  TS% 3PAr  FTr ORB% DRB% TRB% AST%
## 1      C 98   4    22 LAL 82 1542 11.8 .564 .015 .430 12.4 15.5 14.0  4.2
## 2      C 60   1    23 HOU 51  608 13.1 .535 .484 .218  2.5  7.9  5.2 23.5
##   STL% BLK% TOV% USG%   OWS DWS  WS WS/48  .1 OBPM DBPM  BPM VORP
## 1  1.5  1.7 17.7 14.7   1.4 2.0 3.3  .103     -0.5  0.8  0.3  0.9
## 2  1.1  0.6 15.1 21.8   0.6 0.8 1.4  .112      0.0 -1.8 -1.8  0.0
POS_Func <- function (x) {
  ifelse(POS_check$POS != 1|2|3|4|5, POS_check$Pos, POS_check$POS)
}
POS_check$POS <- POS_Func(POS_check$POS)

POS_check <- POS_check[c(1,4)]

POS_check$POS[POS_check$POS == 5] <- "C"
POS_check$POS[POS_check$POS == 4] <- "PF"
POS_check$POS[POS_check$POS == 3] <- "SF"
POS_check$POS[POS_check$POS == 2] <- "SG"
POS_check$POS[POS_check$POS == 1] <-"PG"

playerinfo <- merge(playerinfo, POS_check, by.x = "Player", by.y = "Player") 
playerinfo <- playerinfo[,c(1,2,3,11,5,6,7,8,9,10)]
names(playerinfo) <- c("Player", "Pk", "Draft.Year", "POS", "Height", "Weight", "Age", "YOS", "PreNBA", "Source")
head(playerinfo)
##         Player Pk Draft.Year POS Height Weight Age YOS       PreNBA Source
## 1   A.C. Green 23       1985  PF    6-9    220  21  16 Oregon State      C
## 2 Aaron Brooks 26       2007  PG    6-0    161  22   8       Oregon      C
## 3  Aaron McKie 17       1994  SG    6-5    209  21  14       Temple      C
## 4    Acie Earl 19       1993   C   6-10    240  23   4         Iowa      C
## 5     Acie Law 11       2007  PG    6-3    202  22   4    Texas A&M      C
## 6   Adam Keefe 10       1992  PF    6-9    230  22   9     Stanford      C

Step 3: Aggregating Stats

I can now begin the process of aggregating season stats to each player to find their stats with their drafted team. I begin by creating a new df careerstats and converting names and classes of the variables:

careerstats <- subset(allplayers, select = c(Player, Age, Tm, G, MP, WS, `WS/48`, DWS, OWS,  PER, BPM, OBPM, DBPM, VORP, `TS%`, `3PAr`, FTr, `TRB%`, `AST%`, `STL%`, `BLK%`, `TOV%`, `USG%`))
names(careerstats) <- c("Player", "Age", "Team", "Games", "Minutes", "WS", "WS.48", "DWS", "OWS", "PER", "BPM", "OBPM", "DBPM", "VORP", "TSp", "TPAr", "FTr", "TRBp", "ASTp", "STLp", "BLKp", "TOVp", "USGp")

careerstats[,4:23] <- lapply(careerstats[,4:23], as.numeric.factor)

head(careerstats)
##                Player Age Team Games Minutes   WS  WS.48 DWS  OWS  PER
## 1 Kareem Abdul-Jabbar  33  LAL    80    2976 14.3  0.230 4.6  9.6 25.5
## 3       Tom Abernethy  26  GSW    10      39  0.0 -0.004 0.0  0.0  3.2
## 4       Tom Abernethy  26  IND    29     259  0.4  0.072 0.2  0.2  8.7
## 5         Alvan Adams  26  PHO    75    2054  7.7  0.180 4.5  3.3 20.3
## 6      Darrell Allums  22  DAL    22     276 -0.3 -0.054 0.2 -0.5  5.3
## 7      Tiny Archibald  32  BOS    80    2820  6.9  0.118 2.3  4.6 14.3
##    BPM OBPM DBPM VORP   TSp  TPAr   FTr TRBp ASTp STLp BLKp TOVp USGp
## 1  5.3  3.9  1.4  5.4 0.616 0.001 0.379 15.0 13.6  0.9  4.0 12.8 26.3
## 3 -6.2 -6.0 -0.2  0.0 0.463 0.000 1.000 11.4  2.9  1.2  0.0 31.6  6.4
## 4 -2.8 -2.0 -0.8 -0.1 0.458 0.018 0.339  8.4  8.8  1.1  0.7  8.5 10.9
## 5  5.3  2.0  3.3  3.8 0.567 0.000 0.298 14.7 24.5  2.4  1.9 18.7 23.0
## 6 -5.6 -5.9  0.4 -0.2 0.385 0.015 0.328 13.4 12.5  0.9  1.7 23.1 15.1
## 7 -1.3  0.4 -1.6  0.5 0.582 0.012 0.547  3.6 28.5  1.3  0.4 21.8 17.5

Check for any NA’s. We run into one of the most challenging decisions of the project. After this check, I realize there are NA’s value for players who did not play a minute in the NBA. For some values like Seasons or Win Shares, I can easily assign a 0. The problem was that for a stat like PER and VORP, it is relevant to the other players in the league - and these both represent an impact you had(whether negative or positive). If a player never plays a game, is that as bad as playing the worst game possible? Or should they find the sweet spot where they had no impact on the game at all. I decided to assign players with NA’s the average of players who have only played one game:

head(careerstats[complete.cases(careerstats) == F,])
##             Player Age Team Games Minutes WS  WS.48 DWS OWS   PER   BPM
## 1360   Kevin Loder  24  LAC     1       4  0 -0.018   0   0  -3.9  -5.3
## 2088   Yvon Joseph  28  BRK     1       5  0  0.309   0   0  12.8  -0.9
## 2383   Darren Daye  26  CHI     1       7  0 -0.124   0   0  -5.2 -11.4
## 2904     Jeff Lamp  28  LAL     3       7  0  0.227   0   0   9.2  -2.0
## 3108    Tony White  22  CHI     2       2  0  0.012   0   0   0.0  -6.2
## 3509 John Stroeder  30  SAS     1       2  0 -0.652   0   0 -38.8 -25.6
##       OBPM DBPM VORP   TSp TPAr FTr TRBp ASTp STLp BLKp TOVp USGp
## 1360  -5.3 -0.1    0    NA   NA  NA  0.0  0.0    0    0   NA  0.0
## 2088   2.7 -3.6    0 1.136   NA  NA  0.0  0.0    0    0    0  7.0
## 2383 -11.9  0.5    0    NA   NA  NA  8.1 16.8    0    0  100  6.0
## 2904   1.2 -3.2    0 1.136   NA  NA  0.0  0.0    0    0    0  5.2
## 3108  -6.6  0.4    0    NA   NA  NA  0.0  0.0    0    0   NA  0.0
## 3509 -25.4 -0.2    0    NA   NA  NA  0.0  0.0    0    0  100 19.4
careerstats$WS[is.na(careerstats$WS) == T] <- 0
careerstats$WS.48[is.na(careerstats$WS.48) == T] <- 0
careerstats$DWS[is.na(careerstats$DWS) == T] <- 0
careerstats$OWS[is.na(careerstats$OWS) == T] <- 0
careerstats$PER[is.na(careerstats$PER) == T] <- mean(careerstats$PER[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$BPM[is.na(careerstats$BPM) == T] <- mean(careerstats$BPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$OBPM[is.na(careerstats$OBPM) == T] <- mean(careerstats$OBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$DBPM[is.na(careerstats$DBPM) == T] <- mean(careerstats$DBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$VORP[is.na(careerstats$VORP) == T] <- mean(careerstats$VORP[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
careerstats$TSp[is.na(careerstats$TSp) == T] <- 0
careerstats$FTr[is.na(careerstats$FTr) == T] <- 0
careerstats$TPAr[is.na(careerstats$TPAr) == T] <- 0
careerstats$TRBp[is.na(careerstats$TRBp) == T] <- 0
careerstats$ASTp[is.na(careerstats$ASTp) == T] <- 0
careerstats$STLp[is.na(careerstats$STLp) == T] <- 0
careerstats$BLKp[is.na(careerstats$BLKp) == T] <- 0
careerstats$USGp[is.na(careerstats$USGp) == T] <- 0
careerstats$TOVp[is.na(careerstats$TOVp) == T] <- 0

Calculate the number of seasons played by each player:

seasonsplayed <- count(careerstats, c('Player', 'Age', "Team"))
names(seasonsplayed) <-  c('Player', 'Age', 'Team', 'Seasons')
seasonsplayed <- aggregate(seasonsplayed$Seasons, list(seasonsplayed$Player, seasonsplayed$Team), sum)
names(seasonsplayed) <-  c('Player', 'Team', 'Seasons')
head(seasonsplayed)
##          Player Team Seasons
## 1      Acie Law  ATL       2
## 2    Adam Keefe  ATL       2
## 3 Adreian Payne  ATL       1
## 4 Al Harrington  ATL       2
## 5    Al Horford  ATL       9
## 6       Al Wood  ATL       1

Aggregating each player’s stats into one record (their total career stats):

stats_with_Teams <- merge(aggregate(careerstats[c(4,5,6,8,9)], list(careerstats$Player, careerstats$Team), sum), aggregate(careerstats[c(7, 10:23)], list(careerstats$Player, careerstats$Team), mean))

names(stats_with_Teams)[1] <- "Player"
names(stats_with_Teams)[2] <- "Tm"
head(stats_with_Teams, n=2)
##       Player  Tm Games Minutes   WS  DWS  OWS      WS.48      PER
## 1 A.C. Green DAL   188    5517 10.6  4.9  5.7 0.08933333 12.63333
## 2 A.C. Green LAL   735   21451 64.4 27.2 37.2 0.14188889 14.72222
##         BPM       OBPM      DBPM      VORP       TSp       TPAr       FTr
## 1 -1.333333 -1.6666667 0.3333333 0.4666667 0.4880000 0.02866667 0.3083333
## 2  1.044444  0.3333333 0.7000000 1.8888889 0.5647778 0.03911111 0.4818889
##       TRBp     ASTp     STLp      BLKp     TOVp     USGp
## 1 14.46667 5.366667 1.533333 0.6333333  9.30000 12.53333
## 2 14.74444 5.077778 1.511111 1.1111111 12.32222 15.17778

Now I append the career stats to the drafted players from 1980-2012, as well as their total seasons played with their initial team:

drafted_stats <- merge(Alldrafted, stats_with_Teams, by.x = c("Player", "Tm"), by.y = c("Player", "Tm"), all.x = T)
drafted_stats <- merge(drafted_stats,seasonsplayed,by.x = c("Player", "Tm"), by.y = c("Player", "Team"), all.x = T)

I again need to adjust for players who did not play a minute (NA across the board) with the team that drafted them:

drafted_stats$Seasons[is.na(drafted_stats$Seasons) == T] <- 0
drafted_stats$Games[is.na(drafted_stats$Games) == T] <- 0
drafted_stats$Minutes[is.na(drafted_stats$Minutes) == T] <- 0
drafted_stats$WS.y[is.na(drafted_stats$WS.y) == T] <- 0
drafted_stats$WS.48.y[is.na(drafted_stats$WS.48.y) == T] <- 0
drafted_stats$DWS[is.na(drafted_stats$DWS) == T] <- 0
drafted_stats$OWS[is.na(drafted_stats$OWS) == T] <- 0
drafted_stats$PER[is.na(drafted_stats$PER) == T] <- mean(careerstats$PER[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$BPM[is.na(drafted_stats$BPM) == T] <- mean(careerstats$BPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$OBPM[is.na(drafted_stats$OBPM) == T] <- mean(careerstats$OBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$DBPM[is.na(drafted_stats$DBPM) == T] <- mean(careerstats$DBPM[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$VORP[is.na(drafted_stats$VORP) == T] <- mean(careerstats$VORP[careerstats$Games == 1 & careerstats$Minutes < 5], na.rm = T)
drafted_stats$TSp[is.na(drafted_stats$TSp) == T] <- 0
drafted_stats$FTr[is.na(drafted_stats$FTr) == T] <- 0
drafted_stats$TPAr[is.na(drafted_stats$TPAr) == T] <- 0
drafted_stats$TRBp[is.na(drafted_stats$TRBp) == T] <- 0
drafted_stats$ASTp[is.na(drafted_stats$ASTp) == T] <- 0
drafted_stats$STLp[is.na(drafted_stats$STLp) == T] <- 0
drafted_stats$BLKp[is.na(drafted_stats$BLKp) == T] <- 0
drafted_stats$USGp[is.na(drafted_stats$USGp) == T] <- 0
drafted_stats$TOVp[is.na(drafted_stats$TOVp) == T] <- 0

drafted_stats <- merge(drafted_stats, playerinfo, by.x = c("Player", "Pk"), by.y = c("Player", "Pk"))
drafted_stats <- drafted_stats[c(1,2,3,29:33,35,36,28,8:27)]
names(drafted_stats)[14] <- "WS"
names(drafted_stats)[17] <- "WS.48"
head(drafted_stats, n=3)
##         Player Pk  Tm Draft.Year POS Height Weight Age       PreNBA Source
## 1   A.C. Green 23 LAL       1985  PF    6-9    220  21 Oregon State      C
## 2 Aaron Brooks 26 HOU       2007  PG    6-0    161  22       Oregon      C
## 3  Aaron McKie 17 POR       1994  SG    6-5    209  21       Temple      C
##   Seasons Games Minutes   WS  DWS  OWS      WS.48      PER           BPM
## 1       9   735   21451 64.4 27.2 37.2 0.14188889 14.72222  1.044444e+00
## 2       6   297    7090 12.0  5.4  6.5 0.05416667 11.86667 -2.500000e+00
## 3       3   167    3861  8.4  5.5  2.8 0.09800000 11.93333 -2.775558e-17
##         OBPM  DBPM      VORP       TSp       TPAr       FTr      TRBp
## 1  0.3333333  0.70 1.8888889 0.5647778 0.03911111 0.4818889 14.744444
## 2 -0.4500000 -2.05 0.2666667 0.4973333 0.45833333 0.1738333  4.233333
## 3 -1.5333333  1.50 0.7666667 0.5043333 0.20733333 0.2900000  7.866667
##        ASTp     STLp     BLKp     TOVp     USGp
## 1  5.077778 1.511111 1.111111 12.32222 15.17778
## 2 22.616667 1.300000 1.333333 15.30000 22.75000
## 3 15.433333 2.200000 1.200000 15.76667 16.13333

Step 4: Visualizing the Data

Plot 1:

The first plot I will be creating is the most “loyal” draft classes. Find the classes with the greatest average for seasons played with their original drafted team.

loyaltycheck <- aggregate(drafted_stats$Seasons, list(drafted_stats$Draft.Year), mean) 
names(loyaltycheck) <- c("Draft.Class", "Average.Seasons")

I also want to show the player who has played the most seasons in each draft class. I find this by selecting the player with the most seasons for each class(if there is a tie for seasons, I use games to determine who is the most loyal):

loyaltycheck_top <- aggregate(drafted_stats$Seasons, list(drafted_stats$Draft.Year), max, na.rm = TRUE)
names(loyaltycheck_top) <- c("Draft.Year", "Seasons")
loyaltycheck_top <- merge(loyaltycheck_top, drafted_stats, by = c("Draft.Year", "Seasons"), all.x = T)
loyaltycheck_top <- aggregate(loyaltycheck_top$Games, list(loyaltycheck_top$Draft.Year), max, na.rm = TRUE)
names(loyaltycheck_top) <- c("Draft.Year", "Games")
loyaltycheck_top <- merge(loyaltycheck_top, drafted_stats, by = c("Draft.Year", "Games"), all.x = T)
loyaltycheck_top <- subset(loyaltycheck_top, select = c(Player, Draft.Year, Seasons, Tm))

Using plotly to create the graph. I need to split the data records of top players into different sets to allow for my annotations to lay clean on the plot with no overlapping:

loyaltycheck_top$Draft.Year <- as.numeric(loyaltycheck_top$Draft.Year)
loyaltycheck_topH <- subset(loyaltycheck_top, Draft.Year %in% c(1981,1987,1988,1998))
loyaltycheck_topL <- subset(loyaltycheck_top, Draft.Year %in% c(1983,2002,2006))
loyaltycheck_topR <- subset(loyaltycheck_top, Draft.Year %in% c(1982,1994,2004,2008,2009,2010,2011,2012))
loyaltycheck_topM <- subset(loyaltycheck_top, Draft.Year %in% c(1980,1984,1985,1986,1989,1990,1991,1992,1993,1995,1996,1997,1999,2000,2001,2003,2005,2007))

Loyal.Class <- plot_ly(loyaltycheck, x = loyaltycheck$Draft.Class, y = loyaltycheck$Average.Seasons, 
        type = "scatter", mode = "lines", name = "Average Seasons") %>%
  add_trace(y = ~loyaltycheck_top$Seasons, name = 'Most Loyal', mode = 'markers') %>%
  add_annotations(x = loyaltycheck_topM$Draft.Year,
                  y = loyaltycheck_topM$Seasons,
                  text = paste(loyaltycheck_topM$Player, loyaltycheck_topM$Tm),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 20,
                  ay = -40,
                  font = list(color = '#264E86',
                              family = 'sans serif',
                              size = 12))%>%
  add_annotations(x = loyaltycheck_topH$Draft.Year,
                  y = loyaltycheck_topH$Seasons,
                  text = paste(loyaltycheck_topH$Player, loyaltycheck_topH$Tm),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 20,
                  ay = -60,
                  font = list(color = '#264E86',
                              family = 'sans serif',
                              size = 12))%>%
  add_annotations(x = loyaltycheck_topR$Draft.Year,
                  y = loyaltycheck_topR$Seasons,
                  text = paste(loyaltycheck_topR$Player, loyaltycheck_topR$Tm),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 40,
                  ay = -40,
                  font = list(color = '#264E86',
                              family = 'sans serif',
                              size = 12))%>%
  add_annotations(x = loyaltycheck_topL$Draft.Year,
                  y = loyaltycheck_topL$Seasons,
                  text = paste(loyaltycheck_topL$Player, loyaltycheck_topL$Tm),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 20,
                  ay = 40,
                  font = list(color = '#264E86',
                              family = 'sans serif',
                              size = 12))%>%
  layout(title = "Most Loyal Draft Classes",
         xaxis = list(title = "Draft Class"), 
         yaxis = list(title = "Total Seasons With Drafted Team"))

I will create the same plot but for teams instead of draft classes:

Tloyaltycheck <- aggregate(drafted_stats$Seasons, list(drafted_stats$Tm), mean) 
names(Tloyaltycheck) <- c("Tm", "Average.Seasons")

Tloyaltycheck_top <- aggregate(drafted_stats$Seasons, list(drafted_stats$Tm), max, na.rm = TRUE)
names(Tloyaltycheck_top) <- c("Tm", "Seasons")
Tloyaltycheck_top <- merge(Tloyaltycheck_top, drafted_stats, by = c("Tm", "Seasons"), all.x = T)
Tloyaltycheck_top <- aggregate(Tloyaltycheck_top$Games, list(Tloyaltycheck_top$Tm), max, na.rm = TRUE)
names(Tloyaltycheck_top) <- c("Tm", "Games")
Tloyaltycheck_top <- merge(Tloyaltycheck_top, drafted_stats, by = c("Tm", "Games"), all.x = T)
Tloyaltycheck_top <- subset(Tloyaltycheck_top, select = c(Player, Tm, Seasons, Draft.Year))

Tloyaltycheck_topLeft <- subset(Tloyaltycheck_top, Tm %in% c("HOU", "MIA", "LAC", "MIL", "MIN", "SAS", "ORL", "SAC"))
Tloyaltycheck_topLow <- subset(Tloyaltycheck_top, Tm %in% c("TOR"))
Tloyaltycheck_topHR <- subset(Tloyaltycheck_top, Tm %in% c("POR"))
Tloyaltycheck_topM <- subset(Tloyaltycheck_top, Tm %in% c("ATL", "BOS", "BRK", "CHI", "CHO", "CLE", "DAL", "DEN", "DET", "GSW", "IND", "LAL", "MEM", "NOP", "NYK", "OKC", "PHI", "PHO", "UTA", "WAS"))


Loyal.Team <- plot_ly(Tloyaltycheck, x = Tloyaltycheck$Tm, y = Tloyaltycheck$Average.Seasons, 
        type = "scatter", mode = "lines", name = "Average Seasons") %>%
  add_trace(y = ~Tloyaltycheck_top$Seasons, name = 'Most Loyal', mode = 'markers') %>%
  add_annotations(x = Tloyaltycheck_topM$Tm,
                  y = Tloyaltycheck_topM$Seasons,
                  text = paste(Tloyaltycheck_topM$Player, Tloyaltycheck_topM$Draft.Year),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 20,
                  ay = -40, font = list(color = '#264E86',
                                         family = 'sans serif',
                                         size = 12)) %>%
  add_annotations(x = Tloyaltycheck_topLeft$Tm,
                  y = Tloyaltycheck_topLeft$Seasons,
                  text = paste(Tloyaltycheck_topLeft$Player, Tloyaltycheck_topLeft$Draft.Year),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = -20,
                  ay = -40, font = list(color = '#264E86',
                                         family = 'sans serif',
                                         size = 12)) %>%
  add_annotations(x = Tloyaltycheck_topLow$Tm,
                  y = Tloyaltycheck_topLow$Seasons,
                  text = paste(Tloyaltycheck_topLow$Player, Tloyaltycheck_topLow$Draft.Year),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 20,
                  ay = 40, font = list(color = '#264E86',
                                         family = 'sans serif',
                                         size = 12)) %>%
  add_annotations(x = Tloyaltycheck_topHR$Tm,
                  y = Tloyaltycheck_topHR$Seasons,
                  text = paste(Tloyaltycheck_topHR$Player, Tloyaltycheck_topHR$Draft.Year),
                  xref = "x",
                  yref = "y",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  ax = 40,
                  ay = -60,font = list(color = '#264E86',
                                        family = 'sans serif',
                                        size = 12)) %>%
  layout(title = "Average Seasons Played with Drafted Team",
         xaxis = list(title = "Team"), 
         yaxis = list(title = "Total Seasons With Drafted Team"))
Loyal.Team