I took a manual copy of the NBA Players stats page for the 492 players in the league as of 03/09/2021 and created a text file. When saved the file contents was tab delimited with some extraneous fields and data. Below you will find a sample.
filename <- "https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week6/Project2/Data/NBA_Player_Stats.txt"
nba_player_stats <- readLines(filename)
nba_player_stats[1:19]
## [1] "Season Type"
## [2] "Per Mode"
## [3] "Stat Category"
## [4] "Advanced Filters"
## [5] "Totals"
## [6] "Recent Filters Glossary Share"
## [7] "492 Rows | Page"
## [8] "of 10"
## [9] "# \t"
## [10] "Player \tGP \tMIN \tPTS \tFGM \tFGA \tFG% \t3PM \t3PA \t3P% \tFTM \tFTA \tFT% \tOREB \tDREB \tREB \tAST \tSTL \tBLK \tTOV \tPF \tEFF \tAST/TOV \tSTL/TOV"
## [11] "1 \t"
## [12] "Bradley Beal"
## [13] "\t33 \t1178 \t1074 \t374 \t786 \t47.6 \t75 \t229 \t32.8 \t251 \t277 \t90.6 \t43 \t134 \t177 \t156 \t47 \t15 \t105 \t83 \t926 \t1.49 \t0.45"
## [14] "2 \t"
## [15] "Stephen Curry"
## [16] "\t35 \t1194 \t1039 \t343 \t718 \t47.8 \t169 \t411 \t41.1 \t184 \t197 \t93.4 \t17 \t174 \t191 \t221 \t46 \t2 \t111 \t64 \t1000 \t1.99 \t0.41"
## [17] "3 \t"
## [18] "Giannis Antetokounmpo"
## [19] "\t35 \t1189 \t1015 \t368 \t661 \t55.7 \t39 \t137 \t28.5 \t240 \t363 \t66.1 \t64 \t344 \t408 \t205 \t44 \t49 \t128 \t106 \t1177 \t1.60 \t0.34"
Initialize nba_player_rows vector to hold the rows of player data.
nba_player_rows <- c()
The first 11 rows in the dataset are extraneous and will not be used for our purposes here so I will start at row 11 and process all 1486 rows advancing each read by 3 rows.
I use the following functions to process and tidy up the nba_player_stats:
1. unlist() produces a vector which contains all the atomic components which occur in list (row).
2. strsplit() will split the elements of a character vector into substrings according to the matching substring.
In this case the tab delimiter.
3. gusub() will search for white space and replace with no space
4. str_to_title() will convert every first letter in each word to a capital letter.
5. rbind() will combine the rows created into the vector initialized above.
6. seq() is used to cycle through all rows in the dataset starting at row 11, ending with the last row at 1486 by
every 3 rows.
for (i in seq(12, 1486, 3)) {
# row1 <- unlist(strsplit(nba_player_stats[i], '\\\t'))
# player_row <- gsub(' ', '', row1[1])
row1 <- unlist(strsplit(nba_player_stats[i], '\\\t'))
player_name <- str_to_title(str_trim(row1[1]))
row2 <- unlist(strsplit(nba_player_stats[i+1], '\\\t'))
games_played <- gsub(' ', '', row2[2])
min_played <- gsub(' ', '', row2[3])
pts_scored <- gsub(' ', '', row2[4])
fg_made <- gsub(' ', '', row2[5])
fg_attempted <- gsub(' ', '', row2[6])
fg_perc <- gsub(' ', '', row2[7])
three_pts_made <- gsub(' ', '', row2[8])
three_pts_attempted <- gsub(' ', '', row2[9])
three_perc <- gsub(' ', '', row2[10])
ft_made <- gsub(' ', '', row2[11])
ft_attempted <- gsub(' ', '', row2[12])
ft_perc <- gsub(' ', '', row2[13])
nba_player_rows <- rbind(nba_player_rows, c(player_name, games_played, min_played, pts_scored, fg_made, fg_attempted, fg_perc, three_pts_made, three_pts_attempted, three_perc, ft_made, ft_attempted, ft_perc))
}
Next I created a data.frame from the list of concatenated rows of NBA players and only subsetting the field that I believe are the most important statistics in basketball to include 3 Point Percentage, Field Goal Percentage, and Free Throw Percentage.
nba_player_stats_df <- data.frame(nba_player_rows)
colnames(nba_player_stats_df) <- c("Player","games_played","min_played","pts_scored","fg_made","fg_attemps","fg_pct","three_pts_made","three_pts_attempts","three_pts_pct","ft_made","ft_attemps","ft_pct")
head(nba_player_stats_df,50)
I am shortening the name of Giannis Antetokounmpo to G. Antetokounmpo for display purposes as I noticed that his name would overrun into the next players label in any graph his name was present.
nba_player_stats_df$Player[nba_player_stats_df$Player == "Giannis Antetokounmpo"] <- "G. Antetokounmpo"
Transformation of character fields to integers and doubles.
nba_player_stats_df$games_played <- as.integer(nba_player_stats_df$games_played)
nba_player_stats_df$min_played <- as.integer(nba_player_stats_df$min_played)
nba_player_stats_df$pts_scored <- as.integer(nba_player_stats_df$pts_scored)
nba_player_stats_df$fg_made <- as.integer(nba_player_stats_df$fg_made)
nba_player_stats_df$fg_attemps <- as.integer(nba_player_stats_df$fg_attemps)
nba_player_stats_df$fg_pct <- as.double(nba_player_stats_df$fg_pct)
nba_player_stats_df$three_pts_made <- as.integer(nba_player_stats_df$three_pts_made)
nba_player_stats_df$three_pts_attempts <- as.integer(nba_player_stats_df$three_pts_attempts)
nba_player_stats_df$three_pts_pct <- as.double(nba_player_stats_df$three_pts_pct)
nba_player_stats_df$ft_made <- as.integer(nba_player_stats_df$ft_made)
nba_player_stats_df$ft_attemps <- as.integer(nba_player_stats_df$ft_attemps)
nba_player_stats_df$ft_pct <- as.double(nba_player_stats_df$ft_pct)
head(nba_player_stats_df,10)
summary(nba_player_stats_df)
## Player games_played min_played pts_scored
## Length:492 Min. : 1.00 Min. : 4.0 Min. : 0.0
## Class :character 1st Qu.:15.00 1st Qu.: 153.2 1st Qu.: 51.0
## Mode :character Median :25.00 Median : 524.5 Median : 186.5
## Mean :23.07 Mean : 525.3 Mean : 243.8
## 3rd Qu.:33.00 3rd Qu.: 836.2 3rd Qu.: 364.5
## Max. :38.00 Max. :1358.0 Max. :1074.0
## fg_made fg_attemps fg_pct three_pts_made
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 19.00 1st Qu.: 44.75 1st Qu.: 39.10 1st Qu.: 3.00
## Median : 69.00 Median :148.00 Median : 44.20 Median : 17.00
## Mean : 89.35 Mean :192.17 Mean : 43.68 Mean : 27.91
## 3rd Qu.:135.25 3rd Qu.:283.75 3rd Qu.: 50.00 3rd Qu.: 46.00
## Max. :379.00 Max. :786.00 Max. :100.00 Max. :169.00
## three_pts_attempts three_pts_pct ft_made ft_attemps
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 10.00 1st Qu.: 25.00 1st Qu.: 6.00 1st Qu.: 8.0
## Median : 53.00 Median : 34.35 Median : 22.00 Median : 29.5
## Mean : 75.78 Mean : 29.85 Mean : 37.22 Mean : 47.8
## 3rd Qu.:123.00 3rd Qu.: 39.02 3rd Qu.: 49.00 3rd Qu.: 64.5
## Max. :411.00 Max. :100.00 Max. :298.00 Max. :363.0
## ft_pct
## Min. : 0.00
## 1st Qu.: 63.20
## Median : 76.05
## Mean : 69.76
## 3rd Qu.: 84.62
## Max. :100.00
I have selected the 3 point shooting leaders based on percentage among those players who have made over 100 3 point attempts so that players who have played a game or two and who have made 8 out of their 10 3 point attempts don’t show up as the leaders. You will notice that I also do this for field goal and free throw attempts.
nba_3p_stats <- nba_player_stats_df %>% filter(three_pts_attempts >= 100) %>% select('Player','games_played','three_pts_made', 'three_pts_attempts','three_pts_pct')
(nba_3p_leaders <- head(nba_3p_stats[with(nba_3p_stats, order(-three_pts_pct)),],5))
nba_3p_leaders %>%
ggplot(aes(x=reorder(Player,three_pts_attempts),y=three_pts_attempts,fill=Player)) +
geom_bar(stat = 'identity',position=position_dodge()) +
geom_text(aes(label=three_pts_pct), vjust=1.6, color="black",
position = position_dodge(0.9), size=3.5) +
labs(y = ("3 Point Attempts"),x = ("Player"),
title = ("Percentage of 3 Pointers Made by 3 Point Attempts Taken by Player") ) +
scale_fill_brewer(palette="Paired") +
theme_minimal()
Next are the field goal shooting leaders based on percentage among those players who have made over 175 field goal attempts.
nba_fg_stats <- nba_player_stats_df %>% filter(fg_attemps >= 175) %>% select('Player','games_played','fg_made', 'fg_attemps','fg_pct')
(nba_fg_leaders <- head(nba_fg_stats[with(nba_fg_stats, order(-fg_pct)),],5))
nba_fg_leaders %>%
ggplot(aes(x=reorder(Player,fg_attemps),y=fg_attemps,fill=Player)) +
geom_bar(stat = 'identity',position=position_dodge()) +
geom_text(aes(label=fg_pct), vjust=1.6, color="black",
position = position_dodge(0.9), size=3.5) +
labs(y = ("Field Goal Attempts"),x = ("Player"),
title = ("Percentage of Field Goals Made by Field Goal Attempts Taken by Player") ) +
scale_fill_brewer(palette="Paired") +
theme_minimal()
Lastly, the free throw leaders are based on percentage among those players who have made over 200 free throw attempts.
nba_ft_stats <- nba_player_stats_df %>% filter(ft_attemps >= 200) %>% select('Player','games_played','ft_made', 'ft_attemps','ft_pct')
(nba_ft_leaders <- head(nba_ft_stats[with(nba_ft_stats, order(-ft_pct)),],5))
nba_ft_leaders %>%
ggplot(aes(x=reorder(Player,ft_attemps),y=ft_attemps,fill=Player)) +
geom_bar(stat = 'identity',position=position_dodge()) +
geom_text(aes(label=ft_pct), vjust=1.6, color="black",
position = position_dodge(0.9), size=3.5) +
labs(y = ("Free Throw Attempts"),x = ("Player Name"),
title = ("Percentage of Free Throws Made by Free Throw Attempts Taken by Player") ) +
scale_fill_brewer(palette="Paired") +
theme_minimal()