NBA(National Basketball Association) has gained a lot of popularity with the heated atmosphere of NBA Finals 2019 between the Toronto Raptors and Golden State Warriors. nba.com provides a lot of specific data about teams and players, from basic stats to unconventional hustle stats. Data from https://stats.nba.com/teams/scoring/?sort=W&dir=-1 This is a practice session of dplyr package and I will look at a glimpse of this dataset.
I scrolled essential information from the website and saved it into a csv file named “nbaplayoff.csv”.
It’s a data frame with 16 columns(observations) with 21 variables.
Team name / Conference(East/West) / games won / games lost / minutes / 2point attempts% / 3point attempts% / points from 2 point shots out of total points / points from mid range 2 pointers out of total points/ points from 3 point shots out of total points / percentage of fastbreaks / percentage of free throws / percentage of points off turnover / percentage of points in paint / assisted 2pt percentage / unassisted 2pt percentage / assisted 3pt percentage / unassisted 3pt percentage / total assisted point percentage / total unassisted point percentage
If you need more demonstration, you can go to https://stats.nba.com/teams/scoring/?sort=W&dir=-1 and search for “glossary”.
I am using a data updated right after the game 1 of NBA Finals. People who saw after the game 2 will see a different data from mine.
Playoff Data has more variance. (not many samples) I’m aware of this fact and an analysis from the regular season can be different from mine.
nbaplayoff2019=read.csv("nbaplayoff.csv",fileEncoding="UTF-8-BOM")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
nbaplayoff2019=tbl_df(nbaplayoff2019) #represents which class each variable is
nbaplayoff2019 %>%
glimpse() #better structured than head()
## Observations: 16
## Variables: 21
## $ Team <fct> Toronto Raptors, Golden State Warriors, Milwaukee B...
## $ conference <fct> East, West, East, West, West, East, West, East, Wes...
## $ game <int> 19, 17, 15, 16, 14, 12, 11, 9, 7, 6, 5, 5, 5, 5, 4, 4
## $ win <int> 13, 12, 10, 8, 7, 7, 6, 5, 3, 2, 1, 1, 1, 1, 0, 0
## $ lose <int> 6, 5, 5, 8, 7, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4
## $ min <dbl> 48.5, 48.6, 48.7, 49.6, 49.4, 48.0, 48.5, 48.0, 48....
## $ att2 <dbl> 59.3, 62.9, 56.9, 63.7, 69.2, 66.9, 49.0, 60.8, 76....
## $ att3 <dbl> 40.7, 37.1, 43.1, 36.3, 30.8, 33.1, 51.0, 39.2, 23....
## $ pt2 <dbl> 48.4, 51.8, 47.9, 50.4, 55.6, 55.2, 39.7, 47.7, 63....
## $ midrange2 <dbl> 10.6, 14.0, 6.9, 10.7, 9.3, 11.5, 2.7, 14.4, 18.8, ...
## $ pt3 <dbl> 33.9, 30.8, 34.0, 32.3, 26.3, 26.0, 43.2, 33.3, 19....
## $ fastbreak <dbl> 15.4, 14.1, 18.5, 9.4, 10.1, 10.5, 10.5, 15.3, 9.3,...
## $ freethrow <dbl> 17.6, 17.4, 18.1, 17.3, 18.1, 18.8, 17.1, 19.0, 17....
## $ offto <dbl> 16.9, 16.5, 12.3, 12.4, 10.5, 14.0, 15.9, 12.7, 9.1...
## $ pitp <dbl> 37.9, 37.8, 41.0, 39.7, 46.3, 43.7, 37.0, 33.3, 44....
## $ assist2 <dbl> 50.2, 64.6, 51.9, 43.4, 53.2, 52.5, 41.4, 46.6, 46....
## $ unassist2 <dbl> 49.8, 35.4, 48.1, 56.6, 46.8, 47.5, 58.6, 53.4, 53....
## $ assist3 <dbl> 81.1, 75.0, 85.9, 65.6, 82.1, 92.0, 59.9, 89.2, 89....
## $ unassist3 <dbl> 18.9, 25.0, 14.1, 34.4, 17.9, 8.0, 40.1, 10.8, 10.6...
## $ assist <dbl> 60.0, 67.5, 62.8, 50.1, 60.1, 61.9, 49.1, 60.1, 53....
## $ unassist <dbl> 40.0, 32.5, 37.2, 49.9, 39.9, 38.1, 50.9, 39.9, 46....
nbaplayoff2019[,"min"]= NULL #don't need about data of minutes per game. The value will be 48 for teams that didn't go overtime.
This is good for adding new columns in a data frame
nbaplayoff2019=nbaplayoff2019 %>% mutate(ratio=win/game*100)
#however, for adding rows, base function rbind is better
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcars %>%
rbind(mean=colMeans(mtcars)) %>%
tail()
## mpg cyl disp hp drat wt qsec
## Lotus Europa 30.40000 4.0000 95.1000 113.0000 3.770000 1.51300 16.90000
## Ford Pantera L 15.80000 8.0000 351.0000 264.0000 4.220000 3.17000 14.50000
## Ferrari Dino 19.70000 6.0000 145.0000 175.0000 3.620000 2.77000 15.50000
## Maserati Bora 15.00000 8.0000 301.0000 335.0000 3.540000 3.57000 14.60000
## Volvo 142E 21.40000 4.0000 121.0000 109.0000 4.110000 2.78000 18.60000
## mean 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875
## vs am gear carb
## Lotus Europa 1.0000 1.00000 5.0000 2.0000
## Ford Pantera L 0.0000 1.00000 5.0000 4.0000
## Ferrari Dino 0.0000 1.00000 5.0000 6.0000
## Maserati Bora 0.0000 1.00000 5.0000 8.0000
## Volvo 142E 1.0000 1.00000 4.0000 2.0000
## mean 0.4375 0.40625 3.6875 2.8125
picking out specific rows or columns that fit certain conditions
nbaplayoff2019 %>%
filter(game>7) #filter teams that advanced to the 2nd round
## # A tibble: 8 x 21
## Team conference game win lose att2 att3 pt2 midrange2 pt3
## <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Toro~ East 19 13 6 59.3 40.7 48.4 10.6 33.9
## 2 Gold~ West 17 12 5 62.9 37.1 51.8 14 30.8
## 3 Milw~ East 15 10 5 56.9 43.1 47.9 6.9 34
## 4 Port~ West 16 8 8 63.7 36.3 50.4 10.7 32.3
## 5 Denv~ West 14 7 7 69.2 30.8 55.6 9.3 26.3
## 6 Phil~ East 12 7 5 66.9 33.1 55.2 11.5 26
## 7 Hous~ West 11 6 5 49 51 39.7 2.7 43.2
## 8 Bost~ East 9 5 4 60.8 39.2 47.7 14.4 33.3
## # ... with 11 more variables: fastbreak <dbl>, freethrow <dbl>,
## # offto <dbl>, pitp <dbl>, assist2 <dbl>, unassist2 <dbl>,
## # assist3 <dbl>, unassist3 <dbl>, assist <dbl>, unassist <dbl>,
## # ratio <dbl>
nbaplayoff2019 %>%
filter(assist<50) #filter teams that a huge proportion of points earned are unassisted
## # A tibble: 4 x 21
## Team conference game win lose att2 att3 pt2 midrange2 pt3
## <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Hous~ West 11 6 5 49 51 39.7 2.7 43.2
## 2 Utah~ West 5 1 4 58.6 41.4 51.5 3.3 28.8
## 3 Detr~ East 4 0 4 65.6 34.4 54.1 13.3 33.7
## 4 Indi~ East 4 0 4 64.4 35.6 51.2 10.9 32.7
## # ... with 11 more variables: fastbreak <dbl>, freethrow <dbl>,
## # offto <dbl>, pitp <dbl>, assist2 <dbl>, unassist2 <dbl>,
## # assist3 <dbl>, unassist3 <dbl>, assist <dbl>, unassist <dbl>,
## # ratio <dbl>
nbaplayoff2019 %>%
filter(assist<50&ratio>50) #unassisted and successful
## # A tibble: 1 x 21
## Team conference game win lose att2 att3 pt2 midrange2 pt3
## <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Hous~ West 11 6 5 49 51 39.7 2.7 43.2
## # ... with 11 more variables: fastbreak <dbl>, freethrow <dbl>,
## # offto <dbl>, pitp <dbl>, assist2 <dbl>, unassist2 <dbl>,
## # assist3 <dbl>, unassist3 <dbl>, assist <dbl>, unassist <dbl>,
## # ratio <dbl>
nbaplayoff2019 %>%
filter(att2>60|freethrow>20)
## # A tibble: 11 x 21
## Team conference game win lose att2 att3 pt2 midrange2 pt3
## <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Gold~ West 17 12 5 62.9 37.1 51.8 14 30.8
## 2 Port~ West 16 8 8 63.7 36.3 50.4 10.7 32.3
## 3 Denv~ West 14 7 7 69.2 30.8 55.6 9.3 26.3
## 4 Phil~ East 12 7 5 66.9 33.1 55.2 11.5 26
## 5 Bost~ East 9 5 4 60.8 39.2 47.7 14.4 33.3
## 6 San ~ West 7 3 4 76.7 23.3 63.3 18.8 19.5
## 7 LA C~ West 6 2 4 64.5 35.5 49.4 9.6 30.1
## 8 Broo~ East 5 1 4 62.8 37.2 49.2 6.1 29.6
## 9 Okla~ West 5 1 4 63.5 36.5 52.1 11.8 29.7
## 10 Detr~ East 4 0 4 65.6 34.4 54.1 13.3 33.7
## 11 Indi~ East 4 0 4 64.4 35.6 51.2 10.9 32.7
## # ... with 11 more variables: fastbreak <dbl>, freethrow <dbl>,
## # offto <dbl>, pitp <dbl>, assist2 <dbl>, unassist2 <dbl>,
## # assist3 <dbl>, unassist3 <dbl>, assist <dbl>, unassist <dbl>,
## # ratio <dbl>
nbaplayoff2019 %>%
filter(rank(-att3)<=6 & rank(-ratio)<=6) #We think rank 1, the best/great/big... To fit this convention, I applied -.
## # A tibble: 4 x 21
## Team conference game win lose att2 att3 pt2 midrange2 pt3
## <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Toro~ East 19 13 6 59.3 40.7 48.4 10.6 33.9
## 2 Milw~ East 15 10 5 56.9 43.1 47.9 6.9 34
## 3 Hous~ West 11 6 5 49 51 39.7 2.7 43.2
## 4 Bost~ East 9 5 4 60.8 39.2 47.7 14.4 33.3
## # ... with 11 more variables: fastbreak <dbl>, freethrow <dbl>,
## # offto <dbl>, pitp <dbl>, assist2 <dbl>, unassist2 <dbl>,
## # assist3 <dbl>, unassist3 <dbl>, assist <dbl>, unassist <dbl>,
## # ratio <dbl>
#filter teams that throws 3point a lot and win
Selecting column names
select(nbaplayoff2019,contains("2"),contains("3")) #select all columns that has 2 or 3 in column names
## # A tibble: 16 x 9
## att2 pt2 midrange2 assist2 unassist2 att3 pt3 assist3 unassist3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 59.3 48.4 10.6 50.2 49.8 40.7 33.9 81.1 18.9
## 2 62.9 51.8 14 64.6 35.4 37.1 30.8 75 25
## 3 56.9 47.9 6.9 51.9 48.1 43.1 34 85.9 14.1
## 4 63.7 50.4 10.7 43.4 56.6 36.3 32.3 65.6 34.4
## 5 69.2 55.6 9.3 53.2 46.8 30.8 26.3 82.1 17.9
## 6 66.9 55.2 11.5 52.5 47.5 33.1 26 92 8
## 7 49 39.7 2.7 41.4 58.6 51 43.2 59.9 40.1
## 8 60.8 47.7 14.4 46.6 53.4 39.2 33.3 89.2 10.8
## 9 76.7 63.3 18.8 46.3 53.7 23.3 19.5 89.4 10.6
## 10 64.5 49.4 9.6 53.5 46.5 35.5 30.1 88.4 11.6
## 11 62.8 49.2 6.1 50.4 49.6 37.2 29.6 49.1 50.9
## 12 63.5 52.1 11.8 47.4 52.6 36.5 29.7 92.3 7.7
## 13 58.1 47.4 10.4 49.5 50.5 41.9 33.9 78.8 21.2
## 14 58.6 51.5 3.3 57.1 10.1 41.4 28.8 21.2 15.6
## 15 65.6 54.1 13.3 50 14.3 34.4 33.7 18.3 13.9
## 16 64.4 51.2 10.9 57.4 13 35.6 32.7 18.7 16.7
nbaplayoff2019 %>%
select(starts_with("att")) #select all columns that starts with "att"
## # A tibble: 16 x 2
## att2 att3
## <dbl> <dbl>
## 1 59.3 40.7
## 2 62.9 37.1
## 3 56.9 43.1
## 4 63.7 36.3
## 5 69.2 30.8
## 6 66.9 33.1
## 7 49 51
## 8 60.8 39.2
## 9 76.7 23.3
## 10 64.5 35.5
## 11 62.8 37.2
## 12 63.5 36.5
## 13 58.1 41.9
## 14 58.6 41.4
## 15 65.6 34.4
## 16 64.4 35.6
nbaplayoff2019%>%
ggplot(aes(x=assist,y=ratio,col=conference,size=ratio)) + geom_point() + geom_text(aes(label=Team),cex=3,hjust=-0.1,vjust=0,col="gray47") + scale_x_continuous(breaks = seq(35,75,5)) + scale_y_continuous(breaks=seq(0,70,10)) + xlim(c(35,75)) #Quite strong relationship between assisted points percentage and win ratio. Correlation Coefficient of 0.71
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
nbaplayoff2019%>%
ggplot(aes(x=freethrow,y=ratio,col=conference,size=ratio))+geom_point() + geom_text(aes(label=Team),cex=3,hjust=0.35,vjust=-0.65,col="gray47") +xlim(c(11,23))+scale_x_continuous(breaks = seq(12,22,by=2)) #Freethrow is one of the 4 factors which are crucial for a team's win
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
#Correlation very low (0.22)
nbaplayoff2019%>%
ggplot(aes(x=offto,y=ratio,col=conference,size=ratio)) +geom_point()+ geom_text(aes(label=Team),cex=3,hjust=0.35,vjust=-0.65,col="gray47") #not any correlation between points off turnovers and win ratio