nba <- nba %>%
distinct(Year, Player, Tm, .keep_all = T)
Create new points per game column (PPG)
nba <- nba %>%
mutate(PPG = PTS/G) %>%
relocate(PPG, .after = PTS)
df_1 <- sample_n(nba,size = 10000, replace = TRUE)
df_2 <- sample_n(nba,size = 10000, replace = TRUE)
df_3 <- sample_n(nba,size = 10000, replace = TRUE)
df_4 <- sample_n(nba,size = 10000, replace = TRUE)
df_5 <- sample_n(nba,size = 10000, replace = TRUE)
df_6 <- sample_n(nba,size = 10000, replace = TRUE)
df_7 <- sample_n(nba,size = 10000, replace = TRUE)
df_PPG <- data.frame(df_1$PPG, df_2$PPG, df_3$PPG, df_4$PPG, df_5$PPG, df_6$PPG, df_7$PPG)
colnames(df_PPG) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
summary(df_PPG)
## samp_1 samp_2 samp_3 samp_4
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.478 1st Qu.: 3.372 1st Qu.: 3.421 1st Qu.: 3.462
## Median : 6.634 Median : 6.500 Median : 6.568 Median : 6.692
## Mean : 8.172 Mean : 8.057 Mean : 8.109 Mean : 8.141
## 3rd Qu.:11.608 3rd Qu.:11.418 3rd Qu.:11.425 3rd Qu.:11.624
## Max. :37.085 Max. :37.085 Max. :37.085 Max. :35.400
## samp_5 samp_6 samp_7
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.500 1st Qu.: 3.400 1st Qu.: 3.400
## Median : 6.576 Median : 6.509 Median : 6.631
## Mean : 8.056 Mean : 8.160 Mean : 8.093
## 3rd Qu.:11.354 3rd Qu.:11.627 3rd Qu.:11.500
## Max. :36.128 Max. :34.976 Max. :37.085
ggplot(gather(df_PPG), aes(x = value)) +
geom_boxplot(mapping = aes(color = key)) +
xlab("PPG")
ggplot(gather(df_PPG), aes(x = value)) +
geom_density(mapping = aes(fill = key)) +
xlab("PPG")
After taking 7 random samples of size 10,000, ~46.5% of full data set, for the PPG column we can see by looking as the summary stats for each random sample that they are all really similar. All samples have very similar means and medians. This is reflected in the box plot where the deviation in the medians is almost indistinguishable. Because of this the 7 samples have a lot of outliers in common. The only real difference that can be seen in the box plots is the start of the outliers. For example, sample 7 considers anything larger than 23.224 PPG and sample 4 considers anything larger than 23.962 PPG to be an outlier. All in all these samples are really similar as you can see when you plot the density plots on top of each other.
df_Tm <- data.frame(df_1$Tm, df_2$Tm, df_3$Tm, df_4$Tm, df_5$Tm, df_6$Tm, df_7$Tm)
colnames(df_Tm) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
df_Tm %>%
gather() %>%
group_by(key) %>%
ggplot(mapping = aes(x = value, fill = factor(value))) +
geom_bar(position = 'dodge') +
facet_wrap(~key)
df_Tm %>%
gather() %>%
filter(value == 'TOT') %>%
group_by(key) %>%
summarise(count_TOT = n(),
percent_TOT = n()/100)
## # A tibble: 7 × 3
## key count_TOT percent_TOT
## <chr> <int> <dbl>
## 1 samp_1 883 8.83
## 2 samp_2 853 8.53
## 3 samp_3 835 8.35
## 4 samp_4 960 9.6
## 5 samp_5 909 9.09
## 6 samp_6 892 8.92
## 7 samp_7 875 8.75
nba %>%
filter(Tm == "TOT") %>%
summarise(count_TOT = n(),
percent_TOT = (n()/length(nba$Tm))*100)
## # A tibble: 1 × 2
## count_TOT percent_TOT
## <int> <dbl>
## 1 1932 8.98
Here we are looking at the 7 random samples of the categorical team column. In all of the bar charts we can see that one “team” is accounting for a larger proportion than all the other teams. This is the team “TOT”. This is the record in the data that holds a players stats for the full season if they played for multiple teams within the same season. When compared across all samples the percentage of TOT in each is pretty uniform. Furthermore, comparing these to the percentage of this category in the full data set we similar percentages, all within 0.6%. You can also see dips for the same teams across all bar charts. These correspond with teams that are either new, and haven’t been around as long as teams like Boston (BOS), or old teams that have relocated, such as Seattle (SEA).
df_3PA <- data.frame(df_1$`3PA`, df_2$`3PA`, df_3$`3PA`, df_4$`3PA`, df_5$`3PA`, df_6$`3PA`, df_7$`3PA`)
colnames(df_3PA) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
summary(df_3PA)
## samp_1 samp_2 samp_3 samp_4
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.00 1st Qu.: 1.00 1st Qu.: 2.00 1st Qu.: 2.00
## Median : 15.00 Median : 16.00 Median : 17.00 Median : 16.00
## Mean : 71.93 Mean : 72.73 Mean : 74.18 Mean : 71.26
## 3rd Qu.: 102.00 3rd Qu.: 100.00 3rd Qu.: 106.00 3rd Qu.:103.00
## Max. :1028.00 Max. :1028.00 Max. :1028.00 Max. :789.00
## samp_5 samp_6 samp_7
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 2.00
## Median : 16.00 Median : 17.00 Median : 16.00
## Mean : 72.11 Mean : 73.32 Mean : 71.92
## 3rd Qu.: 102.00 3rd Qu.:103.00 3rd Qu.:101.25
## Max. :1028.00 Max. :843.00 Max. :810.00
ggplot(gather(df_3PA), aes(x = value)) +
geom_boxplot(mapping = aes(color = key)) +
xlab("3PA")
ggplot(gather(df_3PA), aes(x = value)) +
geom_density(mapping = aes(fill = key)) +
xlab("3PA")
Above we can once again see that all of these samples have very similar summary statistics across all samples. The one difference being the max 3PA seen in each sample. Samples 1,3, and 4 have a maximum value of 1028 whereas all the other samples max value is in the mid to upper 800s. This is a pretty big difference, but due to the fact that it is only one point and this variable is heavily skewed it doesn’t play a big role in vastly changing the distribution one sample from the others.
df_college <- data.frame(df_1$Colleges, df_2$Colleges, df_3$Colleges, df_4$Colleges, df_5$Colleges, df_6$Colleges, df_7$Colleges)
colnames(df_college) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
df_college %>%
gather() %>%
filter(is.na(value)) %>%
group_by(key) %>%
summarise(count_no_college = n(),
percent_no_college = n()/100)
## # A tibble: 7 × 3
## key count_no_college percent_no_college
## <chr> <int> <dbl>
## 1 samp_1 883 8.83
## 2 samp_2 891 8.91
## 3 samp_3 914 9.14
## 4 samp_4 956 9.56
## 5 samp_5 929 9.29
## 6 samp_6 957 9.57
## 7 samp_7 884 8.84
nba %>%
filter(is.na(Colleges)) %>%
summarise(count_no_college = n(),
percent_co_college = (n()/length(nba$Colleges))*100)
## # A tibble: 1 × 2
## count_no_college percent_co_college
## <int> <dbl>
## 1 2005 9.32
df_college %>%
group_by(samp_1) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 515 × 2
## # Groups: samp_1 [515]
## samp_1 n
## <chr> <int>
## 1 <NA> 883
## 2 UNC 277
## 3 Kentucky 265
## 4 UCLA 264
## 5 Duke 224
## 6 Kansas 177
## 7 Michigan 162
## 8 Arizona 152
## 9 Georgetown 136
## 10 Michigan State 136
## # ℹ 505 more rows
df_college %>%
group_by(samp_2) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 519 × 2
## # Groups: samp_2 [519]
## samp_2 n
## <chr> <int>
## 1 <NA> 891
## 2 Kentucky 285
## 3 UNC 281
## 4 UCLA 257
## 5 Duke 233
## 6 Kansas 161
## 7 Michigan 161
## 8 Arizona 154
## 9 UConn 153
## 10 Michigan State 149
## # ℹ 509 more rows
df_college %>%
group_by(samp_3) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 514 × 2
## # Groups: samp_3 [514]
## samp_3 n
## <chr> <int>
## 1 <NA> 914
## 2 UNC 272
## 3 Kentucky 254
## 4 UCLA 250
## 5 Duke 211
## 6 Kansas 201
## 7 Michigan 160
## 8 Arizona 154
## 9 Georgia Tech 132
## 10 Indiana 131
## # ℹ 504 more rows
df_college %>%
group_by(samp_4) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 514 × 2
## # Groups: samp_4 [514]
## samp_4 n
## <chr> <int>
## 1 <NA> 956
## 2 Kentucky 284
## 3 UCLA 267
## 4 UNC 265
## 5 Duke 228
## 6 Kansas 188
## 7 Arizona 156
## 8 Georgetown 144
## 9 UConn 144
## 10 Michigan 140
## # ℹ 504 more rows
df_college %>%
group_by(samp_5) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 502 × 2
## # Groups: samp_5 [502]
## samp_5 n
## <chr> <int>
## 1 <NA> 929
## 2 UNC 263
## 3 UCLA 256
## 4 Kentucky 243
## 5 Duke 204
## 6 Kansas 183
## 7 Georgetown 157
## 8 Michigan State 156
## 9 Michigan 152
## 10 Arizona 146
## # ℹ 492 more rows
df_college %>%
group_by(samp_6) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 512 × 2
## # Groups: samp_6 [512]
## samp_6 n
## <chr> <int>
## 1 <NA> 957
## 2 UNC 276
## 3 Kentucky 262
## 4 Duke 246
## 5 UCLA 243
## 6 Kansas 198
## 7 Arizona 161
## 8 Michigan 140
## 9 UConn 136
## 10 Georgetown 134
## # ℹ 502 more rows
df_college %>%
group_by(samp_7) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 518 × 2
## # Groups: samp_7 [518]
## samp_7 n
## <chr> <int>
## 1 <NA> 884
## 2 Kentucky 285
## 3 UNC 273
## 4 UCLA 264
## 5 Duke 235
## 6 Kansas 188
## 7 Michigan 144
## 8 Michigan State 139
## 9 UConn 135
## 10 Arizona 129
## # ℹ 508 more rows
nba %>%
group_by(Colleges) %>%
count() %>%
arrange(desc(n)) %>%
top_n(10)
## Selecting by n
## # A tibble: 628 × 2
## # Groups: Colleges [628]
## Colleges n
## <chr> <int>
## 1 <NA> 2005
## 2 UNC 586
## 3 Kentucky 566
## 4 UCLA 524
## 5 Duke 477
## 6 Kansas 393
## 7 Arizona 337
## 8 Michigan 331
## 9 UConn 312
## 10 Michigan State 288
## # ℹ 618 more rows
When looking at the random samples of colleges I wanted to see if proportion of players with no college experience was different than the overall proportion as well as see if the top 10 colleges changed. First off the proportion of players with no college in every sample was very close to the overall proportion of players with no college. The most any one sample deviated from the proportion in the full data set was 0.64%. When looking at the top 10 most common colleges across all samples 6 of the 7 contain the same schools just in slightly different orders depending on the sample. 1 sample includes Maryland as a top 10 school, where all the others have Michigan State in its place. In the overall data set Michigan State sits at 10th and Maryland sits at 14th most popular. So Maryland jumped Georgetown, Indiana, and Syracuse to make its top 10 appearance in sample 3.
df_age <- data.frame(df_1$Age, df_2$Age, df_3$Age, df_4$Age, df_5$Age, df_6$Age, df_7$Age)
colnames(df_college) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
summary(df_age)
## df_1.Age df_2.Age df_3.Age df_4.Age
## Min. :18.00 Min. :18.00 Min. :18.00 Min. :18.00
## 1st Qu.:24.00 1st Qu.:24.00 1st Qu.:24.00 1st Qu.:24.00
## Median :26.00 Median :26.00 Median :26.00 Median :26.00
## Mean :26.68 Mean :26.77 Mean :26.69 Mean :26.78
## 3rd Qu.:29.00 3rd Qu.:29.00 3rd Qu.:29.00 3rd Qu.:29.00
## Max. :42.00 Max. :44.00 Max. :42.00 Max. :43.00
## df_5.Age df_6.Age df_7.Age
## Min. :18.00 Min. :18.0 Min. :18.0
## 1st Qu.:24.00 1st Qu.:24.0 1st Qu.:24.0
## Median :26.00 Median :26.0 Median :26.0
## Mean :26.81 Mean :26.7 Mean :26.7
## 3rd Qu.:30.00 3rd Qu.:29.0 3rd Qu.:29.0
## Max. :44.00 Max. :41.0 Max. :42.0
ggplot(gather(df_age), aes(x = value)) +
geom_boxplot(mapping = aes(color = key)) +
xlab("Age")
Looking at the age variable the samples are more identical than any of the other variables I I have looked at so far. The medians are all identical, means vary less than 0.18 years cross all samples. The largest visible difference between samples in the boxplot is between sample 3,4,6, and 7 and sample 1, 2, and 5. In sample the group of 4 an you would consider a player as young as 37 to be an outlier, however in the latter 3 a player would have to be 40 or older to fall into this category.
df_1 <- df_1 %>%
mutate(height = (12 * Feet) + Inches)
df_2 <- df_2 %>%
mutate(height = (12 * Feet) + Inches)
df_3 <- df_3 %>%
mutate(height = (12 * Feet) + Inches)
df_4 <- df_4 %>%
mutate(height = (12 * Feet) + Inches)
df_5 <- df_5 %>%
mutate(height = (12 * Feet) + Inches)
df_6 <- df_6 %>%
mutate(height = (12 * Feet) + Inches)
df_7 <- df_7 %>%
mutate(height = (12 * Feet) + Inches)
df_height <- data.frame(df_1$height, df_2$height, df_3$height, df_4$height, df_5$height, df_6$height, df_7$height)
colnames(df_college) <- c("samp_1", "samp_2", "samp_3", "samp_4", "samp_5", "samp_6", "samp_7")
summary(df_height)
## df_1.height df_2.height df_3.height df_4.height
## Min. :63.00 Min. :63.00 Min. :63.00 Min. :63.00
## 1st Qu.:76.00 1st Qu.:76.00 1st Qu.:76.00 1st Qu.:76.00
## Median :79.00 Median :79.00 Median :79.00 Median :79.00
## Mean :78.81 Mean :78.76 Mean :78.72 Mean :78.83
## 3rd Qu.:82.00 3rd Qu.:81.00 3rd Qu.:81.00 3rd Qu.:82.00
## Max. :91.00 Max. :91.00 Max. :91.00 Max. :91.00
## df_5.height df_6.height df_7.height
## Min. :63.00 Min. :63.0 Min. :63.0
## 1st Qu.:76.00 1st Qu.:76.0 1st Qu.:76.0
## Median :79.00 Median :79.0 Median :79.0
## Mean :78.87 Mean :78.8 Mean :78.8
## 3rd Qu.:82.00 3rd Qu.:82.0 3rd Qu.:81.0
## Max. :91.00 Max. :91.0 Max. :91.0
df_height %>%
gather() %>%
ggplot(mapping = aes(x = value)) +
geom_histogram(binwidth = 1) +
geom_vline(xintercept = c(75,81), alpha = 0.3) +
geom_vline(xintercept = 78.80675) +
facet_wrap(~key)
For the last variable, height, we once again have very similar samples. The dark line in the middle of the histograms represents the overall population mean. The other lines are just visual guides showing that all 7 samples have peaks at the same 2 spots, 81 and 75. I believe the reason for this is because players of these height are diverse in the positions that they can play. Another similarity in all samples is the quick drop of to the right of the last peak and a slower decline to the left.