#Student participation:
# Keydy Sanchez
# Vanessa Wasveiler
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(purrr)
options(scipen = 999)
nba <- read_csv("datasets_1206_2171_players_stats (1).csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Name = col_character(),
## Birth_Place = col_character(),
## Birthdate = col_character(),
## Collage = col_character(),
## Experience = col_character(),
## Pos = col_character(),
## Team = col_character()
## )
## See spec(...) for full column specifications.
view(nba)
nba_new <- nba %>%
drop_na(Pos)
# 1. Using the nba dataset, calculate the sampling error for average player weight using a sample size of 50. Be sure to use drop_na() so that you do not have missing values.
set.seed(1)
parameter <- nba_new %>%
summarize(average_weight = mean(`Weight`))
sample_1 <- sample_n(nba_new, size = 50)
sample_statistic <- sample_1 %>%
summarize(average_weight = mean(`Weight`))
sampling_error <- parameter - sample_statistic
print(sampling_error)
## average_weight
## 1 0.4961943
# 2. Sample nba_new$Games Played 100 times with the following sample sizes: 20, 60, 200 and plot them using ggplot2. Describe what you see in 1-2 sentences max (maybe you don’t see anything noteable? - that’s okay too!). Pay attention to the y-axis so that you can see variability. As you work on this, think about what it is that you are doing random samples for.
set.seed(1)
games_played_sample1<- replicate(n = 100, mean(sample(nba_new$`Games Played`,
size = 20)))
sample_index_1 <- 1:100
games_played_mat <- cbind(sample_index_1, games_played_sample1)
games_played_df <- as.data.frame(games_played_mat)
games_played_df %>%
ggplot(aes(x = sample_index_1, y = games_played_sample1)) +
geom_point() + labs(title = "20 times") +
geom_hline(yintercept = mean(nba_new$`Games Played`))
set.seed(1)
games_played_sample2<- replicate(n = 100, mean(sample(nba_new$`Games Played`,
size = 60)))
sample_index_2 <- 1:100
games_played_mat <- cbind(sample_index_2, games_played_sample2)
games_played_df <- as.data.frame(games_played_mat)
games_played_df %>%
ggplot(aes(x = sample_index_2, y = games_played_sample2)) +
geom_point() + labs(title = "60 times") +
geom_hline(yintercept = mean(nba_new$`Games Played`))
set.seed(1)
games_played_sample3<- replicate(n = 100, mean(sample(nba_new$`Games Played`,
size = 200)))
sample_index_3 <- 1:100
games_played_mat <- cbind(sample_index_3, games_played_sample3)
games_played_df <- as.data.frame(games_played_mat)
games_played_df %>%
ggplot(aes(x = sample_index_3, y = games_played_sample3)) +
geom_point() + labs(title = "200 times") +
geom_hline(yintercept = mean(nba_new$`Games Played`))
After using the 3 diferent sample sizes above, the most noticeable thing is how the y-axis becomes smaller as the sample gets bigger and bigger.
# 3. Use stratified sampling to obtain the average mean value of points per player 100 times for a sample size of 98 (17 + 21 + 19 + 18 + 23 for your player Pos breakdown).
nba_new %>%
group_by(Pos) %>%
summarize(freq = n()) %>%
mutate(percentage = freq / sum(freq) * 100) %>%
arrange(desc(percentage)) %>%
View()
## `summarise()` ungrouping output (override with `.groups` argument)
set.seed(1)
sample_strat_nba <- function(x) {
position_c <- nba_new %>% filter(Pos == "C") %>% sample_n(17)
position_pf <- nba_new %>% filter(Pos == "PF") %>% sample_n(21)
position_pg <- nba_new %>% filter(Pos == "PG") %>% sample_n(19)
position_sf <- nba_new %>% filter(Pos == "SF") %>% sample_n(18)
position_sg <- nba_new %>% filter(Pos == "SG") %>% sample_n(23)
combo_nba <- bind_rows(position_c, position_pf, position_pg, position_sf, position_sg)
mean(combo_nba$PTS)
}
sample_num1 <- 1:100
sample_mean_points_value <- map_dbl(sample_num1, sample_strat_nba)
average_points_matrix <- cbind(sample_num1, sample_strat_nba())
average_points_matrix_df <- as.data.frame(average_points_matrix)
average_points_matrix_df %>%
ggplot(aes(x = sample_num1, y = sample_mean_points_value)) +
geom_point() +
geom_hline(yintercept = mean(nba_new$PTS), color = "#002D62")