Activity #4 Group 2

#Student participation: 
# Keydy Sanchez 
# Vanessa Wasveiler

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(purrr)
options(scipen = 999)
nba <- read_csv("datasets_1206_2171_players_stats (1).csv")

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Name = col_character(),
##   Birth_Place = col_character(),
##   Birthdate = col_character(),
##   Collage = col_character(),
##   Experience = col_character(),
##   Pos = col_character(),
##   Team = col_character()
## )

## See spec(...) for full column specifications.

view(nba)

nba_new <- nba %>%
  drop_na(Pos)

# 1. Using the nba dataset, calculate the sampling error for average player weight using a sample size of 50. Be sure to use drop_na() so that you do not have missing values.

set.seed(1)
parameter <- nba_new %>% 
  summarize(average_weight = mean(`Weight`))

sample_1 <- sample_n(nba_new, size = 50) 

sample_statistic <- sample_1 %>% 
  summarize(average_weight = mean(`Weight`))

sampling_error <- parameter - sample_statistic
print(sampling_error)

##   average_weight
## 1      0.4961943

# 2. Sample nba_new$Games Played 100 times with the following sample sizes: 20, 60, 200 and plot them using ggplot2. Describe what you see in 1-2 sentences max (maybe you don’t see anything noteable? - that’s okay too!). Pay attention to the y-axis so that you can see variability. As you work on this, think about what it is that you are doing random samples for.

set.seed(1)

games_played_sample1<- replicate(n = 100, mean(sample(nba_new$`Games Played`, 
size = 20)))
sample_index_1 <- 1:100
games_played_mat <- cbind(sample_index_1, games_played_sample1)
games_played_df <- as.data.frame(games_played_mat)

games_played_df %>%
  ggplot(aes(x = sample_index_1, y = games_played_sample1)) +
  geom_point()  + labs(title = "20 times") +
  geom_hline(yintercept = mean(nba_new$`Games Played`))

set.seed(1)

games_played_sample2<- replicate(n = 100, mean(sample(nba_new$`Games Played`, 
size = 60)))
sample_index_2 <- 1:100
games_played_mat <- cbind(sample_index_2, games_played_sample2)
games_played_df <- as.data.frame(games_played_mat)

games_played_df %>%
  ggplot(aes(x = sample_index_2, y = games_played_sample2)) +
  geom_point() + labs(title = "60 times") + 
  geom_hline(yintercept = mean(nba_new$`Games Played`))

set.seed(1)

games_played_sample3<- replicate(n = 100, mean(sample(nba_new$`Games Played`, 
size = 200)))
sample_index_3 <- 1:100
games_played_mat <- cbind(sample_index_3, games_played_sample3)
games_played_df <- as.data.frame(games_played_mat)

games_played_df %>%
  ggplot(aes(x = sample_index_3, y = games_played_sample3)) +
  geom_point() + labs(title = "200 times") +
  geom_hline(yintercept = mean(nba_new$`Games Played`))

After using the 3 diferent sample sizes above, the most noticeable thing is how the y-axis becomes smaller as the sample gets bigger and bigger.

# 3. Use stratified sampling to obtain the average mean value of points per player 100 times for a sample size of 98 (17 + 21 + 19 + 18 + 23 for your player Pos breakdown).

nba_new %>%
  group_by(Pos) %>%
  summarize(freq = n()) %>%
  mutate(percentage = freq / sum(freq) * 100) %>%
  arrange(desc(percentage)) %>%
  View()

## `summarise()` ungrouping output (override with `.groups` argument)

set.seed(1)

sample_strat_nba <- function(x) {
position_c <- nba_new %>% filter(Pos == "C") %>% sample_n(17)
position_pf <- nba_new %>% filter(Pos == "PF") %>% sample_n(21)
position_pg <- nba_new %>% filter(Pos == "PG") %>% sample_n(19)
position_sf <- nba_new %>% filter(Pos == "SF") %>% sample_n(18)
position_sg <- nba_new %>% filter(Pos == "SG") %>% sample_n(23)
combo_nba <- bind_rows(position_c, position_pf, position_pg, position_sf, position_sg)
mean(combo_nba$PTS)

}

sample_num1 <- 1:100

sample_mean_points_value <- map_dbl(sample_num1, sample_strat_nba)
average_points_matrix <- cbind(sample_num1, sample_strat_nba())
average_points_matrix_df <- as.data.frame(average_points_matrix)
average_points_matrix_df %>%
  ggplot(aes(x = sample_num1, y = sample_mean_points_value)) +
  geom_point() +
  geom_hline(yintercept = mean(nba_new$PTS), color = "#002D62")