DS4 Song Simulation 4.4

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df<-read_csv("Class Playlist Clean DS4 - Sheet1.csv")
## Rows: 175 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Genre, Artist, Song
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Columns show which artist was voted the most in each genre

head(df)
## # A tibble: 6 × 3
##   Genre                    Artist         Song                          
##   <chr>                    <chr>          <chr>                         
## 1 Genre 1 - hiphop/rap     8k             When you call                 
## 2 Genre 2 - pop/kpop/Latin Yonaguni       bad bunny                     
## 3 Genre 1 - hiphop/rap     $uicideboy$    Diemonds                      
## 4 Genre 1 - hiphop/rap     $uicideboy$    Not Even Ghosts Are This Empty
## 5 Genre 1 - hiphop/rap     03 Greedo      Trap House                    
## 6 Genre 5 - alt/indie/folk 10,000 Maniacs Trouble Me

Showing which genres were voted the most and least

df$Genre <- as.factor(df$Genre)
genre_counts<-df%>%count(df$Genre)
genre_counts
## # A tibble: 6 × 2
##   `df$Genre`                   n
##   <fct>                    <int>
## 1 Genre 1 - hiphop/rap        58
## 2 Genre 2 - pop/kpop/Latin    25
## 3 Genre 3 -  house/ska         5
## 4 Genre 4 -  rnb/soul         14
## 5 Genre 5 - alt/indie/folk    45
## 6 Genre 6 - country/rock      28

Changed the column of each probability

genre_counts<-genre_counts%>%mutate(tprob=n/sum(n))

33% hip hop/rap

genre_counts$tprob <- round(genre_counts$tprob,2)
genre_counts
## # A tibble: 6 × 3
##   `df$Genre`                   n tprob
##   <fct>                    <int> <dbl>
## 1 Genre 1 - hiphop/rap        58  0.33
## 2 Genre 2 - pop/kpop/Latin    25  0.14
## 3 Genre 3 -  house/ska         5  0.03
## 4 Genre 4 -  rnb/soul         14  0.08
## 5 Genre 5 - alt/indie/folk    45  0.26
## 6 Genre 6 - country/rock      28  0.16

Calculated the experimental probabilities

sample <- sample_n(df, 10)
sample
## # A tibble: 10 × 3
##    Genre                    Artist           Song                       
##    <fct>                    <chr>            <chr>                      
##  1 Genre 4 -  rnb/soul      Trevor Duncan    East side story            
##  2 Genre 5 - alt/indie/folk Paramore         Decode                     
##  3 Genre 6 - country/rock   Three Days Grace I Hate Everything About You
##  4 Genre 1 - hiphop/rap     Stone ii         O.T.W                      
##  5 Genre 2 - pop/kpop/Latin Casper.True      Too Much                   
##  6 Genre 1 - hiphop/rap     NLE choppa       Shotta Flow 3              
##  7 Genre 1 - hiphop/rap     Lil uzi vert     Ps & Qs                    
##  8 Genre 6 - country/rock   Hank Williams    I'm So Lonesome I Could Cry
##  9 Genre 5 - alt/indie/folk Surf Curse       Freaks                     
## 10 Genre 2 - pop/kpop/Latin MASHLE           Bling-bang-bang-born
Sample_counts <- sample %>% count(Genre) %>% mutate(probability = n/sum(n))
Sample_counts
## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap         3         0.3
## 2 Genre 2 - pop/kpop/Latin     2         0.2
## 3 Genre 4 -  rnb/soul          1         0.1
## 4 Genre 5 - alt/indie/folk     2         0.2
## 5 Genre 6 - country/rock       2         0.2

Changed the table to show only probolibilitys and genres

combined <- tibble(Genre = genre_counts$Genre, n = c(2, 3, 0, 1, 2, 2), probability = n/sum(n))
## Warning: Unknown or uninitialised column: `Genre`.
combined
## # A tibble: 6 × 2
##       n probability
##   <dbl>       <dbl>
## 1     2         0.2
## 2     3         0.3
## 3     0         0  
## 4     1         0.1
## 5     2         0.2
## 6     2         0.2

All of our samples of 10 were combined

sample_n(df, 20) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap         6        0.3 
## 2 Genre 2 - pop/kpop/Latin     4        0.2 
## 3 Genre 4 -  rnb/soul          1        0.05
## 4 Genre 5 - alt/indie/folk     5        0.25
## 5 Genre 6 - country/rock       4        0.2

Run the simulation to see what probability changes

Sampling 20

sample_n(df, 30) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        11        0.37
## 2 Genre 3 -  house/ska         1        0.03
## 3 Genre 4 -  rnb/soul          3        0.1 
## 4 Genre 5 - alt/indie/folk     7        0.23
## 5 Genre 6 - country/rock       8        0.27

Sampling 30

sample_n(df, 40) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        14        0.35
## 2 Genre 2 - pop/kpop/Latin     7        0.17
## 3 Genre 3 -  house/ska         1        0.03
## 4 Genre 4 -  rnb/soul          3        0.07
## 5 Genre 5 - alt/indie/folk     8        0.2 
## 6 Genre 6 - country/rock       7        0.17

Sampling 40

sample_n(df, 50) %>% count(Genre) %>% mutate(probability = n/sum(n))
## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        13        0.26
## 2 Genre 2 - pop/kpop/Latin    10        0.2 
## 3 Genre 3 -  house/ska         1        0.02
## 4 Genre 4 -  rnb/soul          3        0.06
## 5 Genre 5 - alt/indie/folk    16        0.32
## 6 Genre 6 - country/rock       7        0.14

Sampling 50

sample_n(df, 60) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        20        0.33
## 2 Genre 2 - pop/kpop/Latin     9        0.15
## 3 Genre 3 -  house/ska         3        0.05
## 4 Genre 4 -  rnb/soul          4        0.07
## 5 Genre 5 - alt/indie/folk    14        0.23
## 6 Genre 6 - country/rock      10        0.17

Sampling 60

sample_n(df, 70) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        25        0.36
## 2 Genre 2 - pop/kpop/Latin    11        0.16
## 3 Genre 3 -  house/ska         1        0.01
## 4 Genre 4 -  rnb/soul          5        0.07
## 5 Genre 5 - alt/indie/folk    16        0.23
## 6 Genre 6 - country/rock      12        0.17

Sampling 70

sample_n(df, 80) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        23        0.29
## 2 Genre 2 - pop/kpop/Latin    12        0.15
## 3 Genre 3 -  house/ska         3        0.04
## 4 Genre 4 -  rnb/soul          5        0.06
## 5 Genre 5 - alt/indie/folk    23        0.29
## 6 Genre 6 - country/rock      14        0.17