DS4 Song Simulation 4.3

install.packages("tidyverse")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

df<-read_csv("Class Playlist Clean DS4 - Sheet1.csv")

## Rows: 175 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Genre, Artist, Song
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Columns show which artist was voted the most in each genre

head(df)

## # A tibble: 6 × 3
##   Genre                    Artist         Song                          
##   <chr>                    <chr>          <chr>                         
## 1 Genre 1 - hiphop/rap     8k             When you call                 
## 2 Genre 2 - pop/kpop/Latin Yonaguni       bad bunny                     
## 3 Genre 1 - hiphop/rap     $uicideboy$    Diemonds                      
## 4 Genre 1 - hiphop/rap     $uicideboy$    Not Even Ghosts Are This Empty
## 5 Genre 1 - hiphop/rap     03 Greedo      Trap House                    
## 6 Genre 5 - alt/indie/folk 10,000 Maniacs Trouble Me

Showing which genres were voted the most and least

df$Genre <- as.factor(df$Genre)

genre_counts<-df%>%count(df$Genre)
genre_counts

## # A tibble: 6 × 2
##   `df$Genre`                   n
##   <fct>                    <int>
## 1 Genre 1 - hiphop/rap        58
## 2 Genre 2 - pop/kpop/Latin    25
## 3 Genre 3 -  house/ska         5
## 4 Genre 4 -  rnb/soul         14
## 5 Genre 5 - alt/indie/folk    45
## 6 Genre 6 - country/rock      28

Changed the column of each probability

genre_counts<-genre_counts%>%mutate(tprob=n/sum(n))

33% hip hop/rap

genre_counts$tprob <- round(genre_counts$tprob,2)
genre_counts

## # A tibble: 6 × 3
##   `df$Genre`                   n tprob
##   <fct>                    <int> <dbl>
## 1 Genre 1 - hiphop/rap        58  0.33
## 2 Genre 2 - pop/kpop/Latin    25  0.14
## 3 Genre 3 -  house/ska         5  0.03
## 4 Genre 4 -  rnb/soul         14  0.08
## 5 Genre 5 - alt/indie/folk    45  0.26
## 6 Genre 6 - country/rock      28  0.16

Calculated the experimental probabilities

sample <- sample_n(df, 10)
sample

## # A tibble: 10 × 3
##    Genre                    Artist            Song              
##    <fct>                    <chr>             <chr>             
##  1 Genre 6 - country/rock   Pantera           Floods            
##  2 Genre 1 - hiphop/rap     Tyler the Creator She               
##  3 Genre 5 - alt/indie/folk Elliott Smith     Christian Brothers
##  4 Genre 5 - alt/indie/folk Yot club          Ykwim?            
##  5 Genre 1 - hiphop/rap     Noname            Part of me        
##  6 Genre 1 - hiphop/rap     Lucki             Randomly          
##  7 Genre 1 - hiphop/rap     Lil Uzi Vert      Erase your social 
##  8 Genre 2 - pop/kpop/Latin lady gaga         just dance        
##  9 Genre 1 - hiphop/rap     dj khaled         wild thoughts     
## 10 Genre 2 - pop/kpop/Latin Rawul alejandro   dime quien?

Sample_counts <- sample %>% count(Genre) %>% mutate(probability = n/sum(n))
Sample_counts

## # A tibble: 4 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap         5         0.5
## 2 Genre 2 - pop/kpop/Latin     2         0.2
## 3 Genre 5 - alt/indie/folk     2         0.2
## 4 Genre 6 - country/rock       1         0.1

Changed the table to show only probolibilitys and genres

combined <- tibble(Genre = genre_counts$Genre, n = c(2, 3, 0, 1, 2, 2), probability = n/sum(n))

## Warning: Unknown or uninitialised column: `Genre`.

combined

## # A tibble: 6 × 2
##       n probability
##   <dbl>       <dbl>
## 1     2         0.2
## 2     3         0.3
## 3     0         0  
## 4     1         0.1
## 5     2         0.2
## 6     2         0.2

All of our samples of 10 were combined

sample_n(df, 20) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap         6        0.3 
## 2 Genre 2 - pop/kpop/Latin     1        0.05
## 3 Genre 3 -  house/ska         1        0.05
## 4 Genre 5 - alt/indie/folk     7        0.35
## 5 Genre 6 - country/rock       5        0.25

Run the simulation to see what probability changes

Sampling 20

sample_n(df, 30) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        14        0.47
## 2 Genre 2 - pop/kpop/Latin     3        0.1 
## 3 Genre 4 -  rnb/soul          3        0.1 
## 4 Genre 5 - alt/indie/folk     6        0.2 
## 5 Genre 6 - country/rock       4        0.13

Sampling 30

sample_n(df, 40) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap         8        0.2 
## 2 Genre 2 - pop/kpop/Latin     7        0.17
## 3 Genre 4 -  rnb/soul          3        0.07
## 4 Genre 5 - alt/indie/folk    14        0.35
## 5 Genre 6 - country/rock       8        0.2

Sampling 40

sample_n(df, 50) %>% count(Genre) %>% mutate(probability = n/sum(n))

## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        18        0.36
## 2 Genre 2 - pop/kpop/Latin     7        0.14
## 3 Genre 4 -  rnb/soul          4        0.08
## 4 Genre 5 - alt/indie/folk    13        0.26
## 5 Genre 6 - country/rock       8        0.16

Sampling 50

sample_n(df, 60) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 5 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        17        0.28
## 2 Genre 2 - pop/kpop/Latin     8        0.13
## 3 Genre 4 -  rnb/soul          6        0.1 
## 4 Genre 5 - alt/indie/folk    18        0.3 
## 5 Genre 6 - country/rock      11        0.18

Sampling 60

sample_n(df, 70) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        22        0.31
## 2 Genre 2 - pop/kpop/Latin     7        0.1 
## 3 Genre 3 -  house/ska         2        0.03
## 4 Genre 4 -  rnb/soul          3        0.04
## 5 Genre 5 - alt/indie/folk    24        0.34
## 6 Genre 6 - country/rock      12        0.17

Sampling 70

sample_n(df, 80) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))

## # A tibble: 6 × 3
##   Genre                        n probability
##   <fct>                    <int>       <dbl>
## 1 Genre 1 - hiphop/rap        29        0.36
## 2 Genre 2 - pop/kpop/Latin     8        0.1 
## 3 Genre 3 -  house/ska         2        0.03
## 4 Genre 4 -  rnb/soul         10        0.12
## 5 Genre 5 - alt/indie/folk    20        0.25
## 6 Genre 6 - country/rock      11        0.14