install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df<-read_csv("Class Playlist Clean DS4 - Sheet1.csv")
## Rows: 175 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Genre, Artist, Song
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 3
## Genre Artist Song
## <chr> <chr> <chr>
## 1 Genre 1 - hiphop/rap 8k When you call
## 2 Genre 2 - pop/kpop/Latin Yonaguni bad bunny
## 3 Genre 1 - hiphop/rap $uicideboy$ Diemonds
## 4 Genre 1 - hiphop/rap $uicideboy$ Not Even Ghosts Are This Empty
## 5 Genre 1 - hiphop/rap 03 Greedo Trap House
## 6 Genre 5 - alt/indie/folk 10,000 Maniacs Trouble Me
df$Genre <- as.factor(df$Genre)
genre_counts<-df%>%count(df$Genre)
genre_counts
## # A tibble: 6 × 2
## `df$Genre` n
## <fct> <int>
## 1 Genre 1 - hiphop/rap 58
## 2 Genre 2 - pop/kpop/Latin 25
## 3 Genre 3 - house/ska 5
## 4 Genre 4 - rnb/soul 14
## 5 Genre 5 - alt/indie/folk 45
## 6 Genre 6 - country/rock 28
genre_counts<-genre_counts%>%mutate(tprob=n/sum(n))
genre_counts$tprob <- round(genre_counts$tprob,2)
genre_counts
## # A tibble: 6 × 3
## `df$Genre` n tprob
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 58 0.33
## 2 Genre 2 - pop/kpop/Latin 25 0.14
## 3 Genre 3 - house/ska 5 0.03
## 4 Genre 4 - rnb/soul 14 0.08
## 5 Genre 5 - alt/indie/folk 45 0.26
## 6 Genre 6 - country/rock 28 0.16
sample <- sample_n(df, 10)
sample
## # A tibble: 10 × 3
## Genre Artist Song
## <fct> <chr> <chr>
## 1 Genre 6 - country/rock Pantera Floods
## 2 Genre 1 - hiphop/rap Tyler the Creator She
## 3 Genre 5 - alt/indie/folk Elliott Smith Christian Brothers
## 4 Genre 5 - alt/indie/folk Yot club Ykwim?
## 5 Genre 1 - hiphop/rap Noname Part of me
## 6 Genre 1 - hiphop/rap Lucki Randomly
## 7 Genre 1 - hiphop/rap Lil Uzi Vert Erase your social
## 8 Genre 2 - pop/kpop/Latin lady gaga just dance
## 9 Genre 1 - hiphop/rap dj khaled wild thoughts
## 10 Genre 2 - pop/kpop/Latin Rawul alejandro dime quien?
Sample_counts <- sample %>% count(Genre) %>% mutate(probability = n/sum(n))
Sample_counts
## # A tibble: 4 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 5 0.5
## 2 Genre 2 - pop/kpop/Latin 2 0.2
## 3 Genre 5 - alt/indie/folk 2 0.2
## 4 Genre 6 - country/rock 1 0.1
combined <- tibble(Genre = genre_counts$Genre, n = c(2, 3, 0, 1, 2, 2), probability = n/sum(n))
## Warning: Unknown or uninitialised column: `Genre`.
combined
## # A tibble: 6 × 2
## n probability
## <dbl> <dbl>
## 1 2 0.2
## 2 3 0.3
## 3 0 0
## 4 1 0.1
## 5 2 0.2
## 6 2 0.2
sample_n(df, 20) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 6 0.3
## 2 Genre 2 - pop/kpop/Latin 1 0.05
## 3 Genre 3 - house/ska 1 0.05
## 4 Genre 5 - alt/indie/folk 7 0.35
## 5 Genre 6 - country/rock 5 0.25
Sampling 20
sample_n(df, 30) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 14 0.47
## 2 Genre 2 - pop/kpop/Latin 3 0.1
## 3 Genre 4 - rnb/soul 3 0.1
## 4 Genre 5 - alt/indie/folk 6 0.2
## 5 Genre 6 - country/rock 4 0.13
Sampling 30
sample_n(df, 40) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 8 0.2
## 2 Genre 2 - pop/kpop/Latin 7 0.17
## 3 Genre 4 - rnb/soul 3 0.07
## 4 Genre 5 - alt/indie/folk 14 0.35
## 5 Genre 6 - country/rock 8 0.2
Sampling 40
sample_n(df, 50) %>% count(Genre) %>% mutate(probability = n/sum(n))
## # A tibble: 5 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 18 0.36
## 2 Genre 2 - pop/kpop/Latin 7 0.14
## 3 Genre 4 - rnb/soul 4 0.08
## 4 Genre 5 - alt/indie/folk 13 0.26
## 5 Genre 6 - country/rock 8 0.16
Sampling 50
sample_n(df, 60) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 5 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 17 0.28
## 2 Genre 2 - pop/kpop/Latin 8 0.13
## 3 Genre 4 - rnb/soul 6 0.1
## 4 Genre 5 - alt/indie/folk 18 0.3
## 5 Genre 6 - country/rock 11 0.18
Sampling 60
sample_n(df, 70) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 22 0.31
## 2 Genre 2 - pop/kpop/Latin 7 0.1
## 3 Genre 3 - house/ska 2 0.03
## 4 Genre 4 - rnb/soul 3 0.04
## 5 Genre 5 - alt/indie/folk 24 0.34
## 6 Genre 6 - country/rock 12 0.17
Sampling 70
sample_n(df, 80) %>% count(Genre) %>% mutate(probability = round(n/sum(n), 2))
## # A tibble: 6 × 3
## Genre n probability
## <fct> <int> <dbl>
## 1 Genre 1 - hiphop/rap 29 0.36
## 2 Genre 2 - pop/kpop/Latin 8 0.1
## 3 Genre 3 - house/ska 2 0.03
## 4 Genre 4 - rnb/soul 10 0.12
## 5 Genre 5 - alt/indie/folk 20 0.25
## 6 Genre 6 - country/rock 11 0.14