# ===== 0. Libraries ==========================================================
library (tidyverse) # dplyr, ggplot2, readr, forcats...
library (janitor) # clean_names()
library (lubridate) # ymd(), year()
library (tidymodels) # recipes + workflows + parsnip + yardstick
library (yardstick) # (tidymodels loads this, explicit for clarity)
library (pROC) # threshold optimisation & extra ROC tools
# ===== 1. Load & initial inspect ============================================
# ※ データファイルは同じ作業ディレクトリに置いてください
df <- read_csv ("dat_with_country_all.csv" ) %>%
clean_names () %>%
mutate (
# 正解ラベルを factor に。levels の順序は "low" -> "high" にしておく
popularity = factor (popularity, levels = c ("low" , "high" )),
# 日付を Date 型 → 年だけ取り出し
track_album_release_date = ymd (track_album_release_date),
release_year = year (track_album_release_date)
) %>%
select (- track_album_release_date) # もとの日付列は削除
glimpse (df)
Rows: 4,786
Columns: 32
$ energy <dbl> 0.74600, 0.83500, 0.80400, 0.10400, 0.47200, 0.36500…
$ tempo <dbl> 132.310, 129.981, 111.457, 76.474, 80.487, 119.347, …
$ danceability <dbl> 0.6360, 0.5720, 0.5910, 0.4430, 0.6850, 0.6700, 0.66…
$ playlist_genre <chr> "rock", "rock", "rock", "jazz", "jazz", "jazz", "jaz…
$ loudness <dbl> -3.785, -6.219, -7.299, -17.042, -9.691, -10.158, -1…
$ liveness <dbl> 0.1730, 0.0702, 0.0818, 0.1910, 0.2240, 0.0575, 0.10…
$ valence <dbl> 0.4320, 0.7950, 0.6580, 0.3940, 0.4750, 0.4500, 0.65…
$ track_artist <chr> "Creedence Clearwater Revival", "Van Halen", "Stevie…
$ time_signature <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 3…
$ speechiness <dbl> 0.0393, 0.0317, 0.0454, 0.1010, 0.0298, 0.0566, 0.09…
$ track_popularity <dbl> 23, 53, 55, 64, 62, 61, 60, 55, 54, 53, 53, 53, 52, …
$ track_href <chr> "https://api.spotify.com/v1/tracks/5e6x5YRnMJIKvYpZx…
$ uri <chr> "spotify:track:5e6x5YRnMJIKvYpZxLqdpH", "spotify:tra…
$ track_album_name <chr> "The Long Road Home - The Ultimate John Fogerty / Cr…
$ playlist_name <chr> "Rock Classics", "Rock Classics", "Rock Classics", "…
$ analysis_url <chr> "https://api.spotify.com/v1/audio-analysis/5e6x5YRnM…
$ track_id <chr> "5e6x5YRnMJIKvYpZxLqdpH", "5FqYA8KfiwsQvyBI4IamnY", …
$ track_name <chr> "Fortunate Son", "Jump - 2015 Remaster", "Edge of Se…
$ instrumentalness <dbl> 2.90e-01, 3.77e-04, 5.98e-06, 0.00e+00, 2.84e-01, 0.…
$ track_album_id <chr> "4A8gFwqd9jTtnsNwUu3OQx", "2c965LEDRNrXXCeBOAAwns", …
$ mode <dbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
$ key <dbl> 0, 0, 0, 0, 9, 0, 0, 10, 5, 5, 1, 10, 7, 0, 8, 0, 10…
$ duration_ms <dbl> 138053, 241600, 329413, 185160, 205720, 147147, 3545…
$ acousticness <dbl> 0.0648, 0.1710, 0.3270, 0.9130, 0.7850, 0.5250, 0.70…
$ id <chr> "5e6x5YRnMJIKvYpZxLqdpH", "5FqYA8KfiwsQvyBI4IamnY", …
$ playlist_subgenre <chr> "classic", "classic", "classic", "classic", "classic…
$ type <chr> "audio_features", "audio_features", "audio_features"…
$ playlist_id <chr> "37i9dQZF1DWXRqgorJj26U", "37i9dQZF1DWXRqgorJj26U", …
$ popularity <fct> low, low, low, low, low, low, low, low, low, low, lo…
$ genre6 <chr> "rock", "rock", "rock", "others", "others", "others"…
$ country <chr> "US", "US", "US", "US", "CA", "US", "US", "US", "US"…
$ release_year <dbl> 2005, 2015, 2016, 2007, 2000, 2015, 1997, 1990, 1992…
# ===== 2. Remove leakage / identifier columns ===============================
leak_cols <- c (
"track_popularity" , "id" , "uri" , "track_name" , "track_artist" ,
"playlist_id" , "track_href" , "track_album_name" , "playlist_name" ,
"analysis_url" , "type" , "track_id" , "track_album_id" , "genre6"
)
df <- df %>% select (- any_of (leak_cols))
# ===== 3. Collapse infrequent categories ====================================
# カテゴリ数が多すぎる列 → 出現数 < collapse_min を "Other" に統合
collapse_min <- 10
df <- df %>%
mutate (
playlist_genre = fct_lump_min (playlist_genre, min = collapse_min, other_level = "Other" ),
playlist_subgenre = fct_lump_min (playlist_subgenre, min = collapse_min, other_level = "Other" ),
country = fct_lump_min (country, min = collapse_min, other_level = "Other" )
)
glimpse (df)
Rows: 4,786
Columns: 18
$ energy <dbl> 0.74600, 0.83500, 0.80400, 0.10400, 0.47200, 0.36500…
$ tempo <dbl> 132.310, 129.981, 111.457, 76.474, 80.487, 119.347, …
$ danceability <dbl> 0.6360, 0.5720, 0.5910, 0.4430, 0.6850, 0.6700, 0.66…
$ playlist_genre <fct> rock, rock, rock, jazz, jazz, jazz, jazz, jazz, jazz…
$ loudness <dbl> -3.785, -6.219, -7.299, -17.042, -9.691, -10.158, -1…
$ liveness <dbl> 0.1730, 0.0702, 0.0818, 0.1910, 0.2240, 0.0575, 0.10…
$ valence <dbl> 0.4320, 0.7950, 0.6580, 0.3940, 0.4750, 0.4500, 0.65…
$ time_signature <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 3…
$ speechiness <dbl> 0.0393, 0.0317, 0.0454, 0.1010, 0.0298, 0.0566, 0.09…
$ instrumentalness <dbl> 2.90e-01, 3.77e-04, 5.98e-06, 0.00e+00, 2.84e-01, 0.…
$ mode <dbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
$ key <dbl> 0, 0, 0, 0, 9, 0, 0, 10, 5, 5, 1, 10, 7, 0, 8, 0, 10…
$ duration_ms <dbl> 138053, 241600, 329413, 185160, 205720, 147147, 3545…
$ acousticness <dbl> 0.0648, 0.1710, 0.3270, 0.9130, 0.7850, 0.5250, 0.70…
$ playlist_subgenre <fct> classic, classic, classic, classic, classic, classic…
$ popularity <fct> low, low, low, low, low, low, low, low, low, low, lo…
$ country <fct> US, US, US, US, CA, US, US, US, US, US, US, US, US, …
$ release_year <dbl> 2005, 2015, 2016, 2007, 2000, 2015, 1997, 1990, 1992…
# ===== 4. Train-test split ===================================================
set.seed (42 )
splits <- initial_split (df, strata = popularity, prop = 0.80 )
train <- training (splits)
test <- testing (splits)
# ===== 5. Recipe (前処理) ====================================================
rec <- recipe (popularity ~ ., data = train) %>%
step_unknown (all_nominal_predictors ()) %>% # 欠損カテゴリを "unknown"
step_other (all_nominal_predictors (), threshold = 0.01 ) %>% # 超レアをまとめる
step_normalize (all_numeric_predictors ()) %>% # 数値を平均0・分散1
step_dummy (all_nominal_predictors (), one_hot = TRUE ) # one-hot エンコーディング
# ===== 6. Model spec =========================================================
logreg_spec <- logistic_reg () %>%
set_engine ("glm" ) %>%
set_mode ("classification" )
# ===== 7. Workflow ===========================================================
wf <- workflow () %>%
add_recipe (rec) %>%
add_model (logreg_spec)
# ===== 8. Fit model ==========================================================
fit <- wf %>% fit (data = train)
# ===== 9. Predict probabilities on test set ==================================
# 予測確率 p(high)
pred <- predict (fit, test, type = "prob" ) %>%
bind_cols (test %>% select (popularity))
# ===== 10. Performance metrics ==============================================
## 10-1. AUC ("high" を陽性クラスにする)
auc_res <- roc_auc (pred, truth = popularity, .pred_high, event_level = "second" )
print (auc_res)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.920
## 10-2. ROC curve
roc_curve (pred, truth = popularity, .pred_high, event_level = "second" ) %>%
autoplot () +
ggtitle ("ROC Curve – Logistic Regression" )
## 10-3. Class prediction (threshold = 0.5)
pred <- pred %>%
mutate (pred_class = factor (if_else (.pred_high > 0.5 , "high" , "low" ),
levels = c ("low" , "high" )))