rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr); library(ggplot2)

Introduction

議題:使用歌曲的屬性,預測它會不會進入流行歌曲排行榜的前10名

學習重點:



1 基本的資料處理 Understanding the Data

1.1】How many observations (songs) are from the year 2010?

songs=read.csv("Unit3/songs.csv")
library(dplyr)
subset(songs, year>=2010) %>% nrow
## [1] 373

1.2】How many songs does the dataset include for which the artist name is “Michael Jackson”?

subset(songs, artistname=="Michael Jackson") %>% nrow
## [1] 18

1.3】Which of these songs by Michael Jackson made it to the Top 10? Select all that apply.

top_M=subset(songs, artistname=="Michael Jackson" & Top10=="1") 
top_M[c("songtitle", "Top10")]

1.4】(a) What are the values of timesignature that occur in our dataset? (b) Which timesignature value is the most frequent among songs in our dataset?

table(songs$timesignature)
## 
##    0    1    3    4    5    7 
##   10  143  503 6787  112   19
#0,1,3,4,5,7
#4

1.5】 Which of the following songs has the highest tempo?

which.max(songs$tempo)
## [1] 6206
songs$songtitle[6206]
## [1] Wanna Be Startin' Somethin'
## 7141 Levels: \x91u_ Creias? _\x84\x8d '03 Bonnie & Clyde ... Zumbi



2 建立模型 Creating Our Prediction Model

2.1 依時間分割資料】How many observations (songs) are in the training set?

songtr=subset(songs, year<=2009) 
songts=subset(songs, year==2010)
#7201

2.2 建立模型、模型摘要】What is the value of the Akaike Information Criterion (AIC)?

nonvars = c("year", "songtitle", "artistname", "songID", "artistID")#把不需要的變數拿掉
songtr = songtr[ , !(names(songtr) %in% nonvars) ]#tr的nonvar拿掉
songts = songts[ , !(names(songts) %in% nonvars) ]#ts的nonvar拿掉
# %in%<-是什麼??
model1 = glm(Top10 ~ ., data=songtr, family=binomial)
summary(model1)
## 
## Call:
## glm(formula = Top10 ~ ., family = binomial, data = songtr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.922  -0.540  -0.346  -0.184   3.077  
## 
## Coefficients:
##                             Estimate  Std. Error z value
## (Intercept)               14.6999882   1.8063875    8.14
## timesignature              0.1263948   0.0867357    1.46
## timesignature_confidence   0.7449923   0.1953053    3.81
## loudness                   0.2998794   0.0291654   10.28
## tempo                      0.0003634   0.0016915    0.21
## tempo_confidence           0.4732270   0.1421740    3.33
## key                        0.0158820   0.0103895    1.53
## key_confidence             0.3086751   0.1411562    2.19
## energy                    -1.5021445   0.3099240   -4.85
## pitch                    -44.9077399   6.8348831   -6.57
## timbre_0_min               0.0231589   0.0042562    5.44
## timbre_0_max              -0.3309820   0.0256926  -12.88
## timbre_1_min               0.0058810   0.0007798    7.54
## timbre_1_max              -0.0002449   0.0007152   -0.34
## timbre_2_min              -0.0021274   0.0011260   -1.89
## timbre_2_max               0.0006586   0.0009066    0.73
## timbre_3_min               0.0006920   0.0005985    1.16
## timbre_3_max              -0.0029673   0.0005815   -5.10
## timbre_4_min               0.0103956   0.0019850    5.24
## timbre_4_max               0.0061105   0.0015503    3.94
## timbre_5_min              -0.0055980   0.0012767   -4.38
## timbre_5_max               0.0000774   0.0007935    0.10
## timbre_6_min              -0.0168562   0.0022640   -7.45
## timbre_6_max               0.0036681   0.0021895    1.68
## timbre_7_min              -0.0045492   0.0017815   -2.55
## timbre_7_max              -0.0037737   0.0018320   -2.06
## timbre_8_min               0.0039110   0.0028510    1.37
## timbre_8_max               0.0040113   0.0030030    1.34
## timbre_9_min               0.0013673   0.0029981    0.46
## timbre_9_max               0.0016027   0.0024336    0.66
## timbre_10_min              0.0041263   0.0018391    2.24
## timbre_10_max              0.0058250   0.0017694    3.29
## timbre_11_min             -0.0262523   0.0036933   -7.11
## timbre_11_max              0.0196734   0.0033855    5.81
##                                    Pr(>|z|)    
## (Intercept)              0.0000000000000004 ***
## timesignature                       0.14505    
## timesignature_confidence            0.00014 ***
## loudness                            < 2e-16 ***
## tempo                               0.82989    
## tempo_confidence                    0.00087 ***
## key                                 0.12635    
## key_confidence                      0.02876 *  
## energy                   0.0000012545913310 ***
## pitch                    0.0000000000501890 ***
## timbre_0_min             0.0000000529331342 ***
## timbre_0_max                        < 2e-16 ***
## timbre_1_min             0.0000000000000464 ***
## timbre_1_max                        0.73209    
## timbre_2_min                        0.05884 .  
## timbre_2_max                        0.46757    
## timbre_3_min                        0.24758    
## timbre_3_max             0.0000003344570390 ***
## timbre_4_min             0.0000001632385067 ***
## timbre_4_max             0.0000809670432888 ***
## timbre_5_min             0.0000116146773897 ***
## timbre_5_max                        0.92234    
## timbre_6_min             0.0000000000000966 ***
## timbre_6_max                        0.09388 .  
## timbre_7_min                        0.01066 *  
## timbre_7_max                        0.03941 *  
## timbre_8_min                        0.17012    
## timbre_8_max                        0.18162    
## timbre_9_min                        0.64836    
## timbre_9_max                        0.51019    
## timbre_10_min                       0.02485 *  
## timbre_10_max                       0.00099 ***
## timbre_11_min            0.0000000000011760 ***
## timbre_11_max            0.0000000062068661 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6017.5  on 7200  degrees of freedom
## Residual deviance: 4759.2  on 7167  degrees of freedom
## AIC: 4827
## 
## Number of Fisher Scoring iterations: 6
#4827.2

2.3 模型係數判讀】The LOWER or HIGHER our confidence about time signature, key and tempo, the more likely the song is to be in the Top 10

#The higher our confidence about time signature, key and tempo, the more likely the song is to be in the Top 10. 

2.4 進行推論】What does Model 1 suggest in terms of complexity?

#Mainstream listeners tend to prefer less complex songs. 

2.5 檢查異常係數】 (a) By inspecting the coefficient of the variable “loudness”, what does Model 1 suggest? (b) By inspecting the coefficient of the variable “energy”, do we draw the same conclusions as above?

#a)Mainstream listeners prefer songs with heavy instrumentation. 因為loudness的係數為正
#b)no.因為energy的係數為負.



3 處理共線性 Beware of Multicollinearity Issues!

3.1 檢查相關係數】What is the correlation between loudness and energy in the training set?

cor(songtr$loudness, songtr$energy) #cor裡可直接指定要相比的var
## [1] 0.7399

3.2 重新建立模型、檢查係數】Look at the summary of SongsLog2, and inspect the coefficient of the variable “energy”. What do you observe?

model2 = glm(Top10 ~ . - loudness, data=songtr, family=binomial)
summary(model2)
## 
## Call:
## glm(formula = Top10 ~ . - loudness, family = binomial, data = songtr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.098  -0.561  -0.360  -0.190   3.311  
## 
## Coefficients:
##                            Estimate Std. Error z value           Pr(>|z|)
## (Intercept)               -2.240612   0.746484   -3.00            0.00269
## timesignature              0.162461   0.087341    1.86            0.06287
## timesignature_confidence   0.688471   0.192419    3.58            0.00035
## tempo                      0.000552   0.001665    0.33            0.74023
## tempo_confidence           0.549657   0.140736    3.91 0.0000940005473689
## key                        0.017403   0.010256    1.70            0.08974
## key_confidence             0.295367   0.139446    2.12            0.03416
## energy                     0.181260   0.260768    0.70            0.48699
## pitch                    -51.498579   6.856544   -7.51 0.0000000000000587
## timbre_0_min               0.024789   0.004240    5.85 0.0000000050055433
## timbre_0_max              -0.100697   0.011776   -8.55            < 2e-16
## timbre_1_min               0.007143   0.000771    9.27            < 2e-16
## timbre_1_max              -0.000783   0.000706   -1.11            0.26765
## timbre_2_min              -0.001579   0.001109   -1.42            0.15453
## timbre_2_max               0.000389   0.000896    0.43            0.66443
## timbre_3_min               0.000650   0.000595    1.09            0.27452
## timbre_3_max              -0.002462   0.000567   -4.34 0.0000143015554481
## timbre_4_min               0.009115   0.001952    4.67 0.0000030176578261
## timbre_4_max               0.006306   0.001532    4.12 0.0000387139806484
## timbre_5_min              -0.005641   0.001255   -4.50 0.0000069522013076
## timbre_5_max               0.000694   0.000781    0.89            0.37426
## timbre_6_min              -0.016122   0.002235   -7.21 0.0000000000005452
## timbre_6_max               0.003814   0.002157    1.77            0.07698
## timbre_7_min              -0.005102   0.001755   -2.91            0.00364
## timbre_7_max              -0.003158   0.001811   -1.74            0.08109
## timbre_8_min               0.004488   0.002810    1.60            0.11025
## timbre_8_max               0.006422   0.002950    2.18            0.02950
## timbre_9_min              -0.000428   0.002955   -0.14            0.88479
## timbre_9_max               0.003525   0.002377    1.48            0.13802
## timbre_10_min              0.002993   0.001804    1.66            0.09700
## timbre_10_max              0.007367   0.001731    4.25 0.0000209292079939
## timbre_11_min             -0.028370   0.003630   -7.82 0.0000000000000055
## timbre_11_max              0.018294   0.003341    5.48 0.0000000434235974
##                             
## (Intercept)              ** 
## timesignature            .  
## timesignature_confidence ***
## tempo                       
## tempo_confidence         ***
## key                      .  
## key_confidence           *  
## energy                      
## pitch                    ***
## timbre_0_min             ***
## timbre_0_max             ***
## timbre_1_min             ***
## timbre_1_max                
## timbre_2_min                
## timbre_2_max                
## timbre_3_min                
## timbre_3_max             ***
## timbre_4_min             ***
## timbre_4_max             ***
## timbre_5_min             ***
## timbre_5_max                
## timbre_6_min             ***
## timbre_6_max             .  
## timbre_7_min             ** 
## timbre_7_max             .  
## timbre_8_min                
## timbre_8_max             *  
## timbre_9_min                
## timbre_9_max                
## timbre_10_min            .  
## timbre_10_max            ***
## timbre_11_min            ***
## timbre_11_max            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6017.5  on 7200  degrees of freedom
## Residual deviance: 4871.8  on 7168  degrees of freedom
## AIC: 4938
## 
## Number of Fisher Scoring iterations: 6
#Model 2 suggests that songs with high energy levels tend to be more popular. This contradicts our observation in Model 1. 但明明就不顯者了...

3.3 選擇模型】 do we make the same observation about the popularity of heavy instrumentation as we did with Model 2?

model3 = glm(Top10 ~ . - energy, data=songtr, family=binomial)
summary(model3)
## 
## Call:
## glm(formula = Top10 ~ . - energy, family = binomial, data = songtr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.918  -0.542  -0.348  -0.187   3.417  
## 
## Coefficients:
##                             Estimate  Std. Error z value
## (Intercept)               11.9605621   1.7141947    6.98
## timesignature              0.1150942   0.0872615    1.32
## timesignature_confidence   0.7142698   0.1946175    3.67
## loudness                   0.2305565   0.0252798    9.12
## tempo                     -0.0006460   0.0016655   -0.39
## tempo_confidence           0.3840930   0.1398350    2.75
## key                        0.0164946   0.0103514    1.59
## key_confidence             0.3394064   0.1408744    2.41
## pitch                    -53.2840575   6.7328544   -7.91
## timbre_0_min               0.0220452   0.0042394    5.20
## timbre_0_max              -0.3104800   0.0253654  -12.24
## timbre_1_min               0.0054160   0.0007643    7.09
## timbre_1_max              -0.0005115   0.0007110   -0.72
## timbre_2_min              -0.0022544   0.0011203   -2.01
## timbre_2_max               0.0004119   0.0009020    0.46
## timbre_3_min               0.0003179   0.0005869    0.54
## timbre_3_max              -0.0029637   0.0005758   -5.15
## timbre_4_min               0.0110465   0.0019779    5.58
## timbre_4_max               0.0064668   0.0015413    4.20
## timbre_5_min              -0.0051345   0.0012690   -4.05
## timbre_5_max               0.0002979   0.0007856    0.38
## timbre_6_min              -0.0178447   0.0022460   -7.94
## timbre_6_max               0.0034469   0.0021821    1.58
## timbre_7_min              -0.0051284   0.0017685   -2.90
## timbre_7_max              -0.0033935   0.0018198   -1.86
## timbre_8_min               0.0036861   0.0028331    1.30
## timbre_8_max               0.0046578   0.0029879    1.56
## timbre_9_min              -0.0000932   0.0029569   -0.03
## timbre_9_max               0.0013417   0.0024239    0.55
## timbre_10_min              0.0040500   0.0018270    2.22
## timbre_10_max              0.0057925   0.0017586    3.29
## timbre_11_min             -0.0263767   0.0036829   -7.16
## timbre_11_max              0.0198361   0.0033646    5.90
##                                    Pr(>|z|)    
## (Intercept)              0.0000000000030077 ***
## timesignature                       0.18718    
## timesignature_confidence            0.00024 ***
## loudness                            < 2e-16 ***
## tempo                               0.69811    
## tempo_confidence                    0.00602 ** 
## key                                 0.11106    
## key_confidence                      0.01598 *  
## pitch                    0.0000000000000025 ***
## timbre_0_min             0.0000001992236315 ***
## timbre_0_max                        < 2e-16 ***
## timbre_1_min             0.0000000000013757 ***
## timbre_1_max                        0.47193    
## timbre_2_min                        0.04419 *  
## timbre_2_max                        0.64791    
## timbre_3_min                        0.58808    
## timbre_3_max             0.0000002640646649 ***
## timbre_4_min             0.0000000233875661 ***
## timbre_4_max             0.0000272139788370 ***
## timbre_5_min             0.0000520513667576 ***
## timbre_5_max                        0.70453    
## timbre_6_min             0.0000000000000019 ***
## timbre_6_max                        0.11420    
## timbre_7_min                        0.00373 ** 
## timbre_7_max                        0.06221 .  
## timbre_8_min                        0.19323    
## timbre_8_max                        0.11902    
## timbre_9_min                        0.97486    
## timbre_9_max                        0.57990    
## timbre_10_min                       0.02664 *  
## timbre_10_max                       0.00099 ***
## timbre_11_min            0.0000000000007958 ***
## timbre_11_max            0.0000000037350899 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6017.5  on 7200  degrees of freedom
## Residual deviance: 4782.7  on 7168  degrees of freedom
## AIC: 4849
## 
## Number of Fisher Scoring iterations: 6
#Yes. 所以說大家現在喜歡夠大聲、power夠強的歌...



4 驗證模型 Validating Our Model

4.1 正確性】What is the accuracy of Model 3 on the test set, using a threshold of 0.45?

pred1= predict(model3, newdata=songts, type="response" ) #做預測
table(songts$Top10, pred1 >= 0.45) #檢視threshold > 0.45時候的confusion matrix
##    
##     FALSE TRUE
##   0   309    5
##   1    40   19
(309+19)/(309+19+5+40) #計算ACC
## [1] 0.8794

4.2 底線正確率】What would the accuracy of the baseline model be on the test set? ?

#baseline是沒有x的情況,所以也就是沒有歌成為top10的結果
table(songts$Top10)
## 
##   0   1 
## 314  59
314/(314+59) #計算baseline model的ACC
## [1] 0.8418
#model3(沒有留energy)的ACC比起baseline的ACC要高一些

4.3 正確性 vs. 辨識率】How many songs does Model 3 correctly predict as Top 10 hits in 2010? How many non-hit songs does Model 3 predict will be Top 10 hits?

#19首正確預測是top 10
#5首不是top10,但被預測為是

Q】不能大幅度增加正確性的模型也會有用嗎?為甚麼? 但它能正確預測哪些“不是”,一樣很有用。

4.4 敏感性 & 明確性】What is the sensitivity and specificity of Model 3 on the test set, using a threshold of 0.45?

sen= 19/(40+19) ; sen # 0.32
## [1] 0.322
spe=309/(309+5) ; spe # 0.98
## [1] 0.9841

4.5 結論】What conclusions can you make about our model?

# Model 3 favors specificity over sensitivity.
# Model 3 provides conservative predictions, and predicts that a song will make it to the Top 10 very rarely. So while it detects less than half of the Top 10 songs, we can be very confident in the songs that it does predict to be Top 10 hits.


Q】從這個結論我們學到什麼? 這個結果是ACC的specificity很高,但sensitivity很低, 因此並不能單以ACC跟specificity來判斷模型是不是有用。 eg.positive的結果很低,那只要全部預測negative也能得到很高的spe,因此不能單看其中一個。