rm(list=ls(all=T))
Warning messages:
1: In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
3: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
4: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
5: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
options(digits=4, scipen=12)
library(dplyr); library(ggplot2)
Sys.setlocale('LC_ALL','C')
[1] "C"
議題:使用歌曲的屬性,預測它會不會進入流行歌曲排行榜的前10名
學習重點:
【1.1】How many observations (songs) are from the year 2010?
Song = read.csv("data/songs.csv")
sum(Song$year==2010)
[1] 373
【1.2】How many songs does the dataset include for which the artist name is “Michael Jackson”?
sum(Song$artistname=="Michael Jackson")
[1] 18
【1.3】Which of these songs by Michael Jackson made it to the Top 10? Select all that apply.
Song[Song$artistname=="Michael Jackson" & Song$Top10 ==1,1:3]
【1.4】(a) What are the values of timesignature that occur in our dataset? (b) Which timesignature value is the most frequent among songs in our dataset?
table(Song$timesignature)
0 1 3 4 5 7
10 143 503 6787 112 19
"the value=4 is most frequent "
[1] "the value=4 is most frequent "
【1.5】 Which of the following songs has the highest tempo?
Song[which.max(Song$tempo),1:3]
【2.1 依時間分割資料】How many observations (songs) are in the training set?
SongsTrain=subset(Song,Song$year<=2009)
SongsTest=subset(Song,Song$year>=2010)
nrow(SongsTrain)
[1] 7201
【2.2 建立模型、模型摘要】What is the value of the Akaike Information Criterion (AIC)?
nonvars = c("year","songtitle","artistname","songID","artistID")
SongsTrain = SongsTrain[ , !(names(SongsTrain) %in% nonvars) ]
SongsTest = SongsTest[ , !(names(SongsTest) %in% nonvars) ] #去除資料集中對預測結果較無影響的變數
SongsLog1 = glm(Top10 ~ ., data=SongsTrain, family=binomial)
summary(SongsLog1)
Call:
glm(formula = Top10 ~ ., family = binomial, data = SongsTrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.922 -0.540 -0.346 -0.184 3.077
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 14.6999882 1.8063875 8.14 0.0000000000000004 ***
timesignature 0.1263948 0.0867357 1.46 0.14505
timesignature_confidence 0.7449923 0.1953053 3.81 0.00014 ***
loudness 0.2998794 0.0291654 10.28 < 2e-16 ***
tempo 0.0003634 0.0016915 0.21 0.82989
tempo_confidence 0.4732270 0.1421740 3.33 0.00087 ***
key 0.0158820 0.0103895 1.53 0.12635
key_confidence 0.3086751 0.1411562 2.19 0.02876 *
energy -1.5021445 0.3099240 -4.85 0.0000012545913310 ***
pitch -44.9077399 6.8348831 -6.57 0.0000000000501890 ***
timbre_0_min 0.0231589 0.0042562 5.44 0.0000000529331342 ***
timbre_0_max -0.3309820 0.0256926 -12.88 < 2e-16 ***
timbre_1_min 0.0058810 0.0007798 7.54 0.0000000000000464 ***
timbre_1_max -0.0002449 0.0007152 -0.34 0.73209
timbre_2_min -0.0021274 0.0011260 -1.89 0.05884 .
timbre_2_max 0.0006586 0.0009066 0.73 0.46757
timbre_3_min 0.0006920 0.0005985 1.16 0.24758
timbre_3_max -0.0029673 0.0005815 -5.10 0.0000003344570390 ***
timbre_4_min 0.0103956 0.0019850 5.24 0.0000001632385067 ***
timbre_4_max 0.0061105 0.0015503 3.94 0.0000809670432888 ***
timbre_5_min -0.0055980 0.0012767 -4.38 0.0000116146773897 ***
timbre_5_max 0.0000774 0.0007935 0.10 0.92234
timbre_6_min -0.0168562 0.0022640 -7.45 0.0000000000000966 ***
timbre_6_max 0.0036681 0.0021895 1.68 0.09388 .
timbre_7_min -0.0045492 0.0017815 -2.55 0.01066 *
timbre_7_max -0.0037737 0.0018320 -2.06 0.03941 *
timbre_8_min 0.0039110 0.0028510 1.37 0.17012
timbre_8_max 0.0040113 0.0030030 1.34 0.18162
timbre_9_min 0.0013673 0.0029981 0.46 0.64836
timbre_9_max 0.0016027 0.0024336 0.66 0.51019
timbre_10_min 0.0041263 0.0018391 2.24 0.02485 *
timbre_10_max 0.0058250 0.0017694 3.29 0.00099 ***
timbre_11_min -0.0262523 0.0036933 -7.11 0.0000000000011760 ***
timbre_11_max 0.0196734 0.0033855 5.81 0.0000000062068661 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6017.5 on 7200 degrees of freedom
Residual deviance: 4759.2 on 7167 degrees of freedom
AIC: 4827
Number of Fisher Scoring iterations: 6
4827
[1] 4827
【2.3 模型係數判讀】The LOWER or HIGHER our confidence about time signature, key and tempo, the more likely the song is to be in the Top 10
"higher 用係數正負判斷,若是為正且越高的話成為TOP10的機會會越高"
[1] "higher 用係數正負判斷,若是為正且越高的話成為TOP10的機會會越高"
【2.4 進行推論】What does Model 1 suggest in terms of complexity?
"題目告知如果係數值越低,複雜度會越高,由Model 1可知人們傾向聽比較不複雜的"
[1] "題目告知如果係數值越低,複雜度會越高,由Model 1可知人們傾向聽比較不複雜的"
【2.5 檢查異常係數】 (a) By inspecting the coefficient of the variable “loudness”, what does Model 1 suggest? (b) By inspecting the coefficient of the variable “energy”, do we draw the same conclusions as above?
#a
"係數為正且偏高,人們傾向重金屬音樂"
[1] "係數為正且偏高,人們傾向重金屬音樂"
#b
"係數為負且偏高,人們不喜歡有能量的音樂,我們認為loudness與energy是有正相關的,結論與上述不同"
[1] "係數為負且偏高,人們不喜歡有能量的音樂,我們認為loudness與energy是有正相關的,結論與上述不同"
【3.1 檢查相關係數】What is the correlation between loudness and energy in the training set?
cor(SongsTrain$loudness,SongsTrain$energy)
[1] 0.7399
【3.2 重新建立模型、檢查係數】Look at the summary of SongsLog2, and inspect the coefficient of the variable “energy”. What do you observe?
SongsLog2 = glm(Top10 ~ . - loudness, data=SongsTrain, family=binomial)
summary(SongsLog2)
Call:
glm(formula = Top10 ~ . - loudness, family = binomial, data = SongsTrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.098 -0.561 -0.360 -0.190 3.311
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.240612 0.746484 -3.00 0.00269 **
timesignature 0.162461 0.087341 1.86 0.06287 .
timesignature_confidence 0.688471 0.192419 3.58 0.00035 ***
tempo 0.000552 0.001665 0.33 0.74023
tempo_confidence 0.549657 0.140736 3.91 0.0000940005473689 ***
key 0.017403 0.010256 1.70 0.08974 .
key_confidence 0.295367 0.139446 2.12 0.03416 *
energy 0.181260 0.260768 0.70 0.48699
pitch -51.498579 6.856544 -7.51 0.0000000000000587 ***
timbre_0_min 0.024789 0.004240 5.85 0.0000000050055433 ***
timbre_0_max -0.100697 0.011776 -8.55 < 2e-16 ***
timbre_1_min 0.007143 0.000771 9.27 < 2e-16 ***
timbre_1_max -0.000783 0.000706 -1.11 0.26765
timbre_2_min -0.001579 0.001109 -1.42 0.15453
timbre_2_max 0.000389 0.000896 0.43 0.66443
timbre_3_min 0.000650 0.000595 1.09 0.27452
timbre_3_max -0.002462 0.000567 -4.34 0.0000143015554481 ***
timbre_4_min 0.009115 0.001952 4.67 0.0000030176578261 ***
timbre_4_max 0.006306 0.001532 4.12 0.0000387139806484 ***
timbre_5_min -0.005641 0.001255 -4.50 0.0000069522013076 ***
timbre_5_max 0.000694 0.000781 0.89 0.37426
timbre_6_min -0.016122 0.002235 -7.21 0.0000000000005452 ***
timbre_6_max 0.003814 0.002157 1.77 0.07698 .
timbre_7_min -0.005102 0.001755 -2.91 0.00364 **
timbre_7_max -0.003158 0.001811 -1.74 0.08109 .
timbre_8_min 0.004488 0.002810 1.60 0.11025
timbre_8_max 0.006422 0.002950 2.18 0.02950 *
timbre_9_min -0.000428 0.002955 -0.14 0.88479
timbre_9_max 0.003525 0.002377 1.48 0.13802
timbre_10_min 0.002993 0.001804 1.66 0.09700 .
timbre_10_max 0.007367 0.001731 4.25 0.0000209292079939 ***
timbre_11_min -0.028370 0.003630 -7.82 0.0000000000000055 ***
timbre_11_max 0.018294 0.003341 5.48 0.0000000434235974 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6017.5 on 7200 degrees of freedom
Residual deviance: 4871.8 on 7168 degrees of freedom
AIC: 4938
Number of Fisher Scoring iterations: 6
"energy的值由負值轉為正值,有高能量的歌曲更受歡迎"
[1] "energy的值由負值轉為正值,有高能量的歌曲更受歡迎"
【3.3 選擇模型】 do we make the same observation about the popularity of heavy instrumentation as we did with Model 2?
SongsLog3 = glm(Top10 ~ . - energy, data=SongsTrain, family=binomial)
summary(SongsLog3)
Call:
glm(formula = Top10 ~ . - energy, family = binomial, data = SongsTrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.918 -0.542 -0.348 -0.187 3.417
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 11.9605621 1.7141947 6.98 0.0000000000030077 ***
timesignature 0.1150942 0.0872615 1.32 0.18718
timesignature_confidence 0.7142698 0.1946175 3.67 0.00024 ***
loudness 0.2305565 0.0252798 9.12 < 2e-16 ***
tempo -0.0006460 0.0016655 -0.39 0.69811
tempo_confidence 0.3840930 0.1398350 2.75 0.00602 **
key 0.0164946 0.0103514 1.59 0.11106
key_confidence 0.3394064 0.1408744 2.41 0.01598 *
pitch -53.2840575 6.7328544 -7.91 0.0000000000000025 ***
timbre_0_min 0.0220452 0.0042394 5.20 0.0000001992236315 ***
timbre_0_max -0.3104800 0.0253654 -12.24 < 2e-16 ***
timbre_1_min 0.0054160 0.0007643 7.09 0.0000000000013757 ***
timbre_1_max -0.0005115 0.0007110 -0.72 0.47193
timbre_2_min -0.0022544 0.0011203 -2.01 0.04419 *
timbre_2_max 0.0004119 0.0009020 0.46 0.64791
timbre_3_min 0.0003179 0.0005869 0.54 0.58808
timbre_3_max -0.0029637 0.0005758 -5.15 0.0000002640646649 ***
timbre_4_min 0.0110465 0.0019779 5.58 0.0000000233875661 ***
timbre_4_max 0.0064668 0.0015413 4.20 0.0000272139788370 ***
timbre_5_min -0.0051345 0.0012690 -4.05 0.0000520513667576 ***
timbre_5_max 0.0002979 0.0007856 0.38 0.70453
timbre_6_min -0.0178447 0.0022460 -7.94 0.0000000000000019 ***
timbre_6_max 0.0034469 0.0021821 1.58 0.11420
timbre_7_min -0.0051284 0.0017685 -2.90 0.00373 **
timbre_7_max -0.0033935 0.0018198 -1.86 0.06221 .
timbre_8_min 0.0036861 0.0028331 1.30 0.19323
timbre_8_max 0.0046578 0.0029879 1.56 0.11902
timbre_9_min -0.0000932 0.0029569 -0.03 0.97486
timbre_9_max 0.0013417 0.0024239 0.55 0.57990
timbre_10_min 0.0040500 0.0018270 2.22 0.02664 *
timbre_10_max 0.0057925 0.0017586 3.29 0.00099 ***
timbre_11_min -0.0263767 0.0036829 -7.16 0.0000000000007958 ***
timbre_11_max 0.0198361 0.0033646 5.90 0.0000000037350899 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6017.5 on 7200 degrees of freedom
Residual deviance: 4782.7 on 7168 degrees of freedom
AIC: 4849
Number of Fisher Scoring iterations: 6
"扣掉energy後,loudness的係數一樣為正,故可以知道energy與loudness有共線性所以導致原先的模型不準"
[1] "扣掉energy後,loudness的係數一樣為正,故可以知道energy與loudness有共線性所以導致原先的模型不準"
【4.1 正確性】What is the accuracy of Model 3 on the test set, using a threshold of 0.45?
predict3=predict(SongsLog3,SongsTest,type="response")
x=table(actual = SongsTest$Top10, predict = predict3 >= 0.45)
x
predict
actual FALSE TRUE
0 309 5
1 40 19
(309+19)/(309+5+40+19)
[1] 0.8794
【4.2 底線正確率】What would the accuracy of the baseline model be on the test set? ?
1 - mean(SongsTest$Top10)
[1] 0.8418
【4.3 正確性 vs. 辨識率】How many songs does Model 3 correctly predict as Top 10 hits in 2010? How many non-hit songs does Model 3 predict will be Top 10 hits?
#a actual=1 predict=1
19
[1] 19
#b actual=0 predict=1
5
[1] 5
【Q】不能大幅度增加正確性的模型也會有用嗎?為甚麼? 有用,模型除了用正確性來預測之外,還可以依靠敏感性、明確性其他方法參考
【4.4 敏感性 & 明確性】What is the sensitivity and specificity of Model 3 on the test set, using a threshold of 0.45?
sensitivity=19/(19+40) #TP/(TP+FN)
specificity=309/(309+5) #TN/(FP+TN)
sensitivity
[1] 0.322
specificity
[1] 0.9841
【4.5 結論】What conclusions can you make about our model?
"specificity很高 sensitivity很低,但整體預測的正確率很高"
[1] "specificity很高 sensitivity很低,但整體預測的正確率很高"
"由模型可知預測較為保守,很少預測這首歌會不會進入TOP10,但只要預測為TOP10,正確率會很高"
[1] "由模型可知預測較為保守,很少預測這首歌會不會進入TOP10,但只要預測為TOP10,正確率會很高"
【Q】從這個結論我們學到什麼?
模型做出來後會有很多參考值,應該綜合觀察,才能得出適當的結論 - - -