#############################################################
# Data prep
library("rio")
x <- import("https://docs.google.com/spreadsheets/d/1h7XhLd2Byp4OcXSdtxHly9iS7RA673RJ/export?format=csv&gid=1083477596")
x$commentCount <- as.integer(x$commentCount)
x$viewsCount <- as.numeric(x$viewsCount)
x$acousticness <- as.numeric(sub(",", ".", x$acousticness, fixed = TRUE))
x$danceability <- as.numeric(sub(",", ".", x$danceability, fixed = TRUE))
x$energy <- as.numeric(sub(",", ".", x$energy, fixed = TRUE))
x$instrumentalness <- as.numeric(sub(",", ".", x$instrumentalness, fixed = TRUE))
x$liveness <- as.numeric(sub(",", ".", x$liveness, fixed = TRUE))
x$loudness <- as.numeric(sub(",", ".", x$loudness, fixed = TRUE))
x$speechiness <- as.numeric(sub(",", ".", x$speechiness, fixed = TRUE))
x$tempo <- as.numeric(sub(",", ".", x$tempo, fixed = TRUE))
x$valence <- as.numeric(sub(",", ".", x$valence, fixed = TRUE))
x$key <- as.factor(x$key)
x$time_signature <- as.factor(x$time_signature)
min_max_normalize <- function(x)
{
return( (1000-10)*((x- min(x)) /(max(x)-min(x))) + 10)
}
x$acousticness <- min_max_normalize(x$acousticness)
x$danceability <- min_max_normalize(x$danceability)
x$energy <- min_max_normalize(x$energy)
x$instrumentalness <- min_max_normalize(x$instrumentalness)
x$liveness <- min_max_normalize(x$liveness)
x$loudness <- min_max_normalize(x$loudness)
x$speechiness <- min_max_normalize(x$speechiness)
x$tempo <- min_max_normalize(x$tempo)
x$valence <- min_max_normalize(x$valence)
x$Genre <- as.factor(x$Genre)
x$key <- as.factor(x$key)
x$mode <- as.factor(x$mode)
x$Charts <- as.factor(x$Charts)
library("dplyr")
x <- dplyr::select(x, -one_of('Streams'))
x <- x[complete.cases(x), ]
var_list <- c('Artist_Albums_Number', 'Artist_Albums_Tracks_Number', 'Artist_Appearances_Number', 'Artist_Appearances_Tracks_Number', 'Artist_Compilations_Number', 'Artist_Compilations_Tracks_Number', 'Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'commentCount', 'dislikeCount', 'likeCount', 'viewsCount')
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for (i in 1:length(var_list)){
hist(x[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
lines(density(x[, var_list[i]]), col = "red")
}
NA
NA
NA
NA
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for(i in 1:length(var_list)){
df <- data.frame(x[,var_list[i]])
names(df)[1] <- var_list[i]
#ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
boxplot(x = df[,1], main = var_list[i], notch = FALSE)
}
NA
NA
NA
NA
Normality testing
#strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Track_Duration_ms', 'Track_Popularity', 'viewsCount', #'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
for (i in 1:length(var_list)){
column_name <- var_list[i]
message(column_name)
sub_df <- as.numeric(x[,column_name])
# sub_df <- sub_df[complete.cases(sub_df), ]
#sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
critical_value <- 1.3581 / sqrt (length(sub_df))
if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
Artist_Albums_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Albums_Number
Artist_Albums_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Albums_Tracks_Number
Artist_Appearances_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Appearances_Number
Artist_Appearances_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Appearances_Tracks_Number
Artist_Compilations_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Compilations_Number
Artist_Compilations_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Compilations_Tracks_Number
Artist_Follower
Artist_Follower
Artist_Popularity
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein Artist_Popularity is approximately normally distributed! 0.0579112990955349 0.0970071428571429
Artist_Singles_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Singles_Number
Artist_Singles_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Singles_Tracks_Number
Track_Duration_ms
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinTrack_Duration_ms
Track_Popularity
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinTrack_Popularity
acousticness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinacousticness
danceability
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seindanceability
energy
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinenergy
instrumentalness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seininstrumentalness
liveness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinliveness
loudness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinloudness
speechiness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinspeechiness
tempo
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seintempo
valence
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinvalence
commentCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seincommentCount
dislikeCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seindislikeCount
likeCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinlikeCount
viewsCount
viewsCount
x_scaled <- as.data.frame(scale(x[, var_list]))
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for (i in 1:length(var_list)){
hist(x_scaled[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
lines(density(x_scaled[, var_list[i]]), col = "red")
}
NA
NA
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for(i in 1:length(var_list)){
df <- data.frame(x_scaled[,var_list[i]])
names(df)[1] <- var_list[i]
#ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
boxplot(x = df[,1], main = var_list[i], notch = FALSE)
}
for (i in 1:length(var_list)){
column_name <- var_list[i]
message(column_name)
sub_df <- as.numeric(x_scaled[,column_name])
# sub_df <- sub_df[complete.cases(sub_df), ]
#sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
critical_value <- 1.3581 / sqrt (length(sub_df))
if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
Artist_Albums_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Albums_Number
Artist_Albums_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Albums_Tracks_Number
Artist_Appearances_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Appearances_Number
Artist_Appearances_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Appearances_Tracks_Number
Artist_Compilations_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Compilations_Number
Artist_Compilations_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Compilations_Tracks_Number
Artist_Follower
Artist_Follower
Artist_Popularity
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein Artist_Popularity is approximately normally distributed! 0.0579112990955349 0.0970071428571429
Artist_Singles_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Singles_Number
Artist_Singles_Tracks_Number
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinArtist_Singles_Tracks_Number
Track_Duration_ms
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinTrack_Duration_ms
Track_Popularity
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinTrack_Popularity
acousticness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinacousticness
danceability
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seindanceability
energy
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinenergy
instrumentalness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seininstrumentalness
liveness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinliveness
loudness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinloudness
speechiness
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinspeechiness
tempo
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seintempo
valence
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinvalence
commentCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seincommentCount
dislikeCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seindislikeCount
likeCount
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinlikeCount
viewsCount
viewsCount
strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'likeCount', 'viewsCount')
library("psych")
library("car")
ksD <- function (p, x) {
y <- bcPower(x, p)
ks.test(y, "pnorm", mean=mean(y), sd=sd(y))$statistic
}
oldw <- getOption("warn")
options(warn = -1)
min_values <- c()
for (column_index in 1:length(strictly_positive_variables)){
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
result <- optimize(ksD, c(-5,5), x=x_sub)
min_values[column_index] <- result$minimum
message(paste(column_index, ', minimum value is: ', result$minimum))
}
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein1 , minimum value is: -0.0049316395173003
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein2 , minimum value is: 1.40215894049501
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein3 , minimum value is: 0.240525077020065
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein4 , minimum value is: 0.262671909258923
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein5 , minimum value is: -0.699387491055179
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein6 , minimum value is: 0.949365177476973
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein7 , minimum value is: 0.130600084313768
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein8 , minimum value is: 3.51730755217668
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein9 , minimum value is: 3.07785781966221
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein10 , minimum value is: -0.795345146822112
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein11 , minimum value is: -0.303366878680729
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein12 , minimum value is: 4.64767393430379
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein13 , minimum value is: -0.403363137393642
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein14 , minimum value is: 1.16379372652619
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein15 , minimum value is: 0.453314295028104
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden seinf昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein16 , minimum value is: 0.05466614868127
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein17 , minimum value is: 0.0345455532264414
options(warn = oldw)
column_index <- 1
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Follower_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 2
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Popularity_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 3
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Number_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 4
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Tracks_Number_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 5
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Duration_ms_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 6
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Popularity_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 7
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
acousticness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 8
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
danceability_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 9
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
energy_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 10
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
instrumentalness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 11
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
liveness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 12
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
loudness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 13
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
speechiness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 14
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
tempo_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 15
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
valence_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 16
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
likeCount_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 17
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
viewsCount_trans <- bcPower(x_sub, min_values[column_index])
hist_trans_list <- list(Artist_Follower_trans,
Artist_Popularity_trans,
Artist_Singles_Number_trans,
Artist_Singles_Tracks_Number_trans,
Track_Duration_ms_trans,
Track_Popularity_trans,
acousticness_trans,
danceability_trans,
energy_trans,
instrumentalness_trans,
liveness_trans,
loudness_trans,
speechiness_trans,
tempo_trans,
valence_trans,
likeCount_trans,
viewsCount_trans)
par(mar = rep(2, 4))
par(mfrow=c(4,5))
for (trans_index in 1:length(hist_trans_list)){
column_name <- strictly_positive_variables[trans_index]
selected_trans <- hist_trans_list[trans_index]
selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
hist(selected_trans, col = "gray", probability = TRUE, main = column_name, xlab = "")
points(seq(min(selected_trans), max(selected_trans), length.out = 500),
dnorm(seq(min(selected_trans), max(selected_trans), length.out = 500),
mean(selected_trans),sd(selected_trans)), type = "l", col = "red")
test_statistic <- ks.test(selected_trans, "pnorm", mean=mean(selected_trans), sd=sd(selected_trans))$statistic
critical_value <- 1.3581 / sqrt (length(selected_trans))
if (test_statistic > critical_value) {
message(paste("Transformed ", column_name , " is not approximately normally distributed.", test_statistic, critical_value))
} else {
message(paste("Transformed ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
NA
NA
NA
par(mar = rep(2, 4))
par(mfrow=c(4,5))
for(trans_index in 1:length(strictly_positive_variables)){
column_name <- strictly_positive_variables[trans_index]
selected_trans <- hist_trans_list[trans_index]
selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
df <- data.frame(selected_trans)
names(df)[1] <- strictly_positive_variables[trans_index]
boxplot(x = df[,1], main = strictly_positive_variables[trans_index], notch = FALSE)
}
NA
NA
Correlation
library(corrplot)
method <- "pearson"
clean_cor <- cor(x[,var_list], method = method)
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ..., method = method)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(clean_cor)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
significance_level <- 0.05
corrplot(clean_cor, method="color", col=col(200),
type="upper", order="alphabet",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=90, #Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = significance_level, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE)
NA
NA
NA
NA
Steiger’s Z test
#
lm1 <- lm(x$Artist_Popularity ~ x$Artist_Follower)
summary(lm1)
Call:
lm(formula = x$Artist_Popularity ~ x$Artist_Follower)
Residuals:
Min 1Q Median 3Q Max
-39.796 -9.761 2.344 10.590 27.413
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.268e+01 1.029e+00 60.900 < 2e-16 ***
x$Artist_Follower 1.216e-06 1.491e-07 8.154 4.3e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 13.72 on 194 degrees of freedom
Multiple R-squared: 0.2552, Adjusted R-squared: 0.2514
F-statistic: 66.49 on 1 and 194 DF, p-value: 4.298e-14
lm2 <- lm(Artist_Popularity_trans ~ Artist_Follower_trans)
summary(lm2)
Call:
lm(formula = Artist_Popularity_trans ~ Artist_Follower_trans)
Residuals:
Min 1Q Median 3Q Max
-92.767 -23.106 -7.232 23.326 121.213
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -108.514 12.039 -9.014 <2e-16 ***
Artist_Follower_trans 31.226 1.017 30.713 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 34.89 on 194 degrees of freedom
Multiple R-squared: 0.8294, Adjusted R-squared: 0.8285
F-statistic: 943.3 on 1 and 194 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(x$Artist_Follower, x$Artist_Popularity, main = paste("Untransformed, R^2=", round(summary(lm1)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")
plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")
bivariate_df_base <- data.frame(x$Artist_Follower, x$Artist_Popularity)
bivariate_df_bc <- data.frame(Artist_Follower_trans, Artist_Popularity_trans)
library("MVN")
MVN::mvn(bivariate_df_base, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "energy")$multivariateNormality # Jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "energy")$multivariateNormality # Jointly normal
NA
boxcox_df <- data.frame(matrix(vector(), nrow = 196, ncol = 17))
boxcox_df$Artist_Follower <- Artist_Follower_trans
boxcox_df$Artist_Popularity <- Artist_Popularity_trans
boxcox_df$Artist_Singles_Number <- Artist_Singles_Number_trans
boxcox_df$Artist_Singles_Tracks_Number <- Artist_Singles_Tracks_Number_trans
boxcox_df$Track_Duration_ms <- Track_Duration_ms_trans
boxcox_df$Track_Popularity <- Track_Popularity_trans
boxcox_df$acousticness <- acousticness_trans
boxcox_df$danceability <- danceability_trans
boxcox_df$energy <- energy_trans
boxcox_df$instrumentalness <- instrumentalness_trans
boxcox_df$liveness <- liveness_trans
boxcox_df$loudness <- loudness_trans
boxcox_df$speechiness <- speechiness_trans
boxcox_df$tempo <- tempo_trans
boxcox_df$valence <- valence_trans
boxcox_df$likeCount <- likeCount_trans
boxcox_df$viewsCount <- viewsCount_trans
drop.cols <- c('X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17')
boxcox_df <- dplyr::select(boxcox_df, -one_of(drop.cols))
par(mfrow=c(1,1))
palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]
plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity", col = my_colors, pch = 16)
legend("bottomright", legend = levels(as.factor(x$Genre)), col = palette, pch = 16)
library("lattice")
xyplot(Artist_Popularity_trans~Artist_Follower_trans|x$Genre, pch=19, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = "Biplot")
# pairs(boxcox_df)
sunflower_Artist_pop <- 2*round(boxcox_df$Artist_Popularity/2)
sunflower_viewsCount <- 2*round(boxcox_df$viewsCount/2)
sunflowerplot(sunflower_Artist_pop~sunflower_viewsCount, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = 'Sunflower plot')
require("MASS")
parcoord(boxcox_df, var.label = FALSE)
#library("lattice")
#parallelplot(boxcox_df, horizontal.axis=T)
#library("ggplot2")
#library("GGally")
#ggparcoord(boxcox_df) + geom_line()
library("andrews")
andrews(boxcox_df, ymax=7)
audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
selected_columns <- dplyr::select(boxcox_df, audio_features)
parcoord(selected_columns , col= my_colors, var.label = FALSE, ... = par(las=2, mar = c(8,1,3,1), xpd=TRUE))
# title("Parallel coordinates", line = 0.6, adj = 0.1)
legend("topright", inset=c(0.035,-0.08),legend = c('Classic', 'Techno', 'Pop', 'Hip Hop'), col = unique(my_colors),
bty = 'o', horiz = TRUE, pch = 15)
ax <- selected_columns
ax$clr <- x$Genre
andrews(ax, ymax=4, clr=10)
legend("topright",legend=levels(ax$clr), fill=rainbow(length(levels(as.factor(ax$clr)))))
library("scagnostics")
sub_df <- as.data.frame(x[, var_list])
s <- scagnostics(sub_df)
o <- scagnosticsOutliers(s)
g <- scagnosticsGrid(s)
go <- g[o,]
# Outliers: strongly different from the other plots
par(mfrow=c(1,1))
for (i in 1:nrow(go)){
plot(sub_df[[go[i,1]]], sub_df[[go[i,2]]], pch=19, xlab=names(sub_df)[go[i,1]], ylab=names(sub_df)[go[i,2]])
print(paste("Outlying pair: ", "y=", names(sub_df)[go[i,2]], "x=", names(sub_df)[go[i,1]]))}
[1] "Outlying pair: y= Artist_Compilations_Number x= Artist_Albums_Number"
[1] "Outlying pair: y= Artist_Compilations_Number x= Artist_Albums_Tracks_Number"
[1] "Outlying pair: y= Artist_Compilations_Tracks_Number x= Artist_Albums_Number"
[1] "Outlying pair: y= Artist_Compilations_Tracks_Number x= Artist_Compilations_Number"
[1] "Outlying pair: y= Artist_Popularity x= Artist_Follower"
[1] "Outlying pair: y= Artist_Singles_Number x= Artist_Albums_Number"
[1] "Outlying pair: y= Artist_Singles_Number x= Artist_Compilations_Tracks_Number"
[1] "Outlying pair: y= Artist_Singles_Tracks_Number x= Artist_Compilations_Tracks_Number"
[1] "Outlying pair: y= Artist_Singles_Tracks_Number x= Artist_Singles_Number"
[1] "Outlying pair: y= instrumentalness x= Track_Duration_ms"
[1] "Outlying pair: y= loudness x= Artist_Albums_Tracks_Number"
[1] "Outlying pair: y= valence x= instrumentalness"
[1] "Outlying pair: y= dislikeCount x= Artist_Follower"
[1] "Outlying pair: y= likeCount x= Artist_Follower"
[1] "Outlying pair: y= likeCount x= commentCount"
# Highlights:
par(mfrow=c(1,2))
plot(x$Track_Duration_ms, x$instrumentalness, col = my_colors, pch = 16, xlab = 'Track_Duration_ms', ylab = 'instrumentalness')
plot(x$instrumentalness, x$valence, col = my_colors, pch = 16, xlab = 'instrumentalness', ylab = 'valence')
legend(x=-1300, y=1250, legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = TRUE, xpd = NA)
# Exemplars: group of similar plots
e <- scagnosticsExemplars(s)
ge <- g[e,]
par(mfrow = c(2,2))
for (i in 1:dim(ge)[1]){
plot(sub_df[[ge$x[i]]], sub_df[[ge$y[i]]], pch=19,
xlab=names(sub_df)[ge$x[i]], ylab=names(sub_df)[ge$y[i]])
print(paste("Exemplary pair: ", "y=", names(sub_df)[ge$y[i]], "x=", names(sub_df)[ge$x[i]]))
}
[1] "Exemplary pair: y= Track_Popularity x= Artist_Appearances_Tracks_Number"
[1] "Exemplary pair: y= energy x= Artist_Albums_Tracks_Number"
[1] "Exemplary pair: y= likeCount x= Artist_Appearances_Number"
[1] "Exemplary pair: y= viewsCount x= tempo"
table(x$Genre)
Classic Hip Hop Pop Techno
49 50 50 47
x$Artist_Popularity_quantile <- 0
Artist_Popularity_quantiles <- quantile(x$Artist_Popularity, probs = c(0.25, 0.5, 0.75))
Artist_Pop_q_25 <- Artist_Popularity_quantiles[1]
Artist_Pop_median <- Artist_Popularity_quantiles[2]
Artist_Pop_q_75 <- Artist_Popularity_quantiles[3]
x$Artist_Popularity_quantile <- ifelse(x$Artist_Popularity < Artist_Pop_q_25, 1, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_q_25) & (x$Artist_Popularity < Artist_Pop_median)), 2, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_median) & (x$Artist_Popularity < Artist_Pop_q_75)), 3, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse((x$Artist_Popularity >= Artist_Pop_q_75), 4, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- as.factor(x$Artist_Popularity_quantile)
tab<-table(x$Genre, x$Artist_Popularity_quantile)
tab
1 2 3 4
Classic 9 21 13 6
Hip Hop 0 8 25 17
Pop 1 8 10 31
Techno 37 10 0 0
# r = 4, c = 4
#df = (r-1)*(c-1) = 3+3 = 9
# critical value: 16.919
chisq.test(tab)
Pearson's Chi-squared test
data: tab
X-squared = 156.23, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
data: tab
X-squared = 156.23, df = NA, p-value = 0.0004998
library("vcd")
assocstats(tab)
X^2 df P(> X^2)
Likelihood Ratio 168.67 9 0
Pearson 156.23 9 0
Phi-Coefficient : NA
Contingency Coeff.: 0.666
Cramer's V : 0.515
#library("openintro")
#contTable(tab)
x$viewsCount_quantile <- 0
viewsCount_quantiles <- quantile(x$viewsCount, probs = c(0.25, 0.5, 0.75))
viewsCount_q_25 <- viewsCount_quantiles[1]
viewsCount_median <- viewsCount_quantiles[2]
viewsCount_q_75 <- viewsCount_quantiles[3]
x$viewsCount_quantile <- ifelse(x$viewsCount < viewsCount_q_25, 1, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_q_25) & (x$viewsCount < viewsCount_median)), 2, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_median) & (x$viewsCount < viewsCount_q_75)), 3, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse((x$viewsCount >= viewsCount_q_75), 4, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- as.factor(x$viewsCount_quantile)
tab<-table(x$viewsCount_quantile, x$Artist_Popularity_quantile)
tab
1 2 3 4
1 36 6 4 3
2 10 19 12 8
3 1 18 19 11
4 0 4 13 32
chisq.test(tab)
Pearson's Chi-squared test
data: tab
X-squared = 133.34, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
data: tab
X-squared = 133.34, df = NA, p-value = 0.0004998
library("vcd")
assocstats(tab)
X^2 df P(> X^2)
Likelihood Ratio 133.48 9 0
Pearson 133.34 9 0
Phi-Coefficient : NA
Contingency Coeff.: 0.636
Cramer's V : 0.476
#contTable(tab)
ab2 <- na.omit(cbind(x$Artist_Popularity_quantile, x$viewsCount_quantile))
nrow(ab2)*(nrow(ab2)-1)/2
[1] 19110
#
ind <- order(ab2[,1], ab2[,2])
ab2 <- ab2[ind,]
#b
C <- D <- Tx <- Ty <- Txy <- 0
for (i in 1:(nrow(ab2)-1)) {
if (i%%100==0) cat(i, "\n")
for(j in (i+1):nrow(ab2)) {
if (ab2[i,1]==ab2[j,1]) {
if (ab2[i,2]==ab2[j,2]) {
Txy <- Txy+1
} else {
Tx <- Tx+(ab2[i,2]<ab2[j,2])
}
} else {
if (ab2[i,2]==ab2[j,2]) Ty <- Ty+1
if (ab2[i,2]<ab2[j,2]) C <- C+1
if (ab2[i,2]>ab2[j,2]) D <- D+1
}
}
}
100
c(C=C, D=D, Tx=Tx, Ty=Ty, Txy=Txy)
C D Tx Ty Txy
10048 1560 2798 2781 1923
k_t <- (C - D)/(nrow(ab2)*(nrow(ab2)-1)/2)
k_t # (without ties)
[1] 0.4441654
library("ryouready")
ord.tau(table(ab2[,1], ab2[,2]))
Kendall's (and Stuart's) Tau statistics
Tau-b: 0.590
Tau-c: 0.589
cor(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method = "kendall") # identical result
[1] 0.5895469
cor.test(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method="kendall")
Kendall's rank correlation tau
data: as.numeric(x$Artist_Popularity_quantile) and as.numeric(x$viewsCount_quantile)
z = 9.8748, p-value < 2.2e-16
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.5895469
modetab <- function(x, margin=0) {
if (margin>0) apply(x, margin, max) else max(x)
}
tab2 <- table(x$Artist_Popularity_quantile,
x$Genre, dnn = c("Artist Popularity quantile", "Genre"))
tab2
Genre
Artist Popularity quantile Classic Hip Hop Pop Techno
1 9 0 1 37
2 21 8 8 10
3 13 25 10 0
4 6 17 31 0
contTable(tab2)
\begin{table}
\begin{center}
\begin{tabular}{lrrrrr}
\hline
& Classic & Hip Hop & Pop & Techno & Total \\
\hline
1 & 9 & 0 & 1 & 37 & 47 \\
2 & 21 & 8 & 8 & 10 & 47 \\
3 & 13 & 25 & 10 & 0 & 48 \\
4 & 6 & 17 & 31 & 0 & 54 \\
\hline
Total & 49 & 50 & 50 & 47 & 196 \\
\hline
\end{tabular}
\end{center}
\caption{Contingency table for }
\label{}
\end{table}
tab <- rowSums(tab2)
message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
The mode across all four quantiles of stream distribution is 54 and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)
message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: 142 or 72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))
message(paste("Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about genre.)"))
Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be: 82 or 41.8367346938776 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, 114 or 58.1632653061224 % would have been predicted correctly (compared to 27.5510204081633 % if there was no knowledge about genre.)
lambda <- (e1_streams - e2_streams)/e1_streams
message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
Therefore, Goodmann and Kruskals Lambda is: 0.422535211267606
library("ryouready")
nom.lambda(tab2)
Lambda:
Columns dependent: 0.438
Rows dependent: 0.423
Symmetric: 0.431
modetab <- function(x, margin=0) {
if (margin>0) apply(x, margin, max) else max(x)
}
tab2 <- table(x$Artist_Popularity_quantile,
x$viewsCount_quantile, dnn = c("Artist Popularity quantile", "viewsCount quantile"))
tab2
viewsCount quantile
Artist Popularity quantile 1 2 3 4
1 36 10 1 0
2 6 19 18 4
3 4 12 19 13
4 3 8 11 32
tab <- rowSums(tab2)
message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
The mode across all four quantiles of stream distribution is 54 and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)
message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: 142 or 72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))
message(paste("Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about viewsCount quantile.)"))
Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be: 90 or 45.9183673469388 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, 106 or 54.0816326530612 % would have been predicted correctly (compared to 27.5510204081633 % if there was no knowledge about viewsCount quantile.)
lambda <- (e1_streams - e2_streams)/e1_streams
message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
Therefore, Goodmann and Kruskals Lambda is: 0.366197183098592
library("ryouready")
nom.lambda(tab2)
Lambda:
Columns dependent: 0.388
Rows dependent: 0.366
Symmetric: 0.377
ANOVA
fit <- aov(Artist_Popularity~Genre , data =x)
summary(fit)
Df Sum Sq Mean Sq F value Pr(>F)
Genre 3 31032 10344 110.3 <2e-16 ***
Residuals 192 18006 94
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
ks.test(x$Artist_Popularity[x$Genre == 'Classic'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Classic']) , sd=sd(x$Artist_Popularity[x$Genre == 'Classic']))$statistic
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein
D
0.06209392
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Classic']))
[1] 0.1940143
ks.test(x$Artist_Popularity[x$Genre == 'Techno'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Techno']) , sd=sd(x$Artist_Popularity[x$Genre == 'Techno']))$statistic
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein
D
0.1108916
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Techno']))
[1] 0.1980992
ks.test(x$Artist_Popularity[x$Genre == 'Hip Hop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Hip Hop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Hip Hop']))$statistic
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein
D
0.09990045
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Hip Hop']))
[1] 0.1920643
ks.test(x$Artist_Popularity[x$Genre == 'Pop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Pop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Pop']))$statistic
f昼㹣r den Komogorov-Smirnov-Test sollten keine Bindungen vorhanden sein
D
0.1191381
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Pop']))
[1] 0.1920643
leveneTest (Artist_Popularity~Genre, data =x)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 3 4.2258 0.006399 **
192
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
varq <- tapply(x$Artist_Popularity, x$Genre, var, na.rm= TRUE)
varq/min(varq)
Classic Hip Hop Pop Techno
1.522737 1.000000 2.293584 1.117978
PCA
audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
x_features <- dplyr::select(x, audio_features)
z <- scale(x_features)
eigen_cor <- eigen(cov(z))
E <- eigen_cor$vectors
pc_cor <- prcomp(z, center = F, scale = F)
plot(pc_cor, main="Scree plot as bar chart (cor)")
plot(pc_cor$sdev^2, type="b", main="Scree plot (cor)")
pc_index <- c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
eigen_values <- round(eigen_cor$values, digits = 2)
variance_explained <- round(eigen_cor$values/length(audio_features), digits = 2)
cum_variance <- round(cumsum(eigen_cor$values)/length(audio_features), digits = 2)
eigen_df <- data.frame(pc_index, eigen_values, variance_explained, cum_variance)
colnames(eigen_df) <- c("PC", "Eigen value", "Variance explained", "Cumulative variance explained")
as.data.frame(E)
E[,1] <- E[,1]*-1
E[,3] <- E[,3]*-1
E[,4] <- E[,4]*-1
E[,5] <- E[,5]*-1
E[,6] <- E[,6]*-1
as.data.frame(E)
scores <- z %*% E
library(corrplot)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#par(mfrow=c(1,1))
#plot((pc_cor$sdev^2)/round(sum(pc_cor$sdev^2))*100, type="b", xlab = "Number of components", ylab = "Percentage of variance explained")
corrplot(pc_cor$rotation, method="color", col=col(200),
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=90, #Text label color and rotation
diag=TRUE, mar = c(0,0,5,5), title = 'PCA - loadings matrix')
plot(scores[,1], scores[,2])
plot(pc_cor$x[,1], pc_cor$x[,2]) # both plots are identical
library("psych")
psych_pca <- psych::principal(z, nfactors = 9, rotate = "none")
scree(z)
# psych_pca$scores[,1]*sqrt(psych_pca$values[1]) # == scores[,1]
# transform loadings of psych::principal to unit-length eigenvectors according to 1/sqrt(a^2 + b^2) * (a,b)
# 1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1] # is identical to pc_cor$rotation[,1] and E[,1]
# transforming unit-length eigenvectors to loadings like in psych::principal:
manual_loadings <- matrix(nrow = dim(z)[2], ncol = dim(z)[2])
for (i in 1:dim(z)[2]){
manual_loadings[,i] <- E[,i] * sqrt(eigen_cor$values[i])
}
# Still signs are interchanged, affected components are 3 and 9, change signs manually
manual_loadings[,3] <- manual_loadings[,3]*-1
manual_loadings[,9] <- manual_loadings[,9]*-1
# Scores in psych are computed according to
(z %*% E[,1]) / sqrt(eigen_cor$values[1]) # == psych_pca$scores[,1]
[,1]
1 -1.59390372
2 -1.89244014
3 -1.56727583
4 -2.02403816
5 -1.34328173
6 -1.98652322
7 -1.65063718
8 0.23454238
9 0.33189660
10 -0.50527012
11 0.48474383
12 0.35548126
13 0.40438968
14 0.41389912
15 0.88932839
16 0.29545887
17 0.74966760
18 -0.33006285
19 0.42233800
20 0.60508283
21 0.35531678
22 1.08286743
23 0.33772362
24 0.39438136
25 0.38074682
26 0.53275627
27 0.35164160
28 0.89806447
29 0.34945558
30 0.79905861
31 0.64973543
32 0.79940254
33 -1.28729950
34 0.32285315
35 0.28938500
36 0.59057212
37 -0.91856390
38 0.54122847
39 0.77222028
40 -0.08719343
41 0.53776316
42 0.48802436
43 0.92409950
44 0.32211784
45 0.35080523
46 0.50370165
47 0.43417619
48 0.65562461
49 0.82640247
50 0.22221084
51 1.23872589
52 -0.27480596
53 -0.60837106
54 1.09983941
55 1.07217376
56 1.01232351
57 0.62184771
58 0.84979050
59 0.80091618
60 0.85471565
61 0.88031931
62 1.25090015
63 0.89078171
64 0.76487318
65 0.76159508
66 0.30109461
67 0.68030322
68 0.80407226
69 0.72002543
70 1.15107907
71 0.40045381
72 1.02252137
73 0.48945386
74 0.89961735
75 0.58542734
76 0.99733632
77 0.79627998
78 0.98885771
79 1.28630203
80 0.46651096
81 1.33907715
82 0.64998223
83 0.55201140
84 0.75942752
85 0.53261883
86 0.74383810
87 0.74165546
88 0.74866137
89 0.96277142
90 0.62715735
91 0.64793200
92 0.95334623
93 0.44942747
94 0.69025775
95 0.97104873
96 1.18362619
97 0.59019519
98 -0.39117191
99 0.40327243
100 0.99054825
101 0.86270002
102 0.67556803
103 1.01390165
104 1.36233335
105 0.81443559
106 1.12486562
107 0.70258335
108 0.10632817
109 0.99193014
110 0.75504621
111 0.65676336
112 0.33047902
113 0.95870755
114 0.75603911
115 -2.11696526
116 -1.74364531
117 -1.53213316
118 -1.94291687
119 -1.72239513
120 -1.36619922
121 -1.66305915
122 -2.06411957
123 -1.97969420
124 -1.73865879
125 -1.59092573
126 -1.75704042
127 -1.65120933
128 -0.90738230
129 -1.57915022
130 -1.79710843
131 -0.72000536
132 -1.11982385
133 -1.16126131
134 -1.26343254
135 -1.59056965
136 -1.19417247
137 -1.92762287
138 -1.75542132
139 -2.11166874
140 -1.75851617
141 -1.30171171
142 -1.64499033
143 -0.70384029
145 -2.22479894
146 -0.43936157
147 -1.98892924
148 -2.10319657
149 0.35090994
150 0.35960014
152 0.27783403
153 0.23520099
154 0.42816581
155 0.29415102
156 0.28883891
157 0.28901016
158 0.32561591
159 0.33618271
160 0.53796886
161 0.15721183
162 0.48276945
163 0.18837789
165 0.38861944
166 0.15693865
167 0.34641160
168 0.46655158
169 0.33666927
170 0.23963328
171 0.37607131
172 0.51586048
173 -0.01126555
174 0.40426506
175 0.23144730
176 0.40281787
177 0.20758467
178 0.24328905
179 0.06504826
180 0.25132125
181 0.18718854
183 0.39566848
184 0.49874147
185 0.39761257
186 0.59112779
187 0.52223745
188 0.34826820
189 0.17079406
190 0.22460558
191 0.18252664
192 -1.69078006
193 -1.32535523
194 -1.35844177
195 -1.97935120
196 -1.43855432
197 -1.63893155
198 -0.83876042
199 -1.32071392
200 -1.71405630
# i.e. using the unit-length eigenvectors and dividing them by the square root of the corresponding eigenvalue
par(mfrow=c(1,2))
biplot(pc_cor) # loadings plot with loadings scaled to fit in same graph with scores
plot(pc_cor$rotation[, 1:2]) # loadings in original measures
text(pc_cor$rotation[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)
#palette <- rainbow(length(levels(as.factor(x$Genre))))
#my_colors <- palette[as.numeric(x$Genre)]
palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]
par(mfrow=c(1,1))
plot(scores[, 1:2], col = my_colors, pch = 19, xlab = "Scores PC1 (47.84% expl. variance)", ylab = "Scores PC2 (16.25% expl. variance)")
legend("topleft", legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = T)
#plot(pc_cor$rotation[, 1:2])
par(mfrow=c(1,1))
plot(E[, 1:2])
text(E[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)
cumsum(eigen_cor$values)/length(audio_features)
[1] 0.4783993 0.6408610 0.7496500 0.8399854 0.9171536 0.9507038 0.9754113 0.9910233 1.0000000
# According to the 90% criterion there would be 5 components needed to appropriately retain the variance in the data by reducing the feature space from nine to five.
RMSE = function(m, o){
sqrt(mean((m - o)^2))
}
# All expressions for the scores are identical
scores <- z %*% E
scores_from_psych <- (z %*% (1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1])) / sqrt(psych_pca$values[1]) # == psych_pca$scores[,1]
scores_from_princomp <- (z %*% pc_cor$rotation[,1]) / sqrt(eigen_cor$values[1])
fits <- matrix(nrow = dim(z)[2], ncol = 3)
for (i in 1:dim(z)[2]){
num_comp <- i
fit <- scores[, 1:num_comp] %*% t(E[, 1:num_comp])
residuals <- z - fit
rmse <- RMSE(z, fit)
r2 <- 1-(sum(diag(var(residuals)))/ sum(diag(var(z))))
fits[i,1] <- num_comp
fits[i,2] <- rmse
fits[i,3] <- r2
}
par(mfrow=c(1,1))
plot(fits[,1], fits[,2], type = "l", ylab = 'RMSE', xlab = 'Number of components')
par(new = TRUE)
plot(fits[,1], fits[,3], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "")
axis(side=4, at = pretty(range(fits[,3])))
mtext("R^2", side = 4, line = -1)
abline(v = 2, lty = 2, col = 'green')
mtext("Kaiser criterion", side = 1, line = 2, adj = 0.1)
abline(v=min(which(fits[,3]>=0.9)), lty = 2, col = 'red')
mtext("90% rule", side = 1, line = 1, adj = 0.5)
# Squared prediction errors / Q-residuals
par(mfrow=c(1,1))
a <- 1
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
[1] 0.7203745
res <- z - Xhat
E2 <- res * res
SPE_1 <- apply(E2, 1, sum) # not using sqrt(apply(E2, 1, sum))!
# Box (1954) method: weighted Chi-squared
v <- var(SPE_1)
m <- mean(SPE_1)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k1_95 <- g*qchisq(.95, df=h)
plot(SPE_1, ylab = paste('SPE after using', a, 'component'), xlab = 'index')
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_1)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_1))
[1] 2.161124
a <- 2
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
[1] 0.5977514
res <- z - Xhat
E2 <- res * res
#SPE_2 <- sqrt(apply(E2, 1, sum))
SPE_2 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]
v <- var(SPE_2)
m <- mean(SPE_2)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k2_95 <- g*qchisq(.95, df=h)
plot(SPE_2, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_2)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_2))
[1] 1.793254
a <- 9
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
[1] 9.521473e-16
res <- z - Xhat
E2 <- res * res
# SPE_9 <- sqrt(apply(E2, 1, sum))
SPE_9 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]
v <- var(SPE_9)
m <- mean(SPE_9)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k9_95 <- g*qchisq(.95, df=h)
plot(SPE_9, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_9)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_9))
[1] 2.856442e-15
par(mfrow=c(3,1))
plot(SPE_1, type = 'l', col = 'darkgreen', ylab = "SPE for PC1")
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
plot(SPE_2, type = 'l', col = 'red', ylab = "SPE for PC1 + PC2")
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
plot(SPE_9, type = 'l', col = 'blue', ylab = "SPE for all PCs")
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')
sum(SPE_1 > lim_k1_95)
[1] 6
sum(SPE_1 > lim_k1_95)/length(SPE_1)
[1] 0.03061224
sum(SPE_2 > lim_k2_95)
[1] 7
sum(SPE_2 > lim_k2_95)/length(SPE_2)
[1] 0.03571429
sum(SPE_9 > lim_k9_95)
[1] 10
sum(SPE_9 > lim_k9_95)/length(SPE_9)
[1] 0.05102041
Hotelling’s T^2
num_comp <- 2
inverse_cov <- diag(eigen_cor$values[1:num_comp], nrow = num_comp, ncol = num_comp)^-1
inverse_cov[is.infinite(inverse_cov)] <- 0
tsquared <- diag(z %*% E[,1:num_comp] %*% inverse_cov %*% t(z %*% E[,1:num_comp]))
# which is equivalent to the Mahalanobis distance: (for k > 1)
tsquared_mahal <- mahalanobis(scores[,1:num_comp], center = FALSE, cov(cbind(scores[,1], scores[,num_comp])), inverted = FALSE)
tsquared_lim95 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.95, df1 = num_comp, df2 = dim(z)[1] - num_comp)
tsquared_lim99 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.99, df1 = num_comp, df2 = dim(z)[1] - num_comp)
par(mfrow=c(1,1))
plot(tsquared, type = 'l', ylim = c(0,tsquared_lim99*1.1), ylab = "Hotelling's T2")
abline(h=tsquared_lim99, col = 'red', lty = 2)
text(190,10, "99% limit", col = "red")
abline(h=tsquared_lim95, col = 'darkgreen', lty = 2)
text(190,6.6, "95% limit", col = "darkgreen")
# source: https://github.com/hredestig/pcaMethods/blob/master/R/pca.R
simpleEllipse <- function(x, y, alfa=0.95, len=200) {
N <- length(x)
A <- 2
mypi <- seq(0, 2 * pi, length=len)
r1 <- sqrt(var(x) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
r2 <- sqrt(var(y) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
cbind(r1 * cos(mypi) + mean(x), r2 * sin(mypi) + mean(y))
}
confidence_ellipse95 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.95, len=500)
confidence_ellipse99 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.99, len=500)
plot(scores[,1], scores[,2], xlim = c(min(confidence_ellipse99[,1]), max(confidence_ellipse99[,1])),
ylim = c(min(confidence_ellipse99[,2]), max(confidence_ellipse99[,2])), ylab = 'Scores PC2', xlab = 'Scores PC1')
abline(h = 0, v = 0)
points(confidence_ellipse95, type = 'l', lty = 2, col = 'darkgreen')
points(confidence_ellipse99, type = 'l', lty = 2, col = 'red')
x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
outlier_tracks <- x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
round(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),], digits = 2)
acousticness danceability energy instrumentalness liveness loudness speechiness tempo valence
21 0.88 0.21 -0.51 -0.90 -0.23 -0.04 4.30 -0.95 1.08
146 0.35 0.49 -0.79 1.26 2.13 -0.20 -0.61 -0.41 -0.68
audio_z <- as.data.frame(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),])
outlier_tracks$acousticness <- audio_z$acousticness
outlier_tracks$danceability <- audio_z$danceability
outlier_tracks$energy <- audio_z$energy
outlier_tracks$instrumentalness <- audio_z$instrumentalness
outlier_tracks$liveness <- audio_z$liveness
outlier_tracks$loudness <- audio_z$loudness
outlier_tracks$speechiness <- audio_z$speechiness
outlier_tracks$tempo <- audio_z$tempo
outlier_tracks$valence <- audio_z$valence
# Finally, combining residuals and scores (i.e. Hotelling's T^2)
# cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]
plot(tsquared, SPE_2,
xlab = paste("Hotelling's T2 (", round(cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]*100, digits = 2), "%)"),
ylab = paste("Q-residuals (", round((1-cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp])*100, digits = 2), "%)"))
abline(h = lim_k2_95, lty = 2, col = 'darkgreen') # seven residual outliers
abline(v = tsquared_lim95, lty = 2, col = 'darkgreen') # two score outliers
library("robustbase")
out <- adjOutlyingness(z, ndir=5000, clower=0, cupper=0)
hist(out$adjout)
rug(out$adjout)
z[!out$nonOut,]# as we can confirm, the outlying cases differ strongly in their characteristics from their respective centers, that is especially liveness and energy are inconsistent with the model
acousticness danceability energy instrumentalness liveness loudness speechiness tempo valence
15 -0.094467802 0.305088118 0.4781728 -0.9049362 4.3284375 0.5324131 -0.3911434 1.11569104 1.3435281
21 0.882322256 0.212927329 -0.5076300 -0.9049323 -0.2300724 -0.0352499 4.3016000 -0.94770231 1.0815371
51 -0.725356105 0.292520737 1.1679094 -0.9049362 7.1823764 0.8712256 -0.5411446 -0.42960905 1.6680842
66 -0.067632910 -0.172472333 -0.1725221 -0.9049362 -0.4141660 0.5886878 3.4161767 -0.96530254 -0.5998981
68 0.436863054 -0.440576446 0.5399888 -0.9049362 -0.3985814 0.7324146 3.4578437 2.23014470 0.3190258
88 -0.008596148 0.439140174 -0.1725221 -0.9047249 -0.3469572 0.2811821 3.4995106 0.75786761 0.7335191
96 -0.561394917 1.507367499 0.8197876 -0.9042543 4.8251982 0.5040817 1.4994957 -0.06945090 0.1156894
105 -0.448688372 -0.000718136 0.3968359 -0.9049362 -0.1813703 0.7114572 3.5515944 0.02013067 0.2290886
115 1.427070557 -2.111619112 -1.8571713 -0.1679945 -0.7385215 -2.8377300 -0.5098943 -1.52344531 -1.1692398
140 1.588079907 -1.818799151 -1.6889666 0.5378426 3.1693292 -1.7704512 -0.5848949 -1.98956402 -1.0077438
145 1.419020090 -1.709881855 -1.8773428 -0.9048587 -0.3469572 -4.0775846 -0.6432287 -1.63411123 -1.1700219
177 -0.633849124 0.845485470 0.2634435 1.2819623 2.5069816 -0.2098955 0.2494863 0.64759679 -0.9600380
x[rownames(z[!out$nonOut,]),c("Track_Title", "Track_Artist", "Genre", "Track_Popularity")]
library("paran")
paran(z, centile=95, all=T, graph=T)
Using eigendecomposition of correlation matrix.
Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
Results of Horn's Parallel Analysis for component retention
270 iterations, using the 95 centile estimate
--------------------------------------------------
Component Adjusted Unadjusted Estimated
Eigenvalue Eigenvalue Bias
--------------------------------------------------
1 3.869575 4.305594 0.436018
2 1.167162 1.462154 0.294991
3 0.784392 0.979100 0.194708
4 0.696123 0.813018 0.116894
5 0.656627 0.694514 0.037886
6 0.333946 0.301951 -0.03199
7 0.320340 0.222367 -0.09797
8 0.303919 0.140507 -0.16341
9 0.319357 0.080790 -0.23856
--------------------------------------------------
Adjusted eigenvalues > 1 indicate dimensions to retain.
(2 components retained)
x$pc_1 <- pc_cor$x[,1]
x$pc_2 <- pc_cor$x[,2]
x$Release_Date <- as.Date(paste(x$Release_Date, 1, 1, sep = "-"))
x$days_release_orig <- as.integer(round(difftime('2020-03-01', x$Release_Date, units = "days"), digits = 0))
x$Release_Date <- NULL
x$days_release <- as.numeric(scale(x$days_release_orig))
drop.cols <- c('Artist_ID', 'Genre', 'Track_Artist','Track_ID', 'Track_Title', 'key',
'mode', 'time_signature', 'video_ID', 'Charts', 'acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'Artist_Popularity', 'Track_Popularity', 'Artist_Popularity_quantile', 'viewsCount_quantile', 'days_release_orig', 'pc_1', 'pc_2')
x_sel <- dplyr::select(x, -one_of(drop.cols))
complete_index <- as.numeric(rownames(x_sel))
z <- scale(x_sel)
# inverse and partial correlations
p <- solve(cor(z, use="complete.obs"))
# if the model holds then the non-diagonal elements of Rô€€€1
# must be close to zero (relative to the diagonal element)
levelplot(p, scales=list(x=list(rot=90)))
# Partial correlations
pr <- -p/sqrt(outer(diag(p), diag(p)))
levelplot(pr, scales=list(x=list(rot=90)))
KMO(z)
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = z)
Overall MSA = 0.74
MSA for each item =
Artist_Albums_Number Artist_Albums_Tracks_Number Artist_Appearances_Number Artist_Appearances_Tracks_Number
0.81 0.70 0.56 0.53
Artist_Compilations_Number Artist_Compilations_Tracks_Number Artist_Follower Artist_Singles_Number
0.71 0.69 0.71 0.78
Artist_Singles_Tracks_Number Track_Duration_ms commentCount dislikeCount
0.60 0.70 0.85 0.84
likeCount viewsCount days_release
0.80 0.85 0.65
# result: 0.74 overall: middling, every individual MSA value is as least as high as 0.53 (middling)
cortest.bartlett(z) # result: correlation matrix is NOT an identity matrix, so proceed with factor analysis
R was not square, finding R from data
$chisq
[1] 3167.088
$p.value
[1] 0
$df
[1] 105
scree(z)
library("paran")
paran(z, centile=95, all=T, graph=T) # four factors are appropriate
Using eigendecomposition of correlation matrix.
Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
Results of Horn's Parallel Analysis for component retention
450 iterations, using the 95 centile estimate
--------------------------------------------------
Component Adjusted Unadjusted Estimated
Eigenvalue Eigenvalue Bias
--------------------------------------------------
1 3.974762 4.590215 0.615452
2 3.295978 3.761880 0.465901
3 1.658319 2.026943 0.368624
4 1.057991 1.333118 0.275126
5 0.747975 0.955038 0.207062
6 0.506347 0.649312 0.142965
7 0.520244 0.598370 0.078126
8 0.357187 0.377677 0.020490
9 0.271482 0.245019 -0.02646
10 0.261008 0.177595 -0.08341
11 0.238168 0.106222 -0.13194
12 0.285147 0.096959 -0.18818
13 0.277766 0.036315 -0.24145
14 0.321604 0.025889 -0.29571
15 0.382354 0.019441 -0.36291
--------------------------------------------------
Adjusted eigenvalues > 1 indicate dimensions to retain.
(4 components retained)
Loadings
pca <- principal(z, nfactors=4, rotate="none") # h2 are "communalities"
pa <- fa(z, nfactors=4, rotate="none", fm="pa") # principal axis extraction
uls <- fa(z, nfactors=4, rotate="none")
ml <- fa(z, nfactors=4, rotate="none", fm="ml")
library("xtable")
# xtable(unclass(ml$loadings))
print(ml$loadings, sort = T)
Loadings:
ML2 ML3 ML1 ML4
commentCount 0.965 -0.144
dislikeCount 0.968 -0.161
likeCount 0.980 -0.145
viewsCount 0.943 -0.108
Artist_Albums_Number 0.750 0.503
Artist_Albums_Tracks_Number 0.638 0.232 0.587
Artist_Compilations_Number 0.876 0.365
Artist_Compilations_Tracks_Number 0.896 0.408
Artist_Singles_Number 0.358 0.685 -0.206
Artist_Singles_Tracks_Number -0.105 0.988
Artist_Appearances_Number 0.212 0.870
Artist_Appearances_Tracks_Number 0.231 0.551
Artist_Follower 0.335
Track_Duration_ms 0.268 0.375
days_release 0.384 0.106 0.392
ML2 ML3 ML1 ML4
SS loadings 3.879 2.849 2.315 1.756
Proportion Var 0.259 0.190 0.154 0.117
Cumulative Var 0.259 0.449 0.603 0.720
print(ml$loadings, cutoff=.4, sort = T)
Loadings:
ML2 ML3 ML1 ML4
commentCount 0.965
dislikeCount 0.968
likeCount 0.980
viewsCount 0.943
Artist_Albums_Number 0.750 0.503
Artist_Albums_Tracks_Number 0.638 0.587
Artist_Compilations_Number 0.876
Artist_Compilations_Tracks_Number 0.896 0.408
Artist_Singles_Number 0.685
Artist_Singles_Tracks_Number 0.988
Artist_Appearances_Number 0.870
Artist_Appearances_Tracks_Number 0.551
Artist_Follower
Track_Duration_ms
days_release
ML2 ML3 ML1 ML4
SS loadings 3.879 2.849 2.315 1.756
Proportion Var 0.259 0.190 0.154 0.117
Cumulative Var 0.259 0.449 0.603 0.720
set.seed(42)
ml.varimax <- fa(z, nfactors=4, rotate="varimax", fm="ml")
print(ml.varimax$loadings, cutoff=.4, sort = T)
Loadings:
ML2 ML3 ML4 ML1
commentCount 0.974
dislikeCount 0.979
likeCount 0.989
viewsCount 0.950
Artist_Albums_Number 0.857
Artist_Compilations_Number 0.948
Artist_Compilations_Tracks_Number 0.976
Artist_Albums_Tracks_Number 0.619 0.643
Artist_Appearances_Number 0.898
Artist_Appearances_Tracks_Number 0.587
Artist_Singles_Number 0.555 0.579
Artist_Singles_Tracks_Number 0.955
Artist_Follower
Track_Duration_ms 0.430
days_release 0.417
ML2 ML3 ML4 ML1
SS loadings 3.922 3.457 2.015 1.404
Proportion Var 0.261 0.230 0.134 0.094
Cumulative Var 0.261 0.492 0.626 0.720
# xtable(unclass(ml.varimax$loadings))
fa.congruence(ml, ml.varimax)
ML2 ML3 ML4 ML1
ML2 1.00 -0.07 -0.09 -0.01
ML3 -0.01 0.98 0.33 0.17
ML1 -0.20 0.69 0.45 0.89
ML4 -0.03 0.14 0.96 -0.09
par(mfrow=c(1,1))
threshold <- 0.4
plot(ml.varimax$loadings[,1], ml.varimax$loadings[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.varimax$loadings[,1]) > threshold) | (abs(ml.varimax$loadings[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.varimax$loadings[,1], ml.varimax$loadings[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.varimax$loadings[,1:2], as.character(rownames(ml.varimax$loadings)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)
# fa.diagram(ml.varimax, simple=TRUE, cut=.2, digits=2)
set.seed(42)
ml.promax <- fa(z, nfactors=4, rotate="promax", fm="ml")
fa.congruence(ml.promax, ml.varimax)
ML2 ML3 ML4 ML1
ML2 1.00 -0.01 -0.03 -0.08
ML3 -0.01 0.99 0.19 0.29
ML4 0.00 0.09 0.97 0.10
ML1 -0.09 0.24 0.12 0.99
ml.promax$Phi
ML2 ML3 ML4 ML1
ML2 1.00000000 -0.1234826 -0.1397866 0.07121945
ML3 -0.12348257 1.0000000 0.3651684 0.16587388
ML4 -0.13978664 0.3651684 1.0000000 -0.00887250
ML1 0.07121945 0.1658739 -0.0088725 1.00000000
print(ml.promax$loadings, cutoff=.4, sort = T)
Loadings:
ML2 ML3 ML4 ML1
commentCount 0.984
dislikeCount 0.990
likeCount 0.999
viewsCount 0.962
Artist_Albums_Number 0.860
Artist_Albums_Tracks_Number 0.552 0.545
Artist_Compilations_Number 0.999
Artist_Compilations_Tracks_Number 1.018
Artist_Appearances_Number 0.940
Artist_Appearances_Tracks_Number 0.643
Artist_Singles_Number 0.535 0.541
Artist_Singles_Tracks_Number 0.956
Artist_Follower
Track_Duration_ms 0.436
days_release
ML2 ML3 ML4 ML1
SS loadings 3.989 3.519 2.033 1.346
Proportion Var 0.266 0.235 0.136 0.090
Cumulative Var 0.266 0.501 0.636 0.726
#xtable(unclass(ml.promax$loadings))
print(ml.promax$Structure, cutoff = 0.4)
Loadings:
ML2 ML3 ML4 ML1
Artist_Albums_Number 0.889
Artist_Albums_Tracks_Number 0.721 0.744
Artist_Appearances_Number 0.891
Artist_Appearances_Tracks_Number 0.565
Artist_Compilations_Number 0.941
Artist_Compilations_Tracks_Number 0.980
Artist_Follower
Artist_Singles_Number 0.582 0.632
Artist_Singles_Tracks_Number 0.958
Track_Duration_ms 0.449
commentCount 0.974
dislikeCount 0.979
likeCount 0.989
viewsCount 0.947
days_release 0.426 0.477
ML2 ML3 ML4 ML1
SS loadings 3.969 3.890 2.427 1.548
Proportion Var 0.265 0.259 0.162 0.103
Cumulative Var 0.265 0.524 0.686 0.789
# For orthoghonal rotations: cor(item, factor) = loadings ("pattern") matrix = structure matrix,
# since structure matrix = loadings matrix *%* factor intercorrelation matrix (which is identity for orthogonal rotations)
# example: ml.promax$loadings %*% ml.promax$Phi = structure matrix
par(mfrow=c(1,1))
threshold <- 0.5
plot(ml.promax$Structure[,1], ml.promax$Structure[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.promax$Structure[,1]) > threshold) | (abs(ml.promax$Structure[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.promax$Structure[,1], ml.promax$Structure[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.promax$Structure[,1:2], as.character(rownames(ml.promax$Structure)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)
fa1 <-factanal(z, factors=4, scores = 'regression', lower = 0.1) # ML with Kaiser normalization
head(fa1$scores)
Factor1 Factor2 Factor3 Factor4
1 -0.09032514 4.7229399 -1.1982826 1.1913023
2 -0.06532926 1.9592530 0.6796187 1.9294387
3 -0.17155114 1.2161697 2.5043243 -0.6176557
4 -0.14220859 4.8213406 -1.0227099 0.2499034
5 -0.15666866 6.0158376 -1.7644434 -0.1556914
6 -0.16905143 0.1190256 2.0571621 -0.1523273
fa2 <- fa(z, nfactors=4) # oblimin rotation without Kaiser normalization
The estimated weights for the factor scores are probably incorrect. Try a different factor score estimation method.
head(fa2$scores)
MR2 MR1 MR3 MR4
1 -0.1257348 4.9076402 -1.4064326 3.2185555
2 -0.2186062 2.3024182 0.7123318 2.1407078
3 -0.2078290 1.3247000 2.3948606 -0.3356255
4 -0.1849004 4.2448612 0.0405285 1.8486009
5 -0.2345250 5.4651211 -0.7047564 1.0788027
6 -0.2682063 0.3496226 1.8536951 0.1795223
cor(fa1$scores, fa2$scores)
MR2 MR1 MR3 MR4
Factor1 0.99066727 -0.04263156 -0.03183455 0.02879820
Factor2 -0.02300522 0.98601395 0.12472840 0.27124108
Factor3 -0.03546517 0.11316250 0.97914604 0.09737382
Factor4 -0.05458080 0.13298949 -0.03618965 0.93878256
cor(fa1$scores, ml.promax$scores)
ML2 ML3 ML4 ML1
Factor1 0.995994444 -0.06746742 -0.08311111 0.060236052
Factor2 -0.051288886 0.97618626 0.22252324 0.086347208
Factor3 -0.056169205 0.17796182 0.97556026 0.002189822
Factor4 0.007625069 0.11489683 0.02445173 0.989366259
cor(ml.promax$scores, ml.varimax$scores)
ML2 ML3 ML4 ML1
ML2 0.99714527 -0.05043569 -0.058887859 0.01154380
ML3 -0.06617328 0.97785070 0.193263920 0.08735674
ML4 -0.08094263 0.19810346 0.979273972 0.03916156
ML1 0.06253243 0.10010835 -0.004175889 0.99271103
fa_bartlett_scores <- fa(z, nfactors=4, rotate="varimax", fm="ml", scores = 'Bartlett')
cor(ml.varimax$scores, fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
ML2 9.999945e-01 -1.574145e-18 8.383016e-17 -1.545713e-17
ML3 4.392591e-17 9.999300e-01 7.699237e-16 -1.960363e-16
ML4 3.900769e-18 7.222804e-19 9.995550e-01 3.371781e-17
ML1 -5.920793e-18 -3.169430e-16 5.198842e-16 9.996269e-01
cor(fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
ML2 1.0000000000 0.0003036624 0.003257071 -0.0006236104
ML3 0.0003036624 1.0000000000 -0.011669369 -0.0016210978
ML4 0.0032570710 -0.0116693685 1.000000000 -0.0272409055
ML1 -0.0006236104 -0.0016210978 -0.027240905 1.0000000000
# sum scores
threshold <- 0.5
vars_1 <- abs(ml.promax$Structure[,1])>threshold
vars_2 <- abs(ml.promax$Structure[,2])>threshold
vars_3 <- abs(ml.promax$Structure[,3])>threshold
vars_4 <- abs(ml.promax$Structure[,4])>threshold
key.list <- list(one = as.numeric(which(vars_1==1)),
two = as.numeric(which(vars_2==1)),
three = as.numeric(which(vars_3==1)),
four = as.numeric(which(vars_4==1)))
sign.mat <- cbind(sign(ml.promax$Structure[,1]),
sign(ml.promax$Structure[,2]),
sign(ml.promax$Structure[,3]),
sign(ml.promax$Structure[,4]))
keys <- make.keys(z, key.list,item.labels = colnames(z))
si <- scoreItems(keys * sign.mat, z) # these are the scores I finally use
pairs(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))
[,1] [,2] [,3] [,4]
[1,] 1.00000000 -0.07190494 -0.04542303 -0.04038714
[2,] -0.07190494 1.00000000 0.42361028 0.62898318
[3,] -0.04542303 0.42361028 1.00000000 0.19564730
[4,] -0.04038714 0.62898318 0.19564730 1.00000000
print(ml.promax$loadings, cutoff = 0.5, sort = T)
Loadings:
ML2 ML3 ML4 ML1
commentCount 0.984
dislikeCount 0.990
likeCount 0.999
viewsCount 0.962
Artist_Albums_Number 0.860
Artist_Albums_Tracks_Number 0.552 0.545
Artist_Compilations_Number 0.999
Artist_Compilations_Tracks_Number 1.018
Artist_Appearances_Number 0.940
Artist_Appearances_Tracks_Number 0.643
Artist_Singles_Number 0.535 0.541
Artist_Singles_Tracks_Number 0.956
Artist_Follower
Track_Duration_ms
days_release
ML2 ML3 ML4 ML1
SS loadings 3.989 3.519 2.033 1.346
Proportion Var 0.266 0.235 0.136 0.090
Cumulative Var 0.266 0.501 0.636 0.726
# I apply a threshold of 0.5 for manual computation of sum scores
f2 <- (z[,11] + z[,12] + z[,13] + z[,14])/4 # commentCount, dislikeCount, likeCount, viewsCount
f3 <- (z[,1] + z[,2] + z[,5])/3 # Artist_Albums_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number
f4 <- (z[,2] + z[,3] + z[,4])/3 # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number
f1 <- (z[,8] + z[,9])/2 # Artist_Singles_Number, Artist_Singles_Tracks_Number
pairs(cbind(f2, f3, f4, f1))
cor(cbind(f2, f3, f4, f1))
f2 f3 f4 f1
f2 1.00000000 -0.08246012 -0.04542303 -0.04038714
f3 -0.08246012 1.00000000 0.51683413 0.47961885
f4 -0.04542303 0.51683413 1.00000000 0.19564730
f1 -0.04038714 0.47961885 0.19564730 1.00000000
psych::alpha(cor(z[,vars_1]), check.keys = T) # commentCount, dislikeCount, likeCount, viewsCount
Reliability analysis
Call: psych::alpha(x = cor(z[, vars_1]), check.keys = T)
Reliability if an item is dropped:
Item statistics
psych::alpha(cor(z[,vars_2]), check.keys = T) # Artist_Albums_Number, Artist_Albums_Tracks_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number, Artist_Singles_Number
Reliability analysis
Call: psych::alpha(x = cor(z[, vars_2]), check.keys = T)
Reliability if an item is dropped:
Item statistics
psych::alpha(cor(z[,vars_3]), check.keys = T) # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number, Track_Duration_ms
Reliability analysis
Call: psych::alpha(x = cor(z[, vars_3]), check.keys = T)
Reliability if an item is dropped:
Item statistics
psych::alpha(cor(z[,vars_4]), check.keys = T) # Artist_Singles_Number, Artist_Singles_Tracks_Number
Datenl攼㸴nge [16] ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
Reliability analysis
Call: psych::alpha(x = cor(z[, vars_4]), check.keys = T)
Reliability if an item is dropped:
Item statistics
items_1 <- reverse.code((keys[,1] * sign.mat[,1])[(keys[,1] * sign.mat[,1]) != 0], z[, vars_1])
items_2 <- reverse.code((keys[,2] * sign.mat[,2])[(keys[,2] * sign.mat[,2]) != 0], z[, vars_2])
items_3 <- reverse.code((keys[,3] * sign.mat[,3])[(keys[,3] * sign.mat[,3]) != 0], z[, vars_3])
items_4 <- reverse.code((keys[,4] * sign.mat[,4])[(keys[,4] * sign.mat[,4]) != 0], z[, vars_4])
library("additivityTests")
tukey.test(items_1) # result: H0 rejected, i.e. sum score is insufficient to represent the variables (factor 2)
Tukey test on 5% alpha-level:
Test statistic: 16.19
Critival value: 3.857
The additivity hypothesis was rejected.
tukey.test(items_2) # result: H0 rejected (factor 3)
Tukey test on 5% alpha-level:
Test statistic: 9.276
Critival value: 3.853
The additivity hypothesis was rejected.
tukey.test(items_3) # result: H0 cannot be rejected (factor 4)
Tukey test on 5% alpha-level:
Test statistic: 0.3105
Critival value: 3.865
The additivity hypothesis cannot be rejected.
tukey.test(items_4) # result: H0 cannot be rejected (factor 1)
Tukey test on 5% alpha-level:
Test statistic: 0.004627
Critival value: 3.89
The additivity hypothesis cannot be rejected.
cor(ml.promax$scores, si$scores)
one two three four
ML2 0.99592621 -0.09140514 -0.079332072 -0.002099964
ML3 -0.09594147 0.97336006 0.416099225 0.487503635
ML4 -0.11258045 0.39562748 0.960537575 0.200024816
ML1 0.01023158 0.29765356 -0.008730584 0.882141227
fa_bartlett_scores <- fa(z, nfactors=4, rotate="promax", fm="ml", scores = 'Bartlett')
cor(ml.promax$scores, fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
ML2 0.99999331 -0.1228703 -0.132348818 0.071013967
ML3 -0.12409086 0.9999497 0.347443877 0.166211182
ML4 -0.14750946 0.3834347 0.999105855 -0.009335731
ML1 0.07138876 0.1654449 -0.008420436 0.999492034
cor(si$scores, fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
one 0.995760702 -0.09422492 -0.09953451 0.01023589
two -0.091504358 0.97260263 0.36373011 0.29006661
three -0.082455588 0.42220226 0.95965169 -0.03857534
four -0.001308527 0.48171167 0.16518758 0.87773348
cor(cbind(f2, f3, f4, f1), si$scores) # are almost identical now
one two three four
f2 1.00000000 -0.07190494 -0.04542303 -0.04038714
f3 -0.08246012 0.97220279 0.51683413 0.47961885
f4 -0.04542303 0.42361028 1.00000000 0.19564730
f1 -0.04038714 0.62898318 0.19564730 1.00000000
cor(cbind(f2, f3, f4, f1), fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
f2 0.995760702 -0.09422492 -0.09953451 0.01023589
f3 -0.113694191 0.96801245 0.47604146 0.14747026
f4 -0.082455588 0.42220226 0.95965169 -0.03857534
f1 -0.001308527 0.48171167 0.16518758 0.87773348
cor(si$scores, fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
one 0.995760702 -0.09422492 -0.09953451 0.01023589
two -0.091504358 0.97260263 0.36373011 0.29006661
three -0.082455588 0.42220226 0.95965169 -0.03857534
four -0.001308527 0.48171167 0.16518758 0.87773348
# I choose the manually computed scoreItems and sum scores omitting <= 0.5 structure loadings and assigning the item with the larger loading to a single factor in case of ambiguity
par(mfrow=c(1,2))
hist(ml.promax$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")
hist(ml.promax$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")
par(mfrow=c(1,2))
hist(si$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")
hist(si$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")
par(mfrow=c(1,2))
hist(f2, main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(f2[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f2[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")
hist(f3, main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(f3[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f3[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")
x$factor_2 <- si$scores[,1]
x$factor_3 <- si$scores[,2]
x$factor_4 <- si$scores[,3]
x$factor_1 <- si$scores[,4]
x$factor_2man <- f2
x$factor_3man <- f3
x$factor_4man <- f4
x$factor_1man <- f1
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), cbind(f2, f3, f4, f1))
f2 f3 f4 f1
[1,] 1.00000000 -0.08246012 -0.04542303 -0.04038714
[2,] -0.07190494 0.97220279 0.42361028 0.62898318
[3,] -0.04542303 0.51683413 1.00000000 0.19564730
[4,] -0.04038714 0.47961885 0.19564730 1.00000000
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), fa_bartlett_scores$scores)
ML2 ML3 ML4 ML1
[1,] 0.995760702 -0.09422492 -0.09953451 0.01023589
[2,] -0.091504358 0.97260263 0.36373011 0.29006661
[3,] -0.082455588 0.42220226 0.95965169 -0.03857534
[4,] -0.001308527 0.48171167 0.16518758 0.87773348
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), ml.promax$scores)
ML2 ML3 ML4 ML1
[1,] 0.995926214 -0.09594147 -0.1125804 0.010231584
[2,] -0.091405140 0.97336006 0.3956275 0.297653556
[3,] -0.079332072 0.41609922 0.9605376 -0.008730584
[4,] -0.002099964 0.48750364 0.2000248 0.882141227
x$factor_2bartlett <- fa_bartlett_scores$scores[,1]
x$factor_3bartlett <- fa_bartlett_scores$scores[,2]
x$factor_4bartlett <- fa_bartlett_scores$scores[,3]
x$factor_1bartlett <- fa_bartlett_scores$scores[,4]
z <- scale(x_features)
library(factoextra)
# Elbow method
fviz_nbclust(z, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) + labs(title = "")
set.seed(42)
kz <- kmeans(z, c=4)
k_techno <- names(kz$cluster[kz$cluster == 1])
k_hiphop <- names(kz$cluster[kz$cluster == 2])
k_pop <- names(kz$cluster[kz$cluster == 3])
k_classic <- names(kz$cluster[kz$cluster == 4])
kz$cluster[k_techno] <- 4
kz$cluster[k_pop] <- 3
kz$cluster[k_hiphop] <- 2
kz$cluster[k_classic] <- 1
par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=kz$cluster, pch=19, xlab = 'PC1', ylab = 'PC2')
# project cluster centers to PCA feature space:
cluster_centers_pca <- kz$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')
library("cluster")
sk_4 <- silhouette(kz$cluster, dist(z))
plot(sk_4, col=1:4, border=NA, ann = T, main = "")
library("caret")
confusionMatrix(as.factor(as.numeric(kz$cluster)), as.factor(as.numeric(x$Genre)))
Confusion Matrix and Statistics
Reference
Prediction 1 2 3 4
1 48 0 3 0
2 1 1 0 47
3 0 18 8 0
4 0 31 39 0
Overall Statistics
Accuracy : 0.2908
95% CI : (0.2283, 0.3598)
No Information Rate : 0.2551
P-Value [Acc > NIR] : 0.1437
Kappa : 0.0566
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity 0.9796 0.020000 0.16000 0.0000
Specificity 0.9796 0.671233 0.87671 0.5302
Pos Pred Value 0.9412 0.020408 0.30769 0.0000
Neg Pred Value 0.9931 0.666667 0.75294 0.6270
Prevalence 0.2500 0.255102 0.25510 0.2398
Detection Rate 0.2449 0.005102 0.04082 0.0000
Detection Prevalence 0.2602 0.250000 0.13265 0.3571
Balanced Accuracy 0.9796 0.345616 0.51836 0.2651
# Although k-means is not a classification algorithm, we can compute the proportions of genres per cluster to assign
# an interpretation for each cluster.
tab_kmeans <- table(kz$cluster, x$Genre)
#xtable(round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100)
round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100
Classic Hip Hop Pop Techno
1 98 0 6 0
2 2 2 0 94
3 0 36 16 0
4 0 66 83 0
# It seems that Classic music and Techno music can be accurately distinguished but between Hip Hop and Pop there are
# many false negatives, e.g. 62% of the tracks were predicted to be of the genre Pop although the truth was that they were
# of class Hip Hop, underlining the ambiguity between those genres in terms of music theoretical characteristics nowadays.
# Vice versa, 16% of the tracks predicted to be Hip Hop music were actually Pop music.
set.seed(42)
cl2 <- pam(z, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering)
k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])
cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1
par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering, pch=19, xlab = 'PC1', ylab = 'PC2')
points(pc_cor$x[rownames(cl2$medoids),1], pc_cor$x[rownames(cl2$medoids),2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')
# xtable(x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')])
x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')]
sm_4 <- silhouette(cl2$clustering, dist(z))
plot(sm_4, col=1:4, border=NA, main = "")
confusionMatrix(as.factor(as.numeric(cl2$clustering)), as.factor(as.numeric(x$Genre)))
Confusion Matrix and Statistics
Reference
Prediction 1 2 3 4
1 48 0 2 0
2 0 28 13 0
3 0 21 35 0
4 1 1 0 47
Overall Statistics
Accuracy : 0.8061
95% CI : (0.7437, 0.859)
No Information Rate : 0.2551
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7415
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3 Class: 4
Sensitivity 0.9796 0.5600 0.7000 1.0000
Specificity 0.9864 0.9110 0.8562 0.9866
Pos Pred Value 0.9600 0.6829 0.6250 0.9592
Neg Pred Value 0.9932 0.8581 0.8929 1.0000
Prevalence 0.2500 0.2551 0.2551 0.2398
Detection Rate 0.2449 0.1429 0.1786 0.2398
Detection Prevalence 0.2551 0.2092 0.2857 0.2500
Balanced Accuracy 0.9830 0.7355 0.7781 0.9933
tab_kmedoids <- table(cl2$clustering, x$Genre)
# xtable((round(tab_kmedoids/colSums(tab_kmedoids), digits = 2))*100)
round(tab_kmedoids/colSums(tab_kmedoids), digits = 2)*100
Classic Hip Hop Pop Techno
1 98 0 4 0
2 0 56 26 0
3 0 42 70 0
4 2 2 0 100
# Factor analysis + k-means on scores
library("psych")
KMO(z)
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = z)
Overall MSA = 0.83
MSA for each item =
acousticness danceability energy instrumentalness liveness loudness speechiness tempo
0.83 0.92 0.80 0.67 0.81 0.87 0.84 0.82
valence
0.80
scree(z)
library("paran")
paran(z, centile=95, all=T, graph=T) # two factors are appropriate
Using eigendecomposition of correlation matrix.
Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
Results of Horn's Parallel Analysis for component retention
270 iterations, using the 95 centile estimate
--------------------------------------------------
Component Adjusted Unadjusted Estimated
Eigenvalue Eigenvalue Bias
--------------------------------------------------
1 3.857353 4.305594 0.448240
2 1.162793 1.462154 0.299361
3 0.783622 0.979100 0.195478
4 0.709253 0.813018 0.103765
5 0.659769 0.694514 0.034744
6 0.336983 0.301951 -0.03503
7 0.316533 0.222367 -0.09416
8 0.298139 0.140507 -0.15763
9 0.313580 0.080790 -0.23278
--------------------------------------------------
Adjusted eigenvalues > 1 indicate dimensions to retain.
(2 components retained)
fa <- fa(z, nfactors=2, rotate="varimax")
library("plot.matrix")
Registered S3 method overwritten by 'plot.matrix':
method from
plot.loadings pls
plot(loadings(fa), las=1)
al1 <- psych::alpha(z[,c("acousticness", "danceability", "energy", "loudness")], check.keys = TRUE)
Some items were negatively correlated with total scale and were automatically reversed.
This is indicated by a negative sign for the variable name.
al1
Reliability analysis
Call: psych::alpha(x = z[, c("acousticness", "danceability",
"energy", "loudness")], check.keys = TRUE)
lower alpha upper 95% confidence boundaries
0.94 0.95 0.96
Reliability if an item is dropped:
Item statistics
al2 <- psych::alpha(z[,c("instrumentalness", "valence")], check.keys = TRUE)
Some items were negatively correlated with total scale and were automatically reversed.
This is indicated by a negative sign for the variable name.Datenl攼㸴nge [16] ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
al2
Reliability analysis
Call: psych::alpha(x = z[, c("instrumentalness", "valence")],
check.keys = TRUE)
lower alpha upper 95% confidence boundaries
0.73 0.79 0.85
Reliability if an item is dropped:
Item statistics
scale <- cbind(al1$scores, al2$scores)
fviz_nbclust(scale, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
reorder <- function(x, ...) {
dr <- dist(x)
hr <- hclust(dr)
dc <- dist(t(x))
hc <- hclust(dc)
x[hr$order, hc$order]
}
set.seed(42)
cl1 <- kmeans(z, 4)
set.seed(42)
cl2 <- kmeans(scale, 4)
print(reorder(table(cl1$cluster,cl2$cluster)))
4 2 1 3
4 51 0 0 0
2 0 49 0 0
1 0 0 39 31
3 0 0 17 9
par(mfcol=c(1,3))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl1$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19)
par(mfcol=c(1,2))
library("e1071")
set.seed(42)
cl1 <- cmeans(z, 4)
mcolor <- colorRamp(c("black", "green", "blue", "red"))
col <- rgb(mcolor(cl1$membership[,1]), max=255)
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col, main = 'Soft-clustering (fuzzy c-means, k = 4)')
cl <- diana(z)
hcl <- cutree(as.hclust(cl), k = 4)
par(mfrow=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=hcl,
pch=19, cex=0.5)
NA
NA
NA
NA
library("proxy")
Attache Paket: 㤼㸱proxy㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
as.dist, dist
The following object is masked from 㤼㸱package:base㤼㸲:
as.matrix
d <- as.matrix(dist(z))
heatmap(d)
s <- pr_dist2simil(d) # the darker, the more similar
heatmap(s)
par(mfcol=c(1,1))
d <- dist(z)
# hclust
cl1 <- hclust(d, method = 'ward.D2')
memb <- cutree(cl1, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb)
plot(cl1, ann = F)
title(xlab = "Ward.D2", ylab = "Height")
library("ggplot2")
library("tibble")
ggplot(cl1$height %>%
as_tibble() %>%
add_column(groups = length(cl1$height):1) %>%
rename(height=value),
aes(x=groups, y=height)) +
geom_point() +
geom_line()
Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
[90mThis warning is displayed once per session.[39m
library("mclust")
__ ___________ __ _____________
/ |/ / ____/ / / / / / ___/_ __/
/ /|_/ / / / / / / / /\__ \ / /
/ / / / /___/ /___/ /_/ /___/ // /
/_/ /_/\____/_____/\____//____//_/ version 5.4.5
Type 'citation("mclust")' for citing this R package in publications.
Attache Paket: 㤼㸱mclust㤼㸲
The following object is masked from 㤼㸱package:MVN㤼㸲:
mvn
The following object is masked from 㤼㸱package:psych㤼㸲:
sim
set.seed(42)
cl <- Mclust(z)
fitting ...
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|=== | 2%
|
|==== | 3%
|
|===== | 4%
|
|====== | 5%
|
|======= | 6%
|
|======== | 6%
|
|========= | 7%
|
|=========== | 8%
|
|============ | 9%
|
|============= | 9%
|
|============== | 10%
|
|=============== | 11%
|
|================ | 12%
|
|================= | 13%
|
|================== | 13%
|
|=================== | 14%
|
|==================== | 15%
|
|===================== | 16%
|
|====================== | 17%
|
|======================= | 17%
|
|======================== | 18%
|
|========================= | 19%
|
|========================== | 20%
|
|=========================== | 20%
|
|============================ | 21%
|
|============================== | 22%
|
|=============================== | 23%
|
|================================ | 24%
|
|================================= | 24%
|
|================================== | 25%
|
|=================================== | 26%
|
|==================================== | 27%
|
|===================================== | 28%
|
|====================================== | 28%
|
|======================================= | 29%
|
|======================================== | 30%
|
|========================================= | 31%
|
|========================================== | 31%
|
|=========================================== | 32%
|
|============================================ | 33%
|
|============================================= | 34%
|
|============================================== | 35%
|
|=============================================== | 35%
|
|================================================= | 36%
|
|================================================== | 37%
|
|=================================================== | 38%
|
|==================================================== | 39%
|
|===================================================== | 39%
|
|====================================================== | 40%
|
|======================================================= | 41%
|
|======================================================== | 42%
|
|========================================================= | 43%
|
|========================================================== | 43%
|
|=========================================================== | 44%
|
|============================================================ | 45%
|
|============================================================= | 46%
|
|============================================================== | 46%
|
|=============================================================== | 47%
|
|================================================================ | 48%
|
|================================================================= | 49%
|
|================================================================== | 50%
|
|==================================================================== | 50%
|
|===================================================================== | 51%
|
|====================================================================== | 52%
|
|======================================================================= | 53%
|
|======================================================================== | 54%
|
|========================================================================= | 54%
|
|========================================================================== | 55%
|
|=========================================================================== | 56%
|
|============================================================================ | 57%
|
|============================================================================= | 57%
|
|============================================================================== | 58%
|
|=============================================================================== | 59%
|
|================================================================================ | 60%
|
|================================================================================= | 61%
|
|================================================================================== | 61%
|
|=================================================================================== | 62%
|
|==================================================================================== | 63%
|
|===================================================================================== | 64%
|
|======================================================================================= | 65%
|
|======================================================================================== | 65%
|
|========================================================================================= | 66%
|
|========================================================================================== | 67%
|
|=========================================================================================== | 68%
|
|============================================================================================ | 69%
|
|============================================================================================= | 69%
|
|============================================================================================== | 70%
|
|=============================================================================================== | 71%
|
|================================================================================================ | 72%
|
|================================================================================================= | 72%
|
|================================================================================================== | 73%
|
|=================================================================================================== | 74%
|
|==================================================================================================== | 75%
|
|===================================================================================================== | 76%
|
|====================================================================================================== | 76%
|
|======================================================================================================= | 77%
|
|======================================================================================================== | 78%
|
|========================================================================================================== | 79%
|
|=========================================================================================================== | 80%
|
|============================================================================================================ | 80%
|
|============================================================================================================= | 81%
|
|============================================================================================================== | 82%
|
|=============================================================================================================== | 83%
|
|================================================================================================================ | 83%
|
|================================================================================================================= | 84%
|
|================================================================================================================== | 85%
|
|=================================================================================================================== | 86%
|
|==================================================================================================================== | 87%
|
|===================================================================================================================== | 87%
|
|====================================================================================================================== | 88%
|
|======================================================================================================================= | 89%
|
|======================================================================================================================== | 90%
|
|========================================================================================================================= | 91%
|
|========================================================================================================================== | 91%
|
|=========================================================================================================================== | 92%
|
|============================================================================================================================= | 93%
|
|============================================================================================================================== | 94%
|
|=============================================================================================================================== | 94%
|
|================================================================================================================================ | 95%
|
|================================================================================================================================= | 96%
|
|================================================================================================================================== | 97%
|
|=================================================================================================================================== | 98%
|
|==================================================================================================================================== | 98%
|
|===================================================================================================================================== | 99%
|
|======================================================================================================================================| 100%
summary(cl)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VVI (diagonal, varying volume and shape) model with 9 components:
Clustering table:
1 2 3 4 5 6 7 8 9
9 23 7 35 7 65 24 8 18
par(mfrow=c(2,2))
plot(cl, "BIC")
plot(pc_cor$x[,1], pc_cor$x[,2], col = cl$classification)
EM_density_plot <- plot(cl, "density")
NA
NA
par(mfcol=c(2,2))
library("fpc")
Attache Paket: 㤼㸱fpc㤼㸲
The following object is masked from 㤼㸱package:xtable㤼㸲:
xtable
set.seed(42)
cl <- dbscan(z, 0.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 1, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 1.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 2, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
# Mixed clustering
# hclust
d <- dist(z)
set.seed(42)
cl1 <- hclust(d, method="ward.D2")
memb <- cutree(cl1, 4)
# kmeans
groupm <- aggregate(z, list(memb), mean)
centers <- cbind(groupm$acousticness,
groupm$danceability,
groupm$energy,
groupm$instrumentalness,
groupm$liveness,
groupm$loudness,
groupm$speechiness,
groupm$tempo,
groupm$valence)
set.seed(42)
cl2 <- kmeans(z, centers = centers)
# compare results (confusion matrix)
table(memb, cl2$cluster) # result: k-means has classified 4 observations differently than hclust
memb 1 2 3 4
1 51 0 0 1
2 0 49 0 0
3 0 0 7 0
4 0 0 3 85
par(mfcol=c(1,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster,
main="Cluster predictions, mixed clustering", pch=19, xlab = 'PC1', ylab = 'PC2')
# project cluster centers to PCA feature space:
cluster_centers_pca <- cl2$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre),
main="Truth", pch=19, xlab = 'PC1', ylab = 'PC2')
library("NbClust")
# Total variance explained
tve <- rep(NA, 15)
for (k in 2:15) {
clk <- kmeans(z, k)
tve[k] <- 1-clk$tot.withinss/clk$totss
}
plot(tve, type="b")
set.seed(42)
NbClust(z, method="ward.D2", index="ch") # result: k*=2
$All.index
2 3 4 5 6 7 8 9 10 11 12 13 14 15
139.7710 120.4259 98.3300 88.7335 82.5706 74.8123 69.3249 65.2113 61.9272 59.6528 58.1524 57.0914 56.0043 55.3023
$Best.nc
Number_clusters Value_Index
2.000 139.771
$Best.partition
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 145
2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184
2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
set.seed(42)
NbClust(z, method="ward.D2") # result: majority vote (12) for k*=3.
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 8 proposed 2 as the best number of clusters
* 12 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 1 proposed 14 as the best number of clusters
* 2 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
$All.index
KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
2 2.9225 139.7710 59.4773 3.0748 463.1206 4.072864e+17 21596.043 1020.0706 21.7400 1.7205 0.3120 0.9783 0.4265 0.7137 56.9638 2.3921
3 3.5131 120.4259 24.7670 6.5366 826.6680 1.433964e+17 11111.650 780.7158 28.4878 2.2479 0.2692 1.1655 0.3482 0.8329 18.6544 1.1917
4 0.8659 98.3300 24.3660 7.3358 952.5966 1.340864e+17 8423.499 691.9237 29.5447 2.5364 0.3313 1.1514 0.3628 0.8040 20.9664 1.4472
5 1.1693 88.7335 21.0237 9.1153 1104.3665 9.658613e+16 6515.744 614.0028 31.1993 2.8583 0.2878 1.4486 0.3458 0.6822 16.3044 2.7196
6 2.4339 82.5706 12.1013 11.0072 1213.6275 7.964860e+16 4756.855 553.1201 32.3856 3.1729 0.3217 1.2612 0.3517 0.8201 10.9717 1.2919
7 0.9542 74.8123 11.5500 10.5538 1366.8443 4.961074e+16 4347.806 520.0008 37.4071 3.3750 0.3024 1.4710 0.2876 0.7661 14.9600 1.7967
8 1.0040 69.3249 10.9477 10.3915 1447.6622 4.290265e+16 3883.982 490.0530 39.0800 3.5812 0.2885 1.5297 0.2588 0.6165 8.0877 3.4690
9 1.0561 65.2113 10.1985 10.3909 1533.7264 3.500170e+16 3428.487 463.0863 40.7503 3.7898 0.2840 1.4543 0.2668 0.6327 27.2817 3.4130
10 0.8937 61.9272 10.6113 10.4543 1598.4194 3.106406e+16 2948.095 439.1369 42.0453 3.9965 0.2731 1.4046 0.2679 0.6075 12.9223 3.6951
11 0.9214 59.6528 11.0369 10.7463 1689.2379 2.364879e+16 2610.542 415.4363 44.1287 4.2245 0.2665 1.3700 0.2704 0.8028 9.5795 1.4381
12 0.9994 58.1524 10.9827 11.8133 1812.3952 1.501397e+16 2480.136 392.0472 48.7617 4.4765 0.2489 1.3929 0.2545 0.7169 11.0589 2.2899
13 1.1530 57.0914 9.8993 12.5013 1911.2783 1.063937e+16 2136.940 369.9646 52.5702 4.7437 0.2390 1.3446 0.2621 0.6343 12.6834 3.3114
14 0.9733 56.0043 10.0901 13.0538 1965.2543 9.368841e+15 1826.549 350.9785 53.4624 5.0003 0.2303 1.2754 0.2686 1.7043 -2.0662 -2.0679
15 1.3913 55.3023 7.9632 13.7026 2032.3638 7.636806e+15 1567.078 332.5424 54.9323 5.2775 0.2808 1.2230 0.2738 0.6088 6.4256 3.5078
Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
2 0.4113 510.0353 0.6596 1.5894 0.3791 0.1672 0.0014 1.3874 2.1143 0.6318
3 0.4042 260.2386 0.5824 -0.0150 1.0162 0.1346 0.0014 1.4240 1.7562 0.5419
4 0.3785 172.9809 0.6279 0.6392 1.0868 0.1782 0.0015 1.6964 1.6896 0.6632
5 0.3548 122.8006 0.5917 0.1784 1.6348 0.1280 0.0016 1.9833 1.5926 0.7320
6 0.3361 92.1867 0.5988 0.3878 1.7244 0.1504 0.0017 1.7666 1.5282 0.5752
7 0.3153 74.2858 0.5900 0.9796 1.9096 0.1504 0.0018 1.8074 1.4831 0.5428
8 0.2988 61.2566 0.5491 0.0245 2.3354 0.1264 0.0018 2.0264 1.4334 0.4935
9 0.2849 51.4540 0.5521 0.2609 2.3378 0.1264 0.0018 1.9033 1.4024 0.4559
10 0.2731 43.9137 0.5485 0.1750 2.4413 0.1264 0.0018 1.8289 1.3667 0.4188
11 0.2628 37.7669 0.5484 0.3502 2.4821 0.1264 0.0019 1.7465 1.3323 0.3863
12 0.2538 32.6706 0.5333 0.2567 2.7392 0.1264 0.0020 1.7621 1.2967 0.3768
13 0.2459 28.4588 0.5279 0.1186 2.8500 0.1264 0.0021 1.6812 1.2591 0.3519
14 0.2387 25.0699 0.5285 -0.0290 2.8823 0.1264 0.0021 1.6455 1.2326 0.3438
15 0.2322 22.1695 0.5298 0.1067 2.8742 0.1535 0.0022 1.8933 1.2092 0.2842
$All.CriticalValues
CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2 0.8094 33.4492 0.0110
3 0.7816 25.9821 0.2967
4 0.7759 24.8432 0.1638
5 0.6927 15.5269 0.0046
6 0.7297 18.5198 0.2388
7 0.7278 18.3290 0.0668
8 0.5577 10.3089 0.0008
9 0.7237 17.9442 0.0005
10 0.6225 12.1297 0.0003
11 0.7045 16.3556 0.1702
12 0.6665 14.0075 0.0174
13 0.6355 12.6164 0.0009
14 0.3854 7.9739 1.0000
15 0.5139 9.4601 0.0009
$Best.nc
KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette Duda PseudoT2
Number_clusters 3.0000 2.000 3.0000 15.0000 3.0000 3.0000e+00 3.00 3.0000 3.0000 3.000 14.0000 2.0000 2.0000 3.0000 3.0000
Value_Index 3.5131 139.771 34.7103 13.7026 363.5475 2.5458e+17 10484.39 150.5628 6.7478 -0.239 0.2303 0.9783 0.4265 0.8329 18.6544
Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
Number_clusters 3.0000 2.0000 3.0000 2.0000 2.0000 2.0000 4.0000 0 2.0000 0 15.0000
Value_Index 1.1917 0.4113 249.7967 0.6596 1.5894 0.3791 0.1782 0 1.3874 0 0.2842
$Best.partition
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 145
3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184
2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
# Silhouette method
set.seed(42)
fviz_nbclust(z, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
d <- dist(z)
cl1 <- hclust(d, method="ward.D2")
memb2 <- cutree(cl1, 2)
memb3 <- cutree(cl1, 3)
memb4 <- cutree(cl1, 4)
library("cluster")
par(mfcol=c(2,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb2)
s2 <- silhouette(memb2, d)
plot(s2, col=1:2, border=NA)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb3)
s3 <- silhouette(memb3, d)
plot(s3, col=1:3, border=NA)
par(mfcol=c(1,3))
clusplot(z, memb2, col.p=memb2)
clusplot(z, memb3, col.p=memb3)
clusplot(z, memb4, col.p=memb4)
NA
NA
library("cluster")
set.seed(42)
cl2 <- pam(z, 4)
k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])
cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1
x$cluster <- as.factor(cl2$clustering)
x$Artist_Follower <- x$Artist_Follower/1000000
x$fa_1 <- x$factor_1
x$fa_2 <- x$factor_2
x$fa_3 <- x$factor_3
x$fa_4 <- x$factor_4
par(mfrow=c(3,3))
nx <- c("fa_1", "fa_2", "fa_3", "fa_4", "pc_1", "pc_2", "Artist_Follower", "days_release_orig", "Track_Popularity")
for (i in 1:length(nx)) {
lmi <- lm(as.formula(paste('Artist_Popularity', nx[i], sep="~")), data=x)
summary(lmi)
plot(x[,nx[i]], x[,'Artist_Popularity'], xlab=nx[i], ylab='Artist Popularity', main=sprintf("R^2=%.2f", summary(lmi)$r.squared))
abline(lmi, col="red")
}
NA
NA
NA
model1 <- lm(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
summary(model1)
Call:
lm(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
Residuals:
Min 1Q Median 3Q Max
-28.4051 -6.6722 0.3102 7.0904 21.0544
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.5143111 0.8690218 74.238 < 2e-16 ***
fa_1 -0.0816610 1.1386538 -0.072 0.942904
fa_2 1.8480927 0.7571880 2.441 0.015589 *
fa_3 5.4094057 1.3919701 3.886 0.000141 ***
fa_4 -1.2986520 1.2829029 -1.012 0.312714
pc_1 1.4023625 0.5912068 2.372 0.018706 *
pc_2 7.4542956 0.6570917 11.344 < 2e-16 ***
Artist_Follower 0.8090864 0.1199206 6.747 1.84e-10 ***
days_release_orig -0.0004646 0.0002093 -2.220 0.027631 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.765 on 187 degrees of freedom
Multiple R-squared: 0.6364, Adjusted R-squared: 0.6208
F-statistic: 40.91 on 8 and 187 DF, p-value: < 2.2e-16
# fa_2: "Youtube"
# fa_3: "Artist content supply"
# fa_4: "Artist long-term activity"
# fa_1: "One hit wonder"
# pc_1 (between classic and Pop, Hip Hop, Techno): strong negative loadings: acousticness and instrumentalness, strong positive loadings: danceability, energy and loudness
# pc_2 (between Techno and Pop, Hip Hop): strong negative loadings: instrumentalness and tempo, strong positive loadings: speechiness and valence
vif(model1) # fa_1, fa_3, fa_4, pc_1 mild collinearity
fa_1 fa_2 fa_3 fa_4 pc_1 pc_2 Artist_Follower days_release_orig
2.181823 1.127092 2.881682 2.202112 3.077405 1.290976 1.276809 1.607455
library("perturb") # condition index
colldiag(model1)
Condition
Index Variance Decomposition Proportions
intercept fa_1 fa_2 fa_3 fa_4 pc_1 pc_2 Artist_Follower days_release_orig
1 1.000 0.002 0.017 0.003 0.030 0.035 0.031 0.000 0.000 0.029
2 1.314 0.106 0.011 0.066 0.002 0.000 0.002 0.071 0.129 0.036
3 1.462 0.003 0.135 0.054 0.013 0.014 0.008 0.093 0.103 0.023
4 1.562 0.203 0.000 0.220 0.006 0.015 0.004 0.141 0.006 0.030
5 1.914 0.002 0.036 0.381 0.041 0.049 0.017 0.373 0.025 0.005
6 2.154 0.035 0.007 0.265 0.055 0.194 0.002 0.031 0.405 0.048
7 2.926 0.635 0.006 0.006 0.000 0.069 0.042 0.099 0.121 0.739
8 3.154 0.000 0.412 0.007 0.169 0.323 0.301 0.085 0.204 0.069
9 3.912 0.015 0.376 0.000 0.683 0.301 0.593 0.107 0.007 0.022
# I remove some variables according to insignificance (fa_1, fa_4)
model2 <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
# summary(model2)
vif(model2)
fa_2 fa_3 pc_1 pc_2 Artist_Follower days_release_orig
1.126103 1.674037 2.021619 1.082232 1.177174 1.510313
select.cols <- c("Artist_Popularity", "fa_2", "fa_3", "pc_1", "pc_2", "Artist_Follower", "days_release_orig")
z <- as.data.frame(scale(dplyr::select(x, select.cols)))
Note: Using an external vector in selections is ambiguous.
[34mi[39m Use `all_of(select.cols)` instead of `select.cols` to silence this message.
[34mi[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m
model2_z <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = z)
# summary(model2_z)
vif(model2_z)
fa_2 fa_3 pc_1 pc_2 Artist_Follower days_release_orig
1.126103 1.674037 2.021619 1.082232 1.177174 1.510313
model2_z$coefficients^2
(Intercept) fa_2 fa_3 pc_1 pc_2 Artist_Follower days_release_orig
8.251688e-33 1.318360e-02 8.530193e-02 5.212851e-02 3.226773e-01 1.063958e-01 1.819168e-02
sum(model2_z$coefficients^2) # 60% is the proportion in the variance of y explained by X and 40% of the variance of y is due to the variance of the residuals outside of the model IFF there was zero multicollinearity. In terms of relevance, 32% of variance in y is explained by pc_2, 10% by Artist_Follower, 8% by fa_3
[1] 0.5978788
sort(round((model2_z$coefficients)^2, digits = 4), decreasing = T)
pc_2 Artist_Follower fa_3 pc_1 days_release_orig fa_2 (Intercept)
0.3227 0.1064 0.0853 0.0521 0.0182 0.0132 0.0000
# Quadratic specification seems appropriate:
# variables without and with quadratic terms (fa_2, pc_2, Artist_Follower)
par(mfcol=c(2,3))
lm_fa2 <- lm(Artist_Popularity ~ fa_2, data=x)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab='fa_2', main=sprintf("R^2=%.2f", summary(lm_fa2)$r.squared),
ylim = c(min(min(fitted(lm_fa2)), min(x$Artist_Popularity)), max(max(fitted(lm_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_fa2, col="red")
lmq_fa2 <- lm(Artist_Popularity ~ poly(fa_2, 2), data=x)
o <- order(x$fa_2)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab= 'fa_2 + fa_2^2', main=sprintf("R^2=%.2f", summary(lmq_fa2)$r.squared),
ylim = c(min(min(fitted(lmq_fa2)), min(x$Artist_Popularity)), max(max(fitted(lmq_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$fa_2[o], fitted(lmq_fa2)[o], col="red")
lm_follower <- lm(Artist_Popularity ~ Artist_Follower, data=x)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab='Artist_Follower', main=sprintf("R^2=%.2f", summary(lm_follower)$r.squared),
ylim = c(min(min(fitted(lm_follower)), min(x$Artist_Popularity)), max(max(fitted(lm_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_follower, col="red")
lmq_follower <- lm(Artist_Popularity ~ poly(Artist_Follower, 2), data=x)
o <- order(x$Artist_Follower)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab= 'Artist_Follower + Artist_Follower^2', main=sprintf("R^2=%.2f", summary(lmq_follower)$r.squared),
ylim = c(min(min(fitted(lmq_follower)), min(x$Artist_Popularity)), max(max(fitted(lmq_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$Artist_Follower[o], fitted(lmq_follower)[o], col="red")
lm_pc2 <- lm(Artist_Popularity ~ pc_2, data=x)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab='pc_2', main=sprintf("R^2=%.2f", summary(lm_pc2)$r.squared),
ylim = c(min(min(fitted(lm_pc2)), min(x$Artist_Popularity)), max(max(fitted(lm_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_pc2, col="red")
lmq_pc2 <- lm(Artist_Popularity ~ poly(pc_2, 2), data=x)
o <- order(x$pc_2)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab= 'pc_2 + pc_2^2', main=sprintf("R^2=%.2f", summary(lmq_pc2)$r.squared),
ylim = c(min(min(fitted(lmq_pc2)), min(x$Artist_Popularity)), max(max(fitted(lmq_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$pc_2[o], fitted(lmq_pc2)[o], col="red")
NA
NA
# Residuals normality
ks.test(model1$residuals, "pnorm", mean = mean(model1$residuals), sd=sd(model1$residuals))$statistic
D
0.04757442
1.3581 / sqrt(length(model1$residuals))
[1] 0.09700714
ks.test(model2$residuals, "pnorm", mean = mean(model2$residuals), sd=sd(model2$residuals))$statistic
D
0.05823339
1.3581 / sqrt(length(model2$residuals))
[1] 0.09700714
par(mfcol=c(2,2))
plot(model2)
par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
plot(model2$model[,i], residuals(model2), xlab=nx[i])
lines(lowess(model2$model[,i], residuals(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
}
par(mfcol=c(1,1))
plot(model2$fitted.values, residuals(model2))
lines(lowess(model2$fitted.values, residuals(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
# residuals vs. predictors and fitted values suggests nonlinearity of the regression function
par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
plot(model2$model[,i], rstandard(model2), xlab=nx[i])
lines(lowess(model2$model[,i], rstandard(model2)), col = "red")
# wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
abline(h = 0, col = "black", lty = 2)
}
# standardized residuals vs. fitted values
par(mfcol=c(1,1))
plot(model2$fitted.values, rstandard(model2), main = 'Standardized residuals vs. fitted values')
lines(lowess(model2$fitted.values, rstandard(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
NA
NA
# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
summary(auxiliary_reg)
Call:
lm(formula = model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 +
Artist_Follower + days_release_orig, data = x)
Residuals:
Min 1Q Median 3Q Max
-209.99 -69.77 -26.75 39.15 712.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 92.093388 9.620134 9.573 < 2e-16 ***
fa_2 0.973183 8.480588 0.115 0.90876
fa_3 -29.748317 11.887822 -2.502 0.01318 *
pc_1 -8.915701 5.369201 -1.661 0.09847 .
pc_2 4.904291 6.741245 0.728 0.46782
Artist_Follower 3.868671 1.290221 2.998 0.00308 **
days_release_orig -0.004154 0.002273 -1.828 0.06918 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 109.4 on 189 degrees of freedom
Multiple R-squared: 0.1067, Adjusted R-squared: 0.0783
F-statistic: 3.761 on 6 and 189 DF, p-value: 0.001467
# R^2: 0.1067
# N = 196, N = nrow(x)
# test stat:
summary(auxiliary_reg)$r.square * nrow(x)
[1] 20.90497
# test stat: 20.90497
p <- ncol(model2$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
[1] 12.59159
# p_value = P(z > 12.59159) = 1-pchisq(20.90497, p=6) = 0.001908146 < 0.05 , reject H0 of homoskedasticity and conclude there is heteroskedasticity
1-pchisq(20.90497, p)
[1] 0.001908146
library("lmtest")
Lade n昼㸶tiges Paket: zoo
Attache Paket: 㤼㸱zoo㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
as.Date, as.Date.numeric
bptest(model2)
studentized Breusch-Pagan test
data: model2
BP = 20.905, df = 6, p-value = 0.001908
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence reject H0 and conclude that there's
# heteroskedasticity and constancy of error variance does not hold --> inference invalidated
dwtest(model2)
Durbin-Watson test
data: model2
DW = 1.9149, p-value = 0.2323
alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) = \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage
par(mfcol=c(1,2))
plot(hatvalues(model2), pch=19, main="Leverage", cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))
plot(cooks.distance(model2), pch=19, main="Cook's distances",
cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=4/n, col="red")
outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model2) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
[1] 9
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model2) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
[1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
NA
# differences
# SDBETA
SDBETA <- dfbetas(model2)
n <- nrow(model2$model)
par(mfcol=c(2,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model2)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")
# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are
outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1)))
x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
NA
# Add quadratic terms for fa_2, Artist_Follower and pc_2 first before excluding outliers!
x$fa_2sq <- x$fa_2^2
x$pc_2sq <- x$pc_2^2
x$Artist_Followersq <- x$Artist_Follower^2
# model2: lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
model3 <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(model3)
Call:
lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
data = x)
Residuals:
Min 1Q Median 3Q Max
-28.3742 -5.6832 0.3771 4.9769 19.6356
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67.5611323 0.9877844 68.397 < 2e-16 ***
fa_2 6.0858820 1.7276487 3.523 0.000538 ***
fa_2sq -0.5504304 0.1615332 -3.408 0.000803 ***
fa_3 4.4969138 0.9010981 4.990 1.38e-06 ***
pc_1 1.8133635 0.4088899 4.435 1.57e-05 ***
pc_2 5.9511646 0.5374732 11.072 < 2e-16 ***
pc_2sq -2.1818759 0.3886134 -5.615 7.08e-08 ***
Artist_Follower 1.6608731 0.2291803 7.247 1.10e-11 ***
Artist_Followersq -0.0233799 0.0049095 -4.762 3.84e-06 ***
days_release_orig -0.0004777 0.0001710 -2.794 0.005745 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.206 on 186 degrees of freedom
Multiple R-squared: 0.7446, Adjusted R-squared: 0.7322
F-statistic: 60.25 on 9 and 186 DF, p-value: < 2.2e-16
vif(model3)
fa_2 fa_2sq fa_3 pc_1 pc_2 pc_2sq Artist_Follower Artist_Followersq
8.309381 7.584394 1.710158 2.084608 1.223166 1.286026 6.603869 5.824194
days_release_orig
1.519144
ks.test(model3$residuals, "pnorm", mean = mean(model3$residuals), sd=sd(model3$residuals))$statistic
D
0.0367905
1.3581 / sqrt(length(model3$residuals))
[1] 0.09700714
par(mfcol=c(2,2))
plot(model2)
par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
plot(model3$model[,i], residuals(model3), xlab=nx[i])
lines(lowess(model3$model[,i], residuals(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
}
par(mfcol=c(1,1))
plot(model3$fitted.values, residuals(model3))
lines(lowess(model3$fitted.values, residuals(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
# residuals vs. predictors and fitted values suggests accounting for nonlinearity of the regression function has improved
par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
plot(model3$model[,i], rstandard(model3), xlab=nx[i])
lines(lowess(model3$model[,i], rstandard(model3)), col = "red")
# wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
abline(h = 0, col = "black", lty = 2)
}
# standardized residuals vs. fitted values
par(mfcol=c(1,1))
plot(model3$fitted.values, rstandard(model3), main = 'Standardized residuals vs. fitted values')
lines(lowess(model3$fitted.values, rstandard(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(auxiliary_reg)
Call:
lm(formula = model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 +
pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
data = x)
Residuals:
Min 1Q Median 3Q Max
-98.32 -52.32 -23.00 18.34 719.43
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 78.727326 11.223118 7.015 4.15e-11 ***
fa_2 2.535328 19.629390 0.129 0.8974
fa_2sq -0.474656 1.835326 -0.259 0.7962
fa_3 -24.009503 10.238196 -2.345 0.0201 *
pc_1 -6.105726 4.645771 -1.314 0.1904
pc_2 -9.200191 6.106722 -1.507 0.1336
pc_2sq -4.239332 4.415390 -0.960 0.3382
Artist_Follower -1.901275 2.603926 -0.730 0.4662
Artist_Followersq 0.021242 0.055781 0.381 0.7038
days_release_orig -0.002471 0.001942 -1.272 0.2050
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 93.24 on 186 degrees of freedom
Multiple R-squared: 0.06758, Adjusted R-squared: 0.02246
F-statistic: 1.498 on 9 and 186 DF, p-value: 0.1513
# R^2: 0.06758
# N = 196, N = nrow(x)
# test stat:
summary(auxiliary_reg)$r.square * nrow(x)
[1] 13.24499
# test stat: 13.24499
p <- ncol(model3$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
[1] 16.91898
# p_value = P(z > 16.91898) = 1-pchisq(13.24499, p=9) = 0.1518304 > 0.05 , fail to reject H0 of homoskedasticity, i.e. constant variance of error term
1-pchisq(13.24499, p)
[1] 0.1518304
library("lmtest")
bptest(model3)
studentized Breusch-Pagan test
data: model3
BP = 13.245, df = 9, p-value = 0.1518
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence fail to reject H0 and conclude that there's
# no heteroskedasticity --> valid inference
dwtest(model3)
Durbin-Watson test
data: model3
DW = 2.0528, p-value = 0.5856
alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) = \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage
par(mfcol=c(1,1))
plot(hatvalues(model3), pch=19, cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))
plot(cooks.distance(model3), pch=19,
cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=4/n, col="red")
outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model3) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
[1] 10
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Genre", "Artist_Popularity", "Artist_Follower", "viewsCount")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model3) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
[1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# same outliers detected as in model2 even after modification of the model
# differences
# SDBETA
SDBETA <- dfbetas(model3)
n <- nrow(model3$model)
par(mfcol=c(3,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19, ylab = "fa_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_2sq'], main="fa_2sq", pch=19, ylab = "fa_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19, ylab = "fa_3")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19, ylab = "pc_1")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19, ylab = "pc_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2sq'], main="pc_2sq", pch=19, ylab = "pc_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19, ylab = "Artist_Follower")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Followersq'], main="Artist_Followersq", pch=19, ylab = 'Artist_Followersq')
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19, ylab = 'days_release')
abline(h=c(-2,2)/sqrt(n), col="red")
# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model3)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")
# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are
outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1)))
x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
NA
NA
# Now exclude the outliers from leverage analysis
x_noout <- x[-outlier_index_final, ]
lm_final <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout)
summary(lm_final) # result: R^2 decreased a little bit
Call:
lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
data = x_noout)
Residuals:
Min 1Q Median 3Q Max
-27.4824 -5.2041 -0.3971 4.9615 19.4089
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 67.5778523 1.1556264 58.477 < 2e-16 ***
fa_2 14.0294272 3.7773779 3.714 0.000274 ***
fa_2sq -7.7026797 2.3813018 -3.235 0.001455 **
fa_3 4.1812469 1.1144664 3.752 0.000238 ***
pc_1 1.5664575 0.4198679 3.731 0.000257 ***
pc_2 5.2986744 0.5665857 9.352 < 2e-16 ***
pc_2sq -1.9828773 0.4235979 -4.681 5.68e-06 ***
Artist_Follower 4.0311406 0.6231803 6.469 9.45e-10 ***
Artist_Followersq -0.1626515 0.0347541 -4.680 5.70e-06 ***
days_release_orig -0.0004396 0.0001751 -2.510 0.012974 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.855 on 176 degrees of freedom
Multiple R-squared: 0.7448, Adjusted R-squared: 0.7317
F-statistic: 57.06 on 9 and 176 DF, p-value: < 2.2e-16
vif(lm_final)
fa_2 fa_2sq fa_3 pc_1 pc_2 pc_2sq Artist_Follower Artist_Followersq
3.847406 3.207554 1.750377 2.249691 1.401325 1.436766 13.792858 12.676491
days_release_orig
1.497164
# library("stargazer")
# stargazer::stargazer(model1, model2, model3, lm_final)
library("boot")
Attache Paket: 㤼㸱boot㤼㸲
The following object is masked from 㤼㸱package:robustbase㤼㸲:
salinity
The following object is masked from 㤼㸱package:lattice㤼㸲:
melanoma
The following object is masked from 㤼㸱package:car㤼㸲:
logit
The following object is masked from 㤼㸱package:psych㤼㸲:
logit
ci <- confint(lm_final, level = 0.95)
plot.confint <- function (b, b.boot, ci, main) {
hist(b.boot, main=main); rug(b.boot)
abline(v=b, col="red"); abline(v=ci, col="blue")
abline(v=quantile(b.boot, c(0.025, 0.975)), col="green")
}
betahat <- boot(lm_final$model, R=999,
function(x, index) { lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout, subset=index)$coefficients })
par(mfcol=c(2,3))
plot.confint(lm_final$coefficients[1], betahat$t[,1], ci[1,], "Intercept")
plot.confint(lm_final$coefficients[2], betahat$t[,2], ci[2,], "fa_2")
plot.confint(lm_final$coefficients[3], betahat$t[,3], ci[3,], "fa_2sq")
plot.confint(lm_final$coefficients[4], betahat$t[,4], ci[4,], "fa_3")
plot.confint(lm_final$coefficients[5], betahat$t[,5], ci[5,], "pc_1")
plot.confint(lm_final$coefficients[6], betahat$t[,6], ci[6,], "pc_2")
par(mfcol=c(2,2))
plot.confint(lm_final$coefficients[7], betahat$t[,7], ci[7,], "pc_2sq")
plot.confint(lm_final$coefficients[8], betahat$t[,8], ci[8,], "Artist_Follower")
plot.confint(lm_final$coefficients[9], betahat$t[,9], ci[9,], "Artist_Followersq")
plot.confint(lm_final$coefficients[10], betahat$t[,10], ci[10,], "days_release")
NA
NA
model5 <- lm(Artist_Popularity ~ fa_2 + fa_2sq, data = x)
summary(model5)
Call:
lm(formula = Artist_Popularity ~ fa_2 + fa_2sq, data = x)
Residuals:
Min 1Q Median 3Q Max
-37.506 -10.505 1.289 10.461 36.224
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 66.5658 1.0458 63.649 < 2e-16 ***
fa_2 18.0348 2.7106 6.654 2.88e-10 ***
fa_2sq -1.3864 0.2653 -5.226 4.47e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 14.2 on 193 degrees of freedom
Multiple R-squared: 0.2059, Adjusted R-squared: 0.1977
F-statistic: 25.03 on 2 and 193 DF, p-value: 2.167e-10
vertex <- -model5$coefficients[2] / (2*model5$coefficients[3])
vertex
fa_2
6.504143
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2)) # so vertex is inside of range of Artist_Follower and there exists a quadratic component
fa_2
TRUE
# https://stats.idre.ucla.edu/r/faq/how-can-i-estimate-the-standard-error-of-transformed-regression-parameters-in-r-using-the-delta-method/
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1376936-testing-whether-to-include-a-squared-term
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1344089-how-to-find-the-turning-point-for-a-quadratic-function
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1408413-significance-level-of-quadratic-term
# https://psych.unl.edu/psycrs/statpage/nonlin_eg.pdf ("true" quadratic component vs. skewed predictor)
library("msm") # equivalent to the nlcom function in Stata
# z-value according to the Delta method
vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5)) # = 12.8421
fa_2
12.8421
# CI according to Stata
vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
fa_2
5.511479
vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
fa_2
7.496808
n <- nrow(model5$model)
d <- length(model5$coefficients)
df <- n - d
# http://sia.webpopix.org/nonlinearRegression.html
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 3.270659e-10 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for fa_2
vertex <- -model3$coefficients[2] / (2*model3$coefficients[3])
vertex
fa_2
5.528294
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2))
fa_2
TRUE
z <- vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
z
fa_2
8.814632
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
fa_2
4.440892e-16
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 4.440892e-16 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for pc_2
vertex <- -model3$coefficients[6] / (2*model3$coefficients[7])
vertex
pc_2
1.363773
(vertex <= max(x$pc_2)) & (vertex >= min(x$pc_2))
pc_2
TRUE
z <- vertex / msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
z
pc_2
4.536158
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
pc_2
5.125243e-06
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 5.125243e-06 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for Artist_Follower
vertex <- -model3$coefficients[8] / (2*model3$coefficients[9])
vertex
Artist_Follower
35.51927
(vertex <= max(x$Artist_Follower)) & (vertex >= min(x$Artist_Follower))
Artist_Follower
TRUE
z <- vertex / msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
z
Artist_Follower
9.652667
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
Artist_Follower
0
#plot(Artist_Follower_trans, x$Artist_Popularity)
#min_max_normalize <- function(x)
#{
# return( (10-1)*((x- min(x)) /(max(x)-min(x))) + 1)
#}
#min_fa_2 <- optimize(ksD, c(-10,10), x=min_max_normalize(x$pc_2))$minimum
#fa_2bc <- bcPower(min_max_normalize(x$pc_2), min_fa_2)
#fa_2bcsqrr <- fa_2bc ** 0.5
#fa_2bcsqrrcenter <- fa_2bcsqrr - mean(fa_2bcsqrr)
#fa_2bcsqrrcentersq <- fa_2bcsqrrcenter ** 2
#cor(cbind(x$Artist_Popularity, fa_2bcsqrrcenter, fa_2bcsqrrcentersq))
#plot(x$fa_2, x$Artist_Popularity)
#plot(x$pc_2, x$Artist_Popularity)
# baseline model:
# forward
lmi <- lm(Artist_Popularity~1, data=lm_final$model)
lmf <- MASS::stepAIC(lmi, scope=~
fa_2
+ fa_2sq
+ fa_3
+ x_noout[,'fa_4']
+ pc_1
+ pc_2
+ pc_2sq
+ Artist_Follower
+ Artist_Followersq
+ days_release_orig
+ x_noout[, 'fa_1']
+ x_noout[, 'Track_Duration_ms']
+ x_noout[, 'Track_Popularity']
,direction = "forward")
Start: AIC=1012.44
Artist_Popularity ~ 1
Df Sum of Sq RSS AIC
+ pc_2 1 17048.4 25493 919.20
+ x_noout[, "Track_Duration_ms"] 1 12472.7 30069 949.90
+ Artist_Follower 1 11518.9 31023 955.71
+ pc_2sq 1 10312.7 32229 962.81
+ fa_2 1 8694.3 33848 971.92
+ x_noout[, "Track_Popularity"] 1 7421.3 35120 978.79
+ Artist_Followersq 1 6625.6 35916 982.95
+ pc_1 1 1481.0 41061 1007.85
+ fa_2sq 1 1418.9 41123 1008.13
<none> 42542 1012.44
+ days_release_orig 1 414.5 42127 1012.62
+ x_noout[, "fa_4"] 1 398.2 42144 1012.69
+ x_noout[, "fa_1"] 1 176.7 42365 1013.67
+ fa_3 1 133.5 42408 1013.86
Step: AIC=919.2
Artist_Popularity ~ pc_2
Df Sum of Sq RSS AIC
+ Artist_Follower 1 7669.0 17824 854.64
+ Artist_Followersq 1 4882.1 20611 881.66
+ x_noout[, "Track_Duration_ms"] 1 4480.6 21013 885.25
+ fa_2 1 4209.3 21284 887.63
+ x_noout[, "Track_Popularity"] 1 3579.8 21914 893.06
+ pc_2sq 1 2528.4 22965 901.77
+ days_release_orig 1 2251.5 23242 904.00
+ pc_1 1 1731.7 23762 908.12
+ x_noout[, "fa_4"] 1 1088.3 24405 913.09
+ fa_2sq 1 856.0 24637 914.85
+ x_noout[, "fa_1"] 1 736.0 24757 915.75
<none> 25493 919.20
+ fa_3 1 25.1 25468 921.02
Step: AIC=854.64
Artist_Popularity ~ pc_2 + Artist_Follower
Df Sum of Sq RSS AIC
+ x_noout[, "Track_Popularity"] 1 3198.4 14626 819.85
+ x_noout[, "Track_Duration_ms"] 1 2853.7 14971 824.19
+ Artist_Followersq 1 2136.9 15688 832.89
+ pc_2sq 1 1360.1 16464 841.88
+ days_release_orig 1 1225.0 16599 843.40
+ fa_2 1 1157.3 16667 844.15
+ x_noout[, "fa_4"] 1 1050.6 16774 845.34
+ pc_1 1 843.0 16981 847.63
<none> 17824 854.64
+ fa_2sq 1 96.1 17728 855.63
+ fa_3 1 71.3 17753 855.89
+ x_noout[, "fa_1"] 1 19.1 17805 856.44
Step: AIC=819.85
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"]
Df Sum of Sq RSS AIC
+ x_noout[, "Track_Duration_ms"] 1 1841.18 12785 796.83
+ Artist_Followersq 1 1806.26 12820 797.34
+ pc_2sq 1 1150.74 13475 806.61
+ days_release_orig 1 501.06 14125 815.37
+ pc_1 1 417.59 14208 816.47
+ x_noout[, "fa_4"] 1 282.55 14343 818.23
+ fa_3 1 198.73 14427 819.31
<none> 14626 819.85
+ fa_2 1 142.21 14484 820.04
+ x_noout[, "fa_1"] 1 23.05 14603 821.56
+ fa_2sq 1 7.91 14618 821.75
Step: AIC=796.83
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"]
Df Sum of Sq RSS AIC
+ Artist_Followersq 1 1508.10 11277 775.48
+ pc_2sq 1 1241.24 11544 779.83
+ fa_3 1 664.00 12121 788.91
+ x_noout[, "fa_1"] 1 147.88 12637 796.67
<none> 12785 796.83
+ fa_2 1 83.00 12702 797.62
+ days_release_orig 1 71.00 12714 797.79
+ pc_1 1 6.16 12779 798.74
+ x_noout[, "fa_4"] 1 5.55 12779 798.75
+ fa_2sq 1 4.04 12781 798.77
Step: AIC=775.48
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq
Df Sum of Sq RSS AIC
+ pc_2sq 1 1076.83 10200 758.82
+ fa_3 1 598.01 10679 767.35
+ fa_2sq 1 185.08 11092 774.40
<none> 11277 775.48
+ x_noout[, "fa_1"] 1 84.76 11192 776.08
+ days_release_orig 1 36.02 11241 776.89
+ x_noout[, "fa_4"] 1 5.24 11272 777.40
+ pc_1 1 3.80 11273 777.42
+ fa_2 1 0.27 11276 777.48
Step: AIC=758.82
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq
Df Sum of Sq RSS AIC
+ fa_3 1 250.781 9949.1 756.19
+ fa_2sq 1 177.173 10022.7 757.56
+ days_release_orig 1 167.813 10032.1 757.73
<none> 10199.9 758.82
+ pc_1 1 98.887 10101.0 759.00
+ x_noout[, "fa_1"] 1 81.693 10118.2 759.32
+ x_noout[, "fa_4"] 1 28.307 10171.6 760.30
+ fa_2 1 0.969 10198.9 760.80
Step: AIC=756.19
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
fa_3
Df Sum of Sq RSS AIC
+ pc_1 1 546.47 9402.6 747.68
+ days_release_orig 1 302.11 9647.0 752.45
+ x_noout[, "fa_4"] 1 190.84 9758.3 754.58
+ fa_2sq 1 159.29 9789.8 755.18
<none> 9949.1 756.19
+ x_noout[, "fa_1"] 1 1.79 9947.3 758.15
+ fa_2 1 0.74 9948.4 758.17
Step: AIC=747.68
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
fa_3 + pc_1
Df Sum of Sq RSS AIC
+ fa_2sq 1 156.732 9245.9 746.55
+ days_release_orig 1 117.363 9285.3 747.34
<none> 9402.6 747.68
+ x_noout[, "fa_1"] 1 25.394 9377.2 749.17
+ x_noout[, "fa_4"] 1 4.049 9398.6 749.60
+ fa_2 1 2.462 9400.2 749.63
Step: AIC=746.55
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
fa_3 + pc_1 + fa_2sq
Df Sum of Sq RSS AIC
+ fa_2 1 191.308 9054.6 744.66
+ days_release_orig 1 105.159 9140.7 746.42
<none> 9245.9 746.55
+ x_noout[, "fa_1"] 1 37.651 9208.2 747.79
+ x_noout[, "fa_4"] 1 0.048 9245.8 748.55
Step: AIC=744.66
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
fa_3 + pc_1 + fa_2sq + fa_2
Df Sum of Sq RSS AIC
+ days_release_orig 1 109.025 8945.6 744.41
<none> 9054.6 744.66
+ x_noout[, "fa_1"] 1 36.249 9018.3 745.92
+ x_noout[, "fa_4"] 1 0.008 9054.6 746.66
Step: AIC=744.41
Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig
Df Sum of Sq RSS AIC
<none> 8945.6 744.41
+ x_noout[, "fa_1"] 1 49.184 8896.4 745.38
+ x_noout[, "fa_4"] 1 6.072 8939.5 746.28
summary(lmf)
Call:
lm(formula = Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[,
"Track_Popularity"] + x_noout[, "Track_Duration_ms"] + Artist_Followersq +
pc_2sq + fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig,
data = lm_final$model)
Residuals:
Min 1Q Median 3Q Max
-23.0380 -5.2954 0.3417 4.5413 17.3011
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.987e+01 3.045e+00 19.664 < 2e-16 ***
pc_2 4.357e+00 5.531e-01 7.877 3.48e-13 ***
Artist_Follower 4.143e+00 5.731e-01 7.229 1.48e-11 ***
x_noout[, "Track_Popularity"] 1.936e-01 4.086e-02 4.737 4.48e-06 ***
x_noout[, "Track_Duration_ms"] -1.528e-05 4.591e-06 -3.329 0.001065 **
Artist_Followersq -1.614e-01 3.182e-02 -5.071 1.01e-06 ***
pc_2sq -1.667e+00 3.918e-01 -4.255 3.41e-05 ***
fa_3 3.769e+00 1.024e+00 3.682 0.000308 ***
pc_1 9.810e-01 4.271e-01 2.297 0.022806 *
fa_2sq -5.659e+00 2.205e+00 -2.566 0.011127 *
fa_2 7.210e+00 3.700e+00 1.948 0.052974 .
days_release_orig -2.378e-04 1.633e-04 -1.456 0.147127
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.17 on 174 degrees of freedom
Multiple R-squared: 0.7897, Adjusted R-squared: 0.7764
F-statistic: 59.41 on 11 and 174 DF, p-value: < 2.2e-16
# backward:
lma <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 + pc_2sq + Artist_Follower
+ Artist_Followersq + fa_1 + Track_Duration_ms
+ days_release_orig + Track_Popularity,
data= x_noout)
lmb <- MASS::stepAIC(lma)
Start: AIC=747.24
Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 +
pc_2sq + Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms +
days_release_orig + Track_Popularity
Df Sum of Sq RSS AIC
- fa_4 1 6.65 8896.4 745.38
- fa_1 1 49.76 8939.5 746.28
<none> 8889.7 747.24
- days_release_orig 1 128.61 9018.3 747.92
- fa_2 1 195.49 9085.2 749.29
- pc_1 1 271.47 9161.2 750.84
- fa_2sq 1 355.61 9245.3 752.54
- Track_Duration_ms 1 536.66 9426.4 756.15
- fa_3 1 615.81 9505.5 757.70
- pc_2sq 1 830.66 9720.4 761.86
- Track_Popularity 1 1166.70 10056.4 768.18
- Artist_Followersq 1 1363.69 10253.4 771.79
- pc_2 1 2572.29 11462.0 792.51
- Artist_Follower 1 2719.78 11609.5 794.89
Step: AIC=745.38
Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms +
days_release_orig + Track_Popularity
Df Sum of Sq RSS AIC
- fa_1 1 49.18 8945.6 744.41
<none> 8896.4 745.38
- days_release_orig 1 121.96 9018.3 745.92
- fa_2 1 193.77 9090.1 747.39
- pc_1 1 295.87 9192.2 749.47
- fa_2sq 1 349.34 9245.7 750.55
- Track_Duration_ms 1 530.56 9426.9 754.16
- fa_3 1 632.49 9528.9 756.16
- pc_2sq 1 828.02 9724.4 759.94
- Track_Popularity 1 1183.07 10079.4 766.61
- Artist_Followersq 1 1358.68 10255.0 769.82
- pc_2 1 2647.56 11543.9 791.84
- Artist_Follower 1 2723.31 11619.7 793.06
Step: AIC=744.41
Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
Artist_Follower + Artist_Followersq + Track_Duration_ms +
days_release_orig + Track_Popularity
Df Sum of Sq RSS AIC
<none> 8945.6 744.41
- days_release_orig 1 109.0 9054.6 744.66
- fa_2 1 195.2 9140.7 746.42
- pc_1 1 271.3 9216.8 747.97
- fa_2sq 1 338.5 9284.1 749.32
- Track_Duration_ms 1 569.7 9515.2 753.89
- fa_3 1 697.0 9642.6 756.37
- pc_2sq 1 930.7 9876.3 760.82
- Track_Popularity 1 1153.8 10099.4 764.97
- Artist_Followersq 1 1321.8 10267.4 768.04
- Artist_Follower 1 2686.6 11632.1 791.26
- pc_2 1 3190.0 12135.5 799.13
summary(lmb)
Call:
lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + Track_Duration_ms +
days_release_orig + Track_Popularity, data = x_noout)
Residuals:
Min 1Q Median 3Q Max
-23.0380 -5.2954 0.3417 4.5413 17.3011
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.987e+01 3.045e+00 19.664 < 2e-16 ***
fa_2 7.210e+00 3.700e+00 1.948 0.052974 .
fa_2sq -5.659e+00 2.205e+00 -2.566 0.011127 *
fa_3 3.769e+00 1.024e+00 3.682 0.000308 ***
pc_1 9.810e-01 4.271e-01 2.297 0.022806 *
pc_2 4.357e+00 5.531e-01 7.877 3.48e-13 ***
pc_2sq -1.667e+00 3.918e-01 -4.255 3.41e-05 ***
Artist_Follower 4.143e+00 5.731e-01 7.229 1.48e-11 ***
Artist_Followersq -1.614e-01 3.182e-02 -5.071 1.01e-06 ***
Track_Duration_ms -1.528e-05 4.591e-06 -3.329 0.001065 **
days_release_orig -2.378e-04 1.633e-04 -1.456 0.147127
Track_Popularity 1.936e-01 4.086e-02 4.737 4.48e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.17 on 174 degrees of freedom
Multiple R-squared: 0.7897, Adjusted R-squared: 0.7764
F-statistic: 59.41 on 11 and 174 DF, p-value: < 2.2e-16
# Regularization
sel.var <- c("fa_1", "fa_2", "fa_2sq", "fa_3", "fa_4", "pc_1", "pc_2", "pc_2sq", "Artist_Follower", "Artist_Followersq", "Artist_Popularity", "days_release_orig", "Track_Popularity", "Track_Duration_ms")
df <- dplyr::select(x_noout, sel.var)
Note: Using an external vector in selections is ambiguous.
[34mi[39m Use `all_of(sel.var)` instead of `sel.var` to silence this message.
[34mi[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m
lmfull <- lm(Artist_Popularity~., data=df)
lmback <- step(lmfull)
Start: AIC=747.24
Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 +
pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig +
Track_Popularity + Track_Duration_ms
Df Sum of Sq RSS AIC
- fa_4 1 6.65 8896.4 745.38
- fa_1 1 49.76 8939.5 746.28
<none> 8889.7 747.24
- days_release_orig 1 128.61 9018.3 747.92
- fa_2 1 195.49 9085.2 749.29
- pc_1 1 271.47 9161.2 750.84
- fa_2sq 1 355.61 9245.3 752.54
- Track_Duration_ms 1 536.66 9426.4 756.15
- fa_3 1 615.81 9505.5 757.70
- pc_2sq 1 830.66 9720.4 761.86
- Track_Popularity 1 1166.70 10056.4 768.18
- Artist_Followersq 1 1363.69 10253.4 771.79
- pc_2 1 2572.29 11462.0 792.51
- Artist_Follower 1 2719.78 11609.5 794.89
Step: AIC=745.38
Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 +
pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig +
Track_Popularity + Track_Duration_ms
Df Sum of Sq RSS AIC
- fa_1 1 49.18 8945.6 744.41
<none> 8896.4 745.38
- days_release_orig 1 121.96 9018.3 745.92
- fa_2 1 193.77 9090.1 747.39
- pc_1 1 295.87 9192.2 749.47
- fa_2sq 1 349.34 9245.7 750.55
- Track_Duration_ms 1 530.56 9426.9 754.16
- fa_3 1 632.49 9528.9 756.16
- pc_2sq 1 828.02 9724.4 759.94
- Track_Popularity 1 1183.07 10079.4 766.61
- Artist_Followersq 1 1358.68 10255.0 769.82
- pc_2 1 2647.56 11543.9 791.84
- Artist_Follower 1 2723.31 11619.7 793.06
Step: AIC=744.41
Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
Artist_Follower + Artist_Followersq + days_release_orig +
Track_Popularity + Track_Duration_ms
Df Sum of Sq RSS AIC
<none> 8945.6 744.41
- days_release_orig 1 109.0 9054.6 744.66
- fa_2 1 195.2 9140.7 746.42
- pc_1 1 271.3 9216.8 747.97
- fa_2sq 1 338.5 9284.1 749.32
- Track_Duration_ms 1 569.7 9515.2 753.89
- fa_3 1 697.0 9642.6 756.37
- pc_2sq 1 930.7 9876.3 760.82
- Track_Popularity 1 1153.8 10099.4 764.97
- Artist_Followersq 1 1321.8 10267.4 768.04
- Artist_Follower 1 2686.6 11632.1 791.26
- pc_2 1 3190.0 12135.5 799.13
library("glmnet")
Lade n昼㸶tiges Paket: Matrix
Loaded glmnet 3.0-2
xl <- as.matrix(df[,-11])
yl <- df[,11]
set.seed(42)
lmlasso <- cv.glmnet(xl, yl, alpha =1) # cross-validation finds the value of lambda which minimizes MSE, alpha = 1 for LASSO
plot(lmlasso) # displays the bias-variance trade-off (MSE = variance + bias^2)
best.lambda <- lmlasso$lambda.min
lmlasso$lambda.1se
[1] 0.3361553
cl <- list(coef(lmlasso, s="lambda.min")[,1], coef(lmlasso, s="lambda.1se")[,1], coef(lmback), coef(lmfull))
cf <- matrix(NA, nrow=dim(xl)[2]+1, ncol=length(cl))
row.names(cf) <- names(lmfull$coefficients)
i <- 1
for (ci in cl) {
cf[names(ci),i] <- ci[names(ci)]
i <- i+1
}
colnames(cf) <- c("min", "1se", "back", "full")
cf[cf==0] <- NA
cf
min 1se back full
(Intercept) 6.068034e+01 6.114886e+01 5.987096e+01 5.933345e+01
fa_1 NA NA NA -8.893245e-01
fa_2 4.332808e+00 2.626698e+00 7.209531e+00 7.217918e+00
fa_2sq -2.462341e+00 -5.763024e-01 -5.658801e+00 -5.837164e+00
fa_3 3.059559e+00 2.638256e+00 3.769373e+00 4.505508e+00
fa_4 NA NA NA 3.733388e-01
pc_1 7.606562e-01 6.300647e-01 9.809889e-01 1.111732e+00
pc_2 4.498899e+00 4.581303e+00 4.356624e+00 4.151896e+00
pc_2sq -1.670583e+00 -1.671870e+00 -1.667241e+00 -1.600879e+00
Artist_Follower 2.622179e+00 1.731344e+00 4.143098e+00 4.255461e+00
Artist_Followersq -7.241574e-02 -2.026752e-02 -1.613611e-01 -1.647535e-01
days_release_orig -2.304611e-04 -2.257810e-04 -2.377630e-04 -2.659810e-04
Track_Popularity 1.893293e-01 1.868568e-01 1.935870e-01 1.997832e-01
Track_Duration_ms -1.707089e-05 -1.812532e-05 -1.528159e-05 -1.495624e-05
# variable importance
glmmod <- glmnet(xl, yl, lambda = best.lambda)
coefs = coef(glmmod)[,1]
coefs = sort(abs(coefs), decreasing = T)
coefs
(Intercept) pc_2 fa_2 fa_3 Artist_Follower fa_2sq pc_2sq pc_1
6.068265e+01 4.498154e+00 4.340450e+00 3.059708e+00 2.623252e+00 2.466486e+00 1.670685e+00 7.606794e-01
Track_Popularity Artist_Followersq days_release_orig Track_Duration_ms fa_1 fa_4
1.892949e-01 7.248197e-02 2.303984e-04 1.707017e-05 0.000000e+00 0.000000e+00
coef(lmlasso, s="lambda.1se")
14 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 6.114886e+01
fa_1 .
fa_2 2.626698e+00
fa_2sq -5.763024e-01
fa_3 2.638256e+00
fa_4 .
pc_1 6.300647e-01
pc_2 4.581303e+00
pc_2sq -1.671870e+00
Artist_Follower 1.731344e+00
Artist_Followersq -2.026752e-02
days_release_orig -2.257810e-04
Track_Popularity 1.868568e-01
Track_Duration_ms -1.812532e-05
######################
# Performance metrics - full data
performace_metrics <- matrix(nrow = 10, ncol = 6)
RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
[1] 6.913336
R2(predict(lmfull, newdata = df), df$Artist_Popularity)
[1] 0.7910356
performace_metrics[1,1] <- RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
performace_metrics[1,2] <- R2(predict(lmfull, newdata = df), df$Artist_Popularity)
# x_model <- model.matrix(Artist_Popularity~., x_noout)
library("glmnet")
"Ridge regression"
[1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[1] 7.262922
R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
s0
[1,] 0.7710931
performace_metrics[2,1] <- RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[2,2] <- R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
"LASSO regression"
[1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[1] 7.135634
R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
s0
[1,] 0.77876
performace_metrics[3,1] <- RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[3,2] <- R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
"Elasticnet regression"
[1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[1] 7.17704
R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
s0
[1,] 0.7762593
performace_metrics[4,1] <- RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[4,2] <- R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
x_full <- model.matrix(Artist_Popularity~., df)[,-1]
ridge_res2 <- as.numeric(df$Artist_Popularity - predict(ridge.cv, x_full))^2
lasso_res2 <- as.numeric(df$Artist_Popularity - predict(lasso.cv, x_full))^2
elasticnet_res2 <- as.numeric(df$Artist_Popularity - predict(elasticnet.cv, x_full))^2
par(mfcol=c(2,2))
barplot((df$Artist_Popularity - predict(lmfull, df))^2)
Axis(side=2)
title(main = paste("LM (full sample): RMSE =",round(RMSE(predict(lmfull, df), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lmfull, df), df$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (full sample): RMSE =",round(RMSE(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (full sample): RMSE =",round(RMSE(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (full sample): RMSE =",round(RMSE(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2)))
NA
NA
NA
NA
NA
library("caret")
# Splitting in train and test data
set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train <- df[idx.train, ]
test <- df[-idx.train,]
lm_train <- lm(Artist_Popularity~., data=train)
RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
[1] 6.914704
R2(predict(lm_train, newdata = train), train$Artist_Popularity)
[1] 0.799549
performace_metrics[1,3] <- RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
performace_metrics[1,4] <- R2(predict(lm_train, newdata = train), train$Artist_Popularity)
RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
[1] 7.138894
R2(predict(lm_train, newdata = test), test$Artist_Popularity)
[1] 0.7464997
performace_metrics[1,5] <- RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
performace_metrics[1,6] <- R2(predict(lm_train, newdata = test), test$Artist_Popularity)
"Ridge regression"
[1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
[1] 7.387105
R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
s0
[1,] 0.7743463
performace_metrics[2,3] <- RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[2,4] <- R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
[1] 8.151663
R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
s0
[1,] 0.6838742
performace_metrics[2,5] <- RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[2,6] <- R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
"LASSO regression"
[1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
[1] 7.055445
R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
s0
[1,] 0.7920884
performace_metrics[3,3] <- RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[3,4] <- R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
[1] 7.569551
R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
s0
[1,] 0.7215951
performace_metrics[3,5] <- RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[3,6] <- R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
"Elasticnet regression"
[1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
[1] 7.093421
R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
s0
[1,] 0.7898949
performace_metrics[4,3] <- RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[4,4] <- R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
[1] 7.629058
R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
s0
[1,] 0.7179827
performace_metrics[4,5] <- RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[4,6] <- R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
# Graphics
# Training data
x_train <- model.matrix(Artist_Popularity~., train)[,-1]
ridge_res2 <- as.numeric(train$Artist_Popularity - predict(ridge.cv, x_train))^2
lasso_res2 <- as.numeric(train$Artist_Popularity - predict(lasso.cv, x_train))^2
elasticnet_res2 <- as.numeric(train$Artist_Popularity - predict(elasticnet.cv, x_train))^2
par(mfcol=c(2,2))
barplot((train$Artist_Popularity - predict(lm_train, train))^2, main = paste("LM (train): RMSE =",round(RMSE(predict(lm_train, train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, train), train$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (train): RMSE =",round(RMSE(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (train): RMSE =",round(RMSE(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (train): RMSE =",round(RMSE(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2)))
# Test data
x_test <- model.matrix(Artist_Popularity~., test)[,-1]
ridge_res2 <- as.numeric(test$Artist_Popularity - predict(ridge.cv, x_test))^2
lasso_res2 <- as.numeric(test$Artist_Popularity - predict(lasso.cv, x_test))^2
elasticnet_res2 <- as.numeric(test$Artist_Popularity - predict(elasticnet.cv, x_test))^2
par(mfcol=c(2,2))
barplot((test$Artist_Popularity - predict(lm_train, test))^2, main = paste("LM (test): RMSE =",round(RMSE(predict(lm_train, test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, test), test$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (test): RMSE =",round(RMSE(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (test): RMSE =",round(RMSE(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (test): RMSE =",round(RMSE(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2)))
NA
NA
NA
NA
NA
# Non-parametric and semiparametric regression
# Kernel density estimator
library("np")
bw <- npudensbw(~Track_Duration_ms, data=df) # Track_Duration_ms bi-modal?
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 /
Multistart 1 of 1 |
Multistart 1 of 1 |
fhat <- npudens(bw)
fhat
Density Data: 186 training points, in 1 variable(s)
Track_Duration_ms
Bandwidth(s): 25427.78
Bandwidth Type: Fixed
Log Likelihood: -2404.132
Continuous Kernel Type: Second-Order Gaussian
No. Continuous Vars.: 1
plot(fhat, main=sprintf("%s with h=%.2f", fhat$pckertype, fhat$bw))
rug(df$Track_Duration_ms)
bw <- npudensbw(~Artist_Popularity+Track_Duration_ms, data=df, bwmethod="normal-reference")
fhat <- npudens(bw)
# plot(fhat) # bi-modal
par(mfcol=c(1,1))
#1) at Track_Duration_ms ~ 1 and Artist_Popularity ~ -1 (meaning longer tracks' artists are less popular)
plot(fhat, view="fixed", phi=10, theta=320, main = "")
par(mfcol=c(1,1))
# 2) at Track_Duration_ms ~ -0.5 and Artist_Popularity ~ 1
plot(fhat, view="fixed", phi=10, theta=155, main = "")
par(mfcol=c(1,1))
nw2 <- npreg(Artist_Popularity~pc_2+Track_Duration_ms, data=df)
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 /
Multistart 1 of 2 -
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 /
Multistart 2 of 2 |
Multistart 2 of 2 |
summary(nw2)
Regression Data: 186 training points, in 2 variable(s)
pc_2 Track_Duration_ms
Bandwidth(s): 0.6558287 64051.76
Kernel Regression Estimator: Local-Constant
Bandwidth Type: Fixed
Residual standard error: 9.136543
R-squared: 0.6351856
Continuous Kernel Type: Second-Order Gaussian
No. Continuous Explanatory Vars.: 2
# plot(nw2)
# Nadaraya-Watson estimator
# bw <- npregbw(Artist_Popularity~Track_Duration_ms, data=df)
# mhat <- npreg(bw)
# main <- sprintf("%s with h=%.2f", mhat$pckertype, mhat$bw)
# plot(df$Artist_Popularity, df$Track_Duration_ms, pch=19, cex=0.3, main=main)
# ind <- order(df$Track_Duration_ms)
# xs <- cbind(df$Track_Duration_ms, fitted(mhat))[ind,]
# lines(xs[,1], xs[,2], col="red", lwd=2)
# rug(df$Track_Duration_ms)
# bw <- npregbw(Artist_Popularity~Track_Duration_ms+Track_Popularity, data=df)
# mhat <- npreg(bw)
# plot(mhat)
plotContour <- function (model, data, n=30) {
mf <- model.frame(model$terms, data)
mc <- lapply(as.list(mf[,-1]), pretty, n=n)
zc <- predict(model, expand.grid(mc))
dim(zc) <- sapply(mc, length)
r2 <- 1-var(residuals(model))/var(mf[,1])
contour(mc[[1]], mc[[2]], zc, xlab=names(mf)[2], ylab=names(mf)[3],
main=sprintf("R^2=%.3f", r2))
cc <- gray(0.75-0.75*(mf[,1]-min(mf[,1]))/(max(mf[,1])-min(mf[,1])))
points(mf[,2], mf[,3], pch=19, cex=0.5, col=cc)
}
model <- lm(Artist_Popularity~pc_1 + pc_2, data=df)
par(mfrow=c(1,1))
plotContour(model, df)
# Partial linear model (mixing linear and spline terms)
library("mgcv")
model <- gam(Artist_Popularity~s(Track_Popularity)+Track_Duration_ms, data=df)
# plotContour(model, df$Track_Popularity, df$Track_Duration_ms, df$Artist_Popularity)
# plot(model)
# Additive model
library("gam")
Lade n昼㸶tiges Paket: splines
Lade n昼㸶tiges Paket: foreach
Loaded gam 1.16.1
Attache Paket: 㤼㸱gam㤼㸲
The following objects are masked from 㤼㸱package:mgcv㤼㸲:
gam, gam.control, gam.fit, s
am1 <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=df)
non-list contrasts argument ignored
print(am1)
Call:
gam(formula = Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) +
s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) +
s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) +
s(pc_2sq) + s(Artist_Followersq), data = df)
Degrees of Freedom: 185 total; 76.48856 Residual
Residual Deviance: 2110.514
par(mfrow=c(3,4))
plot(am1)
plot(fitted(am1), residuals(am1))
lines(lowess(fitted(am1), residuals(am1)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(am1)^2)/((n-1)*var(df$Artist_Popularity))
[1] 0.9503897
performace_metrics[5,1] <- RMSE(predict(am1, newdata = df), df$Artist_Popularity)
performace_metrics[5,2] <- R2(predict(am1, newdata = df), df$Artist_Popularity)
am1_train <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=train)
non-list contrasts argument ignored
performace_metrics[5,3] <- RMSE(predict(am1_train, newdata = train), train$Artist_Popularity)
performace_metrics[5,4] <- R2(predict(am1_train, newdata = train), train$Artist_Popularity)
performace_metrics[5,5] <- RMSE(predict(am1_train, newdata = test), test$Artist_Popularity)
performace_metrics[5,6] <- R2(predict(am1_train, newdata = test), test$Artist_Popularity)
# Single index model
sim <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df)
Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...
summary(sim)
Single Index Model
Regression Data: 186 training points, in 13 variable(s)
fa_1 fa_2 fa_3 fa_4 pc_1 pc_2 Artist_Follower days_release_orig Track_Popularity Track_Duration_ms fa_2sq pc_2sq
Beta: 1 7.140808 -0.9481839 1.320797 0.3963661 1.316466 10.62225 -4.818576e-05 0.08091355 -7.782318e-06 -7.679957 -0.8994251
Artist_Followersq
Beta: -0.1862698
Bandwidth: 0.8377536
Kernel Regression Estimator: Local-Constant
Residual standard error: 5.506267
R-squared: 0.8675706
Continuous Kernel Type: Second-Order Gaussian
No. Continuous Explanatory Vars.: 1
performace_metrics[6,1] <- RMSE(predict(sim, newdata = df), df$Artist_Popularity)
performace_metrics[6,2] <- R2(predict(sim, newdata = df), df$Artist_Popularity)
sim_train <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train)
Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...
performace_metrics[6,3] <- RMSE(predict(sim_train, newdata = train), train$Artist_Popularity)
performace_metrics[6,4] <- R2(predict(sim_train, newdata = train), train$Artist_Popularity)
performace_metrics[6,5] <- RMSE(predict(sim_train, newdata = test), test$Artist_Popularity)
performace_metrics[6,6] <- R2(predict(sim_train, newdata = test), test$Artist_Popularity)
par(mfrow=c(3,4))
plot(sim)
plot(fitted(sim), residuals(sim))
lines(lowess(fitted(sim), residuals(sim)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(sim)^2)/((n-1)*var(df$Artist_Popularity))
[1] 0.8674403
# Projection Pursuit Regression
ppr <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df, nterm=5)
performace_metrics[7,1] <- RMSE(predict(ppr, newdata = df), df$Artist_Popularity)
performace_metrics[7,2] <- R2(predict(ppr, newdata = df), df$Artist_Popularity)
ppr_train <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train, nterm=5)
performace_metrics[7,3] <- RMSE(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,4] <- R2(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,5] <- RMSE(predict(ppr_train, newdata = test), test$Artist_Popularity)
performace_metrics[7,6] <- R2(predict(ppr_train, newdata = test), test$Artist_Popularity)
summary(ppr)
Call:
ppr(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity +
Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq,
data = df, nterm = 5)
Goodness of fit:
5 terms
1741.435
Projection direction vectors ('alpha'):
term 1 term 2 term 3 term 4 term 5
fa_1 -6.325427e-02 2.081791e-01 -8.112105e-02 4.462995e-01 -6.211143e-01
fa_2 1.999522e-01 1.988317e-01 3.820633e-01 -1.225150e-01 -1.261187e-01
fa_3 1.776410e-01 -4.813375e-01 4.625303e-01 -2.615833e-01 -2.121784e-01
fa_4 9.095299e-02 -7.325872e-01 5.161008e-01 -2.094942e-01 4.149667e-01
pc_1 5.912856e-02 -1.634530e-01 -1.398765e-01 -2.846767e-02 -2.597611e-01
pc_2 7.261656e-02 1.218474e-01 -3.230672e-01 -5.558153e-01 4.152905e-01
Artist_Follower 9.472078e-01 -2.009118e-01 1.651405e-01 5.085105e-01 -1.891676e-01
days_release_orig 5.284337e-07 -2.107916e-05 5.900920e-05 2.729395e-05 -7.811224e-05
Track_Popularity 6.925853e-03 1.765052e-02 -6.223949e-02 2.066660e-03 -2.572436e-03
Track_Duration_ms -3.916723e-07 4.233016e-07 2.333658e-06 -2.253623e-06 -2.907082e-06
fa_2sq -9.559306e-02 -2.453111e-01 5.941416e-02 -1.868207e-01 -1.176104e-01
pc_2sq -7.315117e-03 -7.878193e-02 4.566070e-01 -2.623794e-01 3.024768e-01
Artist_Followersq -3.126859e-02 1.152058e-02 -7.044398e-03 -3.768289e-02 6.051800e-03
Coefficients of ridge terms ('beta'):
term 1 term 2 term 3 term 4 term 5
15.436585 3.228392 3.418258 2.647310 2.190618
par(mfrow=c(1,1))
plot(ppr)
plot(fitted(ppr), residuals(ppr))
lines(lowess(fitted(ppr), residuals(ppr)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(ppr)^2)/((n-1)*var(df$Artist_Popularity))
[1] 0.9590653
library("rpart")
set.seed(42)
tr1 <- rpart(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
+Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0)
tr1
n= 186
node), split, n, deviance, yval
* denotes terminal node
1) root 186 42541.81000 64.03226
2) Artist_Follower< 0.069933 69 6966.63800 49.07246
4) fa_2sq>=0.04873442 44 1927.88600 43.65909
8) Artist_Follower< 0.0063065 24 493.33330 39.33333
16) Track_Popularity< 42.5 7 246.85710 36.14286 *
17) Track_Popularity>=42.5 17 145.88240 40.64706 *
9) Artist_Follower>=0.0063065 20 446.55000 48.85000
18) Artist_Follower< 0.0181555 7 27.42857 45.71429 *
19) Artist_Follower>=0.0181555 13 313.23080 50.53846 *
5) fa_2sq< 0.04873442 25 1480.00000 58.60000
10) Track_Duration_ms>=322237 11 247.63640 53.81818 *
11) Track_Duration_ms< 322237 14 783.21430 62.35714 *
3) Artist_Follower>=0.069933 117 11026.53000 72.85470
6) Artist_Follower< 0.538149 59 2850.67800 66.23729
12) Track_Duration_ms>=327871 16 283.00000 59.25000 *
13) Track_Duration_ms< 327871 43 1495.86000 68.83721
26) Track_Popularity< 66 33 762.90910 66.81818
52) fa_1< -0.7142649 12 117.66670 64.16667 *
53) fa_1>=-0.7142649 21 512.66670 68.33333
106) Track_Duration_ms>=208616 7 77.71429 65.42857 *
107) Track_Duration_ms< 208616 14 346.35710 69.78571 *
27) Track_Popularity>=66 10 154.50000 75.50000 *
7) Artist_Follower>=0.538149 58 2964.06900 79.58621
14) Artist_Follower< 2.122275 41 1084.04900 76.26829
28) Track_Popularity< 59 25 470.96000 73.96000
56) Artist_Follower< 0.779791 11 144.54550 71.36364 *
57) Artist_Follower>=0.779791 14 194.00000 76.00000 *
29) Track_Popularity>=59 16 271.75000 79.87500 *
15) Artist_Follower>=2.122275 17 340.11760 87.58824 *
par(mfrow=c(1,1))
plot(tr1)
text(tr1)
# summary(tr1)
library("rpart.plot")
rpart.plot(tr1, cex = 0.7)
n <- nrow(x_noout)
1-sum(residuals(tr1)^2)/((n-1)*var(df$Artist_Popularity))
[1] 0.9131701
RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[1] 4.45642
R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[,1]
[1,] 0.9131701
performace_metrics[8,1] <- RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[8,2] <- R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
par(mfrow=c(1,1))
printcp(tr1)
Regression tree:
rpart(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity +
Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq,
data = df, cp = 0)
Variables actually used in tree construction:
[1] Artist_Follower fa_1 fa_2sq Track_Duration_ms Track_Popularity
Root node error: 42542/186 = 228.72
n= 186
CP nsplit rel error xerror xstd
1 0.5770474 0 1.000000 1.01051 0.083930
2 0.1225097 1 0.422953 0.52027 0.047831
3 0.0836530 2 0.300443 0.39602 0.043732
4 0.0361974 3 0.216790 0.34182 0.037375
5 0.0251945 4 0.180592 0.24520 0.032810
6 0.0232243 5 0.155398 0.24470 0.032927
7 0.0135972 6 0.132174 0.23754 0.032621
8 0.0105578 7 0.118577 0.22022 0.032283
9 0.0080236 8 0.108019 0.19763 0.026040
10 0.0031164 9 0.099995 0.17719 0.024187
11 0.0031126 10 0.096879 0.17391 0.023256
12 0.0024891 11 0.093766 0.17227 0.023182
13 0.0023646 12 0.091277 0.17570 0.023223
14 0.0020825 13 0.088912 0.17667 0.023198
15 0.0000000 14 0.086830 0.17584 0.022941
plotcp(tr1)
tr1$cptable
CP nsplit rel error xerror xstd
1 0.577047401 0 1.00000000 1.0105061 0.08393019
2 0.122509677 1 0.42295260 0.5202683 0.04783113
3 0.083653037 2 0.30044292 0.3960213 0.04373196
4 0.036197394 3 0.21678988 0.3418154 0.03737539
5 0.025194452 4 0.18059249 0.2451952 0.03281039
6 0.023224285 5 0.15539804 0.2447027 0.03292670
7 0.013597245 6 0.13217375 0.2375437 0.03262056
8 0.010557834 7 0.11857651 0.2202174 0.03228308
9 0.008023608 8 0.10801867 0.1976297 0.02603967
10 0.003116364 9 0.09999507 0.1771938 0.02418689
11 0.003112575 10 0.09687870 0.1739115 0.02325628
12 0.002489096 11 0.09376613 0.1722744 0.02318153
13 0.002364588 12 0.09127703 0.1757018 0.02322286
14 0.002082545 13 0.08891244 0.1766732 0.02319815
15 0.000000000 14 0.08682990 0.1758450 0.02294140
best.cp <- tr1$cptable[which.min(tr1$cptable[,"xerror"]),"CP"]
best.cp
[1] 0.002489096
dt_pruned <- prune(tr1, cp=best.cp)
1-sum(residuals(dt_pruned)^2)/((n-1)*var(df$Artist_Popularity))
[1] 0.9062339
RMSE(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
[1] 4.630997
R2(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11])) # lower R^2 after pruning
[,1]
[1,] 0.9062339
library("caret")
# DT CV on full data
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ .,
data = df,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
There were missing values in resampled performance measures.
rpart.cv
CART
186 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 168, 167, 169, 167, 168, 166, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.000000000 6.202692 0.8458133 4.909969
0.002082545 6.162816 0.8483583 4.865791
0.002364588 6.155920 0.8485051 4.852674
0.002489096 6.156964 0.8487421 4.841797
0.003112575 6.130607 0.8500124 4.814403
0.003116364 6.130607 0.8500124 4.814403
0.008023608 6.355019 0.8355349 5.050998
0.010557834 6.551789 0.8232113 5.215937
0.013597245 6.815278 0.8076098 5.468092
0.023224285 7.373800 0.7741045 5.936121
0.025194452 7.564302 0.7615957 6.066125
0.036197394 7.905309 0.7373849 6.393810
0.083653037 8.645345 0.6856792 7.103210
0.122509677 9.809266 0.5978668 8.102628
0.577047401 12.815514 0.4840569 10.658897
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.003116364.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
n= 186
node), split, n, deviance, yval
* denotes terminal node
1) root 186 42541.8100 64.03226
2) Artist_Follower< 0.069933 69 6966.6380 49.07246
4) fa_2sq>=0.04873442 44 1927.8860 43.65909
8) Artist_Follower< 0.0063065 24 493.3333 39.33333 *
9) Artist_Follower>=0.0063065 20 446.5500 48.85000 *
5) fa_2sq< 0.04873442 25 1480.0000 58.60000
10) Track_Duration_ms>=322237 11 247.6364 53.81818 *
11) Track_Duration_ms< 322237 14 783.2143 62.35714 *
3) Artist_Follower>=0.069933 117 11026.5300 72.85470
6) Artist_Follower< 0.538149 59 2850.6780 66.23729
12) Track_Duration_ms>=327871 16 283.0000 59.25000 *
13) Track_Duration_ms< 327871 43 1495.8600 68.83721
26) Track_Popularity< 66 33 762.9091 66.81818 *
27) Track_Popularity>=66 10 154.5000 75.50000 *
7) Artist_Follower>=0.538149 58 2964.0690 79.58621
14) Artist_Follower< 2.122275 41 1084.0490 76.26829
28) Track_Popularity< 59 25 470.9600 73.96000 *
29) Track_Popularity>=59 16 271.7500 79.87500 *
15) Artist_Follower>=2.122275 17 340.1176 87.58824 *
rpart.plot(rpart.best.cv)
RMSE(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
[1] 4.782344
R2(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
[1] 0.9000049
# DT CV on split data
set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train.data <- df[idx.train, ]
test.data <- df[-idx.train,]
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ .,
data = train.data,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
There were missing values in resampled performance measures.
rpart.cv
CART
142 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 129, 127, 128, 128, 127, 128, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.00000000 6.684278 0.8239615 5.436018
0.04189278 7.224630 0.7877985 5.728523
0.08378556 8.309005 0.7178983 6.710720
0.12567835 10.113664 0.5860693 8.319214
0.16757113 10.113664 0.5860693 8.319214
0.20946391 10.215698 0.5771918 8.394308
0.25135669 10.215698 0.5771918 8.394308
0.29324948 10.215698 0.5771918 8.394308
0.33514226 10.215698 0.5771918 8.394308
0.37703504 10.215698 0.5771918 8.394308
0.41892782 10.215698 0.5771918 8.394308
0.46082061 10.215698 0.5771918 8.394308
0.50271339 10.215698 0.5771918 8.394308
0.54460617 10.215698 0.5771918 8.394308
0.58649895 13.543120 0.4765201 11.241111
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
n= 142
node), split, n, deviance, yval
* denotes terminal node
1) root 142 33870.94000 63.97887
2) Artist_Follower< 0.069933 56 6172.21400 49.32143
4) Artist_Follower< 0.0085735 22 563.86360 39.77273
8) Artist_Follower< 0.003799 15 304.93330 37.73333 *
9) Artist_Follower>=0.003799 7 62.85714 44.14286 *
5) Artist_Follower>=0.0085735 34 2304.50000 55.50000
10) Track_Popularity< 53.5 23 719.30430 51.82609
20) fa_2< -0.2208195 14 276.00000 49.00000 *
21) fa_2>=-0.2208195 9 157.55560 56.22222 *
11) Track_Popularity>=53.5 11 625.63640 63.18182 *
3) Artist_Follower>=0.069933 86 7833.45300 73.52326
6) Artist_Follower< 1.461101 68 3699.11800 70.20588
12) Artist_Follower< 0.2739165 27 821.18520 64.25926
24) fa_2sq>=0.02105125 20 286.55000 61.85000
48) Track_Duration_ms>=321893 10 75.60000 59.20000 *
49) Track_Duration_ms< 321893 10 70.50000 64.50000 *
25) fa_2sq< 0.02105125 7 86.85714 71.14286 *
13) Artist_Follower>=0.2739165 41 1294.39000 74.12195
26) Track_Popularity< 68.5 33 816.54550 72.72727
52) days_release_orig>=160 26 538.50000 71.50000
104) fa_2< -0.08397494 19 290.00000 70.00000 *
105) fa_2>=-0.08397494 7 89.71429 75.57143 *
53) days_release_orig< 160 7 93.42857 77.28571 *
27) Track_Popularity>=68.5 8 148.87500 79.87500 *
7) Artist_Follower>=1.461101 18 558.94440 86.05556 *
rpart.plot(rpart.best.cv)
RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
[1] 4.472846
R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
[1] 0.9161257
performace_metrics[8,3] <- RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
performace_metrics[8,4] <- R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
[1] 7.21592
R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
[1] 0.7520699
performace_metrics[8,5] <- RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
performace_metrics[8,6] <- R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
#########################################################################
library("randomForest")
set.seed(42)
rf <-randomForest(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
+Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0, method = 'anova')
summary(rf)
Length Class Mode
call 5 -none- call
type 1 -none- character
predicted 186 -none- numeric
mse 500 -none- numeric
rsq 500 -none- numeric
oob.times 186 -none- numeric
importance 13 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 186 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
imp <- importance(rf)
ind <- order(imp, decreasing=T)
imp[ind,]
Artist_Follower Artist_Followersq Track_Duration_ms fa_2 pc_2 Track_Popularity fa_2sq days_release_orig
12033.2942 10915.4479 4639.3532 3775.8012 3435.4751 1707.2339 1180.2063 850.0604
pc_2sq fa_3 pc_1 fa_4 fa_1
819.4110 763.9136 761.8407 700.4277 444.7691
rf_pred <- predict(rf, df, type="class")
n <- nrow(df)
1-sum(residuals(rf)^2)/((n-1)*var(df$Artist_Popularity))
[1] 1
RMSE(predict(rf, newdata = df), df$Artist_Popularity)
[1] 2.114007
R2(predict(rf, newdata = df), df$Artist_Popularity)
[1] 0.9826053
performace_metrics[9,1] <- RMSE(predict(rf, newdata = df), df$Artist_Popularity)
performace_metrics[9,2] <- R2(predict(rf, newdata = df), df$Artist_Popularity)
# RMSE(predict(rf, newdata = train.data), train.data$Artist_Popularity)
# R2(predict(rf, newdata = train.data), train.data$Artist_Popularity)
# RMSE(predict(rf, newdata = test.data), test.data$Artist_Popularity)
# R2(predict(rf, newdata = test.data), test.data$Artist_Popularity)
#########################################################################
library("data.table")
library("mlr")
library("parallel")
rf_train <- as.data.table(train)
rf_test <- as.data.table(test)
task <- makeRegrTask(data = train, target = "Artist_Popularity")
rf_learner <- makeLearner("regr.randomForest",
predict.type = "response")
rf.parms <- makeParamSet(
# The recommendation for mtry by Breiman is squareroot number of columns
makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
makeDiscreteParam("sampsize", values = c(30, 50, 70)), # bootstrap sample size, smaller -> faster
makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
)
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)
library("parallelMap")
parallelStartSocket(4, level = "mlr.tuneParams")
Parallelization was not stopped, doing it now.Stopped parallelization. All cleaned up.
Starting parallelization in mode=socket with cpus=4.
#tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
# par.set = rf.parms, control = tuneControl, measures = mlr::rmse)
# https://stackoverflow.com/questions/51333410/mlr-why-does-reproducibility-of-hyperparameter-tuning-fail-using-parallelizatio
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
par.set = rf.parms, control = tuneControl, measures = mlr::rmse)
})
parallelStop()
Stopped parallelization. All cleaned up.
tuning$x
$mtry
[1] 5
$sampsize
[1] 70
$ntree
[1] 50
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)
tuning_results$data
tapply(tuning_results$data$rmse.test.rmse, INDEX = c(tuning_results$data$mtry), mean)
2 3 4 5 6
6.170433 5.843650 5.704894 5.651328 5.639517
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred_train <- predict(rf_model, newdata = train, type = "response")
rf_pred_test <- predict(rf_model, newdata = test, type = "response")
# rf_pred_train$data$response
# rf_pred_test$data$response
RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
[1] 3.511922
R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
[1] 0.954947
performace_metrics[9,3] <- RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
performace_metrics[9,4] <- R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
RMSE(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
[1] 4.908244
R2(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
[1] 0.8911804
performace_metrics[9,5] <- RMSE(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
performace_metrics[9,6] <- R2(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
library("MASS")
library("nnet")
df_z <- scale(as.matrix(df))
train_z <- as.data.frame(df_z[idx.train, ])
test_z <- as.data.frame(df_z[-idx.train,])
mean_mat <- matrix(, nrow = dim(df)[1], ncol = dim(df)[2])
mean_mat[1, ] <- as.numeric(sapply(df, mean, na.rm = TRUE))
library("zoo")
mean_mat <- na.locf(mean_mat)
diag_sd <- diag(sapply(df, sd, na.rm = TRUE))
# Backtransformation scale to orginal
# as.matrix(df_z) %*% diag_sd + mean_mat
task <- makeRegrTask(data = train_z, target = "Artist_Popularity")
nnet <- makeLearner("regr.nnet", predict.type = "response", par.vals = list("trace" = FALSE, "maxit" = 1000,
"MaxNWts" = 80000))
nn.parms <- makeParamSet(
makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)),
makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)
parallelStartSocket(4, level = "mlr.tuneParams")
Parallelization was not stopped, doing it now.Stopped parallelization. All cleaned up.
Starting parallelization in mode=socket with cpus=4.
# tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
# measures = mlr::rmse)
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
measures = mlr::rmse)
})
parallelStop()
Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
$decay
[1] 0.001
$size
[1] 2
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)
plotHyperParsEffect(tuning_results, x = "size", y = "rmse.test.rmse")
plotHyperParsEffect(tuning_results, x = "decay", y = "rmse.test.rmse")
nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)
nn_pred_train <- predict(nn_model, newdata = train_z, type = "response")
nn_pred_test <- predict(nn_model, newdata = test_z, type = "response")
RMSE(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
[1] 0.2779096
R2(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
[1] 0.9255425
RMSE(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
[1] 0.4380883
R2(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
[1] 0.7980027
# Scale predictions back to original:
# as.matrix(train_z[, 11]) %*% diag_sd[11,11] + mean_mat[1,11] == train[,11]
RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
[1] 4.214302
R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
[,1]
[1,] 0.9255425
performace_metrics[10,3] <- RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
performace_metrics[10,4] <- R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
[1] 6.643298
R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
[,1]
[1,] 0.7980027
performace_metrics[10,5] <- RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
performace_metrics[10,6] <- R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
round(performace_metrics, digits = 2)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 6.91 0.79 6.91 0.80 7.14 0.75
[2,] 7.26 0.77 7.39 0.77 8.15 0.68
[3,] 7.14 0.78 7.06 0.79 7.57 0.72
[4,] 7.18 0.78 7.09 0.79 7.63 0.72
[5,] 3.37 0.95 2.92 0.96 6.94 0.80
[6,] 5.51 0.87 5.91 0.86 5.87 0.84
[7,] 3.06 0.96 3.46 0.95 7.17 0.76
[8,] 4.46 0.91 4.47 0.92 7.22 0.75
[9,] 2.11 0.98 3.51 0.95 4.91 0.89
[10,] NA NA 4.21 0.93 6.64 0.80
# Classification
df$Charts <- as.factor(x_noout$Charts)
set.seed(42)
rpart <- rpart(data = df,Charts~.,method = 'class')
rpart.plot(rpart)
rf_pred <- predict(rpart, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
rf_pred
0 1
0 93 0
1 7 86
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 0.9623656
# 0.9623656
par(mfrow=c(1,1))
printcp(rpart)
Classification tree:
rpart(formula = Charts ~ ., data = df, method = "class")
Variables actually used in tree construction:
[1] fa_2 pc_1 Track_Duration_ms
Root node error: 93/186 = 0.5
n= 186
CP nsplit rel error xerror xstd
1 0.795699 0 1.000000 1.19355 0.071937
2 0.118280 1 0.204301 0.23656 0.047358
3 0.010753 2 0.086022 0.12903 0.036027
4 0.010000 3 0.075269 0.13978 0.037390
plotcp(rpart)
rpart$cptable
CP nsplit rel error xerror xstd
1 0.79569892 0 1.00000000 1.1935484 0.07193706
2 0.11827957 1 0.20430108 0.2365591 0.04735805
3 0.01075269 2 0.08602151 0.1290323 0.03602681
4 0.01000000 3 0.07526882 0.1397849 0.03738999
best.cp <- rpart$cptable[which.min(rpart$cptable[,"xerror"]),"CP"]
best.cp
[1] 0.01075269
set.seed(42)
rpart_pruned <- prune(rpart, cp=best.cp)
rpart.plot(rpart_pruned)
rf_pred <- predict(rpart_pruned, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
rf_pred
0 1
0 89 4
1 4 89
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 0.9569892
# 0.9569892
library("caret")
train.data <- df[idx.train, ]
test.data <- df[-idx.train,]
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Charts ~ .,
data = train.data,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
rpart.cv
CART
142 samples
14 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 128, 127, 128, 128, 127, 127, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.9124908 0.8241111
0.05462185 0.9073626 0.8135420
0.10924370 0.8863614 0.7724387
0.16386555 0.8579487 0.7174184
0.21848739 0.8579487 0.7174184
0.27310924 0.8579487 0.7174184
0.32773109 0.8579487 0.7174184
0.38235294 0.8579487 0.7174184
0.43697479 0.8579487 0.7174184
0.49159664 0.8579487 0.7174184
0.54621849 0.8579487 0.7174184
0.60084034 0.8579487 0.7174184
0.65546218 0.8579487 0.7174184
0.71008403 0.8579487 0.7174184
0.76470588 0.6828083 0.3512446
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.
set.seed(42)
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
n= 142
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 142 68 0 (0.52112676 0.47887324)
2) Track_Duration_ms>=247464.5 62 2 0 (0.96774194 0.03225806) *
3) Track_Duration_ms< 247464.5 80 14 1 (0.17500000 0.82500000)
6) pc_1< -0.8616754 13 2 0 (0.84615385 0.15384615) *
7) pc_1>=-0.8616754 67 3 1 (0.04477612 0.95522388) *
rpart.plot(rpart.best.cv)
rpart_pred <- predict(rpart.best.cv, newdata = train.data, type = "class")
confMat <- table(train.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
rpart_pred
0 1
0 71 3
1 4 64
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 0.9507042
# 0.9507042
rpart_pred <- predict(rpart.best.cv, newdata = test.data, type = "class")
confMat <- table(test.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
rpart_pred
0 1
0 19 0
1 3 22
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 0.9318182
# 0.9318182
#################################################
# Random forest
rf_train <- train.data
rf_test <- test.data
task <- makeClassifTask(data = rf_train, target = "Charts", positive = "1")
rf_learner <- makeLearner("classif.randomForest",
predict.type = "prob", # prediction type needs to be specified for the learner
par.vals = list("replace" = TRUE, "importance" = FALSE))
rf.parms <- makeParamSet(
# The recommendation for mtry by Breiman is squareroot number of columns
makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
makeDiscreteParam("sampsize", values = c(30, 50, 70)), # bootstrap sample size, smaller -> faster
makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
)
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)
parallelStartSocket(4, level = "mlr.tuneParams")
Starting parallelization in mode=socket with cpus=4.
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
par.set = rf.parms, control = tuneControl, measures = mlr::auc)
})
parallelStop()
Stopped parallelization. All cleaned up.
tuning$x
$mtry
[1] 4
$sampsize
[1] 70
$ntree
[1] 50
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)
tuning_results$data
tapply(tuning_results$data$auc.test.mean, INDEX = c(tuning_results$data$mtry), mean)
2 3 4 5 6
0.9953519 0.9952401 0.9955916 0.9944808 0.9951856
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred <- predict(rf_model, newdata = train.data, type = "class")
# rf_pred$data$response
confMat <- table(train.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
0 1
0 74 0
1 1 67
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 0.9929577
# 1
rf_pred <- predict(rf_model, newdata = test.data, type = "class")
# rf_pred$data$response
confMat <- table(test.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
0 1
0 19 0
1 0 25
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 1
# 1
# Neural net classification
df$Charts <- x_noout$Charts
train_z$Charts <- df[idx.train, "Charts"]
test_z$Charts <- df[-idx.train, "Charts"]
task <- makeClassifTask(data = train_z, target = "Charts", positive = "1")
nnet <- makeLearner("classif.nnet", predict.type = "prob", par.vals = list("trace" = FALSE, "maxit" = 1000,
"MaxNWts" = 80000))
nn.parms <- makeParamSet(
makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)),
makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)
parallelStartSocket(4, level = "mlr.tuneParams")
Starting parallelization in mode=socket with cpus=4.
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
measures = mlr::auc)
})
parallelStop()
Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
$decay
[1] 0.1
$size
[1] 2
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)
plotHyperParsEffect(tuning_results, x = "size", y = "auc.test.mean")
plotHyperParsEffect(tuning_results, x = "decay", y = "auc.test.mean")
nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)
model <- mlr::train(nnet.tuned, task = task)
nn_pred_train <- predict(model, newdata = train_z, type = "class")
nn_pred_test <- predict(model, newdata = test_z, type = "class")
confMat <- table(train_z$Charts,nn_pred_train$data$response)
confMat # 0: not in charts, 1: in charts
0 1
0 74 0
1 0 68
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 1
# 1
confMat <- table(test_z$Charts,nn_pred_test$data$response)
confMat # 0: not in charts, 1: in charts
0 1
0 19 0
1 0 25
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
[1] 1
# 1
```