library(tidyverse)
library(data.table)
library(DT)
library(MASS)
df <- read.csv("C:/Users/PC/Documents/R_4DS/Spotify/data.csv/data.csv")
theme_set(theme_classic() + #set the theme
theme(text = element_text(size = 20))) #set the default text size
# opts_chunk$set(comment = "",
# fig.show = "hold")
## Check for missing value
null_vars <- (sapply(df, function(x) sum(is.na(x))))
t(data.frame(null_vars))
acousticness artists danceability duration_ms
null_vars 0 0 0 0
energy explicit id instrumentalness key liveness
null_vars 0 0 0 0 0 0
loudness mode name popularity release_date
null_vars 0 0 0 0 0
speechiness tempo valence year
null_vars 0 0 0 0
blank_vars <- sapply(df, function(x) sum(x == ""))
t(data.frame(blank_vars))
acousticness artists danceability duration_ms
blank_vars 0 0 0 0
energy explicit id instrumentalness key
blank_vars 0 0 0 0 0
liveness loudness mode name popularity
blank_vars 0 0 0 0 0
release_date speechiness tempo valence year
blank_vars 0 0 0 0 0
spotify_df <- df %>%
mutate(artists = str_extract(artists, '(\\w.*)(.*\\w)')) %>%
mutate(key = as.factor(key)) %>%
mutate(year = as.factor(year)) %>%
mutate(name = as.character(name)) %>%
mutate(explicit = as.factor(explicit)) %>%
mutate(acoustics = acousticness + instrumentalness) %>%
mutate(acoustics = ifelse(acoustics >= 0.4, 0, 1)) %>%
mutate(acoustics = as.factor(acoustics)) %>%
mutate(decades = case_when(
year < 1930 ~ "Y1920_Y1929",
year < 1940 ~ "Y1930_Y1939",
year < 1950 ~ "Y1940_Y1949",
year < 1960 ~ "Y1950_Y1959",
year < 1970 ~ "Y1960_Y1969",
year < 1980 ~ "Y1970_Y1979",
year < 1990 ~ "Y1980_Y1989",
year < 2000 ~ "Y1990_Y1999",
TRUE ~ "2000s"
)) %>%
rename(song_name = name) %>%
dplyr::select(-c(release_date, id, mode, acousticness, instrumentalness))
Problem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factorsProblem with `mutate()` input `decades`.
[34mi[39m 㤼㸱<㤼㸲 not meaningful for factors
[34mi[39m Input `decades` is `case_when(...)`.㤼㸱<㤼㸲 not meaningful for factors
Acousticness and Instrumentaliness are heaped to the extremes, 0’s and 1’s, merged as One Factor variable.
Popularity of music holds a negative relative with the counts of music.
df_num <- spotify_df %>%
select_if(is.numeric) %>%
subset()
par(mfrow= c(3,3))
invisible(lapply(names(df_num), function(col_name)
truehist(df_num[,col_name], main = paste("Histogram of ", col_name), xlab = NA)))
cor_df <- cor(subset(select_if(spotify_df, is.numeric)), use = "pairwise.complete.obs")[,"popularity"]
(data.frame(cor_df) %>%
arrange(-cor_df))
library(superml)
df <- spotify_df %>%
dplyr::select(-c("artists", "song_name", "year")) %>%
dplyr::select(popularity, everything()) %>%
mutate(acoustics = as.numeric(acoustics)) %>%
mutate(explicit = as.numeric(explicit)) %>%
mutate(key = as.numeric(key))
lbl= LabelEncoder$new()
df$decades = lbl$fit_transform(df$decades)
##Replace NaN & Inf with NA
df[is.na(df) | df=="Inf" | df=="-Inf"] == NA
logical(0)
## Train-Test
n_split <- round(0.8 * nrow(df))
train_indices <- sample(1:nrow(df), n_split)
train_set <- df[train_indices, ]
test_set <- df[-train_indices, ]
# tt_split <- function(df, x){
# n_split <- round(x * nrow(df))
#
# indices <- sample(1:nrow(df), n_split)
#
# df_train <- df[indices, ]
# df_test <- df[-indices, ]
# return(list(df_train, df_test))
# }
## Feature scale (Preserving Outcome Variable)
###---|| NB: We do not scales the Response Variable;Data has to be numeric.
# train_set[-1] = scale(train_set[-1])
# test_set[-1] = scale(test_set[-1])
##Linear Regression
lin_reg <- lm(popularity ~ .-popularity, data = train_set, na.action=na.exclude)
summary(lin_reg)
Call:
lm(formula = popularity ~ . - popularity, data = train_set, na.action = na.exclude)
Residuals:
Min 1Q Median 3Q Max
-57.593 -15.410 -0.657 13.567 77.310
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.331e-01 4.968e-01 -1.878 0.0603 .
danceability 1.059e+01 3.743e-01 28.300 < 2e-16 ***
duration_ms -1.583e-06 3.521e-07 -4.496 6.92e-06 ***
energy 5.460e+00 3.340e-01 16.347 < 2e-16 ***
explicit 1.209e+01 2.267e-01 53.341 < 2e-16 ***
key -3.137e-02 1.447e-02 -2.168 0.0302 *
liveness -7.999e+00 2.935e-01 -27.257 < 2e-16 ***
loudness 4.028e-01 1.499e-02 26.870 < 2e-16 ***
speechiness -2.797e+01 3.289e-01 -85.036 < 2e-16 ***
tempo 1.149e-02 1.765e-03 6.509 7.60e-11 ***
valence -6.373e+00 2.461e-01 -25.898 < 2e-16 ***
acoustics 1.269e+01 1.296e-01 97.872 < 2e-16 ***
decades NA NA NA NA
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 19 on 139499 degrees of freedom
Multiple R-squared: 0.2457, Adjusted R-squared: 0.2456
F-statistic: 4130 on 11 and 139499 DF, p-value: < 2.2e-16
library(forecast)
#use predict() to make prediction on a new set
pred1 <- predict(lin_reg, test_set ,type = "response")
prediction from a rank-deficient fit may be misleading
residuals <- test_set$popularity - pred1
linreg_pred <- data.frame("Predicted" = pred1,
"Actual" = test_set$popularity,
"Residual" = residuals)
accuracy(pred1, test_set$popularity)
ME RMSE MAE MPE MAPE
Test set 0.01589392 18.98866 15.71578 NaN Inf
## Classification Tree
library(rpart)
library(rpart.plot)
class.tree <- rpart(popularity ~.,
data = train_set,
control = rpart.control(cp = 0.01))
plotcp(class.tree)
printcp(class.tree)
Regression tree:
rpart(formula = popularity ~ ., data = train_set, control = rpart.control(cp = 0.01))
Variables actually used in tree construction:
[1] acoustics energy explicit speechiness
Root node error: 66776723/139511 = 478.65
n= 139511
CP nsplit rel error xerror xstd
1 0.152000 0 1.00000 1.00001 0.0027492
2 0.036606 1 0.84800 0.84802 0.0027585
3 0.027777 2 0.81139 0.81148 0.0027249
4 0.017082 3 0.78362 0.78372 0.0027633
5 0.012992 4 0.76653 0.76720 0.0027215
6 0.010000 5 0.75354 0.75421 0.0027294
rpart.plot(class.tree,
box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE)
## Random Forest
#Random Forest
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 㤼㸱randomForest㤼㸲
The following object is masked from 㤼㸱package:gridExtra㤼㸲:
combine
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
margin
RF <- randomForest(popularity ~.-popularity,
data = train_set,
importance =TRUE,
ntree=500,
nodesize=7,
na.action = na.roughfix)