spotify <- read.csv("C:/Users/Acer/Downloads/spotify (1).csv")

Problem 1

1.(1)

plot.ecdf(spotify$duration)

1.(2) 離散,因為有階梯狀出現。

1.(3) 約0.75

cdf.duration <- ecdf(spotify$duration)
cdf.duration(240)
## [1] 0.7701987

2.(1)

hist(spotify$duration/60, breaks = 4)

在2-4分鐘的歌累積最多人數,因此2-4分鐘最多人聽。

2.(2)

bar chart 會用在比較不同樣本的相同區間比較(例如身高160的男女人數比較),而histogram則是顯示單一資料的分配。

3.(1)

plot(density(spotify$duration, kernel = "epanechnikov"))

Histogram 是利用累積此區間中的次數來表示分配,可以透過改變區間的大小調整表示分配的準確度。 KDE線上的點則是區間中數據平均後的結果來顯示分配,其實兩種方法的分析功能差不多,但KDE會是連續函數依據其bandwidth可能有較大的誤差。

3.(2)

當bandwidth越大,取出平均的數據越多,會導致下一個點的數據(改變一個數據後的平均)的變化較小,圖線條看起來也會比較平滑。

Problem 2

1.

popularity_major <- spotify$popularity[spotify$mode==1]
popularity_minor <- spotify$popularity[spotify$mode==0]

summary(popularity_major)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   57.00   69.00   67.82   82.00  100.00
summary(popularity_minor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   57.00   72.00   68.78   83.00  100.00

2.(1)

data.frame(
  popularity=c("major","minor"),
  coefficient_of_variation=c(sd(popularity_major)/mean(popularity_major),sd(popularity_major)/mean(popularity_major))
)
##   popularity coefficient_of_variation
## 1      major                0.2559223
## 2      minor                0.2559223

2.(2)

CV與單位無關,能幫助我們比較不同單位或是平均差很多的資料。

3.(1)

qqplot((popularity_major-min(popularity_major))/(max(popularity_major)-min(popularity_major)),(popularity_minor-min(popularity_minor))/(max(popularity_minor)-min(popularity_minor)),xlab="major累積人數比例",ylab="minor累積人數比例")
abline(a=0,b=1)

從上圖可看出minor與major的分配不相同,major資料的分配較minor分散,但兩者在人數比例低時的人數分配均較少。

plot(density((popularity_major-min(popularity_major))/(max(popularity_major)-min(popularity_major)), kernel = "epanechnikov"))

plot(density((popularity_minor-min(popularity_minor))/(max(popularity_minor)-min(popularity_minor)), kernel = "epanechnikov"))

3.(2)

qqnorm(spotify$popularity) 
qqline(spotify$popularity, datax = FALSE, distribution = qnorm,
probs = c(0.25, 0.75), qtype = 7)

Problem 3

1.(1)

spotify$bpm <- ifelse(spotify$tempo < 90, "slow", 
                      ifelse(spotify$tempo <= 120, "medium", "fast"))
ct <- table(spotify$bpm, spotify$is_explicit)
ct
##         
##          FALSE TRUE
##   fast     596  227
##   medium   378  122
##   slow     133   54

1.(2)

chisq.test(ct)
## 
##  Pearson's Chi-squared test
## 
## data:  ct
## X-squared = 2.1319, df = 2, p-value = 0.3444

p-value > 0.05 ,故此兩筆資料沒有顯著關係

2.

cts <- chisq.test(ct)$statistic
n <- sum(ct)
k <- ncol(ct)
r <- nrow(ct)
cmsv = sqrt(cts/(n*min(k-1,r-1)))
cat(cmsv)
## 0.03757501

Problem 4

1.(1)

plot(spotify$danceability,spotify$energy,xlab="danceability",ylab="energy")

1.(2)

cor(spotify$danceability,spotify$energy)
## [1] 0.1105008

兩筆資料成正相關但相關性不高。

2.(1)

library(corrplot)
## Warning: 套件 'corrplot' 是用 R 版本 4.4.2 來建造的
## corrplot 0.95 loaded
spotify_numeric <- spotify[sapply(spotify, is.numeric)]
s=cor(spotify_numeric)
corrplot(s)

相關性最高為loudness與energy,最低為acousticness與energy。

Problem 5

r_mean <- numeric(14)
r_sd <- numeric(14)
z_score_of_supersonic <- numeric(14)
col_name <- vector()

for(i in c(1:14)){
  r_mean[i] <- mean(spotify_numeric[,i])
  r_sd[i] <- sd(spotify_numeric[,i])
}
z_score_of_supersonic <- numeric(14)
for(i in c(1:14)){
  z_score_of_supersonic[i] <- (spotify_numeric[540, i] - r_mean[i])/r_sd[i]
}

for(i in 1:14) {
  col_name[i] <- colnames(spotify_numeric)[i]
}

data.frame(
  col_name,
  z_score_of_supersonic
)
##            col_name z_score_of_supersonic
## 1        popularity           -0.12229550
## 2          duration            1.41553122
## 3      danceability           -1.80100539
## 4            energy            1.35920035
## 5               key            1.02387524
## 6          loudness            1.24526594
## 7              mode            0.70546868
## 8       speechiness           -0.55497093
## 9      acousticness           -0.94942288
## 10 instrumentalness           -0.04136819
## 11         liveness           -0.66056016
## 12          valence            0.38454328
## 13            tempo           -0.69624849
## 14   time_signature            0.18782872