library(readxl)
library(dplyr)
library(skimr)
knitr::opts_chunk$set(echo = TRUE)
excel_sheets("eurovision_1998 to 2012.xlsx")
## [1] "eurovision_meta" "Voting Final" "Sheet2"
euro <- read_excel("eurovision_1998 to 2012.xlsx", sheet = "eurovision_meta")
## New names:
## • `` -> `...1`
head(euro)
## # A tibble: 6 × 30
## ...1 Year Country Region Artist Song Artist.gender Group.Solo Place Points
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 1 2009 Lithuan… Forme… Sasha… Love Male Solo 23 23
## 2 2 2009 Israel Indep… Noa a… Ther… Female Group 16 53
## 3 3 2009 France Weste… Patri… Et s… Female Solo 8 107
## 4 4 2009 Sweden Scand… Malen… La v… Female Solo 21 33
## 5 5 2009 Croatia Forme… Igor … Lije… Both Group 18 45
## 6 6 2009 Portugal Weste… Flor-… Toda… Both Group 15 57
## # ℹ 20 more variables: Home.Away.Country <chr>, Home.Away.Region <chr>,
## # Is.Final <dbl>, Semi.Final.Number <chr>, Song.In.English <dbl>,
## # Song.Quality <dbl>, Normalized.Points <dbl>, energy <chr>, duration <chr>,
## # acousticness <chr>, danceability <chr>, tempo <chr>, speechiness <chr>,
## # key <chr>, liveness <chr>, time_signature <chr>, mode <chr>,
## # loudness <chr>, valence <chr>, Happiness <chr>
euro <- euro %>%
filter(Year >= 1998 & Year <= 2010)
dim(euro)
## [1] 523 30
str(euro)
## tibble [523 × 30] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:523] 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : num [1:523] 2009 2009 2009 2009 2009 ...
## $ Country : chr [1:523] "Lithuania" "Israel" "France" "Sweden" ...
## $ Region : chr [1:523] "Former Socialist Bloc" "Independent" "Western Europe" "Scandinavia" ...
## $ Artist : chr [1:523] "Sasha Son" "Noa and Mira Awad" "Patricia Kaas" "Malena Ernman" ...
## $ Song : chr [1:523] "Love" "There Must Be Another Way" "Et s'il fallait le faire" "La voix" ...
## $ Artist.gender : chr [1:523] "Male" "Female" "Female" "Female" ...
## $ Group.Solo : chr [1:523] "Solo" "Group" "Solo" "Solo" ...
## $ Place : num [1:523] 23 16 8 21 18 15 2 7 10 11 ...
## $ Points : num [1:523] 23 53 107 33 45 57 218 120 92 91 ...
## $ Home.Away.Country: chr [1:523] "Away" "Away" "Away" "Away" ...
## $ Home.Away.Region : chr [1:523] "Home" "Away" "Away" "Away" ...
## $ Is.Final : num [1:523] 1 1 1 1 1 1 1 1 1 1 ...
## $ Semi.Final.Number: chr [1:523] "NA" "NA" "NA" "NA" ...
## $ Song.In.English : num [1:523] 1 1 0 1 0 0 1 1 1 0 ...
## $ Song.Quality : num [1:523] 0.651 1.324 2.586 0.865 0.931 ...
## $ Normalized.Points: num [1:523] 0.00944 0.02176 0.04392 0.01355 0.01847 ...
## $ energy : chr [1:523] "0.67825829026120399" "0.794093269784657" "0.57462761597565803" "0.67274598733461499" ...
## $ duration : chr [1:523] "183.18476000000001" "179.87872999999999" "227.97333" "178.45660000000001" ...
## $ acousticness : chr [1:523] "0.56317010134575396" "6.8711032467636393E-2" "0.65407792614395999" "0.119194331013219" ...
## $ danceability : chr [1:523] "0.59836042620323504" "0.60454201016248799" "0.36734422733474298" "0.51320168397843902" ...
## $ tempo : chr [1:523] "102.98399999999999" "105.97199999999999" "124.005" "128.02099999999999" ...
## $ speechiness : chr [1:523] "2.7817063934301701E-2" "2.4996099290270999E-2" "4.3750964396931401E-2" "3.4854566287375703E-2" ...
## $ key : chr [1:523] "9" "8" "1" "10" ...
## $ liveness : chr [1:523] "7.8643134114834701E-2" "8.4805072088255995E-2" "0.92390504979230703" "0.14370767891949601" ...
## $ time_signature : chr [1:523] "3" "4" "4" "4" ...
## $ mode : chr [1:523] "0" "1" "1" "1" ...
## $ loudness : chr [1:523] "-9.0820000000000007" "-6.0590000000000002" "-8.6549999999999994" "-4.5359999999999996" ...
## $ valence : chr [1:523] "0.48205864439961499" "0.42620882292588003" "0.292869794406171" "0.20649278592322801" ...
## $ Happiness : chr [1:523] "6.1482323232300002" "5.1418867924500002" "5.06976190476" "5.5026923076900003" ...
n_objects <- nrow(euro)
n_features <- ncol(euro)
numeric_count <- sum(sapply(euro, is.numeric))
non_numeric_count <- n_features - numeric_count
n_objects
## [1] 523
n_features
## [1] 30
numeric_count
## [1] 8
non_numeric_count
## [1] 22
skim(euro)
| Name | euro |
| Number of rows | 523 |
| Number of columns | 30 |
| _______________________ | |
| Column type frequency: | |
| character | 22 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 5 | 22 | 0 | 47 | 0 |
| Region | 0 | 1 | 11 | 21 | 0 | 5 | 0 |
| Artist | 0 | 1 | 2 | 44 | 0 | 428 | 0 |
| Song | 0 | 1 | 3 | 36 | 0 | 427 | 0 |
| Artist.gender | 0 | 1 | 2 | 6 | 0 | 4 | 0 |
| Group.Solo | 0 | 1 | 2 | 5 | 0 | 3 | 0 |
| Home.Away.Country | 0 | 1 | 4 | 4 | 0 | 2 | 0 |
| Home.Away.Region | 0 | 1 | 4 | 4 | 0 | 2 | 0 |
| Semi.Final.Number | 0 | 1 | 1 | 2 | 0 | 3 | 0 |
| energy | 0 | 1 | 2 | 21 | 0 | 290 | 0 |
| duration | 0 | 1 | 2 | 18 | 0 | 282 | 0 |
| acousticness | 0 | 1 | 2 | 21 | 0 | 289 | 0 |
| danceability | 0 | 1 | 2 | 19 | 0 | 290 | 0 |
| tempo | 0 | 1 | 2 | 18 | 0 | 289 | 0 |
| speechiness | 0 | 1 | 2 | 21 | 0 | 290 | 0 |
| key | 0 | 1 | 1 | 2 | 0 | 13 | 0 |
| liveness | 0 | 1 | 2 | 21 | 0 | 290 | 0 |
| time_signature | 0 | 1 | 1 | 2 | 0 | 6 | 0 |
| mode | 0 | 1 | 1 | 2 | 0 | 3 | 0 |
| loudness | 0 | 1 | 2 | 19 | 0 | 285 | 0 |
| valence | 0 | 1 | 2 | 21 | 0 | 289 | 0 |
| Happiness | 0 | 1 | 2 | 18 | 0 | 181 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 289.43 | 181.77 | 1.00 | 131.50 | 262.00 | 443.50 | 648.00 | ▇▇▅▇▃ |
| Year | 0 | 1 | 2005.36 | 3.56 | 1998.00 | 2003.00 | 2006.00 | 2008.00 | 2010.00 | ▃▂▅▅▇ |
| Place | 0 | 1 | 12.02 | 6.86 | 1.00 | 6.00 | 12.00 | 18.00 | 28.00 | ▇▇▇▆▂ |
| Points | 0 | 1 | 77.52 | 65.21 | 0.00 | 27.00 | 58.00 | 111.50 | 387.00 | ▇▃▂▁▁ |
| Is.Final | 0 | 1 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| Song.In.English | 0 | 1 | 0.71 | 0.45 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| Song.Quality | 0 | 1 | 2.60 | 2.14 | -0.42 | 0.93 | 1.97 | 3.92 | 9.80 | ▇▅▃▂▁ |
| Normalized.Points | 0 | 1 | 0.04 | 0.04 | 0.00 | 0.01 | 0.03 | 0.07 | 0.16 | ▇▃▃▁▁ |
sum(duplicated(euro))
## [1] 0
Duplikaatide kontroll näitas, et andmestikus täielikke duplikaate ei esinenud.
sum(is.na(euro))
## [1] 0
euro[euro == "NA"] <- NA
sum(is.na(euro))
## [1] 3074
colSums(is.na(euro))
## ...1 Year Country Region
## 0 0 0 0
## Artist Song Artist.gender Group.Solo
## 0 0 297 297
## Place Points Home.Away.Country Home.Away.Region
## 0 0 0 0
## Is.Final Semi.Final.Number Song.In.English Song.Quality
## 0 316 0 0
## Normalized.Points energy duration acousticness
## 0 156 156 157
## danceability tempo speechiness key
## 156 156 156 156
## liveness time_signature mode loudness
## 156 156 156 156
## valence Happiness
## 157 290
sum(!complete.cases(euro))
## [1] 490
Puuduvate väärtuste analüüs näitas, et mitmes tunnuses oli
märkimisväärne hulk puuduvaid väärtusi.
Kõige probleemsemad tunnused olid Artist.gender,
Group.Solo, Semi.Final.Number ja
Happiness.
Samuti esines puudusi mitmes helitunnuses, näiteks energy,
duration, tempo ja valence.
euro <- euro %>%
select(-...1)
dim(euro)
## [1] 523 29
euro <- euro %>%
mutate(
energy = as.numeric(energy),
duration = as.numeric(duration),
acousticness = as.numeric(acousticness),
danceability = as.numeric(danceability),
tempo = as.numeric(tempo),
speechiness = as.numeric(speechiness),
key = as.numeric(key),
liveness = as.numeric(liveness),
time_signature = as.numeric(time_signature),
mode = as.numeric(mode),
loudness = as.numeric(loudness),
valence = as.numeric(valence),
Happiness = as.numeric(Happiness)
)
str(euro)
## tibble [523 × 29] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:523] 2009 2009 2009 2009 2009 ...
## $ Country : chr [1:523] "Lithuania" "Israel" "France" "Sweden" ...
## $ Region : chr [1:523] "Former Socialist Bloc" "Independent" "Western Europe" "Scandinavia" ...
## $ Artist : chr [1:523] "Sasha Son" "Noa and Mira Awad" "Patricia Kaas" "Malena Ernman" ...
## $ Song : chr [1:523] "Love" "There Must Be Another Way" "Et s'il fallait le faire" "La voix" ...
## $ Artist.gender : chr [1:523] "Male" "Female" "Female" "Female" ...
## $ Group.Solo : chr [1:523] "Solo" "Group" "Solo" "Solo" ...
## $ Place : num [1:523] 23 16 8 21 18 15 2 7 10 11 ...
## $ Points : num [1:523] 23 53 107 33 45 57 218 120 92 91 ...
## $ Home.Away.Country: chr [1:523] "Away" "Away" "Away" "Away" ...
## $ Home.Away.Region : chr [1:523] "Home" "Away" "Away" "Away" ...
## $ Is.Final : num [1:523] 1 1 1 1 1 1 1 1 1 1 ...
## $ Semi.Final.Number: chr [1:523] NA NA NA NA ...
## $ Song.In.English : num [1:523] 1 1 0 1 0 0 1 1 1 0 ...
## $ Song.Quality : num [1:523] 0.651 1.324 2.586 0.865 0.931 ...
## $ Normalized.Points: num [1:523] 0.00944 0.02176 0.04392 0.01355 0.01847 ...
## $ energy : num [1:523] 0.678 0.794 0.575 0.673 0.734 ...
## $ duration : num [1:523] 183 180 228 178 183 ...
## $ acousticness : num [1:523] 0.5632 0.0687 0.6541 0.1192 0.3236 ...
## $ danceability : num [1:523] 0.598 0.605 0.367 0.513 0.591 ...
## $ tempo : num [1:523] 103 106 124 128 116 ...
## $ speechiness : num [1:523] 0.0278 0.025 0.0438 0.0349 0.0324 ...
## $ key : num [1:523] 9 8 1 10 1 7 0 6 4 2 ...
## $ liveness : num [1:523] 0.0786 0.0848 0.9239 0.1437 0.0793 ...
## $ time_signature : num [1:523] 3 4 4 4 4 4 4 4 3 4 ...
## $ mode : num [1:523] 0 1 1 1 0 1 0 0 0 0 ...
## $ loudness : num [1:523] -9.08 -6.06 -8.65 -4.54 -4.43 ...
## $ valence : num [1:523] 0.482 0.426 0.293 0.206 0.345 ...
## $ Happiness : num [1:523] 6.15 5.14 5.07 5.5 NA ...
n_objects <- nrow(euro)
n_features <- ncol(euro)
numeric_count <- sum(sapply(euro, is.numeric))
non_numeric_count <- n_features - numeric_count
n_objects
## [1] 523
n_features
## [1] 29
numeric_count
## [1] 20
non_numeric_count
## [1] 9
skim(euro)
| Name | euro |
| Number of rows | 523 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1.00 | 5 | 22 | 0 | 47 | 0 |
| Region | 0 | 1.00 | 11 | 21 | 0 | 5 | 0 |
| Artist | 0 | 1.00 | 2 | 44 | 0 | 428 | 0 |
| Song | 0 | 1.00 | 3 | 36 | 0 | 427 | 0 |
| Artist.gender | 297 | 0.43 | 4 | 6 | 0 | 3 | 0 |
| Group.Solo | 297 | 0.43 | 4 | 5 | 0 | 2 | 0 |
| Home.Away.Country | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| Home.Away.Region | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| Semi.Final.Number | 316 | 0.40 | 1 | 1 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1.00 | 2005.36 | 3.56 | 1998.00 | 2003.00 | 2006.00 | 2008.00 | 2010.00 | ▃▂▅▅▇ |
| Place | 0 | 1.00 | 12.02 | 6.86 | 1.00 | 6.00 | 12.00 | 18.00 | 28.00 | ▇▇▇▆▂ |
| Points | 0 | 1.00 | 77.52 | 65.21 | 0.00 | 27.00 | 58.00 | 111.50 | 387.00 | ▇▃▂▁▁ |
| Is.Final | 0 | 1.00 | 0.60 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▅▁▁▁▇ |
| Song.In.English | 0 | 1.00 | 0.71 | 0.45 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| Song.Quality | 0 | 1.00 | 2.60 | 2.14 | -0.42 | 0.93 | 1.97 | 3.92 | 9.80 | ▇▅▃▂▁ |
| Normalized.Points | 0 | 1.00 | 0.04 | 0.04 | 0.00 | 0.01 | 0.03 | 0.07 | 0.16 | ▇▃▃▁▁ |
| energy | 156 | 0.70 | 0.73 | 0.18 | 0.06 | 0.61 | 0.76 | 0.89 | 1.00 | ▁▂▃▇▇ |
| duration | 156 | 0.70 | 192.66 | 37.06 | 30.02 | 179.83 | 182.38 | 186.28 | 445.75 | ▁▇▁▁▁ |
| acousticness | 157 | 0.70 | 0.16 | 0.23 | 0.00 | 0.01 | 0.05 | 0.21 | 0.99 | ▇▁▁▁▁ |
| danceability | 156 | 0.70 | 0.57 | 0.14 | 0.18 | 0.49 | 0.58 | 0.67 | 0.89 | ▂▃▇▇▁ |
| tempo | 156 | 0.70 | 112.71 | 25.73 | 64.97 | 89.68 | 119.95 | 130.99 | 211.99 | ▆▅▇▁▁ |
| speechiness | 156 | 0.70 | 0.06 | 0.05 | 0.02 | 0.03 | 0.04 | 0.06 | 0.51 | ▇▁▁▁▁ |
| key | 156 | 0.70 | 5.76 | 3.49 | 0.00 | 2.00 | 6.00 | 9.00 | 11.00 | ▇▃▅▅▇ |
| liveness | 156 | 0.70 | 0.20 | 0.18 | 0.03 | 0.09 | 0.14 | 0.28 | 0.98 | ▇▃▁▁▁ |
| time_signature | 156 | 0.70 | 3.90 | 0.53 | 1.00 | 4.00 | 4.00 | 4.00 | 7.00 | ▁▁▇▁▁ |
| mode | 156 | 0.70 | 0.57 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| loudness | 156 | 0.70 | -6.50 | 2.24 | -21.88 | -7.53 | -6.18 | -4.98 | -2.35 | ▁▁▁▇▇ |
| valence | 157 | 0.70 | 0.52 | 0.24 | 0.02 | 0.33 | 0.52 | 0.71 | 0.97 | ▃▆▇▇▅ |
| Happiness | 290 | 0.45 | 5.40 | 0.31 | 4.38 | 5.22 | 5.47 | 5.59 | 6.15 | ▁▂▅▇▁ |
euro <- euro %>%
select(
-Artist.gender,
-Group.Solo,
-Semi.Final.Number,
-Happiness
)
dim(euro)
## [1] 523 25
colSums(is.na(euro))
## Year Country Region Artist
## 0 0 0 0
## Song Place Points Home.Away.Country
## 0 0 0 0
## Home.Away.Region Is.Final Song.In.English Song.Quality
## 0 0 0 0
## Normalized.Points energy duration acousticness
## 0 156 156 157
## danceability tempo speechiness key
## 156 156 156 156
## liveness time_signature mode loudness
## 156 156 156 156
## valence
## 157
sum(is.na(euro))
## [1] 1874
sum(!complete.cases(euro))
## [1] 157
euro_clean <- na.omit(euro)
dim(euro_clean)
## [1] 366 25
euro_clean <- euro_clean %>%
mutate(
Country = as.factor(Country),
Region = as.factor(Region),
Artist = as.factor(Artist),
Song = as.factor(Song),
Home.Away.Country = as.factor(Home.Away.Country),
Home.Away.Region = as.factor(Home.Away.Region)
)
euro_clean <- euro_clean %>%
mutate(
Is.Final = factor(Is.Final, levels = c(0, 1), labels = c("Ei", "Jah")),
Song.In.English = factor(Song.In.English, levels = c(0, 1), labels = c("Ei", "Jah")),
mode = factor(mode, levels = c(0, 1), labels = c("Minor", "Major"))
)
str(euro_clean)
## tibble [366 × 25] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:366] 2009 2009 2009 2009 2009 ...
## $ Country : Factor w/ 46 levels "Albania","Andorra",..: 25 23 16 42 10 34 21 19 3 36 ...
## $ Region : Factor w/ 5 levels "Former Socialist Bloc",..: 1 3 5 4 2 5 4 5 1 1 ...
## $ Artist : Factor w/ 289 levels "4Fun","Afro-dite",..: 228 191 202 156 109 80 287 224 113 8 ...
## $ Song : Factor w/ 292 levels "'Ajde, kro?i",..: 162 247 65 139 154 255 121 251 126 167 ...
## $ Place : num [1:366] 23 16 8 21 18 15 2 7 10 11 ...
## $ Points : num [1:366] 23 53 107 33 45 57 218 120 92 91 ...
## $ Home.Away.Country: Factor w/ 2 levels "Away","Home": 1 1 1 1 1 1 1 1 1 2 ...
## $ Home.Away.Region : Factor w/ 2 levels "Away","Home": 2 1 1 1 1 1 1 1 2 2 ...
## $ Is.Final : Factor w/ 2 levels "Ei","Jah": 2 2 2 2 2 2 2 2 2 2 ...
## $ Song.In.English : Factor w/ 2 levels "Ei","Jah": 2 2 1 2 1 1 2 2 2 1 ...
## $ Song.Quality : num [1:366] 0.651 1.324 2.586 0.865 0.931 ...
## $ Normalized.Points: num [1:366] 0.00944 0.02176 0.04392 0.01355 0.01847 ...
## $ energy : num [1:366] 0.678 0.794 0.575 0.673 0.734 ...
## $ duration : num [1:366] 183 180 228 178 183 ...
## $ acousticness : num [1:366] 0.5632 0.0687 0.6541 0.1192 0.3236 ...
## $ danceability : num [1:366] 0.598 0.605 0.367 0.513 0.591 ...
## $ tempo : num [1:366] 103 106 124 128 116 ...
## $ speechiness : num [1:366] 0.0278 0.025 0.0438 0.0349 0.0324 ...
## $ key : num [1:366] 9 8 1 10 1 7 0 6 4 2 ...
## $ liveness : num [1:366] 0.0786 0.0848 0.9239 0.1437 0.0793 ...
## $ time_signature : num [1:366] 3 4 4 4 4 4 4 4 3 4 ...
## $ mode : Factor w/ 2 levels "Minor","Major": 1 2 2 2 1 2 1 1 1 1 ...
## $ loudness : num [1:366] -9.08 -6.06 -8.65 -4.54 -4.43 ...
## $ valence : num [1:366] 0.482 0.426 0.293 0.206 0.345 ...
## - attr(*, "na.action")= 'omit' Named int [1:157] 15 25 31 33 46 48 53 56 71 86 ...
## ..- attr(*, "names")= chr [1:157] "15" "25" "31" "33" ...
n_objects_clean <- nrow(euro_clean)
n_features_clean <- ncol(euro_clean)
numeric_count_clean <- sum(sapply(euro_clean, is.numeric))
non_numeric_count_clean <- n_features_clean - numeric_count_clean
n_objects_clean
## [1] 366
n_features_clean
## [1] 25
numeric_count_clean
## [1] 16
non_numeric_count_clean
## [1] 9
skim(euro_clean)
| Name | euro_clean |
| Number of rows | 366 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| factor | 9 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Country | 0 | 1 | FALSE | 46 | Cro: 14, Ice: 14, Nor: 14, Lat: 13 |
| Region | 0 | 1 | FALSE | 5 | For: 118, Wes: 94, Sca: 60, Ind: 52 |
| Artist | 0 | 1 | FALSE | 289 | Chi: 4, Dim: 4, Sak: 4, Ale: 2 |
| Song | 0 | 1 | FALSE | 292 | Shi: 3, All: 2, Alw: 2, Ang: 2 |
| Home.Away.Country | 0 | 1 | FALSE | 2 | Awa: 356, Hom: 10 |
| Home.Away.Region | 0 | 1 | FALSE | 2 | Awa: 297, Hom: 69 |
| Is.Final | 0 | 1 | FALSE | 2 | Jah: 208, Ei: 158 |
| Song.In.English | 0 | 1 | FALSE | 2 | Jah: 273, Ei: 93 |
| mode | 0 | 1 | FALSE | 2 | Maj: 209, Min: 157 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1 | 2006.14 | 3.26 | 1998.00 | 2004.00 | 2007.00 | 2009.00 | 2010.00 | ▂▂▃▅▇ |
| Place | 0 | 1 | 11.42 | 6.82 | 1.00 | 5.00 | 11.00 | 17.00 | 27.00 | ▇▆▆▅▃ |
| Points | 0 | 1 | 83.72 | 67.17 | 0.00 | 33.00 | 68.00 | 120.00 | 387.00 | ▇▃▂▁▁ |
| Song.Quality | 0 | 1 | 2.80 | 2.21 | -0.36 | 1.00 | 2.16 | 4.25 | 9.80 | ▇▅▃▂▁ |
| Normalized.Points | 0 | 1 | 0.05 | 0.04 | 0.00 | 0.02 | 0.04 | 0.07 | 0.16 | ▇▅▃▂▁ |
| energy | 0 | 1 | 0.73 | 0.18 | 0.06 | 0.61 | 0.76 | 0.89 | 1.00 | ▁▂▃▇▇ |
| duration | 0 | 1 | 192.68 | 37.11 | 30.02 | 179.81 | 182.35 | 186.13 | 445.75 | ▁▇▁▁▁ |
| acousticness | 0 | 1 | 0.16 | 0.23 | 0.00 | 0.01 | 0.05 | 0.21 | 0.99 | ▇▁▁▁▁ |
| danceability | 0 | 1 | 0.57 | 0.14 | 0.18 | 0.49 | 0.58 | 0.67 | 0.89 | ▂▃▇▇▁ |
| tempo | 0 | 1 | 112.63 | 25.72 | 64.97 | 89.51 | 119.95 | 130.77 | 211.99 | ▆▅▇▁▁ |
| speechiness | 0 | 1 | 0.06 | 0.05 | 0.02 | 0.03 | 0.04 | 0.06 | 0.51 | ▇▁▁▁▁ |
| key | 0 | 1 | 5.77 | 3.49 | 0.00 | 2.25 | 6.00 | 9.00 | 11.00 | ▇▃▅▅▇ |
| liveness | 0 | 1 | 0.20 | 0.18 | 0.03 | 0.09 | 0.14 | 0.28 | 0.98 | ▇▃▁▁▁ |
| time_signature | 0 | 1 | 3.90 | 0.53 | 1.00 | 4.00 | 4.00 | 4.00 | 7.00 | ▁▁▇▁▁ |
| loudness | 0 | 1 | -6.49 | 2.24 | -21.88 | -7.52 | -6.18 | -4.97 | -2.35 | ▁▁▁▇▇ |
| valence | 0 | 1 | 0.52 | 0.24 | 0.02 | 0.33 | 0.52 | 0.71 | 0.97 | ▃▆▇▇▅ |
Pärast andmete puhastamist jäi andmestikku 366 objekti ja 25
tunnust.
Nendest 16 on arvulised tunnused ja 9 faktorid.
Andmestikust eemaldati tehniline tunnus ...1, kuna see
ei sisaldanud analüüsi jaoks sisulist infot.
Lisaks eemaldati tunnused Artist.gender,
Group.Solo, Semi.Final.Number ja
Happiness, sest nendes oli väga palju puuduvaid
väärtusi.
Tekstina esitatud väärtus "NA" asendati
standardsümboliga NA.
Seejärel teisendati mitmed algselt tekstina loetud arvulised tunnused
arvulisele kujule.
Pärast seda eemaldati ülejäänud puuduvate väärtustega objektid
funktsiooni na.omit() abil.
Tulemuseks saadi analüüsimiseks sobivam ja ühtlasema struktuuriga andmestik.
Käesolevas töös valisin sihttunnuseks Normalized.Points,
sest see on arvuline pidev tunnus ja väljendab laulu edukust võrreldaval
kujul.
Arvuliseks seletavaks tunnuseks valisin Song.Quality, kuna
on loogiline eeldada, et kõrgem kvaliteedihinnang võib olla seotud
parema lõpptulemusega.
Kategoriaalseks tunnuseks valisin Song.In.English, et
uurida, kas ingliskeelsed laulud erinevad tulemuse poolest
mitteingliskeelsetest lauludest.
plot(euro_clean$Song.Quality, euro_clean$Normalized.Points,
xlab = "Song.Quality",
ylab = "Normalized.Points",
pch = 19)
Seos tunnuste Song.Quality ja Normalized.Points vahel
Hajuvusdiagrammi põhjal on näha, et tunnuste
Song.Quality ja Normalized.Points vahel esineb
pigem positiivne seos.
Üldiselt on kõrgema kvaliteedihinnanguga lauludel kalduvus saada rohkem
normaliseeritud punkte.
Samas ei ole seos täielikult lineaarne, sest hajuvus on üsna suur
ning sama kvaliteedihinnangu korral võivad tulemused olla
erinevad.
Seega võib öelda, et Song.Quality mõjutab sihttunnust, kuid
tegemist ei ole ainsa mõjuteguriga.
cor(euro_clean$Song.Quality, euro_clean$Normalized.Points)
## [1] 0.9937847
Korrelatsioonikordaja näitab, et tunnuste Song.Quality ja Normalized.Points vahel esineb positiivne seos. See tähendab, et suurema kvaliteedihinnanguga laulud kipuvad saama rohkem punkte.
boxplot(Normalized.Points ~ Song.In.English, data = euro_clean,
xlab = "Song.In.English",
ylab = "Normalized.Points")
Normalized.Points jaotus laulukeelte rühmades
Karpdiagramm võimaldab võrrelda sihttunnuse
Normalized.Points jaotust ingliskeelsete ja
mitteingliskeelsete laulude rühmades.
Andmete põhjal moodustavad ingliskeelsed laulud suurema osa vaadeldud
andmestikust.
Diagrammi põhjal saab hinnata, kas rühmade mediaanid ja hajuvused
erinevad.
Kui ingliskeelsete laulude mediaan on kõrgem, siis võib järeldada, et
inglise keeles esitatud laulud saavad tüüpiliselt rohkem punkte. Samas
ainult diagrammi põhjal ei saa väita väga tugevat põhjuslikku seost,
vaid pigem võimalikku erinevust rühmade vahel.
table(euro_clean$Song.In.English)
##
## Ei Jah
## 93 273
prop.table(table(euro_clean$Song.In.English))
##
## Ei Jah
## 0.2540984 0.7459016
barplot(prop.table(table(euro_clean$Song.In.English)),
main = "Song.In.English suhtelised sagedused",
xlab = "Song.In.English",
ylab = "Suhteline sagedus")
Tunnuse
Song.In.English sagedustabel näitab, et
ingliskeelsed laulud esinevad andmestikus oluliselt sagedamini kui
mitteingliskeelsed laulud.
Inglise keeles esitatud laulude osakaal on ligikaudu 74,6%,
mitteingliskeelsete laulude osakaal aga 25,4%.
Seega võib öelda, et tüüpilisem väärtus on Jah, mis
tähistab ingliskeelset laulu.
Väärtus Ei esineb märgatavalt harvemini.
Jaotus ei ole ühtlane, sest üks kategooria domineerib selgelt teise
üle.
hist(euro_clean$Normalized.Points,
xlab = "Normalized.Points")
Sihttunnuse Normalized.Points histogramm
Tunnuse Normalized.Points histogrammi põhjal on jaotus
paremale viltune.
Suurem osa väärtustest paikneb väiksemate tulemuste juures ning suuremad
väärtused esinevad harvemini.
Need väärtused arvutati funktsioonidega mean() ja median(). Kuna keskmine on mediaanist suurem, toetab see järeldust, et jaotus on paremale viltune.
Minimaalne väärtus on 0 ja maksimaalne väärtus 0,1589, mis näitab
üsna suurt ulatust.
Standardhälve on umbes 0,0366 ning kvartiilhaare 0,0529, mis viitab
mõõdukale hajuvusele.
boxplot(euro_clean$Normalized.Points,
main = "Normalized.Points karpdiagramm",
ylab = "Normalized.Points")
Karpdiagrammi põhjal võib andmestikus esineda erindeid, eriti suuremate
väärtuste piirkonnas.
See on loogiline, sest mõned laulud said võrreldes enamusega oluliselt
rohkem punkte.
Seega võib öelda, et sihttunnuse jaotus ei ole sümmeetriline ning
sisaldab võimalikke erindväärtusi.
table(cut(euro_clean$Normalized.Points, breaks = 10))
##
## (-0.000159,0.0159] (0.0159,0.0318] (0.0318,0.0477] (0.0477,0.0635]
## 80 86 45 44
## (0.0635,0.0794] (0.0794,0.0953] (0.0953,0.111] (0.111,0.127]
## 37 28 22 12
## (0.127,0.143] (0.143,0.159]
## 7 5
summary(euro_clean$Normalized.Points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.01806 0.03657 0.04745 0.07100 0.15887
mean(euro_clean$Normalized.Points)
## [1] 0.04744768
median(euro_clean$Normalized.Points)
## [1] 0.03656655
min(euro_clean$Normalized.Points)
## [1] 0
max(euro_clean$Normalized.Points)
## [1] 0.158867
sd(euro_clean$Normalized.Points)
## [1] 0.03663326
IQR(euro_clean$Normalized.Points)
## [1] 0.05293988
plot(euro_clean$energy, euro_clean$Normalized.Points,
main = "Normalized.Points ja energy seos",
xlab = "energy",
ylab = "Normalized.Points",
pch = 19)
Tunnuste
energy ja Normalized.Points vaheline
seos ei paista olevat eriti tugev.
Hajuvusdiagrammil puudub selgelt väljendunud lineaarne muster ning sama
energiasisalduse juures võivad tulemused olla üsna erinevad.
Seetõttu võib järeldada, et energy mõju sihttunnusele on
nõrgem kui tunnusel Song.Quality.
Selles seminaritöös analüüsisin Eurovisiooni võistluslaule aastatel
1998–2010.
Töö käigus lugesin andmestiku R-i, uurisin selle struktuuri, puhastasin
andmed, käsitlesin puuduvaid väärtusi ning teisendasin tunnuste
tüüpe.
Analüüsis kasutati kirjeldava statistika meetodeid ning andmete
visualiseerimist, et hinnata tunnuste vahelisi võimalikke seoseid.
Pärast puhastamist jäi andmestikku 366 objekti ja 25 tunnust, millest
16 olid arvulised ja 9 faktorid.
Valitud sihttunnuseks oli Normalized.Points.
Analüüsi põhjal selgus, et tunnusel Song.Quality on
sihttunnusega nähtav positiivne seos.
Kategoriaalse tunnuse Song.In.English puhul ilmnes, et
ingliskeelsed laulud moodustavad andmestikus enamuse.
Sihttunnuse jaotus osutus paremale viltuseks ning sisaldas võimalikke
erindväärtusi.
Kokkuvõttes võib öelda, et Eurovisiooni tulemusi mõjutavad mitmed
tegurid, kuid käesolevas analüüsis osutus vaadeldud tunnustest kõige
sisukamaks seletajaks Song.Quality. Lisaks näitas
korrelatsioonianalüüs, et tunnuse Song.Quality ja
sihttunnuse Normalized.Points vahel esineb mõõdukas
positiivne seos. Tuleb arvestada, et tegemist on kirjeldava analüüsiga
ning saadud seosed ei pruugi viidata otsesele põhjuslikule seosele.