Dataset yang kami dapat adalah dataset Happiness .Dataset ini memberikan peringkat kebahagiaan dan skor kebahagiaan dari 158 negara di seluruh dunia berdasarkan tujuh faktor termasuk keluarga, harapan hidup, ekonomi, kemurahan hati, kepercayaan pada pemerintah, dan kebebasan . Jumlah nilai dari tujuh faktor ini memberi kita skor kebahagiaan dan semakin tinggi skor kebahagiaan, semakin rendah peringkat kebahagiaan.Jadi, jelas bahwa nilai yang lebih tinggi dari masing-masing dari tujuh faktor ini berarti tingkat kebahagiaan lebih tinggi. Kita dapat mendefinisikan arti dari faktor-faktor ini sebagai sejauh mana faktor-faktor ini mengarah pada kebahagiaan.
khasus ini menerapkan beberapa algoritme pembelajaran mesin untuk memprediksi skor kebahagiaan dan membandingkan hasilnya untuk menemukan algoritme mana yang bekerja lebih baik untuk kumpulan data spesifik ini.
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.1.4 ✓ forcats 0.5.1
## ✓ readr 2.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(caTools)
library(ggplot2)
library(ggthemes)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(formattable)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
## The following object is masked from 'package:plyr':
##
## mutate
# World happiness report 2015
Happiness <- read.csv("Data/Happiness.csv")
head(Happiness, 10)
## Country Region Happiness.Rank Happiness.Score
## 1 Switzerland Western Europe 1 7.587
## 2 Iceland Western Europe 2 7.561
## 3 Denmark Western Europe 3 7.527
## 4 Norway Western Europe 4 7.522
## 5 Canada North America 5 7.427
## 6 Finland Western Europe 6 7.406
## 7 Netherlands Western Europe 7 7.378
## 8 Sweden Western Europe 8 7.364
## 9 New Zealand Australia and New Zealand 9 7.286
## 10 Australia Australia and New Zealand 10 7.284
## Standard.Error Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1 0.03411 1.39651 1.34951 0.94143
## 2 0.04884 1.30232 1.40223 0.94784
## 3 0.03328 1.32548 1.36058 0.87464
## 4 0.03880 1.45900 1.33095 0.88521
## 5 0.03553 1.32629 1.32261 0.90563
## 6 0.03140 1.29025 1.31826 0.88911
## 7 0.02799 1.32944 1.28017 0.89284
## 8 0.03157 1.33171 1.28907 0.91087
## 9 0.03371 1.25018 1.31967 0.90837
## 10 0.04083 1.33358 1.30923 0.93156
## Freedom Trust..Government.Corruption. Generosity Dystopia.Residual
## 1 0.66557 0.41978 0.29678 2.51738
## 2 0.62877 0.14145 0.43630 2.70201
## 3 0.64938 0.48357 0.34139 2.49204
## 4 0.66973 0.36503 0.34699 2.46531
## 5 0.63297 0.32957 0.45811 2.45176
## 6 0.64169 0.41372 0.23351 2.61955
## 7 0.61576 0.31814 0.47610 2.46570
## 8 0.65980 0.43844 0.36262 2.37119
## 9 0.63938 0.42922 0.47501 2.26425
## 10 0.65124 0.35637 0.43562 2.26646
glimpse(Happiness)
## Rows: 158
## Columns: 12
## $ Country <chr> "Switzerland", "Iceland", "Denmark", "No…
## $ Region <chr> "Western Europe", "Western Europe", "Wes…
## $ Happiness.Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ Happiness.Score <dbl> 7.587, 7.561, 7.527, 7.522, 7.427, 7.406…
## $ Standard.Error <dbl> 0.03411, 0.04884, 0.03328, 0.03880, 0.03…
## $ Economy..GDP.per.Capita. <dbl> 1.39651, 1.30232, 1.32548, 1.45900, 1.32…
## $ Family <dbl> 1.34951, 1.40223, 1.36058, 1.33095, 1.32…
## $ Health..Life.Expectancy. <dbl> 0.94143, 0.94784, 0.87464, 0.88521, 0.90…
## $ Freedom <dbl> 0.66557, 0.62877, 0.64938, 0.66973, 0.63…
## $ Trust..Government.Corruption. <dbl> 0.41978, 0.14145, 0.48357, 0.36503, 0.32…
## $ Generosity <dbl> 0.29678, 0.43630, 0.34139, 0.34699, 0.45…
## $ Dystopia.Residual <dbl> 2.51738, 2.70201, 2.49204, 2.46531, 2.45…
Happiness %>% is.na() %>% colSums()
## Country Region
## 0 0
## Happiness.Rank Happiness.Score
## 0 0
## Standard.Error Economy..GDP.per.Capita.
## 0 0
## Family Health..Life.Expectancy.
## 0 0
## Freedom Trust..Government.Corruption.
## 0 0
## Generosity Dystopia.Residual
## 0 0
boxplot(Happiness %>% select(-Country,-Region))
summary(Happiness)
## Country Region Happiness.Rank Happiness.Score
## Length:158 Length:158 Min. : 1.00 Min. :2.839
## Class :character Class :character 1st Qu.: 40.25 1st Qu.:4.526
## Mode :character Mode :character Median : 79.50 Median :5.232
## Mean : 79.49 Mean :5.376
## 3rd Qu.:118.75 3rd Qu.:6.244
## Max. :158.00 Max. :7.587
## Standard.Error Economy..GDP.per.Capita. Family
## Min. :0.01848 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.03727 1st Qu.:0.5458 1st Qu.:0.8568
## Median :0.04394 Median :0.9102 Median :1.0295
## Mean :0.04788 Mean :0.8461 Mean :0.9910
## 3rd Qu.:0.05230 3rd Qu.:1.1584 3rd Qu.:1.2144
## Max. :0.13693 Max. :1.6904 Max. :1.4022
## Health..Life.Expectancy. Freedom Trust..Government.Corruption.
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.4392 1st Qu.:0.3283 1st Qu.:0.06168
## Median :0.6967 Median :0.4355 Median :0.10722
## Mean :0.6303 Mean :0.4286 Mean :0.14342
## 3rd Qu.:0.8110 3rd Qu.:0.5491 3rd Qu.:0.18025
## Max. :1.0252 Max. :0.6697 Max. :0.55191
## Generosity Dystopia.Residual
## Min. :0.0000 Min. :0.3286
## 1st Qu.:0.1506 1st Qu.:1.7594
## Median :0.2161 Median :2.0954
## Mean :0.2373 Mean :2.0990
## 3rd Qu.:0.3099 3rd Qu.:2.4624
## Max. :0.7959 Max. :3.6021
Sekarang kita dapat memuat dataset dan melihat struktur variabel kebahagiaan. Datasetnya cukup bersih, dan akan diterapkan beberapa penyesuaian agar terlihat lebih baik sehingga kita melakukan cleanising.
Kami telah mengamati variabel di dalam dataset kami diatas, kelasnya, dan beberapa pengamatan pertama masing-masing. Faktanya, dataset memiliki 158 observasi dan 12 variabel. beberapa nama variabel tidak cukup jelas dan kami memutuskan untuk mengubah nama beberapa dari mereka sedikit. Juga, kami akan menghapus variabel Region dan Standard Error dari dataset skarena tidak perlu digunakan untuk visualisasi dan prediksi.
# Changing column names
colnames(Happiness) <- c("Country", "Region", "Happiness.Rank",
"Happiness.Score", "Standard.Error", "Economy", "Family",
"Life.Expectancy", "Freedom",
"Trust", "Generosity", "Dystopia.Residual")
# Remove unnecessary columns (Standard.Error and Region)
Happiness <- Happiness[, -c(5)]
Happiness <- Happiness[, -c(2)]
# Country: Nama negara
# Happiness.Rank: Peringkat negara berdasarkan Skor Kebahagiaan
# Happiness.Score: Pengukuran kebahagiaan pada skala 0 hingga 10
# Economy: Nilai semua barang dan jasa akhir yang diproduksi dalam suatu negara pada tahun tertentu
# Family: Pentingnya memiliki keluarga
# Life.Expectancy Harapan: Pentingnya kesehatan dan jumlah waktu yang diharapkan untuk hidup
# Freedom: Pentingnya kebebasan di setiap negara
# Generosity: Kualitas bersikap baik dan murah hati
# Trust: Persepsi korupsi dalam pemerintahan
# Dystopia.Residual: Dimainkan sebagai referensi
head(Happiness, 10)
## Country Happiness.Rank Happiness.Score Economy Family Life.Expectancy
## 1 Switzerland 1 7.587 1.39651 1.34951 0.94143
## 2 Iceland 2 7.561 1.30232 1.40223 0.94784
## 3 Denmark 3 7.527 1.32548 1.36058 0.87464
## 4 Norway 4 7.522 1.45900 1.33095 0.88521
## 5 Canada 5 7.427 1.32629 1.32261 0.90563
## 6 Finland 6 7.406 1.29025 1.31826 0.88911
## 7 Netherlands 7 7.378 1.32944 1.28017 0.89284
## 8 Sweden 8 7.364 1.33171 1.28907 0.91087
## 9 New Zealand 9 7.286 1.25018 1.31967 0.90837
## 10 Australia 10 7.284 1.33358 1.30923 0.93156
## Freedom Trust Generosity Dystopia.Residual
## 1 0.66557 0.41978 0.29678 2.51738
## 2 0.62877 0.14145 0.43630 2.70201
## 3 0.64938 0.48357 0.34139 2.49204
## 4 0.66973 0.36503 0.34699 2.46531
## 5 0.63297 0.32957 0.45811 2.45176
## 6 0.64169 0.41372 0.23351 2.61955
## 7 0.61576 0.31814 0.47610 2.46570
## 8 0.65980 0.43844 0.36262 2.37119
## 9 0.63938 0.42922 0.47501 2.26425
## 10 0.65124 0.35637 0.43562 2.26646
Langkah selanjutnya adalah menambahkan kolom lain ke dataset yaitu continent. Kami ingin mengetahui di benua yang berbeda untuk menemukan apakah ada tren yang berbeda untuk mengenai faktor mana yang memainkan peran penting dalam mendapatkan skor kebahagiaan yang lebih tinggi. Asia, Afrika, Amerika, Eropa, dan Australia adalah enam benua kami dalam kumpulan data ini. Kemudian kami memindahkan posisi kolom continent ke kolom kedua karena menurut saya dengan pengaturan posisi ini, dataset terlihat lebih baik.lalu, kami mengubah jenis variabel benua menjadi faktor agar dapat bekerja dengannya dengan mudah untuk visualisasi. Sekarang kami dapat melihat struktur akhir dari dataset kami yang terdiri dari 158 observasi dan 12 variabel. Negara dan benua adalah variabel faktor, peringkat Kebahagiaan adalah bilangan bulat, dan variabel yang tersisa dalam tipe numerik.
# Create a new column for continents
Happiness$Continent <- NA
Happiness$Continent[which(Happiness$Country %in% c("Israel", "United Arab Emirates", "Singapore", "Thailand", "Taiwan",
"Qatar", "Saudi Arabia", "Kuwait", "Bahrain", "Malaysia", "Uzbekistan", "Japan",
"South Korea", "Turkmenistan", "Kazakhstan", "Turkey", "Hong Kong, China", "Philippines",
"Jordan", "China", "Pakistan", "Indonesia", "Azerbaijan", "Lebanon", "Vietnam",
"Tajikistan", "Bhutan", "Kyrgyzstan", "Nepal", "Mongolia", "Palestinian Territories",
"Iran", "Bangladesh", "Myanmar", "Iraq", "Sri Lanka", "Armenia", "India", "Georgia",
"Cambodia", "Afghanistan", "Yemen", "Syria"))] <- "Asia"
Happiness$Continent[which(Happiness$Country %in% c("Norway", "Denmark", "Iceland", "Switzerland", "Finland",
"Netherlands", "Sweden", "Austria", "Ireland", "Germany",
"Belgium", "Luxembourg", "United Kingdom", "Czech Republic",
"Malta", "France", "Spain", "Slovakia", "Poland", "Italy",
"Russia", "Lithuania", "Latvia", "Moldova", "Romania",
"Slovenia", "North Cyprus", "Cyprus", "Estonia", "Belarus",
"Serbia", "Hungary", "Croatia", "Kosovo", "Montenegro",
"Greece", "Portugal", "Bosnia and Herzegovina", "Macedonia",
"Bulgaria", "Albania", "Ukraine"))] <- "Europe"
Happiness$Continent[which(Happiness$Country %in% c("Canada", "Costa Rica", "United States", "Mexico",
"Panama","Trinidad and Tobago", "El Salvador", "Belize", "Guatemala",
"Jamaica", "Nicaragua", "Dominican Republic", "Honduras", "Haiti" ,"Chile", "Brazil",
"Argentina", "Uruguay","Colombia", "Ecuador", "Bolivia", "Peru",
"Paraguay", "Venezuela"))] <- "America"
Happiness$Continent[which(Happiness$Country %in% c("New Zealand", "Australia"))] <- "Australia"
Happiness$Continent[which(is.na(Happiness$Continent))] <- "Africa"
# Move the position of the continent column in the dataset to the second column
Happiness <- Happiness %>% select(Country,Continent, everything())
# Changing Continent column to factor
Happiness$Continent <- as.factor(Happiness$Continent)
str(Happiness)
## 'data.frame': 158 obs. of 11 variables:
## $ Country : chr "Switzerland" "Iceland" "Denmark" "Norway" ...
## $ Continent : Factor w/ 5 levels "Africa","America",..: 5 5 5 5 2 5 5 5 4 4 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.59 7.56 7.53 7.52 7.43 ...
## $ Economy : num 1.4 1.3 1.33 1.46 1.33 ...
## $ Family : num 1.35 1.4 1.36 1.33 1.32 ...
## $ Life.Expectancy : num 0.941 0.948 0.875 0.885 0.906 ...
## $ Freedom : num 0.666 0.629 0.649 0.67 0.633 ...
## $ Trust : num 0.42 0.141 0.484 0.365 0.33 ...
## $ Generosity : num 0.297 0.436 0.341 0.347 0.458 ...
## $ Dystopia.Residual: num 2.52 2.7 2.49 2.47 2.45 ...
Di bagian ini, kita akan menggunakan variabel yang berbeda untuk mengetahui bagaimana mereka saling berkorelasi.
kita lihat korelasi antara variabel numerik dalam dataset kami
#Correlation between variables
# Finding the correlation between numerical columns
Num.cols <- sapply(Happiness, is.numeric)
Cor.data <- cor(Happiness[, Num.cols])
corrplot(Cor.data, method = 'color')
Jelas, ada korelasi terbalik antara “Happiness Rank” dan yang lainnya variabel numerik. Dengan kata lain, semakin rendah happiness rank, semakin tinggi happiness score, dan semakin tinggi tujuh faktor lain yang berkontribusi terhadap kebahagiaan. Mari kita hilangkan happiness rank, dan lihat kembali korelasinya.
# Create a correlation plot
newdatacor = cor(Happiness[c(4:11)])
corrplot(newdatacor, method = "number")
sesuai dengan cor plot, Economy, life expectancy, dan family mendapatkan hasil yang paling signifikan dalam berkontribusi pada data happiness. Trust dan generosity memiliki dampak terendah pada happiness score.
kita lihat korelasi antara skor kebahagiaan dan tujuh faktor lainnya dalam berbagai benua dengan membuat plot pencar.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Life.Expectancy, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Korelasi antara angka “Life.Expectanc” dan “happiness” di Eropa, Amerika dan Asia lebih signifikan dibandingkan dengan benua lain. kami tidak memperhitungkan Australia karena hanya ada dua negara di Australia dan membuat plot sebar dengan garis regresi untuk benua tersebut tidak akan memberikan hasil yang bagus.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Economy, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Kita dapat melihat hasil yang hampir sama di sini untuk korelasi antara Happiness.Score dan Economy. Afrika memiliki hubungan terendah.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Freedom, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Freedom di Eropa dan Amerika Utara lebih berkorelasi dengan Happiness.Score daripada benua lain.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Trust, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Tidak ada korelasi antara skor kepercayaan dan kebahagiaan di Afrika.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Generosity, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Garis regresi memiliki kemiringan positif hanya Eropa. Untuk Afrika, Asia dan Amerika garisnya horizontal.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Family, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
korelasi keluarga di Eropa dan Amerika dengan skor kebahagiaan lebih signifikan dari pada benua lain.
ggplot(subset(Happiness, Happiness$Continent != "Australia"), aes(x = Dystopia.Residual, y = Happiness.Score)) +
geom_point(aes(color=Continent), size = 3, alpha = 0.8) +
geom_smooth(aes(color = Continent, fill = Continent),
method = "lm", fullrange = TRUE) +
facet_wrap(~Continent) +
theme_bw() + labs(title = "Scatter plot with regression line")
Semua benua hampir sama
Di bagian Prediksi, kami menerapkan beberapa modeling untuk memprediksi Happiness.Score. Pertama, kita harus membagi dataset kita menjadi training dan test set.
Variabel terikat kami adalah Happiness.Score, dan variabel bebasnya adalah family, economy, life expectancy, trust, freedom, generosity, and dystopia residual.
# Splitting the dataset into the Training set and Test set
set.seed(123)
dataset <- Happiness[4:11]
split = sample.split(dataset$Happiness.Score, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Fitting Multiple Linear Regression to the Training set
regressor_lm = lm(formula = Happiness.Score ~ .,
data = training_set)
summary(regressor_lm)
##
## Call:
## lm(formula = Happiness.Score ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.616e-04 -2.271e-04 -4.950e-06 2.265e-04 5.395e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.847e-05 1.365e-04 0.502 0.617
## Economy 1.000e+00 1.158e-04 8632.575 <2e-16 ***
## Family 9.999e-01 1.209e-04 8271.246 <2e-16 ***
## Life.Expectancy 9.998e-01 1.689e-04 5921.067 <2e-16 ***
## Freedom 9.998e-01 2.169e-04 4608.742 <2e-16 ***
## Trust 1.000e+00 2.482e-04 4029.080 <2e-16 ***
## Generosity 1.000e+00 2.139e-04 4674.402 <2e-16 ***
## Dystopia.Residual 1.000e+00 4.496e-05 22243.855 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002716 on 118 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.116e+08 on 7 and 118 DF, p-value: < 2.2e-16
Hasil diatas menunjukkan bahwa semua variabel independen memiliki pengaruh yang signifikan, dan adjusted R squared adalah 1, jelas bahwa ada korelasi linier antara variabel dependen dan independen, dan jumlah variabel independen sama dengan variabel dependen yang merupakan skor kebahagiaan. R-squared yang disesuaikan sama dengan 1. Akibatnya, Regresi Linier Berganda akan memprediksi skor kebahagiaan dengan akurasi 100%
Model Evaluation
####### Predicting the Test set results
y_pred_lm = predict(regressor_lm, newdata = test_set)
Pred_Actual_lm <- as.data.frame(cbind(Prediction = y_pred_lm, Actual = test_set$Happiness.Score))
gg.lm <- ggplot(Pred_Actual_lm, aes(Actual, Prediction)) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Multiple Linear Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.lm
Seperti yang dihasilkan, plot aktual happines score dan prediksi happines score menunjukkan keakuratan model Multiple Linear Regression.
# Fitting SVR to the dataset
library(e1071)
regressor_svr = svm(formula = Happiness.Score ~ .,
data = dataset,
type = 'eps-regression',
kernel = 'radial')
Model Evaluation
# Predicting a new result
y_pred_svr = predict(regressor_svr, newdata = test_set)
Pred_Actual_svr <- as.data.frame(cbind(Prediction = y_pred_svr, Actual = test_set$Happiness.Score))
Pred_Actual_lm.versus.svr <- cbind(Prediction.lm = y_pred_lm, Prediction.svr = y_pred_svr, Actual = test_set$Happiness.Score)
gg.svr <- ggplot(Pred_Actual_svr, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "SVR", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.svr
Bedasarkan hasil yang didapat plot aktual Support Vector Regression memprediksi skor kebahagiaan dengan akurasi yang cukup tinggi.
library(rpart)
regressor_dt = rpart(formula = Happiness.Score ~ .,
data = dataset,
control = rpart.control(minsplit = 10))
Model Evaluation
y_pred_dt = predict(regressor_dt, newdata = test_set)
Pred_Actual_dt <- as.data.frame(cbind(Prediction = y_pred_dt, Actual = test_set$Happiness.Score))
gg.dt <- ggplot(Pred_Actual_dt, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Decision Tree Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.dt
Hasil yang kita dapatkan dengan plot aktual didapatkan Regresi Pohon Keputusan bukanlah pilihan yang tepat untuk data Happiness
library(rpart.plot)
prp(regressor_dt)
library(randomForest)
set.seed(1234)
regressor_rf = randomForest(x = dataset[-1],
y = dataset$Happiness.Score,
ntree = 500)
Model Evaluation
# Predicting a new result with Random Forest Regression
y_pred_rf = predict(regressor_rf, newdata = test_set)
Pred_Actual_rf <- as.data.frame(cbind(Prediction = y_pred_rf, Actual = test_set$Happiness.Score))
gg.rf <- ggplot(Pred_Actual_rf, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Random Forest Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.rf
hasil yang didapatkan dengan menggunakan Regresi Randon Forest dengan plot akutual tidak sebagus SVR(Suport Vector Regresion) mengenai skor kebahagiaan yang diprediksi tetapi melakukan lebih baik daripada Decision Tree.
ggarrange(gg.lm, gg.svr, gg.dt, gg.rf, ncol = 2, nrow = 3)
Bedasarkan hasil percobaan kami, kami daptkan bahwa Multiple Linear Regression merupakan yang terbaik untuk memprediksi data happiness. SVR dan Random Forest menempati posisi kedua dalam hal akurasi prediksi. Dan terakhir, Decision Tree adalah algoritma terburuk untuk memprediksi skor kebahagiaan.