1 Introduction

1.1 objective

Our main goal is to do an exploratory analysis of the factors that make people happy.

2 Data preprocessing

2.1 Dataset

Abalone Physical Measurements Source: https://www.kaggle.com/rodolfomendes/abalone-dataset

2.2 Load Libraries

#install.packages('tidyverse')
#install.packages('skimr')
#install.packages('ggpubr')
#install.packages("caret")
#install.packages("Metrics")
#install.packages("e1071")

library(Metrics)
library(caret)
library(readr)
library(readxl)
library(dplyr)
library(ggplot2)
library(skimr)
library(tidyr)
library(reshape2)
library(ggpubr)
library(stringr)
library(e1071)
library(pROC)

2.3 Load the data

# df = read_excel("World Hapiness Data (2015-2019).xlsx", sheet = "Sheet1")
happy15_df = read.csv ("data/2015.csv")
happy16_df = read.csv ("data/2016.csv")
happy17_df = read.csv ("data/2017.csv")
happy18_df = read.csv ("data/2018.csv")
happy19_df = read.csv ("data/2019.csv")
head(happy19_df)
##   Overall.rank Country.or.region Score GDP.per.capita Social.support
## 1            1           Finland 7.769          1.340          1.587
## 2            2           Denmark 7.600          1.383          1.573
## 3            3            Norway 7.554          1.488          1.582
## 4            4           Iceland 7.494          1.380          1.624
## 5            5       Netherlands 7.488          1.396          1.522
## 6            6       Switzerland 7.480          1.452          1.526
##   Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 1                   0.986                        0.596      0.153
## 2                   0.996                        0.592      0.252
## 3                   1.028                        0.603      0.271
## 4                   1.026                        0.591      0.354
## 5                   0.999                        0.557      0.322
## 6                   1.052                        0.572      0.263
##   Perceptions.of.corruption
## 1                     0.393
## 2                     0.410
## 3                     0.341
## 4                     0.118
## 5                     0.298
## 6                     0.343

We have a look at the last one (head is by default with the first 5 rows)

2.4 Explore Dataset

2.4.0.1 Change year 2018 datasets column names base year 2017 datasets

happy18_df=plyr::rename(happy18_df, replace = c( "Country.or.region"="Country", 
                                  "Overall.rank"="Happiness.Rank" ,
                                  "GDP.per.capita"="Economy..GDP.per.Capita.",
                                  "Healthy.life.expectancy"="Health..Life.Expectancy.",
                                  "Freedom.to.make.life.choices"="Freedom",
                                  "Perceptions.of.corruption"="Trust..Government.Corruption.",
                                  "Social.support"="Family",
                                  "Score"="Happiness.Score"))


colnames(happy18_df)
## [1] "Happiness.Rank"                "Country"                      
## [3] "Happiness.Score"               "Economy..GDP.per.Capita."     
## [5] "Family"                        "Health..Life.Expectancy."     
## [7] "Freedom"                       "Generosity"                   
## [9] "Trust..Government.Corruption."

2.4.0.2 Change year 2019 datasets column names base year 2017 datasets

happy19_df=plyr::rename(happy19_df, replace = c( "Country.or.region"="Country", 
                                  "Overall.rank"="Happiness.Rank" ,
                                  "GDP.per.capita"="Economy..GDP.per.Capita.",
                                  "Healthy.life.expectancy"="Health..Life.Expectancy.",
                                  "Freedom.to.make.life.choices"="Freedom",
                                  "Perceptions.of.corruption"="Trust..Government.Corruption.",
                                  "Social.support"="Family",
                                  "Score"="Happiness.Score"))

colnames(happy19_df)
## [1] "Happiness.Rank"                "Country"                      
## [3] "Happiness.Score"               "Economy..GDP.per.Capita."     
## [5] "Family"                        "Health..Life.Expectancy."     
## [7] "Freedom"                       "Generosity"                   
## [9] "Trust..Government.Corruption."

2.4.0.3 Change year 2015 datasets column names base year 2017 datasets

happy15_df=plyr::rename(happy15_df, replace = c( "Happiness Rank" = "Happiness.Rank", 
                                  "Happiness Score" = "Happiness.Score",
                                  "Economy (GDP per Capita)" = "Economy..GDP.per.Capita.",
                                  "Health (Life Expectancy)" = "Health..Life.Expectancy.",
                                  "Trust (Government Corruption)"  = "Trust..Government.Corruption.",
                                  "Dystopia Residual"="Dystopia.Residual"
                                  ))

colnames(happy15_df)
##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Standard.Error"                "Economy..GDP.per.Capita."     
##  [7] "Family"                        "Health..Life.Expectancy."     
##  [9] "Freedom"                       "Trust..Government.Corruption."
## [11] "Generosity"                    "Dystopia.Residual"

2.4.0.4 Change year 2016 datasets column names base year 2017 datasets

happy16_df=plyr::rename(happy16_df, replace = c( "Happiness Rank" = "Happiness.Rank", 
                                  "Happiness Score" = "Happiness.Score",
                                  "Economy (GDP per Capita)" = "Economy..GDP.per.Capita.",
                                  "Health (Life Expectancy)" = "Health..Life.Expectancy.",
                                  "Trust (Government Corruption)"  = "Trust..Government.Corruption.",
                                  "Dystopia Residual"="Dystopia.Residual"
                                  ))

colnames(happy16_df)
##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Lower.Confidence.Interval"     "Upper.Confidence.Interval"    
##  [7] "Economy..GDP.per.Capita."      "Family"                       
##  [9] "Health..Life.Expectancy."      "Freedom"                      
## [11] "Trust..Government.Corruption." "Generosity"                   
## [13] "Dystopia.Residual"

2.4.0.5 insert year column at first position (index 0)

#2015

happy15_df<-cbind(Year=2015,happy15_df)

happy16_df<-cbind(Year=2016,happy16_df)

happy17_df<-cbind(Year=2017,happy17_df)

happy18_df<-cbind(Year=2018,happy18_df)

happy19_df<-cbind(Year=2019,happy19_df)

2.5 change column type

happy18_df$Trust..Government.Corruption. = as.numeric(happy18_df$Trust..Government.Corruption.)

str(happy18_df)
## 'data.frame':    156 obs. of  10 variables:
##  $ Year                         : num  2018 2018 2018 2018 2018 ...
##  $ Happiness.Rank               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country                      : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ Happiness.Score              : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ Economy..GDP.per.Capita.     : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ Family                       : num  1.59 1.58 1.59 1.64 1.55 ...
##  $ Health..Life.Expectancy.     : num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
##  $ Freedom                      : num  0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
##  $ Generosity                   : num  0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
##  $ Trust..Government.Corruption.: num  0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...

2.6 merge data from 2015-2019

happy15_16<-dplyr::bind_rows(happy15_df,happy16_df)

happy15_16_17<-dplyr::bind_rows(happy15_16,happy17_df)


happy18_19<-dplyr::bind_rows(happy18_df,happy19_df)


df<-dplyr::bind_rows(happy18_19,happy15_16_17)

head(df)
##   Year Happiness.Rank     Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018              1     Finland           7.632                    1.305
## 2 2018              2      Norway           7.594                    1.456
## 3 2018              3     Denmark           7.555                    1.351
## 4 2018              4     Iceland           7.495                    1.343
## 5 2018              5 Switzerland           7.487                    1.420
## 6 2018              6 Netherlands           7.441                    1.361
##   Family Health..Life.Expectancy. Freedom Generosity
## 1  1.592                    0.874   0.681      0.202
## 2  1.582                    0.861   0.686      0.286
## 3  1.590                    0.868   0.683      0.284
## 4  1.644                    0.914   0.677      0.353
## 5  1.549                    0.927   0.660      0.256
## 6  1.488                    0.878   0.638      0.333
##   Trust..Government.Corruption. Region Standard.Error Dystopia.Residual
## 1                         0.393   <NA>             NA                NA
## 2                         0.340   <NA>             NA                NA
## 3                         0.408   <NA>             NA                NA
## 4                         0.138   <NA>             NA                NA
## 5                         0.357   <NA>             NA                NA
## 6                         0.295   <NA>             NA                NA
##   Lower.Confidence.Interval Upper.Confidence.Interval Whisker.high Whisker.low
## 1                        NA                        NA           NA          NA
## 2                        NA                        NA           NA          NA
## 3                        NA                        NA           NA          NA
## 4                        NA                        NA           NA          NA
## 5                        NA                        NA           NA          NA
## 6                        NA                        NA           NA          NA

2.7 change column data type

df$Happiness.Rank  = as.numeric(df$Happiness.Rank )


str(df)
## 'data.frame':    782 obs. of  17 variables:
##  $ Year                         : num  2018 2018 2018 2018 2018 ...
##  $ Happiness.Rank               : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country                      : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ Happiness.Score              : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ Economy..GDP.per.Capita.     : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ Family                       : num  1.59 1.58 1.59 1.64 1.55 ...
##  $ Health..Life.Expectancy.     : num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
##  $ Freedom                      : num  0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
##  $ Generosity                   : num  0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
##  $ Trust..Government.Corruption.: num  0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
##  $ Region                       : chr  NA NA NA NA ...
##  $ Standard.Error               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Dystopia.Residual            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Lower.Confidence.Interval    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Upper.Confidence.Interval    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Whisker.high                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Whisker.low                  : num  NA NA NA NA NA NA NA NA NA NA ...

2.8 Remove unnessesary columns

colSums(is.na(df))
##                          Year                Happiness.Rank 
##                             0                             0 
##                       Country               Happiness.Score 
##                             0                             0 
##      Economy..GDP.per.Capita.                        Family 
##                             0                             0 
##      Health..Life.Expectancy.                       Freedom 
##                             0                             0 
##                    Generosity Trust..Government.Corruption. 
##                             0                             1 
##                        Region                Standard.Error 
##                           467                           624 
##             Dystopia.Residual     Lower.Confidence.Interval 
##                           312                           625 
##     Upper.Confidence.Interval                  Whisker.high 
##                           625                           627 
##                   Whisker.low 
##                           627
df = subset(df, select = -c(Lower.Confidence.Interval,Upper.Confidence.Interval,Dystopia.Residual,Standard.Error,Whisker.high,Whisker.low))

head(df)
##   Year Happiness.Rank     Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018              1     Finland           7.632                    1.305
## 2 2018              2      Norway           7.594                    1.456
## 3 2018              3     Denmark           7.555                    1.351
## 4 2018              4     Iceland           7.495                    1.343
## 5 2018              5 Switzerland           7.487                    1.420
## 6 2018              6 Netherlands           7.441                    1.361
##   Family Health..Life.Expectancy. Freedom Generosity
## 1  1.592                    0.874   0.681      0.202
## 2  1.582                    0.861   0.686      0.286
## 3  1.590                    0.868   0.683      0.284
## 4  1.644                    0.914   0.677      0.353
## 5  1.549                    0.927   0.660      0.256
## 6  1.488                    0.878   0.638      0.333
##   Trust..Government.Corruption. Region
## 1                         0.393   <NA>
## 2                         0.340   <NA>
## 3                         0.408   <NA>
## 4                         0.138   <NA>
## 5                         0.357   <NA>
## 6                         0.295   <NA>

2.9 Impute with mean or median values for numerical columns.

df$Trust..Government.Corruption.[is.na(df$Trust..Government.Corruption.)] <- median(df$Trust..Government.Corruption., na.rm = T)

colSums(is.na(df))
##                          Year                Happiness.Rank 
##                             0                             0 
##                       Country               Happiness.Score 
##                             0                             0 
##      Economy..GDP.per.Capita.                        Family 
##                             0                             0 
##      Health..Life.Expectancy.                       Freedom 
##                             0                             0 
##                    Generosity Trust..Government.Corruption. 
##                             0                             0 
##                        Region 
##                           467

2.10 Filter uncommon data in Country Column

Due to the data is describing the happiness score and relative factors for countries accross different years. So, it is important to view the uniformity of the data in Year column of the data.

Country and Region counts groupby Year

aggregate(df$Country, by=list(df$Year), FUN=length)
##   Group.1   x
## 1    2015 158
## 2    2016 157
## 3    2017 155
## 4    2018 156
## 5    2019 156

From the table shown as above, the number of countries involved in this dataset for different year is differnt. Therefore, it is neccessary to make an intersection of them to get the most common country list.

Country_2015 = subset(df, Year == 2015)$Country
Country_2016 = subset(df, Year == 2016)$Country
Country_2017 = subset(df, Year == 2017)$Country
Country_2018 = subset(df, Year == 2018)$Country
Country_2019 = subset(df, Year == 2019)$Country
common_country =intersect(intersect(intersect(intersect(Country_2015, Country_2016),Country_2017),Country_2018),Country_2019)
length(common_country)
## [1] 141

Therefore, there are 142 countries’ data existing accross from 2015-2019 in this dataset.Then we need to filter the orginial dataset by this common_country list.

df1 = subset(df,Country %in% common_country)
print(paste("The amount of rows in the dataset is: ",dim(df1)[1]))
print(paste("The amount of columns in the dataset is: ",dim(df1)[2]))
## [1] "The amount of rows in the dataset is:  705"
## [1] "The amount of columns in the dataset is:  11"
str(df1)
## 'data.frame':    705 obs. of  11 variables:
##  $ Year                         : num  2018 2018 2018 2018 2018 ...
##  $ Happiness.Rank               : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country                      : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ Happiness.Score              : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ Economy..GDP.per.Capita.     : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ Family                       : num  1.59 1.58 1.59 1.64 1.55 ...
##  $ Health..Life.Expectancy.     : num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
##  $ Freedom                      : num  0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
##  $ Generosity                   : num  0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
##  $ Trust..Government.Corruption.: num  0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
##  $ Region                       : chr  NA NA NA NA ...

2.11 Fill value for categorical columns

Create a new dataset for storing common region and country

common_region <- unique(subset(df1, Region!="NA", c(Country, Region)))

head(common_country)
## [1] "Switzerland" "Iceland"     "Denmark"     "Norway"      "Canada"     
## [6] "Finland"

Fill relate region to missing value of region column

assign_region <- function(x){
  
  Region <- common_region$Region[common_region$Country == x]
  
}


for(country in common_country)

      df1$Region[df1$Country == country] <- assign_region(country)

2.12 Save cleaned dataset

# write_csv(df1, path = "World Hapiness Data (2015-2019)_cleaned.csv")

2.12.1 briefly statistic view the data

skimr::skim_without_charts(df1)
Data summary
Name df1
Number of rows 705
Number of columns 11
_______________________
Column type frequency:
character 2
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Country 0 1 4 23 0 141 0
Region 0 1 12 31 0 10 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Year 0 1 2017.00 1.42 2015.00 2016.00 2017.00 2018.00 2019.00
Happiness.Rank 0 1 76.85 45.28 1.00 37.00 77.00 116.00 158.00
Happiness.Score 0 1 5.43 1.13 2.84 4.52 5.39 6.29 7.77
Economy..GDP.per.Capita. 0 1 0.93 0.40 0.00 0.64 1.00 1.24 2.10
Family 0 1 1.09 0.32 0.00 0.88 1.14 1.35 1.64
Health..Life.Expectancy. 0 1 0.63 0.23 0.00 0.49 0.66 0.81 1.14
Freedom 0 1 0.41 0.15 0.00 0.31 0.43 0.53 0.72
Generosity 0 1 0.22 0.13 0.00 0.13 0.20 0.28 0.84
Trust..Government.Corruption. 0 1 0.12 0.11 0.00 0.05 0.09 0.15 0.55
print(paste("The amount of rows in the dataset is: ",dim(df)[1]))
print(paste("The amount of columns in the dataset is: ",dim(df)[2]))
print(paste("the column names in this dataset are:", paste(shQuote(colnames(df)), collapse=", ")))
## [1] "The amount of rows in the dataset is:  782"
## [1] "The amount of columns in the dataset is:  11"
## [1] "the column names in this dataset are: \"Year\", \"Happiness.Rank\", \"Country\", \"Happiness.Score\", \"Economy..GDP.per.Capita.\", \"Family\", \"Health..Life.Expectancy.\", \"Freedom\", \"Generosity\", \"Trust..Government.Corruption.\", \"Region\""

3 Exploratary Data Analysis

The 10 happiest countries in 2015

df1 %>%
  filter(Year == 2015) %>%
  arrange(-Happiness.Score) %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
  geom_point(colour = "red", size = 3) +
  theme(text=element_text(size=10)) + 
  coord_flip() +
  labs(title = "The 10 happiest countries in 2015", x = "")

The 10 happiest countries in 2016

df1 %>%
  filter(Year == 2016) %>%
  arrange(-Happiness.Score) %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
  geom_point(colour = "red", size = 3) +
  theme(text=element_text(size=10)) + 
  coord_flip() +
  labs(title = "The 10 happiest countries in 2016", x = "")

The 10 happiest countries in 2017

df1 %>%
  filter(Year == 2017) %>%
  arrange(-Happiness.Score) %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
  geom_point(colour = "red", size = 3) +
  theme(text=element_text(size=10)) + 
  coord_flip() +
  labs(title = "The 10 happiest countries in 2017", x = "")

The 10 happiest countries in 2018

df1 %>%
  filter(Year == 2018) %>%
  arrange(-Happiness.Score) %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
  geom_point(colour = "red", size = 3) +
  theme(text=element_text(size=10)) + 
  coord_flip() +
  labs(title = "The 10 happiest countries in 2018", x = "")

The 10 happiest countries in 2019

df1 %>%
  filter(Year == 2019) %>%
  arrange(-Happiness.Score) %>%
  slice_head(n=10) %>%
  ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
  geom_point(colour = "red", size = 3) +
  theme(text=element_text(size=10)) + 
  coord_flip() +
  labs(title = "The 10 happiest countries in 2019", x = "")

Region_Yearavg_HappyScore = data.frame(aggregate(df$Happiness.Score, by=list(df$Year,df$Region), FUN=mean))
colnames(Region_Yearavg_HappyScore)<-c("Year","Region","Mean_HappyScore")
Region_Yearavg_HappyScore
##    Year                          Region Mean_HappyScore
## 1  2015       Australia and New Zealand        7.285000
## 2  2016       Australia and New Zealand        7.323500
## 3  2015      Central and Eastern Europe        5.332931
## 4  2016      Central and Eastern Europe        5.370690
## 5  2015                    Eastern Asia        5.626167
## 6  2016                    Eastern Asia        5.624167
## 7  2015     Latin America and Caribbean        6.144682
## 8  2016     Latin America and Caribbean        6.101750
## 9  2015 Middle East and Northern Africa        5.406900
## 10 2016 Middle East and Northern Africa        5.386053
## 11 2015                   North America        7.273000
## 12 2016                   North America        7.254000
## 13 2015               Southeastern Asia        5.317444
## 14 2016               Southeastern Asia        5.338889
## 15 2015                   Southern Asia        4.580857
## 16 2016                   Southern Asia        4.563286
## 17 2015              Sub-Saharan Africa        4.202800
## 18 2016              Sub-Saharan Africa        4.136421
## 19 2015                  Western Europe        6.689619
## 20 2016                  Western Europe        6.685667
ggplot(Region_Yearavg_HappyScore,aes(x = Year,y = Mean_HappyScore,fill = Region))+
  geom_bar(stat = "identity",position = "dodge")

The top 3 happiness region are: Australia and New Zealand, North America and Western Europe

3.1 from 2015 to 2019, Mean Happiness score by regions:

df1 %>%
  group_by(Region) %>%
  summarise(mscore = mean(Happiness.Score)) %>%
  ggplot(aes(reorder(Region, mscore), mscore)) +
  geom_point() +
  theme_bw() +
  
  coord_flip() +
  labs(title = "Happiness Score by Regions",
       x = "", y = "Average happiness score")

3.2 from 2015 to 2019, Mean Happiness score by counntries:

df1 %>%
  group_by(Country) %>%
  summarise(mscore = mean(Happiness.Score)) %>%
  arrange(-mscore) %>%
  slice_head(n=10) %>%
  
  ggplot(aes(reorder(Country, mscore), mscore)) +
  geom_point() +
  theme_bw() +
  
  coord_flip() +
  labs(title = "Happiness Score by Country",
       x = "", y = "Average happiness score")

Top 10 Mean Happiness score by counntries trends by years

Top10_happy_country_DF = df1 %>%
  group_by(Country) %>%
  summarise(mscore = mean(Happiness.Score)) %>%
  arrange(-mscore) %>%
  slice_head(n=10)

Top10_happy_country_DF_list = c(Top10_happy_country_DF$Country)

df1_Top10_happy_country = subset(df1,Country %in% Top10_happy_country_DF_list)

ggplot(df1_Top10_happy_country,  aes(x = Year,y = Happiness.Score,color = Country))+  geom_line()

## The happiness socre of Finland is increasing dramatically from 2015-2019

3.2.1 The 10 most progressive countries from 2015 - 2017:

df1 %>%
  mutate(y = as.character(Year)) %>%
  select(y, Country, Region, Happiness.Score) %>%
  pivot_wider(names_from = y, values_from = Happiness.Score,
              names_prefix = "y_") %>%
  mutate(p = (y_2019 - y_2015)/y_2015 * 100) %>%
  arrange(-p) %>%
  slice_head(n = 10) %>%
  ggplot(aes(reorder(Country, p), p)) +
  geom_point() +
  theme_bw() +
  coord_flip() +
  labs(title = "The 10 most progressive countries from 2015 - 2019",
       y = "Percentage Increase of Happiness Score", x = "")

Top10_Progress_country_df = df1 %>%
  mutate(y = as.character(Year)) %>%
  select(y, Country, Region, Happiness.Score) %>%
  pivot_wider(names_from = y, values_from = Happiness.Score,
              names_prefix = "y_") %>%
  mutate(p = (y_2019 - y_2015)/y_2015 * 100) %>%
  arrange(-p) %>%
  slice_head(n = 10)

Top10_Progress_country_df_list = c(Top10_Progress_country_df$Country)

df1_Top10_Progress_country = subset(df1,Country %in% Top10_Progress_country_df_list)

ggplot(df1_Top10_Progress_country,  aes(x = Year,y = Happiness.Score,color = Country))+  geom_line()

head(df1)
##   Year Happiness.Rank     Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018              1     Finland           7.632                    1.305
## 2 2018              2      Norway           7.594                    1.456
## 3 2018              3     Denmark           7.555                    1.351
## 4 2018              4     Iceland           7.495                    1.343
## 5 2018              5 Switzerland           7.487                    1.420
## 6 2018              6 Netherlands           7.441                    1.361
##   Family Health..Life.Expectancy. Freedom Generosity
## 1  1.592                    0.874   0.681      0.202
## 2  1.582                    0.861   0.686      0.286
## 3  1.590                    0.868   0.683      0.284
## 4  1.644                    0.914   0.677      0.353
## 5  1.549                    0.927   0.660      0.256
## 6  1.488                    0.878   0.638      0.333
##   Trust..Government.Corruption.         Region
## 1                         0.393 Western Europe
## 2                         0.340 Western Europe
## 3                         0.408 Western Europe
## 4                         0.138 Western Europe
## 5                         0.357 Western Europe
## 6                         0.295 Western Europe
df1 %>%
  summarise(gdp = mean(Economy..GDP.per.Capita.),
            family = mean(Family),
            life.expectancy = mean(Health..Life.Expectancy.),
            freedom = mean(Freedom),
            generosity = mean(Generosity),
            corruption = mean(Trust..Government.Corruption.)) %>%
  pivot_longer(c(gdp, family, life.expectancy,freedom,generosity, corruption),
               names_to = "f", values_to = "value") %>%
  ggplot(aes(reorder(f, value), value)) +
  geom_bar(stat = "identity", fill = "darkgreen", width = 0.55, alpha = 0.7) +
  geom_text(aes(label = paste0(round(value, 2)), vjust = -0.5)) +
  theme_bw() +
  labs(title = "The mean value of the factors" , y = "", x = "")

Happiness.Continent <- df1 %>%
                          select(-c(Year,Happiness.Rank))%>%
                          group_by(Region) %>%
                          summarise_at(vars(-Country), funs(mean(., na.rm=TRUE)))


Happiness.Continent.melt <- melt(Happiness.Continent)


# Faceting
ggplot(Happiness.Continent.melt, aes(y=value, x=Region, color=Region, fill=Region)) + 
  geom_bar( stat="identity") +    
  facet_wrap(~variable) + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Average value of happiness variables for different continents", 
       y = "Average value") 

####### Happiness score for each continent

gg1 <- ggplot(df1,
              aes(x=Region,
                  y=Happiness.Score,
                  color=Region))+
  geom_point() + theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

gg1

gg2 <- ggplot(df1 , aes(x = Region, y = Happiness.Score)) +
  geom_boxplot(aes(fill=Region)) + theme_bw() +
  theme(axis.text.x = element_text (angle = 90))

gg2

colnames(df1)
##  [1] "Year"                          "Happiness.Rank"               
##  [3] "Country"                       "Happiness.Score"              
##  [5] "Economy..GDP.per.Capita."      "Family"                       
##  [7] "Health..Life.Expectancy."      "Freedom"                      
##  [9] "Generosity"                    "Trust..Government.Corruption."
## [11] "Region"
ggline1 = ggplot(df1, aes(x = Economy..GDP.per.Capita., y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline1a = ggplot(df1, aes(x = Economy..GDP.per.Capita., y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline1
ggline1a

ggline2 = ggplot(df1, aes(x = Family, y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline2a = ggplot(df1, aes(x = Family, y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline2
ggline2a

ggline3 = ggplot(df1, aes(x = Health..Life.Expectancy., y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline3a = ggplot(df1, aes(x = Health..Life.Expectancy., y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline3
ggline3a

ggline4 = ggplot(df1, aes(x =Freedom, y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline4a = ggplot(df1, aes(x =Freedom, y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline4
ggline4a

ggline5 = ggplot(df1, aes(x = Trust..Government.Corruption., y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline5a = ggplot(df1, aes(x = Trust..Government.Corruption., y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline5
ggline5a

ggline6 = ggplot(df1, aes(x = Generosity, y = Happiness.Score)) + 
  geom_point(size = .5, alpha = 0.8) +  
  geom_smooth(method = "lm", fullrange = TRUE) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline6a = ggplot(df1, aes(x = Generosity, y = Happiness.Score)) + 
  geom_point(aes(color=Region), size = .5, alpha = 0.8) +  
  geom_smooth(aes(color = Region, fill = Region), 
              method = "lm", fullrange = TRUE) +
  facet_wrap(~Region) +
  theme_bw() + labs(title = "Scatter plot with regression line")

ggline6
ggline6a

3.2.2 Correlation Matrix Heatmap

 library(corrplot)
 Num.cols <- sapply(df1, is.numeric)
 Cor.data <- cor(df1[, Num.cols])

 corrplot(Cor.data, method = 'color') 

library(GGally)

ggcorr(df1, label = TRUE, label_round = 2, label_size = 3.5, size = 2, hjust = .85) +
  ggtitle("Correlation Heatmap") +
  theme(plot.title = element_text(hjust = 0.5))

3.3 Drop columns based on heatmap Correlation

Based on the heatmap, we should drop Year,Country,Happiness.Rank,Region column

dataset= select(df1,-c("Year","Country","Happiness.Rank","Region"))
head(dataset)
##   Happiness.Score Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1           7.632                    1.305  1.592                    0.874
## 2           7.594                    1.456  1.582                    0.861
## 3           7.555                    1.351  1.590                    0.868
## 4           7.495                    1.343  1.644                    0.914
## 5           7.487                    1.420  1.549                    0.927
## 6           7.441                    1.361  1.488                    0.878
##   Freedom Generosity Trust..Government.Corruption.
## 1   0.681      0.202                         0.393
## 2   0.686      0.286                         0.340
## 3   0.683      0.284                         0.408
## 4   0.677      0.353                         0.138
## 5   0.660      0.256                         0.357
## 6   0.638      0.333                         0.295

3.4 Categorize Happiness score into 3 level:

  • High, Mid, low
  • Add new column Happy.Level into dataset
rge_dif=round((max(dataset$Happiness.Score)-min(dataset$Happiness.Score))/3,3)

low=min(dataset$Happiness.Score)+rge_dif
mid=low+rge_dif


print(paste("range difference in happiness score: ",rge_dif))
print(paste('upper bound of Low grp',low))
print(paste('upper bound of Mid grp',mid))
print(paste('upper bound of High grp','max:',max(dataset$Happiness.Score)))
## [1] "range difference in happiness score:  1.643"
## [1] "upper bound of Low grp 4.482"
## [1] "upper bound of Mid grp 6.125"
## [1] "upper bound of High grp max: 7.769"

“Happy.Level” columns ⟶ transformed from “”Happiness.Score column.

dataset_level <- dataset %>%
  mutate(Happy.Level=case_when(
    Happiness.Score <=low  ~ "Low",
    Happiness.Score>low & Happiness.Score <=mid ~ "Mid",
    Happiness.Score >mid ~ "High"
  ))  %>%
  mutate(Happy.Level=factor(Happy.Level, levels=c("High", "Mid", "Low"))) %>%
  select(-Happiness.Score)

4 Data Modelling by Regression Algorithm

4.1 Split into train set (80%) and test set (20%)

# Splitting the dataset into the Training set and Test set
set.seed(123) 
split=0.80
trainIndex <- createDataPartition(dataset$Happiness.Score, p=split, list=FALSE) 
data_train <- dataset[ trainIndex,] 
data_test <- dataset[-trainIndex,]

4.2 Multiple Linear Regression for Happyniess Score Prediction

4.2.1 Train Multiple Linear Regression model with data_train

# Fitting Multiple Linear Regression to the Training set
lm_model = lm(formula = Happiness.Score ~ .,
               data = data_train)

summary(lm_model)
## 
## Call:
## lm(formula = Happiness.Score ~ ., data = data_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.87619 -0.33193  0.00798  0.34507  1.43298 
## 
## Coefficients:
##                               Estimate Std. Error t value             Pr(>|t|)
## (Intercept)                    2.09629    0.09560  21.927 < 0.0000000000000002
## Economy..GDP.per.Capita.       1.14157    0.09825  11.619 < 0.0000000000000002
## Family                         0.64275    0.09459   6.795   0.0000000000277987
## Health..Life.Expectancy.       1.26063    0.16461   7.658   0.0000000000000837
## Freedom                        1.21029    0.20527   5.896   0.0000000064505937
## Generosity                     0.71311    0.19481   3.661             0.000276
## Trust..Government.Corruption.  0.96843    0.26401   3.668             0.000268
##                                  
## (Intercept)                   ***
## Economy..GDP.per.Capita.      ***
## Family                        ***
## Health..Life.Expectancy.      ***
## Freedom                       ***
## Generosity                    ***
## Trust..Government.Corruption. ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5422 on 558 degrees of freedom
## Multiple R-squared:  0.7722, Adjusted R-squared:  0.7697 
## F-statistic: 315.2 on 6 and 558 DF,  p-value: < 0.00000000000000022

An (adjusted) R2 that is close to 1 indicates that a large proportion of the variability in the outcome has been explained by the regression model.

A number near 0 indicates that the regression model did not explain much of the variability in the outcome.

Our adjusted R2 is 0.7697, which is good.

4.2.2 Predict datat_test happiness score by Multiple Linear Regression model

y_pred_lm = predict(lm_model, newdata = data_test)
Actual_lm = data_test$Happiness.Score

Pred_Actual_lm <- as.data.frame(cbind(Prediction = y_pred_lm, Actual = Actual_lm))



gg.lm <- ggplot(Pred_Actual_lm, aes(Actual, Prediction )) +
  geom_point() + theme_bw() + geom_abline() +
  labs(title = "Multiple Linear Regression", x = "Actual happiness score",
       y = "Predicted happiness score") +
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)), 
        axis.title = element_text(family = "Helvetica", size = (10)))
gg.lm

data.frame(
  R2 = R2(y_pred_lm, data_test$Happiness.Score),
  RMSE = RMSE(y_pred_lm, data_test$Happiness.Score),
  MAE = MAE(y_pred_lm, data_test$Happiness.Score)
)
##          R2      RMSE       MAE
## 1 0.7643535 0.5478055 0.4256454

4.3 Support Vector Regression for Happyniess Score Prediction

4.3.1 Train SVR model data_train

library(e1071)

regressor_svr = svm(formula = Happiness.Score ~ .,
                data = data_train,
                type = 'eps-regression',
                kernel = 'radial')

4.3.2 Predict datat_test happiness score by SVR model

# Predicting a new result
y_pred_svr = predict(regressor_svr,  newdata = data_test)

Pred_Actual_svr <- as.data.frame(cbind(Prediction = y_pred_svr, Actual = data_test$Happiness.Score))


Pred_Actual_lm.versus.svr <- cbind(Prediction.lm = y_pred_lm, Prediction.svr = y_pred_svr, Actual = data_test$Happiness.Score)


gg.svr <- ggplot(Pred_Actual_svr, aes(Actual, Prediction )) +
  geom_point() + theme_bw() + geom_abline() +
  labs(title = "SVR", x = "Actual happiness score",
       y = "Predicted happiness score") +
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)), 
        axis.title = element_text(family = "Helvetica", size = (10)))
gg.svr

data.frame(
  R2 = R2(y_pred_svr, data_test$Happiness.Score),
  RMSE = RMSE(y_pred_svr, data_test$Happiness.Score),
  MAE = MAE(y_pred_svr, data_test$Happiness.Score)
)
##          R2      RMSE       MAE
## 1 0.8246303 0.4740831 0.3504708

4.4 Decision Tree Regression for Happyniess Score Prediction

4.4.1 rain Decision Tree Regressio model with data_train

library(rpart)
regressor_dt = rpart(formula = Happiness.Score ~ .,
                  data = data_train,
                  control = rpart.control(minsplit = 10))

4.4.2 Predict datat_test happiness score with Decision Tree Regression

# Predicting a new result with Decision Tree Regression
y_pred_dt = predict(regressor_dt, newdata = data_test)

Pred_Actual_dt <- as.data.frame(cbind(Prediction = y_pred_dt, Actual = data_test$Happiness.Score))


gg.dt <- ggplot(Pred_Actual_dt, aes(Actual, Prediction )) +
  geom_point() + theme_bw() + geom_abline() +
  labs(title = "Decision Tree Regression", x = "Actual happiness score",
       y = "Predicted happiness score") +
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)), 
        axis.title = element_text(family = "Helvetica", size = (10)))
gg.dt

#install.packages("rpart.plot")
library(rpart.plot)
prp(regressor_dt)

data.frame(
  R2 = R2(y_pred_dt, data_test$Happiness.Score),
  RMSE = RMSE(y_pred_dt, data_test$Happiness.Score),
  MAE = MAE(y_pred_dt, data_test$Happiness.Score)
)
##         R2      RMSE       MAE
## 1 0.682486 0.6362329 0.5223723

4.5 Random Forest Regression for Happyniess Score Prediction

4.5.1 Train Random Forest Regression model with data_train

library(randomForest)

x_train_rf<-select(dataset,-c("Happiness.Score"))

          
set.seed(1234)
regressor_rf = randomForest(x = x_train_rf,
                         y = dataset$Happiness.Score,
                         ntree = 500)

4.5.2 Predict datat_test happiness score with Random Forest Regression

# Predicting a new result with Random Forest Regression
y_pred_rf = predict(regressor_rf, newdata = data_test)

Pred_Actual_rf <- as.data.frame(cbind(Prediction = y_pred_rf, Actual = data_test$Happiness.Score))


gg.rf <- ggplot(Pred_Actual_rf, aes(Actual, Prediction )) +
  geom_point() + theme_bw() + geom_abline() +
  labs(title = "Random Forest Regression", x = "Actual happiness score",
       y = "Predicted happiness score") +
  theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)), 
        axis.title = element_text(family = "Helvetica", size = (10)))
gg.rf

data.frame(
  R2 = R2(y_pred_rf, data_test$Happiness.Score),
  RMSE = RMSE(y_pred_rf, data_test$Happiness.Score),
  MAE = MAE(y_pred_rf, data_test$Happiness.Score)
)
##          R2      RMSE       MAE
## 1 0.9692887 0.2104387 0.1561681

4.6 Model Evaluation

ggarrange(gg.lm, gg.svr, gg.dt, gg.rf, ncol = 2, nrow = 3)

5 Data Modelling Classification Algorithm

5.1 Feature Scaling

#defined a preprocess variable, which includes the two operations: center and scale

preProcess <- c("center","scale")

5.2 Split into train set (70%) and test set (30%)

# Splitting the dataset into the Training set and Test set
set.seed(123) 

split=0.70
trainIndex <- createDataPartition(dataset_level$Happy.Level, p=split, list=FALSE) 
data_train <- dataset_level[ trainIndex,] 
data_test <- dataset_level[-trainIndex,]

5.3 Cross validation

tc <- trainControl(method = "repeatedcv", number=10,#10-fold cross validation 
                   repeats = 10,classProbs = TRUE)

5.4 K-Neighbors Classifier for predicting Happy Level

5.4.1 Train K-Nearest Neighbours model with data_train

set.seed(123)
model_knn <- train(
  Happy.Level~., 
  data=data_train, 
  trControl=tc, 
  preProcess = preProcess,
  method="knn",
  metric='Accuracy',
  tuneLength=20
  ) 

model_knn
## k-Nearest Neighbors 
## 
## 495 samples
##   6 predictor
##   3 classes: 'High', 'Mid', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 445, 446, 445, 447, 446, 445, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.7423937  0.5805538
##    7  0.7556978  0.6004267
##    9  0.7657579  0.6168692
##   11  0.7631707  0.6123394
##   13  0.7573573  0.6036589
##   15  0.7593779  0.6070139
##   17  0.7569697  0.6031741
##   19  0.7591531  0.6063606
##   21  0.7537652  0.5978543
##   23  0.7525370  0.5955120
##   25  0.7539567  0.5973907
##   27  0.7491322  0.5889126
##   29  0.7497358  0.5893327
##   31  0.7462994  0.5830612
##   33  0.7456737  0.5818007
##   35  0.7444814  0.5797942
##   37  0.7424644  0.5761892
##   39  0.7385986  0.5695721
##   41  0.7384029  0.5686863
##   43  0.7410321  0.5726873
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Accuracy of K-Nearest Neighbours model by running it on train data

plot(model_knn)

5.4.2 Predict happiness level by K-Nearest Neighbours model

pred_knn <- predict(model_knn, data_test)

cm_knn<-confusionMatrix(pred_knn, data_test$Happy.Level)

cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Mid Low
##       High   45   9   0
##       Mid    12  84  17
##       Low     0  11  32
## 
## Overall Statistics
##                                                
##                Accuracy : 0.7667               
##                  95% CI : (0.7035, 0.8221)     
##     No Information Rate : 0.4952               
##     P-Value [Acc > NIR] : 0.0000000000000006721
##                                                
##                   Kappa : 0.6212               
##                                                
##  Mcnemar's Test P-Value : NA                   
## 
## Statistics by Class:
## 
##                      Class: High Class: Mid Class: Low
## Sensitivity               0.7895     0.8077     0.6531
## Specificity               0.9412     0.7264     0.9317
## Pos Pred Value            0.8333     0.7434     0.7442
## Neg Pred Value            0.9231     0.7938     0.8982
## Prevalence                0.2714     0.4952     0.2333
## Detection Rate            0.2143     0.4000     0.1524
## Detection Prevalence      0.2571     0.5381     0.2048
## Balanced Accuracy         0.8653     0.7671     0.7924

5.4.3 Feature Importance

# Create object of importance of our variables 
knn_importance <- varImp(model_knn) 

# Create box plot of importance of variables
ggplot(data = knn_importance, mapping = aes(x = knn_importance[,1])) + # Data & mapping
  geom_boxplot() + # Create box plot
  labs(title = "Variable importance: K-Nearest Neighbours ") + # Title
  theme_light() # Theme

5.5 Random Forest Classification model for predicting Happy Level

model_rf <- train(Happy.Level~.,
                  data_train,
                  method="rf",
                  preProcess = preProcess,
                  trControl=tc)

model_rf
## Random Forest 
## 
## 495 samples
##   6 predictor
##   3 classes: 'High', 'Mid', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 446, 446, 446, 445, 447, 445, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.7578917  0.6042773
##   4     0.7601203  0.6088770
##   6     0.7613414  0.6103306
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
plot(model_rf)

5.5.1 Predict happiness Level by Random Forest Classification

pred_rf <- predict(model_rf, data_test)

cm_rf<-confusionMatrix(pred_rf, data_test$Happy.Level)

cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Mid Low
##       High   45   4   0
##       Mid    12  88  11
##       Low     0  12  38
## 
## Overall Statistics
##                                                
##                Accuracy : 0.8143               
##                  95% CI : (0.755, 0.8645)      
##     No Information Rate : 0.4952               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7001               
##                                                
##  Mcnemar's Test P-Value : NA                   
## 
## Statistics by Class:
## 
##                      Class: High Class: Mid Class: Low
## Sensitivity               0.7895     0.8462     0.7755
## Specificity               0.9739     0.7830     0.9255
## Pos Pred Value            0.9184     0.7928     0.7600
## Neg Pred Value            0.9255     0.8384     0.9312
## Prevalence                0.2714     0.4952     0.2333
## Detection Rate            0.2143     0.4190     0.1810
## Detection Prevalence      0.2333     0.5286     0.2381
## Balanced Accuracy         0.8817     0.8146     0.8505

Feature Importance

# Create object of importance of our variables 
rf_importance <- varImp(model_rf) 

# Create box plot of importance of variables
ggplot(data = rf_importance, mapping = aes(x = rf_importance[,1])) + # Data & mapping
  geom_boxplot() + # Create box plot
  labs(title = "Variable importance: Random forest model") + # Title
  theme_light() # Theme

5.5.2 Model Evaluation

data.frame(
  accuracy_knn = cm_knn$overall[1],
  accuracy_rf =  cm_rf$overall[1]
)
##          accuracy_knn accuracy_rf
## Accuracy    0.7666667   0.8142857

6 conclusion

6.1 Happiness score Prediction (Regression Model)

Random Forest Regression comes out with the best result compared to others, Support Vector Regression model and Multiple Linear Regression are good in prediction. And finally, Decision Tree was the worst algorithm to predict happiness scores.

6.2 Happiness Level Prediction (Classification Model)

Random Forest Classification is better than K-Neighbors Classifier model

7 Reference