Our main goal is to do an exploratory analysis of the factors that make people happy.
Abalone Physical Measurements Source: https://www.kaggle.com/rodolfomendes/abalone-dataset
#install.packages('tidyverse')
#install.packages('skimr')
#install.packages('ggpubr')
#install.packages("caret")
#install.packages("Metrics")
#install.packages("e1071")
library(Metrics)
library(caret)
library(readr)
library(readxl)
library(dplyr)
library(ggplot2)
library(skimr)
library(tidyr)
library(reshape2)
library(ggpubr)
library(stringr)
library(e1071)
library(pROC)
# df = read_excel("World Hapiness Data (2015-2019).xlsx", sheet = "Sheet1")
happy15_df = read.csv ("data/2015.csv")
happy16_df = read.csv ("data/2016.csv")
happy17_df = read.csv ("data/2017.csv")
happy18_df = read.csv ("data/2018.csv")
happy19_df = read.csv ("data/2019.csv")
head(happy19_df)
## Overall.rank Country.or.region Score GDP.per.capita Social.support
## 1 1 Finland 7.769 1.340 1.587
## 2 2 Denmark 7.600 1.383 1.573
## 3 3 Norway 7.554 1.488 1.582
## 4 4 Iceland 7.494 1.380 1.624
## 5 5 Netherlands 7.488 1.396 1.522
## 6 6 Switzerland 7.480 1.452 1.526
## Healthy.life.expectancy Freedom.to.make.life.choices Generosity
## 1 0.986 0.596 0.153
## 2 0.996 0.592 0.252
## 3 1.028 0.603 0.271
## 4 1.026 0.591 0.354
## 5 0.999 0.557 0.322
## 6 1.052 0.572 0.263
## Perceptions.of.corruption
## 1 0.393
## 2 0.410
## 3 0.341
## 4 0.118
## 5 0.298
## 6 0.343
We have a look at the last one (head is by default with the first 5 rows)
happy18_df=plyr::rename(happy18_df, replace = c( "Country.or.region"="Country",
"Overall.rank"="Happiness.Rank" ,
"GDP.per.capita"="Economy..GDP.per.Capita.",
"Healthy.life.expectancy"="Health..Life.Expectancy.",
"Freedom.to.make.life.choices"="Freedom",
"Perceptions.of.corruption"="Trust..Government.Corruption.",
"Social.support"="Family",
"Score"="Happiness.Score"))
colnames(happy18_df)
## [1] "Happiness.Rank" "Country"
## [3] "Happiness.Score" "Economy..GDP.per.Capita."
## [5] "Family" "Health..Life.Expectancy."
## [7] "Freedom" "Generosity"
## [9] "Trust..Government.Corruption."
happy19_df=plyr::rename(happy19_df, replace = c( "Country.or.region"="Country",
"Overall.rank"="Happiness.Rank" ,
"GDP.per.capita"="Economy..GDP.per.Capita.",
"Healthy.life.expectancy"="Health..Life.Expectancy.",
"Freedom.to.make.life.choices"="Freedom",
"Perceptions.of.corruption"="Trust..Government.Corruption.",
"Social.support"="Family",
"Score"="Happiness.Score"))
colnames(happy19_df)
## [1] "Happiness.Rank" "Country"
## [3] "Happiness.Score" "Economy..GDP.per.Capita."
## [5] "Family" "Health..Life.Expectancy."
## [7] "Freedom" "Generosity"
## [9] "Trust..Government.Corruption."
happy15_df=plyr::rename(happy15_df, replace = c( "Happiness Rank" = "Happiness.Rank",
"Happiness Score" = "Happiness.Score",
"Economy (GDP per Capita)" = "Economy..GDP.per.Capita.",
"Health (Life Expectancy)" = "Health..Life.Expectancy.",
"Trust (Government Corruption)" = "Trust..Government.Corruption.",
"Dystopia Residual"="Dystopia.Residual"
))
colnames(happy15_df)
## [1] "Country" "Region"
## [3] "Happiness.Rank" "Happiness.Score"
## [5] "Standard.Error" "Economy..GDP.per.Capita."
## [7] "Family" "Health..Life.Expectancy."
## [9] "Freedom" "Trust..Government.Corruption."
## [11] "Generosity" "Dystopia.Residual"
happy16_df=plyr::rename(happy16_df, replace = c( "Happiness Rank" = "Happiness.Rank",
"Happiness Score" = "Happiness.Score",
"Economy (GDP per Capita)" = "Economy..GDP.per.Capita.",
"Health (Life Expectancy)" = "Health..Life.Expectancy.",
"Trust (Government Corruption)" = "Trust..Government.Corruption.",
"Dystopia Residual"="Dystopia.Residual"
))
colnames(happy16_df)
## [1] "Country" "Region"
## [3] "Happiness.Rank" "Happiness.Score"
## [5] "Lower.Confidence.Interval" "Upper.Confidence.Interval"
## [7] "Economy..GDP.per.Capita." "Family"
## [9] "Health..Life.Expectancy." "Freedom"
## [11] "Trust..Government.Corruption." "Generosity"
## [13] "Dystopia.Residual"
#2015
happy15_df<-cbind(Year=2015,happy15_df)
happy16_df<-cbind(Year=2016,happy16_df)
happy17_df<-cbind(Year=2017,happy17_df)
happy18_df<-cbind(Year=2018,happy18_df)
happy19_df<-cbind(Year=2019,happy19_df)
happy18_df$Trust..Government.Corruption. = as.numeric(happy18_df$Trust..Government.Corruption.)
str(happy18_df)
## 'data.frame': 156 obs. of 10 variables:
## $ Year : num 2018 2018 2018 2018 2018 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ Happiness.Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ Economy..GDP.per.Capita. : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Family : num 1.59 1.58 1.59 1.64 1.55 ...
## $ Health..Life.Expectancy. : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom : num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Trust..Government.Corruption.: num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
happy15_16<-dplyr::bind_rows(happy15_df,happy16_df)
happy15_16_17<-dplyr::bind_rows(happy15_16,happy17_df)
happy18_19<-dplyr::bind_rows(happy18_df,happy19_df)
df<-dplyr::bind_rows(happy18_19,happy15_16_17)
head(df)
## Year Happiness.Rank Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018 1 Finland 7.632 1.305
## 2 2018 2 Norway 7.594 1.456
## 3 2018 3 Denmark 7.555 1.351
## 4 2018 4 Iceland 7.495 1.343
## 5 2018 5 Switzerland 7.487 1.420
## 6 2018 6 Netherlands 7.441 1.361
## Family Health..Life.Expectancy. Freedom Generosity
## 1 1.592 0.874 0.681 0.202
## 2 1.582 0.861 0.686 0.286
## 3 1.590 0.868 0.683 0.284
## 4 1.644 0.914 0.677 0.353
## 5 1.549 0.927 0.660 0.256
## 6 1.488 0.878 0.638 0.333
## Trust..Government.Corruption. Region Standard.Error Dystopia.Residual
## 1 0.393 <NA> NA NA
## 2 0.340 <NA> NA NA
## 3 0.408 <NA> NA NA
## 4 0.138 <NA> NA NA
## 5 0.357 <NA> NA NA
## 6 0.295 <NA> NA NA
## Lower.Confidence.Interval Upper.Confidence.Interval Whisker.high Whisker.low
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
df$Happiness.Rank = as.numeric(df$Happiness.Rank )
str(df)
## 'data.frame': 782 obs. of 17 variables:
## $ Year : num 2018 2018 2018 2018 2018 ...
## $ Happiness.Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ Happiness.Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ Economy..GDP.per.Capita. : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Family : num 1.59 1.58 1.59 1.64 1.55 ...
## $ Health..Life.Expectancy. : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom : num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Trust..Government.Corruption.: num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
## $ Region : chr NA NA NA NA ...
## $ Standard.Error : num NA NA NA NA NA NA NA NA NA NA ...
## $ Dystopia.Residual : num NA NA NA NA NA NA NA NA NA NA ...
## $ Lower.Confidence.Interval : num NA NA NA NA NA NA NA NA NA NA ...
## $ Upper.Confidence.Interval : num NA NA NA NA NA NA NA NA NA NA ...
## $ Whisker.high : num NA NA NA NA NA NA NA NA NA NA ...
## $ Whisker.low : num NA NA NA NA NA NA NA NA NA NA ...
colSums(is.na(df))
## Year Happiness.Rank
## 0 0
## Country Happiness.Score
## 0 0
## Economy..GDP.per.Capita. Family
## 0 0
## Health..Life.Expectancy. Freedom
## 0 0
## Generosity Trust..Government.Corruption.
## 0 1
## Region Standard.Error
## 467 624
## Dystopia.Residual Lower.Confidence.Interval
## 312 625
## Upper.Confidence.Interval Whisker.high
## 625 627
## Whisker.low
## 627
df = subset(df, select = -c(Lower.Confidence.Interval,Upper.Confidence.Interval,Dystopia.Residual,Standard.Error,Whisker.high,Whisker.low))
head(df)
## Year Happiness.Rank Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018 1 Finland 7.632 1.305
## 2 2018 2 Norway 7.594 1.456
## 3 2018 3 Denmark 7.555 1.351
## 4 2018 4 Iceland 7.495 1.343
## 5 2018 5 Switzerland 7.487 1.420
## 6 2018 6 Netherlands 7.441 1.361
## Family Health..Life.Expectancy. Freedom Generosity
## 1 1.592 0.874 0.681 0.202
## 2 1.582 0.861 0.686 0.286
## 3 1.590 0.868 0.683 0.284
## 4 1.644 0.914 0.677 0.353
## 5 1.549 0.927 0.660 0.256
## 6 1.488 0.878 0.638 0.333
## Trust..Government.Corruption. Region
## 1 0.393 <NA>
## 2 0.340 <NA>
## 3 0.408 <NA>
## 4 0.138 <NA>
## 5 0.357 <NA>
## 6 0.295 <NA>
df$Trust..Government.Corruption.[is.na(df$Trust..Government.Corruption.)] <- median(df$Trust..Government.Corruption., na.rm = T)
colSums(is.na(df))
## Year Happiness.Rank
## 0 0
## Country Happiness.Score
## 0 0
## Economy..GDP.per.Capita. Family
## 0 0
## Health..Life.Expectancy. Freedom
## 0 0
## Generosity Trust..Government.Corruption.
## 0 0
## Region
## 467
Due to the data is describing the happiness score and relative factors for countries accross different years. So, it is important to view the uniformity of the data in Year column of the data.
Country and Region counts groupby Year
aggregate(df$Country, by=list(df$Year), FUN=length)
## Group.1 x
## 1 2015 158
## 2 2016 157
## 3 2017 155
## 4 2018 156
## 5 2019 156
From the table shown as above, the number of countries involved in this dataset for different year is differnt. Therefore, it is neccessary to make an intersection of them to get the most common country list.
Country_2015 = subset(df, Year == 2015)$Country
Country_2016 = subset(df, Year == 2016)$Country
Country_2017 = subset(df, Year == 2017)$Country
Country_2018 = subset(df, Year == 2018)$Country
Country_2019 = subset(df, Year == 2019)$Country
common_country =intersect(intersect(intersect(intersect(Country_2015, Country_2016),Country_2017),Country_2018),Country_2019)
length(common_country)
## [1] 141
Therefore, there are 142 countries’ data existing accross from 2015-2019 in this dataset.Then we need to filter the orginial dataset by this common_country list.
df1 = subset(df,Country %in% common_country)
print(paste("The amount of rows in the dataset is: ",dim(df1)[1]))
print(paste("The amount of columns in the dataset is: ",dim(df1)[2]))
## [1] "The amount of rows in the dataset is: 705"
## [1] "The amount of columns in the dataset is: 11"
str(df1)
## 'data.frame': 705 obs. of 11 variables:
## $ Year : num 2018 2018 2018 2018 2018 ...
## $ Happiness.Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Country : chr "Finland" "Norway" "Denmark" "Iceland" ...
## $ Happiness.Score : num 7.63 7.59 7.55 7.5 7.49 ...
## $ Economy..GDP.per.Capita. : num 1.3 1.46 1.35 1.34 1.42 ...
## $ Family : num 1.59 1.58 1.59 1.64 1.55 ...
## $ Health..Life.Expectancy. : num 0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
## $ Freedom : num 0.681 0.686 0.683 0.677 0.66 0.638 0.653 0.669 0.659 0.647 ...
## $ Generosity : num 0.202 0.286 0.284 0.353 0.256 0.333 0.321 0.365 0.285 0.361 ...
## $ Trust..Government.Corruption.: num 0.393 0.34 0.408 0.138 0.357 0.295 0.291 0.389 0.383 0.302 ...
## $ Region : chr NA NA NA NA ...
Create a new dataset for storing common region and country
common_region <- unique(subset(df1, Region!="NA", c(Country, Region)))
head(common_country)
## [1] "Switzerland" "Iceland" "Denmark" "Norway" "Canada"
## [6] "Finland"
Fill relate region to missing value of region column
assign_region <- function(x){
Region <- common_region$Region[common_region$Country == x]
}
for(country in common_country)
df1$Region[df1$Country == country] <- assign_region(country)
# write_csv(df1, path = "World Hapiness Data (2015-2019)_cleaned.csv")
skimr::skim_without_charts(df1)
| Name | df1 |
| Number of rows | 705 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 23 | 0 | 141 | 0 |
| Region | 0 | 1 | 12 | 31 | 0 | 10 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1 | 2017.00 | 1.42 | 2015.00 | 2016.00 | 2017.00 | 2018.00 | 2019.00 |
| Happiness.Rank | 0 | 1 | 76.85 | 45.28 | 1.00 | 37.00 | 77.00 | 116.00 | 158.00 |
| Happiness.Score | 0 | 1 | 5.43 | 1.13 | 2.84 | 4.52 | 5.39 | 6.29 | 7.77 |
| Economy..GDP.per.Capita. | 0 | 1 | 0.93 | 0.40 | 0.00 | 0.64 | 1.00 | 1.24 | 2.10 |
| Family | 0 | 1 | 1.09 | 0.32 | 0.00 | 0.88 | 1.14 | 1.35 | 1.64 |
| Health..Life.Expectancy. | 0 | 1 | 0.63 | 0.23 | 0.00 | 0.49 | 0.66 | 0.81 | 1.14 |
| Freedom | 0 | 1 | 0.41 | 0.15 | 0.00 | 0.31 | 0.43 | 0.53 | 0.72 |
| Generosity | 0 | 1 | 0.22 | 0.13 | 0.00 | 0.13 | 0.20 | 0.28 | 0.84 |
| Trust..Government.Corruption. | 0 | 1 | 0.12 | 0.11 | 0.00 | 0.05 | 0.09 | 0.15 | 0.55 |
print(paste("The amount of rows in the dataset is: ",dim(df)[1]))
print(paste("The amount of columns in the dataset is: ",dim(df)[2]))
print(paste("the column names in this dataset are:", paste(shQuote(colnames(df)), collapse=", ")))
## [1] "The amount of rows in the dataset is: 782"
## [1] "The amount of columns in the dataset is: 11"
## [1] "the column names in this dataset are: \"Year\", \"Happiness.Rank\", \"Country\", \"Happiness.Score\", \"Economy..GDP.per.Capita.\", \"Family\", \"Health..Life.Expectancy.\", \"Freedom\", \"Generosity\", \"Trust..Government.Corruption.\", \"Region\""
The 10 happiest countries in 2015
df1 %>%
filter(Year == 2015) %>%
arrange(-Happiness.Score) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
geom_point(colour = "red", size = 3) +
theme(text=element_text(size=10)) +
coord_flip() +
labs(title = "The 10 happiest countries in 2015", x = "")
The 10 happiest countries in 2016
df1 %>%
filter(Year == 2016) %>%
arrange(-Happiness.Score) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
geom_point(colour = "red", size = 3) +
theme(text=element_text(size=10)) +
coord_flip() +
labs(title = "The 10 happiest countries in 2016", x = "")
The 10 happiest countries in 2017
df1 %>%
filter(Year == 2017) %>%
arrange(-Happiness.Score) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
geom_point(colour = "red", size = 3) +
theme(text=element_text(size=10)) +
coord_flip() +
labs(title = "The 10 happiest countries in 2017", x = "")
The 10 happiest countries in 2018
df1 %>%
filter(Year == 2018) %>%
arrange(-Happiness.Score) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
geom_point(colour = "red", size = 3) +
theme(text=element_text(size=10)) +
coord_flip() +
labs(title = "The 10 happiest countries in 2018", x = "")
The 10 happiest countries in 2019
df1 %>%
filter(Year == 2019) %>%
arrange(-Happiness.Score) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, Happiness.Score), Happiness.Score)) +
geom_point(colour = "red", size = 3) +
theme(text=element_text(size=10)) +
coord_flip() +
labs(title = "The 10 happiest countries in 2019", x = "")
Region_Yearavg_HappyScore = data.frame(aggregate(df$Happiness.Score, by=list(df$Year,df$Region), FUN=mean))
colnames(Region_Yearavg_HappyScore)<-c("Year","Region","Mean_HappyScore")
Region_Yearavg_HappyScore
## Year Region Mean_HappyScore
## 1 2015 Australia and New Zealand 7.285000
## 2 2016 Australia and New Zealand 7.323500
## 3 2015 Central and Eastern Europe 5.332931
## 4 2016 Central and Eastern Europe 5.370690
## 5 2015 Eastern Asia 5.626167
## 6 2016 Eastern Asia 5.624167
## 7 2015 Latin America and Caribbean 6.144682
## 8 2016 Latin America and Caribbean 6.101750
## 9 2015 Middle East and Northern Africa 5.406900
## 10 2016 Middle East and Northern Africa 5.386053
## 11 2015 North America 7.273000
## 12 2016 North America 7.254000
## 13 2015 Southeastern Asia 5.317444
## 14 2016 Southeastern Asia 5.338889
## 15 2015 Southern Asia 4.580857
## 16 2016 Southern Asia 4.563286
## 17 2015 Sub-Saharan Africa 4.202800
## 18 2016 Sub-Saharan Africa 4.136421
## 19 2015 Western Europe 6.689619
## 20 2016 Western Europe 6.685667
ggplot(Region_Yearavg_HappyScore,aes(x = Year,y = Mean_HappyScore,fill = Region))+
geom_bar(stat = "identity",position = "dodge")
The top 3 happiness region are: Australia and New Zealand, North America and Western Europe
df1 %>%
group_by(Region) %>%
summarise(mscore = mean(Happiness.Score)) %>%
ggplot(aes(reorder(Region, mscore), mscore)) +
geom_point() +
theme_bw() +
coord_flip() +
labs(title = "Happiness Score by Regions",
x = "", y = "Average happiness score")
df1 %>%
group_by(Country) %>%
summarise(mscore = mean(Happiness.Score)) %>%
arrange(-mscore) %>%
slice_head(n=10) %>%
ggplot(aes(reorder(Country, mscore), mscore)) +
geom_point() +
theme_bw() +
coord_flip() +
labs(title = "Happiness Score by Country",
x = "", y = "Average happiness score")
Top 10 Mean Happiness score by counntries trends by years
Top10_happy_country_DF = df1 %>%
group_by(Country) %>%
summarise(mscore = mean(Happiness.Score)) %>%
arrange(-mscore) %>%
slice_head(n=10)
Top10_happy_country_DF_list = c(Top10_happy_country_DF$Country)
df1_Top10_happy_country = subset(df1,Country %in% Top10_happy_country_DF_list)
ggplot(df1_Top10_happy_country, aes(x = Year,y = Happiness.Score,color = Country))+ geom_line()
## The happiness socre of Finland is increasing dramatically from
2015-2019
df1 %>%
mutate(y = as.character(Year)) %>%
select(y, Country, Region, Happiness.Score) %>%
pivot_wider(names_from = y, values_from = Happiness.Score,
names_prefix = "y_") %>%
mutate(p = (y_2019 - y_2015)/y_2015 * 100) %>%
arrange(-p) %>%
slice_head(n = 10) %>%
ggplot(aes(reorder(Country, p), p)) +
geom_point() +
theme_bw() +
coord_flip() +
labs(title = "The 10 most progressive countries from 2015 - 2019",
y = "Percentage Increase of Happiness Score", x = "")
Top10_Progress_country_df = df1 %>%
mutate(y = as.character(Year)) %>%
select(y, Country, Region, Happiness.Score) %>%
pivot_wider(names_from = y, values_from = Happiness.Score,
names_prefix = "y_") %>%
mutate(p = (y_2019 - y_2015)/y_2015 * 100) %>%
arrange(-p) %>%
slice_head(n = 10)
Top10_Progress_country_df_list = c(Top10_Progress_country_df$Country)
df1_Top10_Progress_country = subset(df1,Country %in% Top10_Progress_country_df_list)
ggplot(df1_Top10_Progress_country, aes(x = Year,y = Happiness.Score,color = Country))+ geom_line()
head(df1)
## Year Happiness.Rank Country Happiness.Score Economy..GDP.per.Capita.
## 1 2018 1 Finland 7.632 1.305
## 2 2018 2 Norway 7.594 1.456
## 3 2018 3 Denmark 7.555 1.351
## 4 2018 4 Iceland 7.495 1.343
## 5 2018 5 Switzerland 7.487 1.420
## 6 2018 6 Netherlands 7.441 1.361
## Family Health..Life.Expectancy. Freedom Generosity
## 1 1.592 0.874 0.681 0.202
## 2 1.582 0.861 0.686 0.286
## 3 1.590 0.868 0.683 0.284
## 4 1.644 0.914 0.677 0.353
## 5 1.549 0.927 0.660 0.256
## 6 1.488 0.878 0.638 0.333
## Trust..Government.Corruption. Region
## 1 0.393 Western Europe
## 2 0.340 Western Europe
## 3 0.408 Western Europe
## 4 0.138 Western Europe
## 5 0.357 Western Europe
## 6 0.295 Western Europe
df1 %>%
summarise(gdp = mean(Economy..GDP.per.Capita.),
family = mean(Family),
life.expectancy = mean(Health..Life.Expectancy.),
freedom = mean(Freedom),
generosity = mean(Generosity),
corruption = mean(Trust..Government.Corruption.)) %>%
pivot_longer(c(gdp, family, life.expectancy,freedom,generosity, corruption),
names_to = "f", values_to = "value") %>%
ggplot(aes(reorder(f, value), value)) +
geom_bar(stat = "identity", fill = "darkgreen", width = 0.55, alpha = 0.7) +
geom_text(aes(label = paste0(round(value, 2)), vjust = -0.5)) +
theme_bw() +
labs(title = "The mean value of the factors" , y = "", x = "")
Happiness.Continent <- df1 %>%
select(-c(Year,Happiness.Rank))%>%
group_by(Region) %>%
summarise_at(vars(-Country), funs(mean(., na.rm=TRUE)))
Happiness.Continent.melt <- melt(Happiness.Continent)
# Faceting
ggplot(Happiness.Continent.melt, aes(y=value, x=Region, color=Region, fill=Region)) +
geom_bar( stat="identity") +
facet_wrap(~variable) + theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Average value of happiness variables for different continents",
y = "Average value")
####### Happiness score for each continent
gg1 <- ggplot(df1,
aes(x=Region,
y=Happiness.Score,
color=Region))+
geom_point() + theme_bw() +
theme(axis.text.x = element_text(angle = 90))
gg1
gg2 <- ggplot(df1 , aes(x = Region, y = Happiness.Score)) +
geom_boxplot(aes(fill=Region)) + theme_bw() +
theme(axis.text.x = element_text (angle = 90))
gg2
colnames(df1)
## [1] "Year" "Happiness.Rank"
## [3] "Country" "Happiness.Score"
## [5] "Economy..GDP.per.Capita." "Family"
## [7] "Health..Life.Expectancy." "Freedom"
## [9] "Generosity" "Trust..Government.Corruption."
## [11] "Region"
ggline1 = ggplot(df1, aes(x = Economy..GDP.per.Capita., y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline1a = ggplot(df1, aes(x = Economy..GDP.per.Capita., y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline1
ggline1a
ggline2 = ggplot(df1, aes(x = Family, y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline2a = ggplot(df1, aes(x = Family, y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline2
ggline2a
ggline3 = ggplot(df1, aes(x = Health..Life.Expectancy., y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline3a = ggplot(df1, aes(x = Health..Life.Expectancy., y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline3
ggline3a
ggline4 = ggplot(df1, aes(x =Freedom, y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline4a = ggplot(df1, aes(x =Freedom, y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline4
ggline4a
ggline5 = ggplot(df1, aes(x = Trust..Government.Corruption., y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline5a = ggplot(df1, aes(x = Trust..Government.Corruption., y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline5
ggline5a
ggline6 = ggplot(df1, aes(x = Generosity, y = Happiness.Score)) +
geom_point(size = .5, alpha = 0.8) +
geom_smooth(method = "lm", fullrange = TRUE) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline6a = ggplot(df1, aes(x = Generosity, y = Happiness.Score)) +
geom_point(aes(color=Region), size = .5, alpha = 0.8) +
geom_smooth(aes(color = Region, fill = Region),
method = "lm", fullrange = TRUE) +
facet_wrap(~Region) +
theme_bw() + labs(title = "Scatter plot with regression line")
ggline6
ggline6a
library(corrplot)
Num.cols <- sapply(df1, is.numeric)
Cor.data <- cor(df1[, Num.cols])
corrplot(Cor.data, method = 'color')
library(GGally)
ggcorr(df1, label = TRUE, label_round = 2, label_size = 3.5, size = 2, hjust = .85) +
ggtitle("Correlation Heatmap") +
theme(plot.title = element_text(hjust = 0.5))
Based on the heatmap, we should drop Year,Country,Happiness.Rank,Region column
dataset= select(df1,-c("Year","Country","Happiness.Rank","Region"))
head(dataset)
## Happiness.Score Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1 7.632 1.305 1.592 0.874
## 2 7.594 1.456 1.582 0.861
## 3 7.555 1.351 1.590 0.868
## 4 7.495 1.343 1.644 0.914
## 5 7.487 1.420 1.549 0.927
## 6 7.441 1.361 1.488 0.878
## Freedom Generosity Trust..Government.Corruption.
## 1 0.681 0.202 0.393
## 2 0.686 0.286 0.340
## 3 0.683 0.284 0.408
## 4 0.677 0.353 0.138
## 5 0.660 0.256 0.357
## 6 0.638 0.333 0.295
rge_dif=round((max(dataset$Happiness.Score)-min(dataset$Happiness.Score))/3,3)
low=min(dataset$Happiness.Score)+rge_dif
mid=low+rge_dif
print(paste("range difference in happiness score: ",rge_dif))
print(paste('upper bound of Low grp',low))
print(paste('upper bound of Mid grp',mid))
print(paste('upper bound of High grp','max:',max(dataset$Happiness.Score)))
## [1] "range difference in happiness score: 1.643"
## [1] "upper bound of Low grp 4.482"
## [1] "upper bound of Mid grp 6.125"
## [1] "upper bound of High grp max: 7.769"
“Happy.Level” columns ⟶ transformed from “”Happiness.Score column.
dataset_level <- dataset %>%
mutate(Happy.Level=case_when(
Happiness.Score <=low ~ "Low",
Happiness.Score>low & Happiness.Score <=mid ~ "Mid",
Happiness.Score >mid ~ "High"
)) %>%
mutate(Happy.Level=factor(Happy.Level, levels=c("High", "Mid", "Low"))) %>%
select(-Happiness.Score)
# Splitting the dataset into the Training set and Test set
set.seed(123)
split=0.80
trainIndex <- createDataPartition(dataset$Happiness.Score, p=split, list=FALSE)
data_train <- dataset[ trainIndex,]
data_test <- dataset[-trainIndex,]
# Fitting Multiple Linear Regression to the Training set
lm_model = lm(formula = Happiness.Score ~ .,
data = data_train)
summary(lm_model)
##
## Call:
## lm(formula = Happiness.Score ~ ., data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.87619 -0.33193 0.00798 0.34507 1.43298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.09629 0.09560 21.927 < 0.0000000000000002
## Economy..GDP.per.Capita. 1.14157 0.09825 11.619 < 0.0000000000000002
## Family 0.64275 0.09459 6.795 0.0000000000277987
## Health..Life.Expectancy. 1.26063 0.16461 7.658 0.0000000000000837
## Freedom 1.21029 0.20527 5.896 0.0000000064505937
## Generosity 0.71311 0.19481 3.661 0.000276
## Trust..Government.Corruption. 0.96843 0.26401 3.668 0.000268
##
## (Intercept) ***
## Economy..GDP.per.Capita. ***
## Family ***
## Health..Life.Expectancy. ***
## Freedom ***
## Generosity ***
## Trust..Government.Corruption. ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5422 on 558 degrees of freedom
## Multiple R-squared: 0.7722, Adjusted R-squared: 0.7697
## F-statistic: 315.2 on 6 and 558 DF, p-value: < 0.00000000000000022
An (adjusted) R2 that is close to 1 indicates that a large proportion of the variability in the outcome has been explained by the regression model.
A number near 0 indicates that the regression model did not explain much of the variability in the outcome.
Our adjusted R2 is 0.7697, which is good.
y_pred_lm = predict(lm_model, newdata = data_test)
Actual_lm = data_test$Happiness.Score
Pred_Actual_lm <- as.data.frame(cbind(Prediction = y_pred_lm, Actual = Actual_lm))
gg.lm <- ggplot(Pred_Actual_lm, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Multiple Linear Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.lm
data.frame(
R2 = R2(y_pred_lm, data_test$Happiness.Score),
RMSE = RMSE(y_pred_lm, data_test$Happiness.Score),
MAE = MAE(y_pred_lm, data_test$Happiness.Score)
)
## R2 RMSE MAE
## 1 0.7643535 0.5478055 0.4256454
library(e1071)
regressor_svr = svm(formula = Happiness.Score ~ .,
data = data_train,
type = 'eps-regression',
kernel = 'radial')
# Predicting a new result
y_pred_svr = predict(regressor_svr, newdata = data_test)
Pred_Actual_svr <- as.data.frame(cbind(Prediction = y_pred_svr, Actual = data_test$Happiness.Score))
Pred_Actual_lm.versus.svr <- cbind(Prediction.lm = y_pred_lm, Prediction.svr = y_pred_svr, Actual = data_test$Happiness.Score)
gg.svr <- ggplot(Pred_Actual_svr, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "SVR", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.svr
data.frame(
R2 = R2(y_pred_svr, data_test$Happiness.Score),
RMSE = RMSE(y_pred_svr, data_test$Happiness.Score),
MAE = MAE(y_pred_svr, data_test$Happiness.Score)
)
## R2 RMSE MAE
## 1 0.8246303 0.4740831 0.3504708
library(rpart)
regressor_dt = rpart(formula = Happiness.Score ~ .,
data = data_train,
control = rpart.control(minsplit = 10))
# Predicting a new result with Decision Tree Regression
y_pred_dt = predict(regressor_dt, newdata = data_test)
Pred_Actual_dt <- as.data.frame(cbind(Prediction = y_pred_dt, Actual = data_test$Happiness.Score))
gg.dt <- ggplot(Pred_Actual_dt, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Decision Tree Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.dt
#install.packages("rpart.plot")
library(rpart.plot)
prp(regressor_dt)
data.frame(
R2 = R2(y_pred_dt, data_test$Happiness.Score),
RMSE = RMSE(y_pred_dt, data_test$Happiness.Score),
MAE = MAE(y_pred_dt, data_test$Happiness.Score)
)
## R2 RMSE MAE
## 1 0.682486 0.6362329 0.5223723
library(randomForest)
x_train_rf<-select(dataset,-c("Happiness.Score"))
set.seed(1234)
regressor_rf = randomForest(x = x_train_rf,
y = dataset$Happiness.Score,
ntree = 500)
# Predicting a new result with Random Forest Regression
y_pred_rf = predict(regressor_rf, newdata = data_test)
Pred_Actual_rf <- as.data.frame(cbind(Prediction = y_pred_rf, Actual = data_test$Happiness.Score))
gg.rf <- ggplot(Pred_Actual_rf, aes(Actual, Prediction )) +
geom_point() + theme_bw() + geom_abline() +
labs(title = "Random Forest Regression", x = "Actual happiness score",
y = "Predicted happiness score") +
theme(plot.title = element_text(family = "Helvetica", face = "bold", size = (15)),
axis.title = element_text(family = "Helvetica", size = (10)))
gg.rf
data.frame(
R2 = R2(y_pred_rf, data_test$Happiness.Score),
RMSE = RMSE(y_pred_rf, data_test$Happiness.Score),
MAE = MAE(y_pred_rf, data_test$Happiness.Score)
)
## R2 RMSE MAE
## 1 0.9692887 0.2104387 0.1561681
ggarrange(gg.lm, gg.svr, gg.dt, gg.rf, ncol = 2, nrow = 3)
#defined a preprocess variable, which includes the two operations: center and scale
preProcess <- c("center","scale")
# Splitting the dataset into the Training set and Test set
set.seed(123)
split=0.70
trainIndex <- createDataPartition(dataset_level$Happy.Level, p=split, list=FALSE)
data_train <- dataset_level[ trainIndex,]
data_test <- dataset_level[-trainIndex,]
tc <- trainControl(method = "repeatedcv", number=10,#10-fold cross validation
repeats = 10,classProbs = TRUE)
set.seed(123)
model_knn <- train(
Happy.Level~.,
data=data_train,
trControl=tc,
preProcess = preProcess,
method="knn",
metric='Accuracy',
tuneLength=20
)
model_knn
## k-Nearest Neighbors
##
## 495 samples
## 6 predictor
## 3 classes: 'High', 'Mid', 'Low'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 445, 446, 445, 447, 446, 445, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7423937 0.5805538
## 7 0.7556978 0.6004267
## 9 0.7657579 0.6168692
## 11 0.7631707 0.6123394
## 13 0.7573573 0.6036589
## 15 0.7593779 0.6070139
## 17 0.7569697 0.6031741
## 19 0.7591531 0.6063606
## 21 0.7537652 0.5978543
## 23 0.7525370 0.5955120
## 25 0.7539567 0.5973907
## 27 0.7491322 0.5889126
## 29 0.7497358 0.5893327
## 31 0.7462994 0.5830612
## 33 0.7456737 0.5818007
## 35 0.7444814 0.5797942
## 37 0.7424644 0.5761892
## 39 0.7385986 0.5695721
## 41 0.7384029 0.5686863
## 43 0.7410321 0.5726873
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Accuracy of K-Nearest Neighbours model by running it on train data
plot(model_knn)
pred_knn <- predict(model_knn, data_test)
cm_knn<-confusionMatrix(pred_knn, data_test$Happy.Level)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Mid Low
## High 45 9 0
## Mid 12 84 17
## Low 0 11 32
##
## Overall Statistics
##
## Accuracy : 0.7667
## 95% CI : (0.7035, 0.8221)
## No Information Rate : 0.4952
## P-Value [Acc > NIR] : 0.0000000000000006721
##
## Kappa : 0.6212
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: High Class: Mid Class: Low
## Sensitivity 0.7895 0.8077 0.6531
## Specificity 0.9412 0.7264 0.9317
## Pos Pred Value 0.8333 0.7434 0.7442
## Neg Pred Value 0.9231 0.7938 0.8982
## Prevalence 0.2714 0.4952 0.2333
## Detection Rate 0.2143 0.4000 0.1524
## Detection Prevalence 0.2571 0.5381 0.2048
## Balanced Accuracy 0.8653 0.7671 0.7924
# Create object of importance of our variables
knn_importance <- varImp(model_knn)
# Create box plot of importance of variables
ggplot(data = knn_importance, mapping = aes(x = knn_importance[,1])) + # Data & mapping
geom_boxplot() + # Create box plot
labs(title = "Variable importance: K-Nearest Neighbours ") + # Title
theme_light() # Theme
model_rf <- train(Happy.Level~.,
data_train,
method="rf",
preProcess = preProcess,
trControl=tc)
model_rf
## Random Forest
##
## 495 samples
## 6 predictor
## 3 classes: 'High', 'Mid', 'Low'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 446, 446, 446, 445, 447, 445, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7578917 0.6042773
## 4 0.7601203 0.6088770
## 6 0.7613414 0.6103306
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
plot(model_rf)
pred_rf <- predict(model_rf, data_test)
cm_rf<-confusionMatrix(pred_rf, data_test$Happy.Level)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Mid Low
## High 45 4 0
## Mid 12 88 11
## Low 0 12 38
##
## Overall Statistics
##
## Accuracy : 0.8143
## 95% CI : (0.755, 0.8645)
## No Information Rate : 0.4952
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7001
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: High Class: Mid Class: Low
## Sensitivity 0.7895 0.8462 0.7755
## Specificity 0.9739 0.7830 0.9255
## Pos Pred Value 0.9184 0.7928 0.7600
## Neg Pred Value 0.9255 0.8384 0.9312
## Prevalence 0.2714 0.4952 0.2333
## Detection Rate 0.2143 0.4190 0.1810
## Detection Prevalence 0.2333 0.5286 0.2381
## Balanced Accuracy 0.8817 0.8146 0.8505
Feature Importance
# Create object of importance of our variables
rf_importance <- varImp(model_rf)
# Create box plot of importance of variables
ggplot(data = rf_importance, mapping = aes(x = rf_importance[,1])) + # Data & mapping
geom_boxplot() + # Create box plot
labs(title = "Variable importance: Random forest model") + # Title
theme_light() # Theme
data.frame(
accuracy_knn = cm_knn$overall[1],
accuracy_rf = cm_rf$overall[1]
)
## accuracy_knn accuracy_rf
## Accuracy 0.7666667 0.8142857
Random Forest Regression comes out with the best result compared to others, Support Vector Regression model and Multiple Linear Regression are good in prediction. And finally, Decision Tree was the worst algorithm to predict happiness scores.
Random Forest Classification is better than K-Neighbors Classifier model