##install required packages

install.packages("readr")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("tidyr")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("dplyr")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("ggplot2")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("rworldmap")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("DT")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("ggcorrplot")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("viridis")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("plotly")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install.packages("RCurl")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install required library

library(readr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(rworldmap)

## Loading required package: sp

## ### Welcome to rworldmap ###

## For a short introduction type :   vignette('rworldmap')

library(DT) 
library(ggcorrplot)
library(viridis)

## Loading required package: viridisLite

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(RCurl)

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

setting working directory

getwd()

## [1] "/cloud/project/data"

setwd("/cloud/project/data")

loading data from .csv files

h_2015 <-read.csv('/cloud/project/data/2015.csv')
h_2016 <-read.csv('/cloud/project/data/2016.csv')
h_2017 <-read.csv('/cloud/project/data/2017.csv')
h_2018 <-read.csv('/cloud/project/data/2018.csv')
h_2019 <-read.csv('/cloud/project/data/2019.csv')

Preprocessing and cleaning of database

#Selecting variables from data set for year 2015

names(h_2015)

##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Standard.Error"                "Economy..GDP.per.Capita."     
##  [7] "Family"                        "Health..Life.Expectancy."     
##  [9] "Freedom"                       "Trust..Government.Corruption."
## [11] "Generosity"                    "Dystopia.Residual"

pre15 <- h_2015 %>% select(Country,Region, Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre15)

## [1] "Country"                  "Region"                  
## [3] "Happiness.Score"          "Economy..GDP.per.Capita."
## [5] "Health..Life.Expectancy."

#Selecting variables from data set for year 2016

names(h_2016)

##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Lower.Confidence.Interval"     "Upper.Confidence.Interval"    
##  [7] "Economy..GDP.per.Capita."      "Family"                       
##  [9] "Health..Life.Expectancy."      "Freedom"                      
## [11] "Trust..Government.Corruption." "Generosity"                   
## [13] "Dystopia.Residual"

pre16 <- h_2016 %>% select(Country,Region, Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre16)

## [1] "Country"                  "Region"                  
## [3] "Happiness.Score"          "Economy..GDP.per.Capita."
## [5] "Health..Life.Expectancy."

#Selecting variables from data set for year 2017

names(h_2017)

##  [1] "Country"                       "Happiness.Rank"               
##  [3] "Happiness.Score"               "Whisker.high"                 
##  [5] "Whisker.low"                   "Economy..GDP.per.Capita."     
##  [7] "Family"                        "Health..Life.Expectancy."     
##  [9] "Freedom"                       "Generosity"                   
## [11] "Trust..Government.Corruption." "Dystopia.Residual"

pre17 <- h_2017 %>% select(Country,Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre17)

## [1] "Country"                  "Happiness.Score"         
## [3] "Economy..GDP.per.Capita." "Health..Life.Expectancy."

adding region in combined data

new dataset containg Country and region

ref <- select(h_2015,Country,Region)
#ref
pre17 <- left_join(pre17,ref, by = "Country")

## Warning: Column `Country` joining factors with different levels, coercing to
## character vector

pre17 <- pre17[,c(1,5,2,3,4)]
#dim(pre17)
str(pre17)

## 'data.frame':    155 obs. of  5 variables:
##  $ Country                 : chr  "Norway" "Denmark" "Iceland" "Switzerland" ...
##  $ Region                  : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 6 1 10 1 ...
##  $ Happiness.Score         : num  7.54 7.52 7.5 7.49 7.47 ...
##  $ Economy..GDP.per.Capita.: num  1.62 1.48 1.48 1.56 1.44 ...
##  $ Health..Life.Expectancy.: num  0.797 0.793 0.834 0.858 0.809 ...

colSums(is.na(pre17))

##                  Country                   Region          Happiness.Score 
##                        0                        6                        0 
## Economy..GDP.per.Capita. Health..Life.Expectancy. 
##                        0                        0

c()

## NULL

c <- which(rowSums(is.na(pre17)) == 1)
pre17 <- na.omit(pre17)
head(pre17)

##       Country         Region Happiness.Score Economy..GDP.per.Capita.
## 1      Norway Western Europe           7.537                 1.616463
## 2     Denmark Western Europe           7.522                 1.482383
## 3     Iceland Western Europe           7.504                 1.480633
## 4 Switzerland Western Europe           7.494                 1.564980
## 5     Finland Western Europe           7.469                 1.443572
## 6 Netherlands Western Europe           7.377                 1.503945
##   Health..Life.Expectancy.
## 1                0.7966665
## 2                0.7925655
## 3                0.8335521
## 4                0.8581313
## 5                0.8091577
## 6                0.8106961

#Selecting variables from data set for year 2018

names(h_2018)

## [1] "Overall.rank"                 "Country.or.region"           
## [3] "Score"                        "GDP.per.capita"              
## [5] "Social.support"               "Healthy.life.expectancy"     
## [7] "Freedom.to.make.life.choices" "Generosity"                  
## [9] "Perceptions.of.corruption"

pre_18 <- h_2018 %>% select(Country.or.region ,Score,GDP.per.capita,Healthy.life.expectancy  )
names(pre_18)

## [1] "Country.or.region"       "Score"                  
## [3] "GDP.per.capita"          "Healthy.life.expectancy"

pre18 <- pre_18 %>% rename(Country = Country.or.region)
head(pre18)

##       Country Score GDP.per.capita Healthy.life.expectancy
## 1     Finland 7.632          1.305                   0.874
## 2      Norway 7.594          1.456                   0.861
## 3     Denmark 7.555          1.351                   0.868
## 4     Iceland 7.495          1.343                   0.914
## 5 Switzerland 7.487          1.420                   0.927
## 6 Netherlands 7.441          1.361                   0.878

Data wrangling to get combined dataframe for EDA

now, adding region in combined data

new dataset containg Country and region

ref <- select(h_2015,Country,Region)
#ref
pre18 <- left_join(pre18,ref, by = "Country")

## Warning: Column `Country` joining factors with different levels, coercing to
## character vector

pre18 <- pre18[,c(1,5,2,3,4)]
#dim(pre18)
str(pre18)

## 'data.frame':    156 obs. of  5 variables:
##  $ Country                : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ Region                 : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 6 1 10 1 ...
##  $ Score                  : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ GDP.per.capita         : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ Healthy.life.expectancy: num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...

colSums(is.na(pre18))

##                 Country                  Region                   Score 
##                       0                       6                       0 
##          GDP.per.capita Healthy.life.expectancy 
##                       0                       0

c()

## NULL

c <- which(rowSums(is.na(pre18)) == 1)
pre18 <- na.omit(pre18)

#Selecting variables from data set for year 2019

names(h_2019)

## [1] "Overall.rank"                 "Country.or.region"           
## [3] "Score"                        "GDP.per.capita"              
## [5] "Social.support"               "Healthy.life.expectancy"     
## [7] "Freedom.to.make.life.choices" "Generosity"                  
## [9] "Perceptions.of.corruption"

pre_19 <- h_2019 %>% select(Country.or.region ,Score,GDP.per.capita,Healthy.life.expectancy  )
names(pre_19)

## [1] "Country.or.region"       "Score"                  
## [3] "GDP.per.capita"          "Healthy.life.expectancy"

#adding region column

pre19 <- pre_19 %>% rename(Country = Country.or.region)
# adding region in combined data
# new dataset containg Country and region
ref <- select(h_2015,Country,Region)
#ref
pre19 <- left_join(pre19,ref, by = "Country")

## Warning: Column `Country` joining factors with different levels, coercing to
## character vector

pre19 <- pre19[,c(1,5,2,3,4)]
pre19 <- pre19 %>% rename(Region = Region )
#dim(pre19)
str(pre19)

## 'data.frame':    156 obs. of  5 variables:
##  $ Country                : chr  "Finland" "Denmark" "Norway" "Iceland" ...
##  $ Region                 : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 10 1 6 10 ...
##  $ Score                  : num  7.77 7.6 7.55 7.49 7.49 ...
##  $ GDP.per.capita         : num  1.34 1.38 1.49 1.38 1.4 ...
##  $ Healthy.life.expectancy: num  0.986 0.996 1.028 1.026 0.999 ...

colSums(is.na(pre19))

##                 Country                  Region                   Score 
##                       0                       7                       0 
##          GDP.per.capita Healthy.life.expectancy 
##                       0                       0

c()

## NULL

c <- which(rowSums(is.na(pre19)) == 1)
pre19 <- na.omit(pre19)

To standarize datasets, renaming variable across 5 datasets

#Renaming variables to be standardised across all data sets
fin15 <- pre15 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin16 <- pre16 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin17 <- pre17 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin18 <- pre18 %>% rename("Score" = "Score",
                            "GDP" = "GDP.per.capita",
                            "Life" = "Healthy.life.expectancy")
fin19 <- pre19 %>% rename("Score" = "Score",
                            "GDP" = "GDP.per.capita",
                            "Life" = "Healthy.life.expectancy")

head(pre19)

##       Country         Region Score GDP.per.capita Healthy.life.expectancy
## 1     Finland Western Europe 7.769          1.340                   0.986
## 2     Denmark Western Europe 7.600          1.383                   0.996
## 3      Norway Western Europe 7.554          1.488                   1.028
## 4     Iceland Western Europe 7.494          1.380                   1.026
## 5 Netherlands Western Europe 7.488          1.396                   0.999
## 6 Switzerland Western Europe 7.480          1.452                   1.052

combining datasets for comparision of 5 year scores

pre_happiness <- fin15 %>% 
  left_join(fin16, by=c("Country")) %>%
  na.omit() %>%
  left_join(fin17,by=c("Country")) %>%
  na.omit() %>%
  #left_join(fin18,by=c("Country")) %>%
  #na.omit() %>%
  #left_join(fin19,by=c("Country")) %>%
  #na.omit() %>%
  select(Country,Region,Score.x,Score.y,Score) %>%
  rename(Score_2015 = Score.x,Score_2016 = Score.y,Score_2017 = Score)

## Warning: Column `Country` joining factors with different levels, coercing to
## character vector

pre_happiness$Score_2017 <- round(pre_happiness$Score_2017,2)

#Checking for missing values in the common data set

sum(is.na(pre_happiness))

## [1] 0

head(pre_happiness)

##       Country         Region Score_2015 Score_2016 Score_2017
## 1 Switzerland Western Europe      7.587      7.509       7.49
## 2     Iceland Western Europe      7.561      7.501       7.50
## 3     Denmark Western Europe      7.527      7.526       7.52
## 4      Norway Western Europe      7.522      7.498       7.54
## 5      Canada  North America      7.427      7.404       7.32
## 6     Finland Western Europe      7.406      7.413       7.47

Common data set for comparison of Happiness Scores over 5 years

happiness <- pre_happiness %>% 
  left_join(fin18,by=c("Country")) %>%
  na.omit() %>%
  left_join(fin19,by=c("Country")) %>%
  na.omit() %>%
  select(Country,Region,Score_2015,Score_2016,Score_2017,Score.x,Score.y) %>%
  rename(Score_2018 = Score.x,Score_2019 = Score.y)

happiness$Score_2019 <- round(happiness$Score_2019,2)

#Checking for missing values in the common data set
sum(is.na(happiness))

## [1] 0

head(happiness)

##       Country         Region Score_2015 Score_2016 Score_2017 Score_2018
## 1 Switzerland Western Europe      7.587      7.509       7.49      7.487
## 2     Iceland Western Europe      7.561      7.501       7.50      7.495
## 3     Denmark Western Europe      7.527      7.526       7.52      7.555
## 4      Norway Western Europe      7.522      7.498       7.54      7.594
## 5      Canada  North America      7.427      7.404       7.32      7.328
## 6     Finland Western Europe      7.406      7.413       7.47      7.632
##   Score_2019
## 1       7.48
## 2       7.49
## 3       7.60
## 4       7.55
## 5       7.28
## 6       7.77

combining 5 datasets and introduce a new variable for year

master1 <- inner_join(fin17,fin16, by = c("Country","Region")) %>%
          inner_join(fin15,by = c("Country","Region"))

## Warning: Column `Country` joining character vector and factor, coercing into
## character vector

## Warning: Column `Country` joining character vector and factor, coercing into
## character vector

master<- inner_join(fin19,fin18, by = c("Country","Region")) %>%
          inner_join(master1,by = c("Country","Region"))
names(master)

##  [1] "Country"   "Region"    "Score.x.x" "GDP.x.x"   "Life.x.x"  "Score.y.x"
##  [7] "GDP.y.x"   "Life.y.x"  "Score.x.y" "GDP.x.y"   "Life.x.y"  "Score.y.y"
## [13] "GDP.y.y"   "Life.y.y"  "Score"     "GDP"       "Life"

finmaster <- master %>% rename("Score_2019" = "Score.x.x","GDP_2019" = "GDP.x.x","Life Expectancy_2019" = "Life.x.x",
                               "Score_2018" = "Score.y.x","GDP_2018" = "GDP.y.x","Life Expectancy_2018" = "Life.y.x",
                               "Score_2017" = "Score.x.y","GDP_2017" = "GDP.x.y","Life Expectancy_2017" = "Life.x.y",
                               "Score_2016" = "Score.y.y","GDP_2016" = "GDP.y.y","Life Expectancy_2016" = "Life.y.y",
                               "Score_2015" = "Score","GDP_2015" = "GDP","Life Expectancy_2015" = "Life")
finmaster <-finmaster[,c(1,2,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3)]
head(finmaster)

##       Country         Region Life Expectancy_2015 GDP_2015 Score_2015
## 1     Finland Western Europe              0.88911  1.29025      7.406
## 2     Denmark Western Europe              0.87464  1.32548      7.527
## 3      Norway Western Europe              0.88521  1.45900      7.522
## 4     Iceland Western Europe              0.94784  1.30232      7.561
## 5 Netherlands Western Europe              0.89284  1.32944      7.378
## 6 Switzerland Western Europe              0.94143  1.39651      7.587
##   Life Expectancy_2016 GDP_2016 Score_2016 Life Expectancy_2017 GDP_2017
## 1              0.81091  1.40598      7.413            0.8091577 1.443572
## 2              0.79504  1.44178      7.526            0.7925655 1.482383
## 3              0.79579  1.57744      7.498            0.7966665 1.616463
## 4              0.86733  1.42666      7.501            0.8335521 1.480633
## 5              0.81231  1.46468      7.339            0.8106961 1.503945
## 6              0.86303  1.52733      7.509            0.8581313 1.564980
##   Score_2017 Life Expectancy_2018 GDP_2018 Score_2018 Life Expectancy_2019
## 1      7.469                0.874    1.305      7.632                0.986
## 2      7.522                0.868    1.351      7.555                0.996
## 3      7.537                0.861    1.456      7.594                1.028
## 4      7.504                0.914    1.343      7.495                1.026
## 5      7.377                0.878    1.361      7.441                0.999
## 6      7.494                0.927    1.420      7.487                1.052
##   GDP_2019 Score_2019
## 1    1.340      7.769
## 2    1.383      7.600
## 3    1.488      7.554
## 4    1.380      7.494
## 5    1.396      7.488
## 6    1.452      7.480

Joining datasets based on 3 variables

score.df <- select(happiness, Country = Country,Region,'2015' = 'Score_2015','2016' = 'Score_2016','2017' = 'Score_2017','2018' = 'Score_2018','2019' = 'Score_2019')
score.comb <- gather(score.df,Year,HappinessScore,3:7)
GDP.df <- select(finmaster, Country = Country,Region,'2015' = 'GDP_2015','2016' = 'GDP_2016','2017' = 'GDP_2017','2018' = 'GDP_2018','2019' = 'GDP_2019')
GDP.comb <- gather(GDP.df,Year,GDP,3:7)
life.df <- select(finmaster, Country = Country,Region,'2015' = 'Life Expectancy_2015','2016' = 'Life Expectancy_2016','2017' = 'Life Expectancy_2017','2018' = 'Life Expectancy_2018','2019' = 'Life Expectancy_2019')
life.comb <- gather(life.df,Year,LifeExpectancy,3:7)

combined <- inner_join(score.comb,GDP.comb,by = c("Country","Region","Year")) %>%
            inner_join(life.comb,by = c("Country","Region","Year"))
head(combined)

##       Country         Region Year HappinessScore     GDP LifeExpectancy
## 1 Switzerland Western Europe 2015          7.587 1.39651        0.94143
## 2     Iceland Western Europe 2015          7.561 1.30232        0.94784
## 3     Denmark Western Europe 2015          7.527 1.32548        0.87464
## 4      Norway Western Europe 2015          7.522 1.45900        0.88521
## 5      Canada  North America 2015          7.427 1.32629        0.90563
## 6     Finland Western Europe 2015          7.406 1.29025        0.88911

#Aggregating data by regional affliation

line15 <- aggregate(x = fin15,by = list(as.factor(fin15$Region)),FUN = "mean")

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

line15new <- rename(line15,RegionName = Group.1)

line16 <- aggregate(x = fin16,by = list(as.factor(fin16$Region)),FUN = "mean")

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

line16new <- rename(line16,RegionName = Group.1)

line17 <- aggregate(x = fin17,by = list(as.factor(fin17$Region)),FUN = "mean")

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

line17new <- rename(line17,RegionName = Group.1)

line18 <- aggregate(x = fin18,by = list(as.factor(fin18$Region)),FUN = "mean")

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

line18new <- rename(line18,RegionName = Group.1)

line19 <- aggregate(x = fin19,by = list(as.factor(fin19$Region)),FUN = "mean")

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

line19new <- rename(line19,RegionName = Group.1)

Exploratory data analysis

#plotting happiness score region wise for year 2015

theme_set(theme_bw())
ggplot(line15new,aes(RegionName,Score)) + geom_point(aes(color = RegionName),size = 3) + geom_segment(aes(x = RegionName,xend = RegionName,y = 0,yend = Score)) + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Happiness Score by Region for 2015")

#Cheking change in happiness score

overallhap <- inner_join(line15new,line16new,by = "RegionName") %>%
              #inner_join(line17new,line18new,by = "RegionName") %>%
              inner_join(line17new, by = "RegionName") %>%
              select(RegionName,'2015' = Score.x,'2016'= Score.y,'2017'= Score) %>%
              gather(Year,AverageHappinessScore,2:4)

theme_set(theme_bw())
ggplot(overallhap,aes(Year,AverageHappinessScore,color = RegionName)) + 
  geom_line(aes(group = RegionName)) + geom_text(aes(label = RegionName),size = 0)+
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

head(overallhap)

##                        RegionName Year AverageHappinessScore
## 1       Australia and New Zealand 2015              7.285000
## 2      Central and Eastern Europe 2015              5.332931
## 3                    Eastern Asia 2015              5.626167
## 4     Latin America and Caribbean 2015              6.144682
## 5 Middle East and Northern Africa 2015              5.406900
## 6                   North America 2015              7.273000

overallhap1<- inner_join(line18new,line19new,by = "RegionName") %>%
              inner_join(line17new, by = "RegionName") %>%
              select(RegionName,'2017' = Score.x,'2018'= Score.y,'2019'= Score) %>%
              gather(Year,AverageHappinessScore,2:4)
overallhap1 <-subset(overallhap1,Year!="2017")

head(overallhap1)

##                         RegionName Year AverageHappinessScore
## 11       Australia and New Zealand 2018              7.267500
## 12      Central and Eastern Europe 2018              5.571786
## 13                    Eastern Asia 2018              5.688833
## 14     Latin America and Caribbean 2018              5.942550
## 15 Middle East and Northern Africa 2018              5.237000
## 16                   North America 2018              7.085000

totalhapreg <-rbind(overallhap,overallhap1)

##Plotting average happiness score across regions for last 5 years

theme_set(theme_bw())
ggplot(totalhapreg,aes(Year,AverageHappinessScore,color = RegionName)) + 
  geom_line(aes(group = RegionName)) + geom_text(aes(label = RegionName),size = 0)+
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

##Now plotting interactive plot with help of plotly to further explore processed data

#plotly
colors <- c("red", "green", "blue", "purple","navyblue" , "yellow", "darkgrey", "orange","violet", "maroon")
fig <- plot_ly(combined, x = ~GDP, y = ~LifeExpectancy, text = ~Country, type = 'scatter', mode = 'markers',color = ~Region, colors = colors,
                       #Choosing the range of the bubbles' sizes:
        sizes = c(1, 50),
        marker = list(size = ~HappinessScore, opacity = 0.75,sizemode = 'diameter'))
fig <- fig %>% layout(title = 'xxx',
         xaxis = list(showgrid = TRUE),
         yaxis = list(showgrid = TRUE))

fig

#plotly
colors <- c("red", "green", "blue", "purple","navyblue" , "yellow", "darkgrey", "orange","violet", "maroon")
desired_maximum_marker_size <- 1000
your_list_of_size_values <- combined['HappinessScore']
sizeref <- 1000.0 * max(your_list_of_size_values) / (desired_maximum_marker_size**1)
fig <- plot_ly(combined, x = ~GDP, y = ~LifeExpectancy, text = ~Country, type = 'scatter', mode = 'markers',color = ~Region, colors=colors,


#Choosing the range of the bubbles' sizes:
sizes = c(50, 1000),
marker = list(size = your_list_of_size_values,opacity = 1,sizemode = 'diameter', sizeref = sizeref))
fig <- fig %>% layout(title = '5 year average GDP vs Life Expectancy',
         xaxis = list(showgrid = TRUE),
         yaxis = list(showgrid = TRUE))

#Tuning charts

#quick chart plotting
fig <- plot_ly(data = combined, x = ~GDP, y = ~HappinessScore, color = ~Region)

fig

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

##Plotting chart to see which country is getting happier and which country is getting unhappier.

library(ggplot2)
library(ggalt)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

happiness.index<-left_join(fin15,fin19,by="Country") %>% select(Country,Score.x,Score.y)

## Warning: Column `Country` joining factor and character vector, coercing into
## character vector

happiness.index$Country<-factor(happiness.index$Country,levels=as.character(happiness.index$Country))
colnames(happiness.index)=c("Country","Score_2015","Score_2019")

ggplot()+geom_dumbbell(data=happiness.index %>% filter(Score_2019-Score_2015>0),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color="coral1",colour_xend="coral4", size=1.5) + 
  geom_dumbbell(data=happiness.index%>%mutate(a=Score_2019-Score_2015)%>%arrange(desc(a))%>%top_n(10),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color=NA,dot_guide = TRUE,dot_guide_colour ='chocolate1' ) +
  labs(x=NULL,y=NULL,title='Less Happier Countries: 2015 vs 2019') +
  theme(plot.title = element_text(face="bold"),
        plot.background = element_rect(fill="cornsilk1"),
        panel.background = element_rect(fill="cornsilk1"),
        axis.text.x=element_text(size=9),
        axis.text.y=element_text(size=6),
        panel.grid.major.y = element_blank(),
        panel.border = element_blank())

## Selecting by a

ggplot()+geom_dumbbell(data=happiness.index %>% filter(Score_2019-Score_2015<0),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color="coral1",colour_xend="coral4", size=1.5) + 
  geom_dumbbell(data=happiness.index%>%mutate(a=Score_2015-Score_2019)%>%arrange(desc(a))%>%top_n(10),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color=NA,dot_guide = TRUE,dot_guide_colour ='chocolate1' ) +
  labs(x=NULL,y=NULL,title='Happier Countries: 2015 vs 2019') +
  theme(plot.title = element_text(face="bold"),
        plot.background = element_rect(fill="cornsilk1"),
        panel.background = element_rect(fill="cornsilk1"),
        axis.text.x=element_text(size=9),
        axis.text.y=element_text(size=6),
        panel.grid.major.y = element_blank(),
        panel.border = element_blank())

## Selecting by a

##Plotting data on map

#Worldmap2015 happiness scores
world15 <- joinCountryData2Map(fin15, joinCode = "NAME", nameJoinColumn = "Country")

## 155 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 88 codes from the map weren't represented in your data

mapCountryData(world15,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2015")

#Worldmap2016 happiness scores
world16 <- joinCountryData2Map(fin16, joinCode = "NAME", nameJoinColumn = "Country")

## 154 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 89 codes from the map weren't represented in your data

mapCountryData(world16,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2016")

#Worldmap2017 happiness scores
world17 <- joinCountryData2Map(fin17, joinCode = "NAME", nameJoinColumn = "Country")

## 147 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 96 codes from the map weren't represented in your data

mapCountryData(world17,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2017")

#Worldmap2018 happiness scores
world18 <- joinCountryData2Map(fin18, joinCode = "NAME", nameJoinColumn = "Country")

## 149 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 94 codes from the map weren't represented in your data

mapCountryData(world18,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2018")

#Worldmap2019 happiness scores
world19 <- joinCountryData2Map(fin19, joinCode = "NAME", nameJoinColumn = "Country")

## 148 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data

mapCountryData(world19,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2019")

##Plotting chart to check changes across 3 variables

#Line chart of variation over five years in happiness score
c1 <- fin19 %>% filter(Score >7.25) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years in GDP per capita
c1 <- fin19 %>% filter(GDP >1.4) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,GDP,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years in life expectancy
c1 <- fin19 %>% filter(Life >1.028) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,LifeExpectancy,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

Plotting chart to see correlation between variables

#Ploting chart for Happiness score Vs GDP per capita

combined$Year <- factor(combined$Year)

ggplot(data = combined, aes(x = GDP, y = HappinessScore,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Ploting chart for Happiness score Vs life expectancy

ggplot(data = combined, aes(x = LifeExpectancy, y = HappinessScore,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Ploting chart for GDP per capita Vs life expectancy

ggplot(data = combined, aes(x = GDP, y = LifeExpectancy,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##Line diagrams for top and bottom performers based on 3 variables

#Top and bottom countries based on happiness score

#Line chart of variation over five years
c1 <- fin19 %>% filter(Score >7.25) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years
c1 <- fin19 %>% filter(Score <3.9) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Top and bottom countries based on life expectancy and GDP

#Line chart of variation over five years
c1 <- fin19 %>% filter(Life >1.03) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,LifeExpectancy,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years
c1 <- fin19 %>% filter(GDP <0.325) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,GDP,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Plotting box plot for happiness score across region

plot_ly(fin15,x=~Region,
        y=~Score,
        type="box",
        boxpoints="all",
        pointpos = -1.8,
        color=~Region)%>%
  layout(xaxis=list(showticklabels = FALSE),
         margin=list(b = 100))

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

#Box plot tuning

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

## The following object is masked from 'package:readr':
## 
##     col_factor

box <- ggplot(fin19, aes(x = Region, y = Score, color = Region)) +
  geom_boxplot() + 
  geom_jitter(aes(color=Country), size = 0.5) +
  ggtitle("Happiness Score for Regions and Countries") + 
  coord_flip() + 
  theme(legend.position="none")
ggplotly(box)

##Plotting data for insights in data

plot1<-plot_ly(combined,x=~HappinessScore,
        y=~LifeExpectancy,
        color=~Region,
        colors=c("red","orange","yellow","green","cyan","purple","darkgreen","grey","gold","darkblue"),
        size=~HappinessScore,
        hoverinfo = 'text',
        text=~paste("Happiness Score:",HappinessScore,
                    "</br>Health Life Expectancy:",LifeExpectancy,
                    "</br>Country:",Country,
                    "</br>Region:",Region))%>%
  layout(xaxis=list(title="Happiness Score"),
         yaxis=list(title="Health Life Expectancy"))

plot2<-plot_ly(combined,x=~HappinessScore,
        y=~GDP,
        color=~Region,
        colors=c("red","orange","yellow","green","cyan","purple","darkgreen","grey","gold","darkblue"),
        size=~HappinessScore,
        hoverinfo = 'text',
        text=~paste("Happiness Score:",HappinessScore,
                    "</br>GDP per capita:",GDP,
                    "</br>Country:",Country,
                    "</br>Region:",Region))%>%
  layout(xaxis=list(title="Happiness Score"),
         yaxis=list(title="GDP per capita"))

#Plotting

plot1

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

plot2

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Factors influencing the World Happiness Index

install required library

setting working directory

loading data from .csv files

Preprocessing and cleaning of database

adding region in combined data

new dataset containg Country and region

Data wrangling to get combined dataframe for EDA

now, adding region in combined data

new dataset containg Country and region

To standarize datasets, renaming variable across 5 datasets

combining datasets for comparision of 5 year scores

Common data set for comparison of Happiness Scores over 5 years

combining 5 datasets and introduce a new variable for year

Joining datasets based on 3 variables

Exploratory data analysis

Plotting chart to see correlation between variables