##install required packages

install.packages("readr")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("tidyr")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("rworldmap")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("DT")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("ggcorrplot")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("viridis")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("plotly")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
install.packages("RCurl")
## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)

install required library

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(DT) 
library(ggcorrplot)
library(viridis) 
## Loading required package: viridisLite
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete

setting working directory

getwd()
## [1] "/cloud/project/data"
setwd("/cloud/project/data")

loading data from .csv files

h_2015 <-read.csv('/cloud/project/data/2015.csv')
h_2016 <-read.csv('/cloud/project/data/2016.csv')
h_2017 <-read.csv('/cloud/project/data/2017.csv')
h_2018 <-read.csv('/cloud/project/data/2018.csv')
h_2019 <-read.csv('/cloud/project/data/2019.csv')

Preprocessing and cleaning of database

#Selecting variables from data set for year 2015

names(h_2015)
##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Standard.Error"                "Economy..GDP.per.Capita."     
##  [7] "Family"                        "Health..Life.Expectancy."     
##  [9] "Freedom"                       "Trust..Government.Corruption."
## [11] "Generosity"                    "Dystopia.Residual"
pre15 <- h_2015 %>% select(Country,Region, Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre15)
## [1] "Country"                  "Region"                  
## [3] "Happiness.Score"          "Economy..GDP.per.Capita."
## [5] "Health..Life.Expectancy."

#Selecting variables from data set for year 2016

names(h_2016)
##  [1] "Country"                       "Region"                       
##  [3] "Happiness.Rank"                "Happiness.Score"              
##  [5] "Lower.Confidence.Interval"     "Upper.Confidence.Interval"    
##  [7] "Economy..GDP.per.Capita."      "Family"                       
##  [9] "Health..Life.Expectancy."      "Freedom"                      
## [11] "Trust..Government.Corruption." "Generosity"                   
## [13] "Dystopia.Residual"
pre16 <- h_2016 %>% select(Country,Region, Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre16)
## [1] "Country"                  "Region"                  
## [3] "Happiness.Score"          "Economy..GDP.per.Capita."
## [5] "Health..Life.Expectancy."

#Selecting variables from data set for year 2017

names(h_2017)
##  [1] "Country"                       "Happiness.Rank"               
##  [3] "Happiness.Score"               "Whisker.high"                 
##  [5] "Whisker.low"                   "Economy..GDP.per.Capita."     
##  [7] "Family"                        "Health..Life.Expectancy."     
##  [9] "Freedom"                       "Generosity"                   
## [11] "Trust..Government.Corruption." "Dystopia.Residual"
pre17 <- h_2017 %>% select(Country,Happiness.Score,Economy..GDP.per.Capita.,Health..Life.Expectancy. )
names(pre17)
## [1] "Country"                  "Happiness.Score"         
## [3] "Economy..GDP.per.Capita." "Health..Life.Expectancy."

adding region in combined data

new dataset containg Country and region

ref <- select(h_2015,Country,Region)
#ref
pre17 <- left_join(pre17,ref, by = "Country")
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
pre17 <- pre17[,c(1,5,2,3,4)]
#dim(pre17)
str(pre17)
## 'data.frame':    155 obs. of  5 variables:
##  $ Country                 : chr  "Norway" "Denmark" "Iceland" "Switzerland" ...
##  $ Region                  : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 6 1 10 1 ...
##  $ Happiness.Score         : num  7.54 7.52 7.5 7.49 7.47 ...
##  $ Economy..GDP.per.Capita.: num  1.62 1.48 1.48 1.56 1.44 ...
##  $ Health..Life.Expectancy.: num  0.797 0.793 0.834 0.858 0.809 ...
colSums(is.na(pre17))
##                  Country                   Region          Happiness.Score 
##                        0                        6                        0 
## Economy..GDP.per.Capita. Health..Life.Expectancy. 
##                        0                        0
c()
## NULL
c <- which(rowSums(is.na(pre17)) == 1)
pre17 <- na.omit(pre17)
head(pre17)
##       Country         Region Happiness.Score Economy..GDP.per.Capita.
## 1      Norway Western Europe           7.537                 1.616463
## 2     Denmark Western Europe           7.522                 1.482383
## 3     Iceland Western Europe           7.504                 1.480633
## 4 Switzerland Western Europe           7.494                 1.564980
## 5     Finland Western Europe           7.469                 1.443572
## 6 Netherlands Western Europe           7.377                 1.503945
##   Health..Life.Expectancy.
## 1                0.7966665
## 2                0.7925655
## 3                0.8335521
## 4                0.8581313
## 5                0.8091577
## 6                0.8106961

#Selecting variables from data set for year 2018

names(h_2018)
## [1] "Overall.rank"                 "Country.or.region"           
## [3] "Score"                        "GDP.per.capita"              
## [5] "Social.support"               "Healthy.life.expectancy"     
## [7] "Freedom.to.make.life.choices" "Generosity"                  
## [9] "Perceptions.of.corruption"
pre_18 <- h_2018 %>% select(Country.or.region ,Score,GDP.per.capita,Healthy.life.expectancy  )
names(pre_18)
## [1] "Country.or.region"       "Score"                  
## [3] "GDP.per.capita"          "Healthy.life.expectancy"
pre18 <- pre_18 %>% rename(Country = Country.or.region)
head(pre18)
##       Country Score GDP.per.capita Healthy.life.expectancy
## 1     Finland 7.632          1.305                   0.874
## 2      Norway 7.594          1.456                   0.861
## 3     Denmark 7.555          1.351                   0.868
## 4     Iceland 7.495          1.343                   0.914
## 5 Switzerland 7.487          1.420                   0.927
## 6 Netherlands 7.441          1.361                   0.878

Data wrangling to get combined dataframe for EDA

now, adding region in combined data

new dataset containg Country and region

ref <- select(h_2015,Country,Region)
#ref
pre18 <- left_join(pre18,ref, by = "Country")
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
pre18 <- pre18[,c(1,5,2,3,4)]
#dim(pre18)
str(pre18)
## 'data.frame':    156 obs. of  5 variables:
##  $ Country                : chr  "Finland" "Norway" "Denmark" "Iceland" ...
##  $ Region                 : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 6 1 10 1 ...
##  $ Score                  : num  7.63 7.59 7.55 7.5 7.49 ...
##  $ GDP.per.capita         : num  1.3 1.46 1.35 1.34 1.42 ...
##  $ Healthy.life.expectancy: num  0.874 0.861 0.868 0.914 0.927 0.878 0.896 0.876 0.913 0.91 ...
colSums(is.na(pre18))
##                 Country                  Region                   Score 
##                       0                       6                       0 
##          GDP.per.capita Healthy.life.expectancy 
##                       0                       0
c()
## NULL
c <- which(rowSums(is.na(pre18)) == 1)
pre18 <- na.omit(pre18)

#Selecting variables from data set for year 2019

names(h_2019)
## [1] "Overall.rank"                 "Country.or.region"           
## [3] "Score"                        "GDP.per.capita"              
## [5] "Social.support"               "Healthy.life.expectancy"     
## [7] "Freedom.to.make.life.choices" "Generosity"                  
## [9] "Perceptions.of.corruption"
pre_19 <- h_2019 %>% select(Country.or.region ,Score,GDP.per.capita,Healthy.life.expectancy  )
names(pre_19)
## [1] "Country.or.region"       "Score"                  
## [3] "GDP.per.capita"          "Healthy.life.expectancy"

#adding region column

pre19 <- pre_19 %>% rename(Country = Country.or.region)
# adding region in combined data
# new dataset containg Country and region
ref <- select(h_2015,Country,Region)
#ref
pre19 <- left_join(pre19,ref, by = "Country")
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
pre19 <- pre19[,c(1,5,2,3,4)]
pre19 <- pre19 %>% rename(Region = Region )
#dim(pre19)
str(pre19)
## 'data.frame':    156 obs. of  5 variables:
##  $ Country                : chr  "Finland" "Denmark" "Norway" "Iceland" ...
##  $ Region                 : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 10 10 10 1 6 10 ...
##  $ Score                  : num  7.77 7.6 7.55 7.49 7.49 ...
##  $ GDP.per.capita         : num  1.34 1.38 1.49 1.38 1.4 ...
##  $ Healthy.life.expectancy: num  0.986 0.996 1.028 1.026 0.999 ...
colSums(is.na(pre19))
##                 Country                  Region                   Score 
##                       0                       7                       0 
##          GDP.per.capita Healthy.life.expectancy 
##                       0                       0
c()
## NULL
c <- which(rowSums(is.na(pre19)) == 1)
pre19 <- na.omit(pre19)

To standarize datasets, renaming variable across 5 datasets

#Renaming variables to be standardised across all data sets
fin15 <- pre15 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin16 <- pre16 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin17 <- pre17 %>% rename("Score" = "Happiness.Score",
                            "GDP" = "Economy..GDP.per.Capita.",
                            "Life" = "Health..Life.Expectancy.")
fin18 <- pre18 %>% rename("Score" = "Score",
                            "GDP" = "GDP.per.capita",
                            "Life" = "Healthy.life.expectancy")
fin19 <- pre19 %>% rename("Score" = "Score",
                            "GDP" = "GDP.per.capita",
                            "Life" = "Healthy.life.expectancy")
head(pre19)
##       Country         Region Score GDP.per.capita Healthy.life.expectancy
## 1     Finland Western Europe 7.769          1.340                   0.986
## 2     Denmark Western Europe 7.600          1.383                   0.996
## 3      Norway Western Europe 7.554          1.488                   1.028
## 4     Iceland Western Europe 7.494          1.380                   1.026
## 5 Netherlands Western Europe 7.488          1.396                   0.999
## 6 Switzerland Western Europe 7.480          1.452                   1.052

combining datasets for comparision of 5 year scores

pre_happiness <- fin15 %>% 
  left_join(fin16, by=c("Country")) %>%
  na.omit() %>%
  left_join(fin17,by=c("Country")) %>%
  na.omit() %>%
  #left_join(fin18,by=c("Country")) %>%
  #na.omit() %>%
  #left_join(fin19,by=c("Country")) %>%
  #na.omit() %>%
  select(Country,Region,Score.x,Score.y,Score) %>%
  rename(Score_2015 = Score.x,Score_2016 = Score.y,Score_2017 = Score)
## Warning: Column `Country` joining factors with different levels, coercing to
## character vector
pre_happiness$Score_2017 <- round(pre_happiness$Score_2017,2)

#Checking for missing values in the common data set

sum(is.na(pre_happiness))
## [1] 0
head(pre_happiness) 
##       Country         Region Score_2015 Score_2016 Score_2017
## 1 Switzerland Western Europe      7.587      7.509       7.49
## 2     Iceland Western Europe      7.561      7.501       7.50
## 3     Denmark Western Europe      7.527      7.526       7.52
## 4      Norway Western Europe      7.522      7.498       7.54
## 5      Canada  North America      7.427      7.404       7.32
## 6     Finland Western Europe      7.406      7.413       7.47

Common data set for comparison of Happiness Scores over 5 years

happiness <- pre_happiness %>% 
  left_join(fin18,by=c("Country")) %>%
  na.omit() %>%
  left_join(fin19,by=c("Country")) %>%
  na.omit() %>%
  select(Country,Region,Score_2015,Score_2016,Score_2017,Score.x,Score.y) %>%
  rename(Score_2018 = Score.x,Score_2019 = Score.y)

happiness$Score_2019 <- round(happiness$Score_2019,2)

#Checking for missing values in the common data set
sum(is.na(happiness))
## [1] 0
head(happiness)
##       Country         Region Score_2015 Score_2016 Score_2017 Score_2018
## 1 Switzerland Western Europe      7.587      7.509       7.49      7.487
## 2     Iceland Western Europe      7.561      7.501       7.50      7.495
## 3     Denmark Western Europe      7.527      7.526       7.52      7.555
## 4      Norway Western Europe      7.522      7.498       7.54      7.594
## 5      Canada  North America      7.427      7.404       7.32      7.328
## 6     Finland Western Europe      7.406      7.413       7.47      7.632
##   Score_2019
## 1       7.48
## 2       7.49
## 3       7.60
## 4       7.55
## 5       7.28
## 6       7.77

combining 5 datasets and introduce a new variable for year

master1 <- inner_join(fin17,fin16, by = c("Country","Region")) %>%
          inner_join(fin15,by = c("Country","Region"))
## Warning: Column `Country` joining character vector and factor, coercing into
## character vector

## Warning: Column `Country` joining character vector and factor, coercing into
## character vector
master<- inner_join(fin19,fin18, by = c("Country","Region")) %>%
          inner_join(master1,by = c("Country","Region"))
names(master)
##  [1] "Country"   "Region"    "Score.x.x" "GDP.x.x"   "Life.x.x"  "Score.y.x"
##  [7] "GDP.y.x"   "Life.y.x"  "Score.x.y" "GDP.x.y"   "Life.x.y"  "Score.y.y"
## [13] "GDP.y.y"   "Life.y.y"  "Score"     "GDP"       "Life"
finmaster <- master %>% rename("Score_2019" = "Score.x.x","GDP_2019" = "GDP.x.x","Life Expectancy_2019" = "Life.x.x",
                               "Score_2018" = "Score.y.x","GDP_2018" = "GDP.y.x","Life Expectancy_2018" = "Life.y.x",
                               "Score_2017" = "Score.x.y","GDP_2017" = "GDP.x.y","Life Expectancy_2017" = "Life.x.y",
                               "Score_2016" = "Score.y.y","GDP_2016" = "GDP.y.y","Life Expectancy_2016" = "Life.y.y",
                               "Score_2015" = "Score","GDP_2015" = "GDP","Life Expectancy_2015" = "Life")
finmaster <-finmaster[,c(1,2,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3)]
head(finmaster)
##       Country         Region Life Expectancy_2015 GDP_2015 Score_2015
## 1     Finland Western Europe              0.88911  1.29025      7.406
## 2     Denmark Western Europe              0.87464  1.32548      7.527
## 3      Norway Western Europe              0.88521  1.45900      7.522
## 4     Iceland Western Europe              0.94784  1.30232      7.561
## 5 Netherlands Western Europe              0.89284  1.32944      7.378
## 6 Switzerland Western Europe              0.94143  1.39651      7.587
##   Life Expectancy_2016 GDP_2016 Score_2016 Life Expectancy_2017 GDP_2017
## 1              0.81091  1.40598      7.413            0.8091577 1.443572
## 2              0.79504  1.44178      7.526            0.7925655 1.482383
## 3              0.79579  1.57744      7.498            0.7966665 1.616463
## 4              0.86733  1.42666      7.501            0.8335521 1.480633
## 5              0.81231  1.46468      7.339            0.8106961 1.503945
## 6              0.86303  1.52733      7.509            0.8581313 1.564980
##   Score_2017 Life Expectancy_2018 GDP_2018 Score_2018 Life Expectancy_2019
## 1      7.469                0.874    1.305      7.632                0.986
## 2      7.522                0.868    1.351      7.555                0.996
## 3      7.537                0.861    1.456      7.594                1.028
## 4      7.504                0.914    1.343      7.495                1.026
## 5      7.377                0.878    1.361      7.441                0.999
## 6      7.494                0.927    1.420      7.487                1.052
##   GDP_2019 Score_2019
## 1    1.340      7.769
## 2    1.383      7.600
## 3    1.488      7.554
## 4    1.380      7.494
## 5    1.396      7.488
## 6    1.452      7.480

Joining datasets based on 3 variables

score.df <- select(happiness, Country = Country,Region,'2015' = 'Score_2015','2016' = 'Score_2016','2017' = 'Score_2017','2018' = 'Score_2018','2019' = 'Score_2019')
score.comb <- gather(score.df,Year,HappinessScore,3:7)
GDP.df <- select(finmaster, Country = Country,Region,'2015' = 'GDP_2015','2016' = 'GDP_2016','2017' = 'GDP_2017','2018' = 'GDP_2018','2019' = 'GDP_2019')
GDP.comb <- gather(GDP.df,Year,GDP,3:7)
life.df <- select(finmaster, Country = Country,Region,'2015' = 'Life Expectancy_2015','2016' = 'Life Expectancy_2016','2017' = 'Life Expectancy_2017','2018' = 'Life Expectancy_2018','2019' = 'Life Expectancy_2019')
life.comb <- gather(life.df,Year,LifeExpectancy,3:7)
combined <- inner_join(score.comb,GDP.comb,by = c("Country","Region","Year")) %>%
            inner_join(life.comb,by = c("Country","Region","Year"))
head(combined)
##       Country         Region Year HappinessScore     GDP LifeExpectancy
## 1 Switzerland Western Europe 2015          7.587 1.39651        0.94143
## 2     Iceland Western Europe 2015          7.561 1.30232        0.94784
## 3     Denmark Western Europe 2015          7.527 1.32548        0.87464
## 4      Norway Western Europe 2015          7.522 1.45900        0.88521
## 5      Canada  North America 2015          7.427 1.32629        0.90563
## 6     Finland Western Europe 2015          7.406 1.29025        0.88911

#Aggregating data by regional affliation

line15 <- aggregate(x = fin15,by = list(as.factor(fin15$Region)),FUN = "mean")
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
line15new <- rename(line15,RegionName = Group.1)

line16 <- aggregate(x = fin16,by = list(as.factor(fin16$Region)),FUN = "mean")
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
line16new <- rename(line16,RegionName = Group.1)

line17 <- aggregate(x = fin17,by = list(as.factor(fin17$Region)),FUN = "mean")
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
line17new <- rename(line17,RegionName = Group.1)

line18 <- aggregate(x = fin18,by = list(as.factor(fin18$Region)),FUN = "mean")
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
line18new <- rename(line18,RegionName = Group.1)

line19 <- aggregate(x = fin19,by = list(as.factor(fin19$Region)),FUN = "mean")
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
line19new <- rename(line19,RegionName = Group.1)

Exploratory data analysis

#plotting happiness score region wise for year 2015

theme_set(theme_bw())
ggplot(line15new,aes(RegionName,Score)) + geom_point(aes(color = RegionName),size = 3) + geom_segment(aes(x = RegionName,xend = RegionName,y = 0,yend = Score)) + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Happiness Score by Region for 2015")

#Cheking change in happiness score

overallhap <- inner_join(line15new,line16new,by = "RegionName") %>%
              #inner_join(line17new,line18new,by = "RegionName") %>%
              inner_join(line17new, by = "RegionName") %>%
              select(RegionName,'2015' = Score.x,'2016'= Score.y,'2017'= Score) %>%
              gather(Year,AverageHappinessScore,2:4)

theme_set(theme_bw())
ggplot(overallhap,aes(Year,AverageHappinessScore,color = RegionName)) + 
  geom_line(aes(group = RegionName)) + geom_text(aes(label = RegionName),size = 0)+
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

head(overallhap)
##                        RegionName Year AverageHappinessScore
## 1       Australia and New Zealand 2015              7.285000
## 2      Central and Eastern Europe 2015              5.332931
## 3                    Eastern Asia 2015              5.626167
## 4     Latin America and Caribbean 2015              6.144682
## 5 Middle East and Northern Africa 2015              5.406900
## 6                   North America 2015              7.273000
overallhap1<- inner_join(line18new,line19new,by = "RegionName") %>%
              inner_join(line17new, by = "RegionName") %>%
              select(RegionName,'2017' = Score.x,'2018'= Score.y,'2019'= Score) %>%
              gather(Year,AverageHappinessScore,2:4)
overallhap1 <-subset(overallhap1,Year!="2017")

head(overallhap1)
##                         RegionName Year AverageHappinessScore
## 11       Australia and New Zealand 2018              7.267500
## 12      Central and Eastern Europe 2018              5.571786
## 13                    Eastern Asia 2018              5.688833
## 14     Latin America and Caribbean 2018              5.942550
## 15 Middle East and Northern Africa 2018              5.237000
## 16                   North America 2018              7.085000
totalhapreg <-rbind(overallhap,overallhap1)

##Plotting average happiness score across regions for last 5 years

theme_set(theme_bw())
ggplot(totalhapreg,aes(Year,AverageHappinessScore,color = RegionName)) + 
  geom_line(aes(group = RegionName)) + geom_text(aes(label = RegionName),size = 0)+
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

##Now plotting interactive plot with help of plotly to further explore processed data

#plotly
colors <- c("red", "green", "blue", "purple","navyblue" , "yellow", "darkgrey", "orange","violet", "maroon")
fig <- plot_ly(combined, x = ~GDP, y = ~LifeExpectancy, text = ~Country, type = 'scatter', mode = 'markers',color = ~Region, colors = colors,
                       #Choosing the range of the bubbles' sizes:
        sizes = c(1, 50),
        marker = list(size = ~HappinessScore, opacity = 0.75,sizemode = 'diameter'))
fig <- fig %>% layout(title = 'xxx',
         xaxis = list(showgrid = TRUE),
         yaxis = list(showgrid = TRUE))

fig
#plotly
colors <- c("red", "green", "blue", "purple","navyblue" , "yellow", "darkgrey", "orange","violet", "maroon")
desired_maximum_marker_size <- 1000
your_list_of_size_values <- combined['HappinessScore']
sizeref <- 1000.0 * max(your_list_of_size_values) / (desired_maximum_marker_size**1)
fig <- plot_ly(combined, x = ~GDP, y = ~LifeExpectancy, text = ~Country, type = 'scatter', mode = 'markers',color = ~Region, colors=colors,


#Choosing the range of the bubbles' sizes:
sizes = c(50, 1000),
marker = list(size = your_list_of_size_values,opacity = 1,sizemode = 'diameter', sizeref = sizeref))
fig <- fig %>% layout(title = '5 year average GDP vs Life Expectancy',
         xaxis = list(showgrid = TRUE),
         yaxis = list(showgrid = TRUE))

#Tuning charts

#quick chart plotting
fig <- plot_ly(data = combined, x = ~GDP, y = ~HappinessScore, color = ~Region)

fig
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

##Plotting chart to see which country is getting happier and which country is getting unhappier.

library(ggplot2)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
happiness.index<-left_join(fin15,fin19,by="Country") %>% select(Country,Score.x,Score.y) 
## Warning: Column `Country` joining factor and character vector, coercing into
## character vector
happiness.index$Country<-factor(happiness.index$Country,levels=as.character(happiness.index$Country))
colnames(happiness.index)=c("Country","Score_2015","Score_2019")

ggplot()+geom_dumbbell(data=happiness.index %>% filter(Score_2019-Score_2015>0),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color="coral1",colour_xend="coral4", size=1.5) + 
  geom_dumbbell(data=happiness.index%>%mutate(a=Score_2019-Score_2015)%>%arrange(desc(a))%>%top_n(10),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color=NA,dot_guide = TRUE,dot_guide_colour ='chocolate1' ) +
  labs(x=NULL,y=NULL,title='Less Happier Countries: 2015 vs 2019') +
  theme(plot.title = element_text(face="bold"),
        plot.background = element_rect(fill="cornsilk1"),
        panel.background = element_rect(fill="cornsilk1"),
        axis.text.x=element_text(size=9),
        axis.text.y=element_text(size=6),
        panel.grid.major.y = element_blank(),
        panel.border = element_blank())
## Selecting by a

ggplot()+geom_dumbbell(data=happiness.index %>% filter(Score_2019-Score_2015<0),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color="coral1",colour_xend="coral4", size=1.5) + 
  geom_dumbbell(data=happiness.index%>%mutate(a=Score_2015-Score_2019)%>%arrange(desc(a))%>%top_n(10),aes(x=Score_2019,xend=Score_2015,y=Country,group=Country),color=NA,dot_guide = TRUE,dot_guide_colour ='chocolate1' ) +
  labs(x=NULL,y=NULL,title='Happier Countries: 2015 vs 2019') +
  theme(plot.title = element_text(face="bold"),
        plot.background = element_rect(fill="cornsilk1"),
        panel.background = element_rect(fill="cornsilk1"),
        axis.text.x=element_text(size=9),
        axis.text.y=element_text(size=6),
        panel.grid.major.y = element_blank(),
        panel.border = element_blank())
## Selecting by a

##Plotting data on map

#Worldmap2015 happiness scores
world15 <- joinCountryData2Map(fin15, joinCode = "NAME", nameJoinColumn = "Country")
## 155 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 88 codes from the map weren't represented in your data
mapCountryData(world15,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2015")

#Worldmap2016 happiness scores
world16 <- joinCountryData2Map(fin16, joinCode = "NAME", nameJoinColumn = "Country")
## 154 codes from your data successfully matched countries in the map
## 3 codes from your data failed to match with a country code in the map
## 89 codes from the map weren't represented in your data
mapCountryData(world16,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2016")

#Worldmap2017 happiness scores
world17 <- joinCountryData2Map(fin17, joinCode = "NAME", nameJoinColumn = "Country")
## 147 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 96 codes from the map weren't represented in your data
mapCountryData(world17,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2017")

#Worldmap2018 happiness scores
world18 <- joinCountryData2Map(fin18, joinCode = "NAME", nameJoinColumn = "Country")
## 149 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 94 codes from the map weren't represented in your data
mapCountryData(world18,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2018")

#Worldmap2019 happiness scores
world19 <- joinCountryData2Map(fin19, joinCode = "NAME", nameJoinColumn = "Country")
## 148 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data
mapCountryData(world19,nameColumnToPlot = "Score",mapTitle = "Happiness Scores across the Globe - 2019")

##Plotting chart to check changes across 3 variables

#Line chart of variation over five years in happiness score
c1 <- fin19 %>% filter(Score >7.25) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years in GDP per capita
c1 <- fin19 %>% filter(GDP >1.4) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,GDP,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years in life expectancy
c1 <- fin19 %>% filter(Life >1.028) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,LifeExpectancy,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

Plotting chart to see correlation between variables

#Ploting chart for Happiness score Vs GDP per capita

combined$Year <- factor(combined$Year)

ggplot(data = combined, aes(x = GDP, y = HappinessScore,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Ploting chart for Happiness score Vs life expectancy

ggplot(data = combined, aes(x = LifeExpectancy, y = HappinessScore,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Ploting chart for GDP per capita Vs life expectancy

ggplot(data = combined, aes(x = GDP, y = LifeExpectancy,color = Year)) + geom_point(alpha = 0.5,aes(color = Year)) + scale_fill_viridis(discrete = F) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##Line diagrams for top and bottom performers based on 3 variables

#Top and bottom countries based on happiness score

#Line chart of variation over five years
c1 <- fin19 %>% filter(Score >7.25) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years
c1 <- fin19 %>% filter(Score <3.9) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,HappinessScore,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Top and bottom countries based on life expectancy and GDP

#Line chart of variation over five years
c1 <- fin19 %>% filter(Life >1.03) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,LifeExpectancy,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Line chart of variation over five years
c1 <- fin19 %>% filter(GDP <0.325) %>% select(Country)
c2 <- as.list(c1)
#View(c2)

viz1 <- combined %>% filter(combined$Country %in% c2$Country == 1)

ggplot(viz1,aes(Year,GDP,color = Country)) + 
  geom_line(aes(group = Country)) + geom_text(aes(label = Country),size = 0) +
  geom_point() + theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())

#Plotting box plot for happiness score across region

plot_ly(fin15,x=~Region,
        y=~Score,
        type="box",
        boxpoints="all",
        pointpos = -1.8,
        color=~Region)%>%
  layout(xaxis=list(showticklabels = FALSE),
         margin=list(b = 100))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

#Box plot tuning

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
box <- ggplot(fin19, aes(x = Region, y = Score, color = Region)) +
  geom_boxplot() + 
  geom_jitter(aes(color=Country), size = 0.5) +
  ggtitle("Happiness Score for Regions and Countries") + 
  coord_flip() + 
  theme(legend.position="none")
ggplotly(box)

##Plotting data for insights in data

plot1<-plot_ly(combined,x=~HappinessScore,
        y=~LifeExpectancy,
        color=~Region,
        colors=c("red","orange","yellow","green","cyan","purple","darkgreen","grey","gold","darkblue"),
        size=~HappinessScore,
        hoverinfo = 'text',
        text=~paste("Happiness Score:",HappinessScore,
                    "</br>Health Life Expectancy:",LifeExpectancy,
                    "</br>Country:",Country,
                    "</br>Region:",Region))%>%
  layout(xaxis=list(title="Happiness Score"),
         yaxis=list(title="Health Life Expectancy"))

plot2<-plot_ly(combined,x=~HappinessScore,
        y=~GDP,
        color=~Region,
        colors=c("red","orange","yellow","green","cyan","purple","darkgreen","grey","gold","darkblue"),
        size=~HappinessScore,
        hoverinfo = 'text',
        text=~paste("Happiness Score:",HappinessScore,
                    "</br>GDP per capita:",GDP,
                    "</br>Country:",Country,
                    "</br>Region:",Region))%>%
  layout(xaxis=list(title="Happiness Score"),
         yaxis=list(title="GDP per capita"))

#Plotting

plot1
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
plot2
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.