Required packages

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units

Data

getwd()
## [1] "/Users/qmoa_liu/Downloads"
setwd("/Users/qmoa_liu/Downloads")
country_profile= read_csv("/Users/qmoa_liu/Downloads/undata-country-profiles/country_profile_variables.csv")
## Warning: Duplicated column names deduplicated: 'Mobile-cellular
## subscriptions (per 100 inhabitants)' => 'Mobile-cellular subscriptions (per
## 100 inhabitants)_1' [41]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Population in thousands (2017)` = col_double(),
##   `Population density (per km2, 2017)` = col_double(),
##   `Sex ratio (m per 100 f, 2017)` = col_double(),
##   `GDP: Gross domestic product (million current US$)` = col_double(),
##   `GDP per capita (current US$)` = col_double(),
##   `Economy: Industry (% of GVA)` = col_double(),
##   `Economy: Services and other activity (% of GVA)` = col_double(),
##   `Agricultural production index (2004-2006=100)` = col_double(),
##   `Food production index (2004-2006=100)` = col_double(),
##   `Urban population (% of total population)` = col_double(),
##   `Health: Total expenditure (% of GDP)` = col_double(),
##   `Seats held by women in national parliaments %` = col_double(),
##   `Individuals using the Internet (per 100 inhabitants)` = col_double(),
##   `CO2 emission estimates (million tons/tons per capita)` = col_double(),
##   `Energy production, primary (Petajoules)` = col_double(),
##   `Net Official Development Assist. received (% of GNI)` = col_double()
## )
## See spec(...) for full column specifications.
country_profile=subset(country_profile[,c(1,2,4,7,28)])
head(country_profile)
country_happiness <- read_csv("/Users/qmoa_liu/Downloads/world-happiness-report/2017.csv")
## Parsed with column specification:
## cols(
##   Country = col_character(),
##   Happiness.Rank = col_double(),
##   Happiness.Score = col_double(),
##   Whisker.high = col_double(),
##   Whisker.low = col_double(),
##   Economy..GDP.per.Capita. = col_double(),
##   Family = col_double(),
##   Health..Life.Expectancy. = col_double(),
##   Freedom = col_double(),
##   Generosity = col_double(),
##   Trust..Government.Corruption. = col_double(),
##   Dystopia.Residual = col_double()
## )
country_happiness=subset(country_happiness[,c(1,2,3,6,7)])
head(country_happiness)
# Combine two datasets
country_all=country_happiness %>% left_join(country_profile,by="Country")

head(country_all)

Understand

# Generate the region factor in order to know about which region is happier than others
country_all$Region=country_all$Region %>% factor(
  levels = c("SouthernAsia","SouthernEurope","NorthernAfrica","Polynesia",
"MiddleAfrica","Caribbean","SouthAmerica","WesternAsia","Oceania","WesternEurope",
"EasternEurope","CentralAmerica","WesternAfrica","NorthernAmerica","SouthernAfrica",
"South-easternAsia","EasternAfrica","NorthernEurope","EasternAsia","Melanesia","Micronesia","CentralAsia"),
labels=c("SouthernAsia","SouthernEurope","NorthernAfrica","Polynesia","MiddleAfrica","Caribbean","SouthAmerica","WesternAsia","Oceania","WesternEurope",
"EasternEurope","CentralAmerica","WesternAfrica","NorthernAmerica","SouthernAfrica",
"South-easternAsia","EasternAfrica","NorthernEurope","EasternAsia","Melanesia","Micronesia","CentralAsia") )

str(country_all)
## Classes 'tbl_df', 'tbl' and 'data.frame':    155 obs. of  9 variables:
##  $ Country                                          : chr  "Norway" "Denmark" "Iceland" "Switzerland" ...
##  $ Happiness.Rank                                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Happiness.Score                                  : num  7.54 7.52 7.5 7.49 7.47 ...
##  $ Economy..GDP.per.Capita.                         : num  1.62 1.48 1.48 1.56 1.44 ...
##  $ Family                                           : num  1.53 1.55 1.61 1.52 1.54 ...
##  $ Region                                           : Factor w/ 22 levels "SouthernAsia",..: 18 18 18 10 18 10 14 9 18 9 ...
##  $ Population in thousands (2017)                   : num  5305 5734 335 8476 5523 ...
##  $ GDP: Gross domestic product (million current US$): num  386578 301308 16780 670790 231960 ...
##  $ Life expectancy at birth (females/males, years)  : chr  "83.6/79.5" "82.2/78.1" "83.8/80.6" "84.8/80.5" ...

Scan I (Missing value)

# Find the missing value observation
country_all %>% is.na() %>% colSums()
##                                           Country 
##                                                 0 
##                                    Happiness.Rank 
##                                                 0 
##                                   Happiness.Score 
##                                                 0 
##                          Economy..GDP.per.Capita. 
##                                                 0 
##                                            Family 
##                                                 0 
##                                            Region 
##                                                20 
##                    Population in thousands (2017) 
##                                                20 
## GDP: Gross domestic product (million current US$) 
##                                                20 
##   Life expectancy at birth (females/males, years) 
##                                                20
country_all %>% is.na() %>% which()
##  [1]  789  798  808  824  830  831  833  836  846  853  857  867  869  878
## [15]  883  899  901  903  927  928  944  953  963  979  985  986  988  991
## [29] 1001 1008 1012 1022 1024 1033 1038 1054 1056 1058 1082 1083 1099 1108
## [43] 1118 1134 1140 1141 1143 1146 1156 1163 1167 1177 1179 1188 1193 1209
## [57] 1211 1213 1237 1238 1254 1263 1273 1289 1295 1296 1298 1301 1311 1318
## [71] 1322 1332 1334 1343 1348 1364 1366 1368 1392 1393
country_all %>% is.na() %>% sum()
## [1] 80
# Impute GDP by the mean because GDP will be used for analysis later and keep the whole information
country_imputed=country_all %>% mutate(GDP_imputed=ifelse(is.na(country_all$`GDP: Gross domestic product (million current US$)`),mean(`GDP: Gross domestic product (million current US$)`,na.rm = T),`GDP: Gross domestic product (million current US$)`))

Scan II (Outlier)

# Explore the outliers and exclude it
boxplot_country=boxplot(country_imputed$GDP_imputed,na.rm=T)

boxplot_country$out
##  [1]  1552808  1230859  3363600  2858003  1772591  1140724  2418946
##  [8]  1192955  1821580  4383076 11158457  2116239
country_filter=country_imputed %>% filter(`GDP: Gross domestic product (million current US$)`!=c(1552808,1230859,3363600,2858003,1772591,1140724,2418946,1192955,1821580,4383076,11158457))
## Warning in `GDP: Gross domestic product (million current US$)` !=
## c(1552808, : longer object length is not a multiple of shorter object
## length

Tidy & Manipulate Data

# Mutate the GDP per 1000 people in country for easier understanding
country_imputed= country_imputed %>% mutate(GDP_per_1k=GDP_imputed/`Population in thousands (2017)`)

Transform & analyze

# Continue as above in order to understand which region is happier, we spread the sheet to make the happiness ranking corresponding to the region.

country_filter=country_imputed %>% filter(!is.na(Region))

country_filter %>% group_by(Region) %>% summarise(mean(Happiness.Score))
country_filter %>% group_by(Region) %>% summarise(mean(GDP_per_1k))
# NorthernAmerica has the highest mean of happiness score.

#In order to understand the GDP per 1k distribution easily, I have to do transformation.
country_imputed$GDP_per_1k %>% scale(center = T,scale = T)
##                [,1]
##   [1,]  3.215097892
##   [2,]  2.116862427
##   [3,]  1.984030571
##   [4,]  3.553893470
##   [5,]  1.546817813
##   [6,]  1.657283494
##   [7,]  1.568419070
##   [8,]  1.268584262
##   [9,]  1.979976382
##  [10,]  1.997555011
##  [11,]  1.221478143
##  [12,] -0.139446041
##  [13,]  1.609074957
##  [14,]           NA
##  [15,]  2.496843152
##  [16,]  1.490814178
##  [17,]  1.429090715
##  [18,]  4.533290235
##  [19,]  1.610858698
##  [20,] -0.002063619
##  [21,]  1.406006313
##  [22,] -0.265082732
##  [23,]           NA
##  [24,]  0.049092964
##  [25,] -0.245518506
##  [26,]  2.048138949
##  [27,]  0.499315794
##  [28,]  0.112622143
##  [29,] -0.518957228
##  [30,] -0.035491286
##  [31,]  1.288893151
##  [32,] -0.413458836
##  [33,]           NA
##  [34,]  0.667967261
##  [35,]  2.648611468
##  [36,] -0.401090415
##  [37,]  0.348920298
##  [38,]  0.300655954
##  [39,]  0.767408712
##  [40,]  0.142845632
##  [41,]  0.403834116
##  [42,] -0.216483280
##  [43,] -0.612463485
##  [44,] -0.397151707
##  [45,] -0.503754545
##  [46,] -0.047385188
##  [47,] -0.605921616
##  [48,]  0.935529193
##  [49,]           NA
##  [50,] -0.474771158
##  [51,]  1.135171400
##  [52,]  0.051389250
##  [53,] -0.507263131
##  [54,]  0.025572149
##  [55,]           NA
##  [56,]           NA
##  [57,] -0.234101245
##  [58,]           NA
##  [59,] -0.369924530
##  [60,] -0.183231314
##  [61,]           NA
##  [62,]  0.388590065
##  [63,] -0.402853941
##  [64,] -0.231039075
##  [65,]  0.173040830
##  [66,]  0.203730825
##  [67,] -0.411090626
##  [68,] -0.430692011
##  [69,] -0.242322508
##  [70,] -0.502889302
##  [71,]           NA
##  [72,] -0.572146151
##  [73,] -0.494348688
##  [74,] -0.513809105
##  [75,] -0.046227857
##  [76,] -0.456094365
##  [77,] -0.094840904
##  [78,]           NA
##  [79,] -0.294972672
##  [80,] -0.649689174
##  [81,] -0.546336562
##  [82,]           NA
##  [83,] -0.377405043
##  [84,] -0.571031600
##  [85,] -0.431085335
##  [86,] -0.385987200
##  [87,]  0.220782144
##  [88,] -0.277195494
##  [89,]  0.318891406
##  [90,] -0.472364150
##  [91,] -0.603994138
##  [92,]           NA
##  [93,] -0.717060912
##  [94,]           NA
##  [95,] -0.582760513
##  [96,] -0.675205812
##  [97,] -0.584065809
##  [98,] -0.664025235
##  [99,] -0.684681696
## [100,] -0.516210500
## [101,] -0.423055880
## [102,] -0.529715801
## [103,]           NA
## [104,] -0.547773946
## [105,] -0.349397045
## [106,] -0.690718218
## [107,] -0.658936756
## [108,]           NA
## [109,] -0.509919721
## [110,] -0.658958279
## [111,] -0.477722010
## [112,] -0.653841289
## [113,] -0.695807936
## [114,] -0.659390774
## [115,] -0.676297978
## [116,] -0.655582169
## [117,] -0.490898577
## [118,] -0.356242207
## [119,] -0.691926157
## [120,] -0.509703789
## [121,] -0.528584503
## [122,] -0.637380039
## [123,] -0.661364022
## [124,]           NA
## [125,] -0.529866861
## [126,]           NA
## [127,] -0.684596678
## [128,]           NA
## [129,] -0.661831612
## [130,] -0.616723510
## [131,] -0.653139652
## [132,] -0.612046558
## [133,] -0.690901525
## [134,] -0.691621354
## [135,] -0.704802841
## [136,] -0.704145443
## [137,] -0.686475042
## [138,] -0.677357152
## [139,] -0.674181431
## [140,] -0.508761118
## [141,] -0.691946047
## [142,] -0.383473723
## [143,] -0.681791673
## [144,] -0.702194212
## [145,] -0.680940852
## [146,] -0.665985611
## [147,] -0.666196817
## [148,] -0.699330479
## [149,] -0.685062467
## [150,] -0.694460135
## [151,] -0.686938336
## [152,]           NA
## [153,]           NA
## [154,] -0.709171352
## [155,] -0.703834669
## attr(,"scaled:center")
## [1] 13.37499
## attr(,"scaled:scale")
## [1] 18.50504

Executive Summary

Firstly, I combine two datasets in correspoinding to the country. With that, it is easier for me to explore the meaning of the indicator of happiness ranking. At this point, I subset some useful variables from country_happiness dataset, and factorize the region variable to see which region has the highest happiness score. After that, in order to understand the indicator of GDP relative to happiness ranking, I transform it into Z-score data. In addtion, some manipulations of missing value and outlier are also included in this assignment. To sum up, we could say NorthernAmerica is the happiest region with around 42.4 million USD of GDP per 1000 people.