library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
getwd()
## [1] "/Users/qmoa_liu/Downloads"
setwd("/Users/qmoa_liu/Downloads")
country_profile= read_csv("/Users/qmoa_liu/Downloads/undata-country-profiles/country_profile_variables.csv")
## Warning: Duplicated column names deduplicated: 'Mobile-cellular
## subscriptions (per 100 inhabitants)' => 'Mobile-cellular subscriptions (per
## 100 inhabitants)_1' [41]
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Population in thousands (2017)` = col_double(),
## `Population density (per km2, 2017)` = col_double(),
## `Sex ratio (m per 100 f, 2017)` = col_double(),
## `GDP: Gross domestic product (million current US$)` = col_double(),
## `GDP per capita (current US$)` = col_double(),
## `Economy: Industry (% of GVA)` = col_double(),
## `Economy: Services and other activity (% of GVA)` = col_double(),
## `Agricultural production index (2004-2006=100)` = col_double(),
## `Food production index (2004-2006=100)` = col_double(),
## `Urban population (% of total population)` = col_double(),
## `Health: Total expenditure (% of GDP)` = col_double(),
## `Seats held by women in national parliaments %` = col_double(),
## `Individuals using the Internet (per 100 inhabitants)` = col_double(),
## `CO2 emission estimates (million tons/tons per capita)` = col_double(),
## `Energy production, primary (Petajoules)` = col_double(),
## `Net Official Development Assist. received (% of GNI)` = col_double()
## )
## See spec(...) for full column specifications.
country_profile=subset(country_profile[,c(1,2,4,7,28)])
head(country_profile)
country_happiness <- read_csv("/Users/qmoa_liu/Downloads/world-happiness-report/2017.csv")
## Parsed with column specification:
## cols(
## Country = col_character(),
## Happiness.Rank = col_double(),
## Happiness.Score = col_double(),
## Whisker.high = col_double(),
## Whisker.low = col_double(),
## Economy..GDP.per.Capita. = col_double(),
## Family = col_double(),
## Health..Life.Expectancy. = col_double(),
## Freedom = col_double(),
## Generosity = col_double(),
## Trust..Government.Corruption. = col_double(),
## Dystopia.Residual = col_double()
## )
country_happiness=subset(country_happiness[,c(1,2,3,6,7)])
head(country_happiness)
# Combine two datasets
country_all=country_happiness %>% left_join(country_profile,by="Country")
head(country_all)
# Generate the region factor in order to know about which region is happier than others
country_all$Region=country_all$Region %>% factor(
levels = c("SouthernAsia","SouthernEurope","NorthernAfrica","Polynesia",
"MiddleAfrica","Caribbean","SouthAmerica","WesternAsia","Oceania","WesternEurope",
"EasternEurope","CentralAmerica","WesternAfrica","NorthernAmerica","SouthernAfrica",
"South-easternAsia","EasternAfrica","NorthernEurope","EasternAsia","Melanesia","Micronesia","CentralAsia"),
labels=c("SouthernAsia","SouthernEurope","NorthernAfrica","Polynesia","MiddleAfrica","Caribbean","SouthAmerica","WesternAsia","Oceania","WesternEurope",
"EasternEurope","CentralAmerica","WesternAfrica","NorthernAmerica","SouthernAfrica",
"South-easternAsia","EasternAfrica","NorthernEurope","EasternAsia","Melanesia","Micronesia","CentralAsia") )
str(country_all)
## Classes 'tbl_df', 'tbl' and 'data.frame': 155 obs. of 9 variables:
## $ Country : chr "Norway" "Denmark" "Iceland" "Switzerland" ...
## $ Happiness.Rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.54 7.52 7.5 7.49 7.47 ...
## $ Economy..GDP.per.Capita. : num 1.62 1.48 1.48 1.56 1.44 ...
## $ Family : num 1.53 1.55 1.61 1.52 1.54 ...
## $ Region : Factor w/ 22 levels "SouthernAsia",..: 18 18 18 10 18 10 14 9 18 9 ...
## $ Population in thousands (2017) : num 5305 5734 335 8476 5523 ...
## $ GDP: Gross domestic product (million current US$): num 386578 301308 16780 670790 231960 ...
## $ Life expectancy at birth (females/males, years) : chr "83.6/79.5" "82.2/78.1" "83.8/80.6" "84.8/80.5" ...
# Find the missing value observation
country_all %>% is.na() %>% colSums()
## Country
## 0
## Happiness.Rank
## 0
## Happiness.Score
## 0
## Economy..GDP.per.Capita.
## 0
## Family
## 0
## Region
## 20
## Population in thousands (2017)
## 20
## GDP: Gross domestic product (million current US$)
## 20
## Life expectancy at birth (females/males, years)
## 20
country_all %>% is.na() %>% which()
## [1] 789 798 808 824 830 831 833 836 846 853 857 867 869 878
## [15] 883 899 901 903 927 928 944 953 963 979 985 986 988 991
## [29] 1001 1008 1012 1022 1024 1033 1038 1054 1056 1058 1082 1083 1099 1108
## [43] 1118 1134 1140 1141 1143 1146 1156 1163 1167 1177 1179 1188 1193 1209
## [57] 1211 1213 1237 1238 1254 1263 1273 1289 1295 1296 1298 1301 1311 1318
## [71] 1322 1332 1334 1343 1348 1364 1366 1368 1392 1393
country_all %>% is.na() %>% sum()
## [1] 80
# Impute GDP by the mean because GDP will be used for analysis later and keep the whole information
country_imputed=country_all %>% mutate(GDP_imputed=ifelse(is.na(country_all$`GDP: Gross domestic product (million current US$)`),mean(`GDP: Gross domestic product (million current US$)`,na.rm = T),`GDP: Gross domestic product (million current US$)`))
# Explore the outliers and exclude it
boxplot_country=boxplot(country_imputed$GDP_imputed,na.rm=T)
boxplot_country$out
## [1] 1552808 1230859 3363600 2858003 1772591 1140724 2418946
## [8] 1192955 1821580 4383076 11158457 2116239
country_filter=country_imputed %>% filter(`GDP: Gross domestic product (million current US$)`!=c(1552808,1230859,3363600,2858003,1772591,1140724,2418946,1192955,1821580,4383076,11158457))
## Warning in `GDP: Gross domestic product (million current US$)` !=
## c(1552808, : longer object length is not a multiple of shorter object
## length
# Mutate the GDP per 1000 people in country for easier understanding
country_imputed= country_imputed %>% mutate(GDP_per_1k=GDP_imputed/`Population in thousands (2017)`)
# Continue as above in order to understand which region is happier, we spread the sheet to make the happiness ranking corresponding to the region.
country_filter=country_imputed %>% filter(!is.na(Region))
country_filter %>% group_by(Region) %>% summarise(mean(Happiness.Score))
country_filter %>% group_by(Region) %>% summarise(mean(GDP_per_1k))
# NorthernAmerica has the highest mean of happiness score.
#In order to understand the GDP per 1k distribution easily, I have to do transformation.
country_imputed$GDP_per_1k %>% scale(center = T,scale = T)
## [,1]
## [1,] 3.215097892
## [2,] 2.116862427
## [3,] 1.984030571
## [4,] 3.553893470
## [5,] 1.546817813
## [6,] 1.657283494
## [7,] 1.568419070
## [8,] 1.268584262
## [9,] 1.979976382
## [10,] 1.997555011
## [11,] 1.221478143
## [12,] -0.139446041
## [13,] 1.609074957
## [14,] NA
## [15,] 2.496843152
## [16,] 1.490814178
## [17,] 1.429090715
## [18,] 4.533290235
## [19,] 1.610858698
## [20,] -0.002063619
## [21,] 1.406006313
## [22,] -0.265082732
## [23,] NA
## [24,] 0.049092964
## [25,] -0.245518506
## [26,] 2.048138949
## [27,] 0.499315794
## [28,] 0.112622143
## [29,] -0.518957228
## [30,] -0.035491286
## [31,] 1.288893151
## [32,] -0.413458836
## [33,] NA
## [34,] 0.667967261
## [35,] 2.648611468
## [36,] -0.401090415
## [37,] 0.348920298
## [38,] 0.300655954
## [39,] 0.767408712
## [40,] 0.142845632
## [41,] 0.403834116
## [42,] -0.216483280
## [43,] -0.612463485
## [44,] -0.397151707
## [45,] -0.503754545
## [46,] -0.047385188
## [47,] -0.605921616
## [48,] 0.935529193
## [49,] NA
## [50,] -0.474771158
## [51,] 1.135171400
## [52,] 0.051389250
## [53,] -0.507263131
## [54,] 0.025572149
## [55,] NA
## [56,] NA
## [57,] -0.234101245
## [58,] NA
## [59,] -0.369924530
## [60,] -0.183231314
## [61,] NA
## [62,] 0.388590065
## [63,] -0.402853941
## [64,] -0.231039075
## [65,] 0.173040830
## [66,] 0.203730825
## [67,] -0.411090626
## [68,] -0.430692011
## [69,] -0.242322508
## [70,] -0.502889302
## [71,] NA
## [72,] -0.572146151
## [73,] -0.494348688
## [74,] -0.513809105
## [75,] -0.046227857
## [76,] -0.456094365
## [77,] -0.094840904
## [78,] NA
## [79,] -0.294972672
## [80,] -0.649689174
## [81,] -0.546336562
## [82,] NA
## [83,] -0.377405043
## [84,] -0.571031600
## [85,] -0.431085335
## [86,] -0.385987200
## [87,] 0.220782144
## [88,] -0.277195494
## [89,] 0.318891406
## [90,] -0.472364150
## [91,] -0.603994138
## [92,] NA
## [93,] -0.717060912
## [94,] NA
## [95,] -0.582760513
## [96,] -0.675205812
## [97,] -0.584065809
## [98,] -0.664025235
## [99,] -0.684681696
## [100,] -0.516210500
## [101,] -0.423055880
## [102,] -0.529715801
## [103,] NA
## [104,] -0.547773946
## [105,] -0.349397045
## [106,] -0.690718218
## [107,] -0.658936756
## [108,] NA
## [109,] -0.509919721
## [110,] -0.658958279
## [111,] -0.477722010
## [112,] -0.653841289
## [113,] -0.695807936
## [114,] -0.659390774
## [115,] -0.676297978
## [116,] -0.655582169
## [117,] -0.490898577
## [118,] -0.356242207
## [119,] -0.691926157
## [120,] -0.509703789
## [121,] -0.528584503
## [122,] -0.637380039
## [123,] -0.661364022
## [124,] NA
## [125,] -0.529866861
## [126,] NA
## [127,] -0.684596678
## [128,] NA
## [129,] -0.661831612
## [130,] -0.616723510
## [131,] -0.653139652
## [132,] -0.612046558
## [133,] -0.690901525
## [134,] -0.691621354
## [135,] -0.704802841
## [136,] -0.704145443
## [137,] -0.686475042
## [138,] -0.677357152
## [139,] -0.674181431
## [140,] -0.508761118
## [141,] -0.691946047
## [142,] -0.383473723
## [143,] -0.681791673
## [144,] -0.702194212
## [145,] -0.680940852
## [146,] -0.665985611
## [147,] -0.666196817
## [148,] -0.699330479
## [149,] -0.685062467
## [150,] -0.694460135
## [151,] -0.686938336
## [152,] NA
## [153,] NA
## [154,] -0.709171352
## [155,] -0.703834669
## attr(,"scaled:center")
## [1] 13.37499
## attr(,"scaled:scale")
## [1] 18.50504
Firstly, I combine two datasets in correspoinding to the country. With that, it is easier for me to explore the meaning of the indicator of happiness ranking. At this point, I subset some useful variables from country_happiness dataset, and factorize the region variable to see which region has the highest happiness score. After that, in order to understand the indicator of GDP relative to happiness ranking, I transform it into Z-score data. In addtion, some manipulations of missing value and outlier are also included in this assignment. To sum up, we could say NorthernAmerica is the happiest region with around 42.4 million USD of GDP per 1000 people.