# load data
library(tidyverse)
happiness_rank <- read.csv("https://raw.githubusercontent.com/saayedalam/Data/master/happiness_rank_2017.csv")
happiness_rank <- happiness_rank %>%
dplyr::rename(Life.Expectancy = Health..Life.Expectancy.
,Economy = Economy..GDP.per.Capita.
,Trust = Trust..Government.Corruption.) %>%
select(-Whisker.high, -Whisker.low, - Dystopia.Residual, -Happiness.Rank) %>%
mutate(Continent = case_when(
Country %in% c("Israel", "United Arab Emirates", "Singapore", "Thailand", "Taiwan Province of China", "Qatar", "Saudi Arabia", "Kuwait", "Bahrain", "Malaysia", "Uzbekistan", "Japan", "South Korea", "Turkmenistan",
"Kazakhstan", "Turkey", "Hong Kong S.A.R., China", "Philippines", "Jordan", "China", "Pakistan", "Indonesia", "Azerbaijan", "Lebanon", "Vietnam", "Tajikistan", "Bhutan", "Kyrgyzstan", "Nepal", "Mongolia",
"Palestinian Territories", "Iran", "Bangladesh", "Myanmar", "Iraq", "Sri Lanka", "Armenia", "India", "Georgia", "Cambodia", "Afghanistan", "Yemen", "Syria") ~ "Asia",
Country %in% c("Norway", "Denmark", "Iceland", "Switzerland", "Finland", "Netherlands", "Sweden", "Austria", "Ireland", "Germany", "Belgium", "Luxembourg", "United Kingdom", "Czech Republic", "Malta", "France",
"Spain","Slovakia", "Poland", "Italy", "Russia", "Lithuania", "Latvia", "Moldova", "Romania", "Slovenia", "North Cyprus", "Cyprus", "Estonia", "Belarus", "Serbia", "Hungary", "Croatia", "Kosovo",
"Montenegro", "Greece", "Portugal", "Bosnia and Herzegovina", "Macedonia", "Bulgaria", "Albania", "Ukraine") ~ "Europe",
Country %in% c("Canada", "Costa Rica", "United States", "Mexico", "Panama","Trinidad and Tobago", "El Salvador", "Belize", "Guatemala", "Jamaica", "Nicaragua", "Dominican Republic", "Honduras", "Haiti") ~ "North America",
Country %in% c("Chile", "Brazil", "Argentina", "Uruguay", "Colombia", "Ecuador", "Bolivia", "Peru", "Paraguay", "Venezuela") ~ "South America",
Country %in% c("New Zealand", "Australia") ~ "Australia",
TRUE ~ "Africa")) %>%
mutate(Continent = as.factor(Continent)) %>%
select(Country, Continent, everything())
glimpse(happiness_rank)
## Observations: 155
## Variables: 9
## $ Country <fct> Norway, Denmark, Iceland, Switzerland, Finland...
## $ Continent <fct> Europe, Europe, Europe, Europe, Europe, Europe...
## $ Happiness.Score <dbl> 7.537, 7.522, 7.504, 7.494, 7.469, 7.377, 7.31...
## $ Economy <dbl> 1.616463, 1.482383, 1.480633, 1.564980, 1.4435...
## $ Family <dbl> 1.533524, 1.551122, 1.610574, 1.516912, 1.5402...
## $ Life.Expectancy <dbl> 0.7966665, 0.7925655, 0.8335521, 0.8581313, 0....
## $ Freedom <dbl> 0.6354226, 0.6260067, 0.6271626, 0.6200706, 0....
## $ Generosity <dbl> 0.36201224, 0.35528049, 0.47554022, 0.29054928...
## $ Trust <dbl> 0.31596383, 0.40077007, 0.15352656, 0.36700729...
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Which variable i.e. family, life expectancy, economy, generosity, trust in government or freedom best predicts happiness score.
What are the cases, and how many are there?
Each case represents a country and it’s happiness score and hapiness rank. There are 155 cases in the dataset.
Describe the method of data collection.
Data is collected from Sustainable Development Solutions Network hosted on Kaggle.
What type of study is this (observational/experiment)?
This is an observational study because all the data has been collected.
If you collected the data, state self-collected. If not, provide a citation/link.
The following link has the dataset and readme file that describes all the variables. https://www.kaggle.com/unsdsn/world-happiness
What is the response variable? Is it quantitative or qualitative?
The response variable is the happiness score of a country based on seven explanatory variables. It is quantative i.e the variable is numerical.
What is the explanatory variable? Is it quantitative or qualitative?
I have six variables to work with. I will find the variable that is most effective at predicting happiness score. One of the following will be my explanatory variable: family, life expectancy, economy, generosity, trust in government or freedom. All these variables are numerical.
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
library(Hmisc)
happiness_rank %>%
select(-Country, -Continent) %>%
Hmisc::describe()
## .
##
## 7 Variables 155 Observations
## ---------------------------------------------------------------------------
## Happiness.Score
## n missing distinct Info Mean Gmd .05 .10
## 155 0 151 1 5.354 1.301 3.574 3.800
## .25 .50 .75 .90 .95
## 4.506 5.279 6.102 6.927 7.293
##
## lowest : 2.693 2.905 3.349 3.462 3.471, highest: 7.469 7.494 7.504 7.522 7.537
## ---------------------------------------------------------------------------
## Economy
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 0.9847 0.4802 0.2415 0.3687
## .25 .50 .75 .90 .95
## 0.6634 1.0646 1.3180 1.4860 1.5479
##
## lowest : 0.00000000 0.02264318 0.09162257 0.09210235 0.11904179
## highest: 1.62634337 1.63295245 1.69227767 1.74194360 1.87076569
## ---------------------------------------------------------------------------
## Family
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 1.189 0.3106 0.6213 0.7814
## .25 .50 .75 .90 .95
## 1.0426 1.2539 1.4143 1.4856 1.5215
##
## lowest : 0.0000000 0.3961026 0.4318825 0.4352998 0.5125688
## highest: 1.5481951 1.5489691 1.5511216 1.5582311 1.6105740
## ---------------------------------------------------------------------------
## Life.Expectancy
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 0.5513 0.2677 0.1118 0.1925
## .25 .50 .75 .90 .95
## 0.3699 0.6060 0.7230 0.8273 0.8448
##
## lowest : 0.000000000 0.005564754 0.018772686 0.041134715 0.048642170
## highest: 0.888960600 0.900214076 0.913475871 0.943062425 0.949492395
## ---------------------------------------------------------------------------
## Freedom
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 0.4088 0.1691 0.1179 0.2007
## .25 .50 .75 .90 .95
## 0.3037 0.4375 0.5166 0.5874 0.6133
##
## lowest : 0.00000000 0.01499586 0.03036986 0.05990075 0.08153944
## highest: 0.62600672 0.62716264 0.63337582 0.63542259 0.65824866
## ---------------------------------------------------------------------------
## Generosity
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 0.2469 0.1482 0.05149 0.08534
## .25 .50 .75 .90 .95
## 0.15411 0.23154 0.32376 0.42829 0.48970
##
## lowest : 0.00000000 0.01016466 0.02880684 0.03220996 0.04378538
## highest: 0.50000513 0.57212311 0.57473058 0.61170459 0.83807516
## ---------------------------------------------------------------------------
## Trust
## n missing distinct Info Mean Gmd .05 .10
## 155 0 155 1 0.1231 0.1047 0.02072 0.03213
## .25 .50 .75 .90 .95
## 0.05727 0.08985 0.15330 0.28256 0.33724
##
## lowest : 0.000000000 0.004387901 0.008964816 0.010091286 0.011051531
## highest: 0.384398729 0.400770068 0.439299256 0.455220014 0.464307785
## ---------------------------------------------------------------------------
happiness_rank %>%
ggplot(aes(Economy, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
happiness_rank %>%
ggplot(aes(Family, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
happiness_rank %>%
ggplot(aes(Life.Expectancy, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
happiness_rank %>%
ggplot(aes(Freedom, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
happiness_rank %>%
ggplot(aes(Generosity, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
happiness_rank %>%
ggplot(aes(Trust, Happiness.Score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)