DATA 606 Data Project Proposal

Data Preparation

# load data
library(tidyverse)
happiness_rank <- read.csv("https://raw.githubusercontent.com/saayedalam/Data/master/happiness_rank_2017.csv")
happiness_rank <- happiness_rank %>%
  dplyr::rename(Life.Expectancy = Health..Life.Expectancy.
                ,Economy = Economy..GDP.per.Capita.
                ,Trust = Trust..Government.Corruption.) %>%
  select(-Whisker.high, -Whisker.low, - Dystopia.Residual, -Happiness.Rank) %>%
  mutate(Continent = case_when(
    Country %in% c("Israel", "United Arab Emirates", "Singapore", "Thailand", "Taiwan Province of China", "Qatar", "Saudi Arabia", "Kuwait", "Bahrain", "Malaysia", "Uzbekistan", "Japan", "South Korea", "Turkmenistan",
                   "Kazakhstan", "Turkey", "Hong Kong S.A.R., China", "Philippines", "Jordan", "China", "Pakistan", "Indonesia", "Azerbaijan", "Lebanon", "Vietnam", "Tajikistan", "Bhutan", "Kyrgyzstan", "Nepal", "Mongolia",
                   "Palestinian Territories", "Iran", "Bangladesh", "Myanmar", "Iraq", "Sri Lanka", "Armenia", "India", "Georgia", "Cambodia", "Afghanistan", "Yemen", "Syria") ~ "Asia",
    Country %in%  c("Norway", "Denmark", "Iceland", "Switzerland", "Finland", "Netherlands", "Sweden", "Austria", "Ireland", "Germany", "Belgium", "Luxembourg", "United Kingdom", "Czech Republic", "Malta", "France", 
                    "Spain","Slovakia", "Poland", "Italy", "Russia", "Lithuania", "Latvia", "Moldova", "Romania", "Slovenia", "North Cyprus", "Cyprus", "Estonia", "Belarus", "Serbia", "Hungary", "Croatia", "Kosovo",
                    "Montenegro", "Greece", "Portugal", "Bosnia and Herzegovina", "Macedonia", "Bulgaria", "Albania", "Ukraine") ~ "Europe",
    Country %in%  c("Canada", "Costa Rica", "United States", "Mexico", "Panama","Trinidad and Tobago", "El Salvador", "Belize", "Guatemala", "Jamaica", "Nicaragua", "Dominican Republic", "Honduras", "Haiti") ~ "North America",
    Country %in%  c("Chile", "Brazil", "Argentina", "Uruguay", "Colombia", "Ecuador", "Bolivia", "Peru", "Paraguay", "Venezuela") ~ "South America",
    Country %in%  c("New Zealand", "Australia") ~ "Australia",
    TRUE ~ "Africa")) %>%
  mutate(Continent = as.factor(Continent)) %>%
  select(Country, Continent, everything()) 

glimpse(happiness_rank)

## Observations: 155
## Variables: 9
## $ Country         <fct> Norway, Denmark, Iceland, Switzerland, Finland...
## $ Continent       <fct> Europe, Europe, Europe, Europe, Europe, Europe...
## $ Happiness.Score <dbl> 7.537, 7.522, 7.504, 7.494, 7.469, 7.377, 7.31...
## $ Economy         <dbl> 1.616463, 1.482383, 1.480633, 1.564980, 1.4435...
## $ Family          <dbl> 1.533524, 1.551122, 1.610574, 1.516912, 1.5402...
## $ Life.Expectancy <dbl> 0.7966665, 0.7925655, 0.8335521, 0.8581313, 0....
## $ Freedom         <dbl> 0.6354226, 0.6260067, 0.6271626, 0.6200706, 0....
## $ Generosity      <dbl> 0.36201224, 0.35528049, 0.47554022, 0.29054928...
## $ Trust           <dbl> 0.31596383, 0.40077007, 0.15352656, 0.36700729...

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Which variable i.e. family, life expectancy, economy, generosity, trust in government or freedom best predicts happiness score.

Cases

What are the cases, and how many are there?
Each case represents a country and it’s happiness score and hapiness rank. There are 155 cases in the dataset.

Data collection

Describe the method of data collection.
Data is collected from Sustainable Development Solutions Network hosted on Kaggle.

Type of study

What type of study is this (observational/experiment)?
This is an observational study because all the data has been collected.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.
The following link has the dataset and readme file that describes all the variables. https://www.kaggle.com/unsdsn/world-happiness

Dependent Variable

What is the response variable? Is it quantitative or qualitative?
The response variable is the happiness score of a country based on seven explanatory variables. It is quantative i.e the variable is numerical.

Independent Variable

What is the explanatory variable? Is it quantitative or qualitative?
I have six variables to work with. I will find the variable that is most effective at predicting happiness score. One of the following will be my explanatory variable: family, life expectancy, economy, generosity, trust in government or freedom. All these variables are numerical.

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

library(Hmisc)
happiness_rank %>%
  select(-Country, -Continent) %>%
  Hmisc::describe()

## . 
## 
##  7  Variables      155  Observations
## ---------------------------------------------------------------------------
## Happiness.Score 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      151        1    5.354    1.301    3.574    3.800 
##      .25      .50      .75      .90      .95 
##    4.506    5.279    6.102    6.927    7.293 
## 
## lowest : 2.693 2.905 3.349 3.462 3.471, highest: 7.469 7.494 7.504 7.522 7.537
## ---------------------------------------------------------------------------
## Economy 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1   0.9847   0.4802   0.2415   0.3687 
##      .25      .50      .75      .90      .95 
##   0.6634   1.0646   1.3180   1.4860   1.5479 
## 
## lowest : 0.00000000 0.02264318 0.09162257 0.09210235 0.11904179
## highest: 1.62634337 1.63295245 1.69227767 1.74194360 1.87076569
## ---------------------------------------------------------------------------
## Family 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1    1.189   0.3106   0.6213   0.7814 
##      .25      .50      .75      .90      .95 
##   1.0426   1.2539   1.4143   1.4856   1.5215 
## 
## lowest : 0.0000000 0.3961026 0.4318825 0.4352998 0.5125688
## highest: 1.5481951 1.5489691 1.5511216 1.5582311 1.6105740
## ---------------------------------------------------------------------------
## Life.Expectancy 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1   0.5513   0.2677   0.1118   0.1925 
##      .25      .50      .75      .90      .95 
##   0.3699   0.6060   0.7230   0.8273   0.8448 
## 
## lowest : 0.000000000 0.005564754 0.018772686 0.041134715 0.048642170
## highest: 0.888960600 0.900214076 0.913475871 0.943062425 0.949492395
## ---------------------------------------------------------------------------
## Freedom 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1   0.4088   0.1691   0.1179   0.2007 
##      .25      .50      .75      .90      .95 
##   0.3037   0.4375   0.5166   0.5874   0.6133 
## 
## lowest : 0.00000000 0.01499586 0.03036986 0.05990075 0.08153944
## highest: 0.62600672 0.62716264 0.63337582 0.63542259 0.65824866
## ---------------------------------------------------------------------------
## Generosity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1   0.2469   0.1482  0.05149  0.08534 
##      .25      .50      .75      .90      .95 
##  0.15411  0.23154  0.32376  0.42829  0.48970 
## 
## lowest : 0.00000000 0.01016466 0.02880684 0.03220996 0.04378538
## highest: 0.50000513 0.57212311 0.57473058 0.61170459 0.83807516
## ---------------------------------------------------------------------------
## Trust 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      155        0      155        1   0.1231   0.1047  0.02072  0.03213 
##      .25      .50      .75      .90      .95 
##  0.05727  0.08985  0.15330  0.28256  0.33724 
## 
## lowest : 0.000000000 0.004387901 0.008964816 0.010091286 0.011051531
## highest: 0.384398729 0.400770068 0.439299256 0.455220014 0.464307785
## ---------------------------------------------------------------------------

happiness_rank %>%
  ggplot(aes(Economy, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

happiness_rank %>%
  ggplot(aes(Family, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

happiness_rank %>%
  ggplot(aes(Life.Expectancy, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

happiness_rank %>%
  ggplot(aes(Freedom, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

happiness_rank %>%
  ggplot(aes(Generosity, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

happiness_rank %>%
  ggplot(aes(Trust, Happiness.Score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)