EDA on happiness data set

knitr:: opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)

hap=read.csv("D:\\wallpapers and photos\\2017.csv")

DATA CLEANING

first we will rename the data for our simplification, whisker high & whisker low seems useless. thus we can perhaps remove them

library(tidyverse)
hap=hap %>%
  select(-c(Whisker.high,Whisker.low)) %>% 
  rename("Economy"="Economy..GDP.per.Capita.") %>% 
  rename("life.exp"="Health..Life.Expectancy.") %>% 
  rename( "Trust"="Trust..Government.Corruption.")

MISSING DATA

its important to check if our data is missing or not. using we can use vis_miss function and skim funtion to see any missing data or any outlier

library(naniar)
library(skimr)
library(knitr)
vis_miss(hap)

skim(hap)

Data summary
Name	hap
Number of rows	155
Number of columns	10
_______________________
Column type frequency:
character	1
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Country	0	1	4	24	0	155	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Happiness.Rank	1	78.00	44.89	1.00	39.50	78.00	116.50	155.00	▇▇▇▇▇
Happiness.Score	1	5.35	1.13	2.69	4.51	5.28	6.10	7.54	▂▆▇▇▅
Economy	1	0.98	0.42	0.00	0.66	1.06	1.32	1.87	▂▅▇▇▂
Family	1	1.19	0.29	0.00	1.04	1.25	1.41	1.61	▁▁▂▇▇
life.exp	1	0.55	0.24	0.00	0.37	0.61	0.72	0.95	▂▃▃▇▅
Freedom	1	0.41	0.15	0.00	0.30	0.44	0.52	0.66	▁▃▅▇▅
Generosity	1	0.25	0.13	0.00	0.15	0.23	0.32	0.84	▅▇▃▁▁
Trust	1	0.12	0.10	0.00	0.06	0.09	0.15	0.46	▇▅▁▁▁
Dystopia.Residual	1	1.85	0.50	0.38	1.59	1.83	2.14	3.12	▁▂▇▅▂

Hey, we have no missing data. lucky…..:)

SIMPLIFYING DATA

using so many country seems like a hassle. So it we will find the common characteristics of the data ,in this is instance country can be simplified by continent.

hap$continent=NA
hap$continent[which(hap$Country %in% c('Afghanistan','Armenia','Azerbaijan','Bahrain','Bangladesh','Bhutan','Brunei','Cambodia','China','Cyprus','Georgia','India','Indonesia','Iran','Iraq','Israel','Japan','Jordan','Kazakhstan','Kuwait'
,'Kyrgyzstan'
,'Laos'
,'Lebanon'
,'Malaysia'
,'Maldives'
,'Mongolia'
,'Myanmar'
,'Nepal'
,'North Korea'
,'Oman'
,'Pakistan'
,'Philippines'
,'Qatar'
,'Saudi Arabia'
,'Singapore'
,'South Korea'
,'Sri Lanka'
,'State of Palestine'
,'Syria'
,'Tajikistan'
,'Thailand'
,'Timor-Leste'
,'Turkey'
,'Turkmenistan'
,'United Arab Emirates'
,'Uzbekistan'
,'Vietnam'
,'Yemen'))]='asia'
hap$continent[which(hap$Country %in% c("Norway", "Denmark", "Iceland", "Switzerland", "Finland",
"Netherlands", "Sweden", "Austria", "Ireland", "Germany",
"Belgium", "Luxembourg", "United Kingdom", "Czech Republic",
 "Malta", "France", "Spain", "Slovakia", "Poland", "Italy",
                                   "Russia", "Lithuania", "Latvia", "Moldova", "Romania",
                                   "Slovenia", "North Cyprus", "Cyprus", "Estonia", "Belarus",
                                   "Serbia", "Hungary", "Croatia", "Kosovo", "Montenegro",
                                   "Greece", "Portugal", "Bosnia and Herzegovina", "Macedonia",
                                   "Bulgaria", "Albania", "Ukraine"))]="Europe"
hap$continent[which(hap$Country %in% c("Canada", "Costa Rica", "United States", "Mexico",  
                                   "Panama","Trinidad and Tobago", "El Salvador", "Belize", "Guatemala",
                                   "Jamaica", "Nicaragua", "Dominican Republic", "Honduras",
                                   "Haiti"))]="North america"
hap$continent[which(hap$Country %in% c("Chile", "Brazil", "Argentina", "Uruguay",
                                   "Colombia", "Ecuador", "Bolivia", "Peru",
                                   "Paraguay", "Venezuela"))]="South america"
hap$continent[which(hap$Country %in% c("New Zealand", "Australia"))] <- "Australia"
hap$continent[is.na(hap$continent)]="africa"
skim(hap)

Data summary
Name	hap
Number of rows	155
Number of columns	11
_______________________
Column type frequency:
character	2
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Country	0	1	4	24	0	155	0
continent	0	1	4	13	0	6	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Happiness.Rank	1	78.00	44.89	1.00	39.50	78.00	116.50	155.00	▇▇▇▇▇
Happiness.Score	1	5.35	1.13	2.69	4.51	5.28	6.10	7.54	▂▆▇▇▅
Economy	1	0.98	0.42	0.00	0.66	1.06	1.32	1.87	▂▅▇▇▂
Family	1	1.19	0.29	0.00	1.04	1.25	1.41	1.61	▁▁▂▇▇
life.exp	1	0.55	0.24	0.00	0.37	0.61	0.72	0.95	▂▃▃▇▅
Freedom	1	0.41	0.15	0.00	0.30	0.44	0.52	0.66	▁▃▅▇▅
Generosity	1	0.25	0.13	0.00	0.15	0.23	0.32	0.84	▅▇▃▁▁
Trust	1	0.12	0.10	0.00	0.06	0.09	0.15	0.46	▇▅▁▁▁
Dystopia.Residual	1	1.85	0.50	0.38	1.59	1.83	2.14	3.12	▁▂▇▅▂

happiness=hap %>% 
  select(continent,-Country,everything()) %>% 
  mutate(continent=as.factor(continent))

CORRELATION:

now we will see the correlation among our data. for this plot_correlation and corrgram is very helpful.

quick reminder to use order=TRUE and Upper.panel=panel.cor for a right interpretation

library(corrgram)
library(DataExplorer)
happiness %>%
  plot_correlation()

happiness %>%
  corrgram(order = TRUE, upper.panel=panel.cor,main="overall correlation plot")

from the corrgram correlation plot we can see that for all continent the relation for happiness score depends on

economy>life exp> family>freedom>dystopia>trust>generosity

meaning the better the economy ,the better life exp is and happiness score increases

happiness %>% 
  filter(continent=="asia") %>% 
  corrgram(order = TRUE, upper.panel = panel.cor,main= "corrrelation plot for asia")

in asia happiness score depends on

economy>family>life exp ….

meaning for asian people family holds great meaning,as the impact happiness score

happiness %>% 
  filter(continent=="Europe") %>% 
  corrgram(order = TRUE, upper.panel = panel.cor,main=" correlation plot for europe")

here the relation between the happiness score and other things are

freedom>trust>economy>family>life Exp

meaning people of europe continent are free spirited, and trust is very important for their happiness.

Interesting….

happiness %>% 
  filter(continent %in% "South america") %>% 
  corrgram(order = TRUE,upper.panel= panel.cor,main="correlation plot for south america")

for people of south america dystopia residual is very imortant for their happiness.

happiness %>% 
  filter( continent=="North america") %>% 
  corrgram(order=TRUE, upper.panel = panel.cor,main="correlation plot for north america")

like Asian people of north america family, economy,life exp is very important for their happiness. They seems to love their freedom also

happiness %>% 
  filter(continent=="africa") %>% 
  corrgram(order= TRUE, upper.panel = panel.cor,main="correlation plot for africa")

for africans economy and life Exp is the most important. for happiness generosity and trust has the least impact

  corrgram(happiness %>%select(-Country) %>%  filter(continent=="Australia") ,order= TRUE, upper.panel = panel.cor,main="correlation plot for australia")

this is awkward , maybe countries of australia continent has very similar data.

BARPLOT FOR THEIR AVG DATA:

for bar plot we will summarize the data with their mean for a beautiful barplot. we will group the data by continent and summarise(across(everything(),mean))

then we will melt the data from reshape2 package so we can get a new data set with all the variables and their values. In ggplot we will use stat=“identity” so that we can use the both x and y values in the bar plot.

pretty neat….right??

library(reshape2)
barplot_data=happiness %>% 
  select(-Country,-Happiness.Rank) %>% 
  group_by(continent) %>% 
  summarise(across(everything(),mean,na.rm=TRUE))
barplot_data_melt=melt(barplot_data)
ggplot(barplot_data_melt,aes(x=continent,y=value,color=continent,fill=continent))+
  geom_bar(stat="identity")+
  facet_wrap(~variable) + theme(plot.subtitle = element_text(family = "Bookman",
    face = "bold"), panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    panel.background = element_rect(fill = "white")) +labs(title = "BARPLOT for AVG data", subtitle = "Barplot for comparing different factors")

europe,north america , south america have a good happiness score, asia is decent, but africa is the lowest,

in terms of economy europe is the highest and africa is the lowest . asia, north & south america are pretty decent.Same pattern follows for the family.

library(gridExtra)
happiness=happiness %>% 
  filter(continent!="Australia") %>% 
  select(-Happiness.Rank)
g1=ggplot(happiness,aes(x=continent,y=Happiness.Score,color=continent,fill=continent)) +
  geom_boxplot(alpha=0.6) + theme(panel.background = element_rect(fill = NA))
g2=ggplot(happiness,aes(x=continent,y=Happiness.Score,color=continent,fill=continent))+
  geom_violin(alpha=0.6) + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    axis.text.x = element_text(size = 15),
    panel.background = element_rect(fill = NA))
grid.arrange(g1,g2,nrow=2,ncol=1)

from above we can see that two figures can be merged using gridEXTRA package ,using grid.arrange function

SCATTER PLOT USING REGRESSION LINE FOR ESTIMATING CORRELATION:

ggplot(happiness,aes(x=Happiness.Score,y=Economy,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(panel.grid.major = element_line(linetype = "blank"),
    panel.grid.minor = element_line(linetype = "blank"),
    plot.title = element_text(face = "bold",
        colour = "dodgerblue4"), panel.background = element_rect(fill = "white")) +labs(title = "Economy vs Happiness.Score")

ggplot(happiness,aes(x=Happiness.Score,y=Family,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(axis.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold"),
    panel.background = element_rect(fill = NA)) +labs(title = "Family vs. happiness.score")

ggplot(happiness,aes(x=Happiness.Score,y=Trust,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(plot.title = element_text(face = "bold"),
    panel.background = element_rect(fill = NA)) +labs(title = "Trust vs happiness.Score")

ggplot(happiness,aes(x=Happiness.Score,y=Freedom,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(axis.text = element_text(size = 12,
    face = "bold"), panel.background = element_rect(fill = NA)) +labs(title = "Freedom vs. happiness.score")

ggplot(happiness,aes(x=Happiness.Score,y=life.exp,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(panel.background = element_rect(fill = NA)) + theme(plot.title = element_text(face = "bold")) +labs(title = "life.exp vs happiness.score")

ggplot(happiness,aes(x=Happiness.Score,y=Dystopia.Residual,color=continent))+
  geom_point(size=2.5,alpha=0.7)+
  geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
  facet_wrap(~continent) + theme(panel.grid.major = element_line(linetype = "blank")) +labs(title = "Dystopia va happiness.score") + theme(axis.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold"),
    panel.background = element_rect(fill = NA))

EDA on happiness data set

omon das

2023-03-04