knitr:: opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)
hap=read.csv("D:\\wallpapers and photos\\2017.csv")
DATA CLEANING
first we will rename the data for our simplification, whisker high & whisker low seems useless. thus we can perhaps remove them
library(tidyverse)
hap=hap %>%
select(-c(Whisker.high,Whisker.low)) %>%
rename("Economy"="Economy..GDP.per.Capita.") %>%
rename("life.exp"="Health..Life.Expectancy.") %>%
rename( "Trust"="Trust..Government.Corruption.")
MISSING DATA
its important to check if our data is missing or not. using we can use vis_miss function and skim funtion to see any missing data or any outlier
library(naniar)
library(skimr)
library(knitr)
vis_miss(hap)
skim(hap)
| Name | hap |
| Number of rows | 155 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 24 | 0 | 155 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Happiness.Rank | 0 | 1 | 78.00 | 44.89 | 1.00 | 39.50 | 78.00 | 116.50 | 155.00 | ▇▇▇▇▇ |
| Happiness.Score | 0 | 1 | 5.35 | 1.13 | 2.69 | 4.51 | 5.28 | 6.10 | 7.54 | ▂▆▇▇▅ |
| Economy | 0 | 1 | 0.98 | 0.42 | 0.00 | 0.66 | 1.06 | 1.32 | 1.87 | ▂▅▇▇▂ |
| Family | 0 | 1 | 1.19 | 0.29 | 0.00 | 1.04 | 1.25 | 1.41 | 1.61 | ▁▁▂▇▇ |
| life.exp | 0 | 1 | 0.55 | 0.24 | 0.00 | 0.37 | 0.61 | 0.72 | 0.95 | ▂▃▃▇▅ |
| Freedom | 0 | 1 | 0.41 | 0.15 | 0.00 | 0.30 | 0.44 | 0.52 | 0.66 | ▁▃▅▇▅ |
| Generosity | 0 | 1 | 0.25 | 0.13 | 0.00 | 0.15 | 0.23 | 0.32 | 0.84 | ▅▇▃▁▁ |
| Trust | 0 | 1 | 0.12 | 0.10 | 0.00 | 0.06 | 0.09 | 0.15 | 0.46 | ▇▅▁▁▁ |
| Dystopia.Residual | 0 | 1 | 1.85 | 0.50 | 0.38 | 1.59 | 1.83 | 2.14 | 3.12 | ▁▂▇▅▂ |
Hey, we have no missing data. lucky…..:)
SIMPLIFYING DATA
using so many country seems like a hassle. So it we will find the common characteristics of the data ,in this is instance country can be simplified by continent.
hap$continent=NA
hap$continent[which(hap$Country %in% c('Afghanistan','Armenia','Azerbaijan','Bahrain','Bangladesh','Bhutan','Brunei','Cambodia','China','Cyprus','Georgia','India','Indonesia','Iran','Iraq','Israel','Japan','Jordan','Kazakhstan','Kuwait'
,'Kyrgyzstan'
,'Laos'
,'Lebanon'
,'Malaysia'
,'Maldives'
,'Mongolia'
,'Myanmar'
,'Nepal'
,'North Korea'
,'Oman'
,'Pakistan'
,'Philippines'
,'Qatar'
,'Saudi Arabia'
,'Singapore'
,'South Korea'
,'Sri Lanka'
,'State of Palestine'
,'Syria'
,'Tajikistan'
,'Thailand'
,'Timor-Leste'
,'Turkey'
,'Turkmenistan'
,'United Arab Emirates'
,'Uzbekistan'
,'Vietnam'
,'Yemen'))]='asia'
hap$continent[which(hap$Country %in% c("Norway", "Denmark", "Iceland", "Switzerland", "Finland",
"Netherlands", "Sweden", "Austria", "Ireland", "Germany",
"Belgium", "Luxembourg", "United Kingdom", "Czech Republic",
"Malta", "France", "Spain", "Slovakia", "Poland", "Italy",
"Russia", "Lithuania", "Latvia", "Moldova", "Romania",
"Slovenia", "North Cyprus", "Cyprus", "Estonia", "Belarus",
"Serbia", "Hungary", "Croatia", "Kosovo", "Montenegro",
"Greece", "Portugal", "Bosnia and Herzegovina", "Macedonia",
"Bulgaria", "Albania", "Ukraine"))]="Europe"
hap$continent[which(hap$Country %in% c("Canada", "Costa Rica", "United States", "Mexico",
"Panama","Trinidad and Tobago", "El Salvador", "Belize", "Guatemala",
"Jamaica", "Nicaragua", "Dominican Republic", "Honduras",
"Haiti"))]="North america"
hap$continent[which(hap$Country %in% c("Chile", "Brazil", "Argentina", "Uruguay",
"Colombia", "Ecuador", "Bolivia", "Peru",
"Paraguay", "Venezuela"))]="South america"
hap$continent[which(hap$Country %in% c("New Zealand", "Australia"))] <- "Australia"
hap$continent[is.na(hap$continent)]="africa"
skim(hap)
| Name | hap |
| Number of rows | 155 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 24 | 0 | 155 | 0 |
| continent | 0 | 1 | 4 | 13 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Happiness.Rank | 0 | 1 | 78.00 | 44.89 | 1.00 | 39.50 | 78.00 | 116.50 | 155.00 | ▇▇▇▇▇ |
| Happiness.Score | 0 | 1 | 5.35 | 1.13 | 2.69 | 4.51 | 5.28 | 6.10 | 7.54 | ▂▆▇▇▅ |
| Economy | 0 | 1 | 0.98 | 0.42 | 0.00 | 0.66 | 1.06 | 1.32 | 1.87 | ▂▅▇▇▂ |
| Family | 0 | 1 | 1.19 | 0.29 | 0.00 | 1.04 | 1.25 | 1.41 | 1.61 | ▁▁▂▇▇ |
| life.exp | 0 | 1 | 0.55 | 0.24 | 0.00 | 0.37 | 0.61 | 0.72 | 0.95 | ▂▃▃▇▅ |
| Freedom | 0 | 1 | 0.41 | 0.15 | 0.00 | 0.30 | 0.44 | 0.52 | 0.66 | ▁▃▅▇▅ |
| Generosity | 0 | 1 | 0.25 | 0.13 | 0.00 | 0.15 | 0.23 | 0.32 | 0.84 | ▅▇▃▁▁ |
| Trust | 0 | 1 | 0.12 | 0.10 | 0.00 | 0.06 | 0.09 | 0.15 | 0.46 | ▇▅▁▁▁ |
| Dystopia.Residual | 0 | 1 | 1.85 | 0.50 | 0.38 | 1.59 | 1.83 | 2.14 | 3.12 | ▁▂▇▅▂ |
happiness=hap %>%
select(continent,-Country,everything()) %>%
mutate(continent=as.factor(continent))
CORRELATION:
now we will see the correlation among our data. for this plot_correlation and corrgram is very helpful.
quick reminder to use order=TRUE and Upper.panel=panel.cor for a right interpretation
library(corrgram)
library(DataExplorer)
happiness %>%
plot_correlation()
happiness %>%
corrgram(order = TRUE, upper.panel=panel.cor,main="overall correlation plot")
from the corrgram correlation plot we can see that for all continent the relation for happiness score depends on
economy>life exp> family>freedom>dystopia>trust>generosity
meaning the better the economy ,the better life exp is and happiness score increases
happiness %>%
filter(continent=="asia") %>%
corrgram(order = TRUE, upper.panel = panel.cor,main= "corrrelation plot for asia")
in asia happiness score depends on
economy>family>life exp ….
meaning for asian people family holds great meaning,as the impact happiness score
happiness %>%
filter(continent=="Europe") %>%
corrgram(order = TRUE, upper.panel = panel.cor,main=" correlation plot for europe")
here the relation between the happiness score and other things are
freedom>trust>economy>family>life Exp
meaning people of europe continent are free spirited, and trust is very important for their happiness.
Interesting….
happiness %>%
filter(continent %in% "South america") %>%
corrgram(order = TRUE,upper.panel= panel.cor,main="correlation plot for south america")
for people of south america dystopia residual is very imortant for their happiness.
happiness %>%
filter( continent=="North america") %>%
corrgram(order=TRUE, upper.panel = panel.cor,main="correlation plot for north america")
like Asian people of north america family, economy,life exp is very important for their happiness. They seems to love their freedom also
happiness %>%
filter(continent=="africa") %>%
corrgram(order= TRUE, upper.panel = panel.cor,main="correlation plot for africa")
for africans economy and life Exp is the most important. for happiness generosity and trust has the least impact
corrgram(happiness %>%select(-Country) %>% filter(continent=="Australia") ,order= TRUE, upper.panel = panel.cor,main="correlation plot for australia")
this is awkward , maybe countries of australia continent has very similar data.
BARPLOT FOR THEIR AVG DATA:
for bar plot we will summarize the data with their mean for a beautiful barplot. we will group the data by continent and summarise(across(everything(),mean))
then we will melt the data from reshape2 package so we can get a new data set with all the variables and their values. In ggplot we will use stat=“identity” so that we can use the both x and y values in the bar plot.
pretty neat….right??
library(reshape2)
barplot_data=happiness %>%
select(-Country,-Happiness.Rank) %>%
group_by(continent) %>%
summarise(across(everything(),mean,na.rm=TRUE))
barplot_data_melt=melt(barplot_data)
ggplot(barplot_data_melt,aes(x=continent,y=value,color=continent,fill=continent))+
geom_bar(stat="identity")+
facet_wrap(~variable) + theme(plot.subtitle = element_text(family = "Bookman",
face = "bold"), panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
panel.background = element_rect(fill = "white")) +labs(title = "BARPLOT for AVG data", subtitle = "Barplot for comparing different factors")
europe,north america , south america have a good happiness score, asia is decent, but africa is the lowest,
in terms of economy europe is the highest and africa is the lowest . asia, north & south america are pretty decent.Same pattern follows for the family.
library(gridExtra)
happiness=happiness %>%
filter(continent!="Australia") %>%
select(-Happiness.Rank)
g1=ggplot(happiness,aes(x=continent,y=Happiness.Score,color=continent,fill=continent)) +
geom_boxplot(alpha=0.6) + theme(panel.background = element_rect(fill = NA))
g2=ggplot(happiness,aes(x=continent,y=Happiness.Score,color=continent,fill=continent))+
geom_violin(alpha=0.6) + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
axis.text.x = element_text(size = 15),
panel.background = element_rect(fill = NA))
grid.arrange(g1,g2,nrow=2,ncol=1)
from above we can see that two figures can be merged using gridEXTRA package ,using grid.arrange function
SCATTER PLOT USING REGRESSION LINE FOR ESTIMATING CORRELATION:
ggplot(happiness,aes(x=Happiness.Score,y=Economy,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(panel.grid.major = element_line(linetype = "blank"),
panel.grid.minor = element_line(linetype = "blank"),
plot.title = element_text(face = "bold",
colour = "dodgerblue4"), panel.background = element_rect(fill = "white")) +labs(title = "Economy vs Happiness.Score")
ggplot(happiness,aes(x=Happiness.Score,y=Family,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(axis.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
panel.background = element_rect(fill = NA)) +labs(title = "Family vs. happiness.score")
ggplot(happiness,aes(x=Happiness.Score,y=Trust,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(plot.title = element_text(face = "bold"),
panel.background = element_rect(fill = NA)) +labs(title = "Trust vs happiness.Score")
ggplot(happiness,aes(x=Happiness.Score,y=Freedom,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(axis.text = element_text(size = 12,
face = "bold"), panel.background = element_rect(fill = NA)) +labs(title = "Freedom vs. happiness.score")
ggplot(happiness,aes(x=Happiness.Score,y=life.exp,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(panel.background = element_rect(fill = NA)) + theme(plot.title = element_text(face = "bold")) +labs(title = "life.exp vs happiness.score")
ggplot(happiness,aes(x=Happiness.Score,y=Dystopia.Residual,color=continent))+
geom_point(size=2.5,alpha=0.7)+
geom_smooth(aes(color=continent,fill=continent),fullrange=TRUE,method="lm")+
facet_wrap(~continent) + theme(panel.grid.major = element_line(linetype = "blank")) +labs(title = "Dystopia va happiness.score") + theme(axis.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
panel.background = element_rect(fill = NA))