# Importing Data
library("readxl")
happy = read_excel("C:/Users/hp/Documents/datasets/datasets/Global/happiness.xlsx", sheet = "world")
happy$continent = factor(happy$continent) # Changes continent into a factor
happy$happiness2022_measure = factor(happy$happiness2022_measure)
happy$income_measure = factor(happy$income_measure)
str(happy) # checks the data type per variable
## tibble [146 x 10] (S3: tbl_df/tbl/data.frame)
## $ country : chr [1:146] "Finland" "Denmark" "Iceland" "Switzerland" ...
## $ continent : Factor w/ 8 levels "Africa","Asia",..: 4 4 7 7 4 4 4 7 2 6 ...
## $ happiness2022_measure: Factor w/ 3 levels "Fairly Happy",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ income_measure : Factor w/ 5 levels "Fairly Rich",..: 4 5 5 5 5 5 5 5 4 4 ...
## $ happiness2022 : num [1:146] 7.82 7.64 7.56 7.51 7.41 ...
## $ happiness2021 : num [1:146] 7.84 7.62 7.55 7.57 7.46 ...
## $ happiness2020 : num [1:146] 7.81 7.65 7.5 7.56 7.45 ...
## $ pop_millions : num [1:146] 5.531 5.831 0.366 8.637 17.441 ...
## $ income_thousands : num [1:146] 48.8 61.1 59.3 87.1 52.4 ...
## $ lifeexp : num [1:146] 82.1 81.6 83.1 83.1 81.4 ...
summary(happy) # summary statistics per variable
## country continent happiness2022_measure
## Length:146 Africa :41 Fairly Happy :55
## Class :character Asia :38 Somewhat Happy:78
## Mode :character European Union :26 Unhappy :13
## Rest of Europe :17
## South America :10
## Central America and Caribbean: 9
## (Other) : 5
## income_measure happiness2022 happiness2021 happiness2020
## Fairly Rich :38 Min. :2.404 Min. :2.523 Min. :2.567
## Middle Income:50 1st Qu.:4.876 1st Qu.:4.853 1st Qu.:4.755
## Poor :21 Median :5.553 Median :5.505 Median :5.527
## Rich :26 Mean :5.527 Mean :5.534 Mean :5.508
## Very Rich :11 3rd Qu.:6.285 3rd Qu.:6.247 3rd Qu.:6.250
## Max. :7.821 Max. :7.842 Max. :7.809
##
## pop_millions income_thousands lifeexp
## Min. : 0.3664 Min. : 0.000 Min. :54.51
## 1st Qu.: 5.3993 1st Qu.: 1.854 1st Qu.:67.62
## Median : 11.7458 Median : 5.290 Median :74.62
## Mean : 51.1032 Mean : 14.795 Mean :73.22
## 3rd Qu.: 37.6907 3rd Qu.: 19.899 3rd Qu.:78.32
## Max. :1410.9294 Max. :116.356 Max. :85.39
##
par(mfrow = c(1,1)) # Number of plots in rows and columns;
# Histograms and qqplots of the quantitative variables
hist(happy$happiness2022)

qqnorm(happy$happiness2022)

hist(happy$pop_millions)

qqnorm(happy$pop_millions)

hist(happy$income_thousands)

qqnorm(happy$income_thousands)

hist(happy$lifeexp)

qqnorm(happy$lifeexp)

# Shapiro tests of Normality
shapiro.test(happy$happiness2022)
##
## Shapiro-Wilk normality test
##
## data: happy$happiness2022
## W = 0.99001, p-value = 0.387
shapiro.test(happy$happiness2021)
##
## Shapiro-Wilk normality test
##
## data: happy$happiness2021
## W = 0.99078, p-value = 0.4576
shapiro.test(happy$happiness2020)
##
## Shapiro-Wilk normality test
##
## data: happy$happiness2020
## W = 0.98872, p-value = 0.2863
shapiro.test(happy$pop_millions)
##
## Shapiro-Wilk normality test
##
## data: happy$pop_millions
## W = 0.25867, p-value < 2.2e-16
shapiro.test(happy$income_thousands)
##
## Shapiro-Wilk normality test
##
## data: happy$income_thousands
## W = 0.70948, p-value = 1.211e-15
shapiro.test(happy$lifeexp)
##
## Shapiro-Wilk normality test
##
## data: happy$lifeexp
## W = 0.95239, p-value = 6.589e-05
# calculate skewness
library(moments)
skewness(happy$happiness2022)
## [1] -0.2344808
skewness(happy$happiness2021)
## [1] -0.1045487
skewness(happy$happiness2020)
## [1] -0.06276648
skewness(happy$income_thousands)
## [1] 2.112674
skewness(happy$pop_millions)
## [1] 7.25343
skewness(happy$lifeexp)
## [1] -0.5723455
# State count in each continent
counts = sort(table(happy$continent), decreasing = TRUE) # Number of countries in each continent
percentages = 100 * counts / length(happy$continent)
barplot(percentages, ylab = "Percentage", col = "lightblue")
text(x=seq(0.7, 5, 1.2), 2, paste("n=", counts)) # Add count to each bar
#Chi-Square Test
library("MASS")
# Create a data frame from the main data set.
happy.data = data.frame(happy$happiness2022_measure, happy$income_measure)
# Create a table with the needed variables.
happy.data = table(happy$happiness2022_measure, happy$income_measure)
happy.data
##
## Fairly Rich Middle Income Poor Rich Very Rich
## Fairly Happy 15 5 0 24 11
## Somewhat Happy 22 40 14 2 0
## Unhappy 1 5 7 0 0
# Perform a Chi-Square test.
print(chisq.test(happy.data))
## Warning in chisq.test(happy.data): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: happy.data
## X-squared = 92.666, df = 8, p-value < 2.2e-16
# Lollipop plot of the Happiness Index in each country
library(ggplot2)

ggplot(happy, aes(x = country, y = happiness2022)) +
geom_point(size = 3, color = "red") +
geom_segment(aes(x = country, xend = country, y = 0, yend = happiness2022)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) # Rotate axis label

# Ridge Diagrams
library(ggridges)
ggplot(happy, aes(x = happiness2022, y = continent, fill = continent)) +
geom_density_ridges() +
theme_ridges() + # No color on backgroud
theme(legend.position = "none", # No show legend
axis.title.x = element_text(hjust = 0.5), # x axis title in the center
axis.title.y = element_text(hjust = 0.5)) # y axis title in the center
## Picking joint bandwidth of 0.257

# Correlation diagram of the numeric variables
hap = happy[, 5:10] # Take numeric variables as goal matrix
hap
## # A tibble: 146 x 6
## happiness2022 happiness2021 happiness2020 pop_millions income_thousands
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7.82 7.84 7.81 5.53 48.8
## 2 7.64 7.62 7.65 5.83 61.1
## 3 7.56 7.55 7.50 0.366 59.3
## 4 7.51 7.57 7.56 8.64 87.1
## 5 7.41 7.46 7.45 17.4 52.4
## 6 7.40 7.32 7.24 0.632 116.
## 7 7.38 7.36 7.35 10.4 52.3
## 8 7.37 7.39 7.49 5.38 67.3
## 9 7.36 7.16 7.13 9.22 44.2
## 10 7.20 7.28 7.3 5.08 41.4
## # ... with 136 more rows, and 1 more variable: lifeexp <dbl>
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
library(corrplot)
## corrplot 0.92 loaded
corMatrix = cor(as.matrix(hap)) # Calculate correlation matrix
col = colorRampPalette(c("red", "yellow", "blue")) # 3 colors to represent coefficients -1 to 1.
corrplot.mixed(corMatrix, order = "AOE", lower = "number", lower.col = "black",
number.cex = .8, upper = "ellipse", upper.col = col(10),
diag = "u", tl.pos = "lt", tl.col = "black") # Mix plots of "number" and "ellipse"

# Cluster dendrogram for numeric variables
plot(hclust(as.dist(1 - cor(as.matrix(hap))))) # Hierarchical clustering

# density plot showing the distribution of Happiness by income
ggplot(happy, aes(x = happiness2022, fill = income_measure)) + geom_density(alpha = 0.9)

# Box plot of gdp by continent
happy$gdp = (happy$pop_millions*1000000*happy$income_thousands*1000)/1000000000
boxplot(happy$gdp ~ happy$continent, xlab = "Continent", ylab = "GDP in Dollars (Billions)")

# ANOVA test of gdp by continent
gdp_differences <- aov(happy$gdp ~ happy$continent, happy)
summary(gdp_differences) #the ANOVA test shows a significant difference in the gdp of the different continents
## Df Sum Sq Mean Sq F value Pr(>F)
## happy$continent 7 176258115 25179731 6.754 6.88e-07 ***
## Residuals 138 514495892 3728231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Scatterplot for Income and Happiness Index by Life Expectancy
ggplot(happy, aes(x = income_thousands, y = happiness2022)) +
geom_point(aes(size = lifeexp, color = continent)) +
geom_smooth(method = 'lm',formula = y ~ x) # Add regression line

# Happiness distribution by income
ggplot(happy, aes(x = income_measure, y = happiness2022, fill = income_measure)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1)

# Relationship between Happiness, Income, Population and Life Expectancy
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# group Pop into Pop size first
happy.pop_millions = happy %>% mutate(PopSize = factor(ifelse(pop_millions < 1, "Below 1M",
ifelse(pop_millions < 20 & pop_millions >= 1, "1M to 19M",
ifelse(pop_millions < 50 & pop_millions >= 20, "20M to 49M",
ifelse(pop_millions < 100 & pop_millions >= 50, "50M to 99M",
"100M and Above"))))))
ggplot(happy.pop_millions, aes(x = income_thousands, y = happiness2022)) +
geom_point(aes(shape = PopSize, color = continent, size = lifeexp)) +
geom_smooth(method = 'lm', formula = y ~ x)

# Segment diagram for all countries
row.names(hap) <- happy$country
## Warning: Setting row names on a tibble is deprecated.
stars(hap, key.loc = c(13, 1.5), draw.segments = T)

# Heat map for whole happy data set
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
hap.matrix = as.matrix(hap) # Transfer the data frame to matrix
s = apply(hap.matrix, 2, function(y)(y - mean(y)) / sd(y)) # Standardize data
a = heatmap.2(s,
col = greenred(75), # Color green red
density.info = "none",
trace = "none",
scale = "none",
RowSideColors = rainbow(4)[happy$continent],
srtCol = 45, # Column labels at 45 degree
margins = c(5, 8), # Bottom and right margins
lhei = c(5, 15) # Relative heights of the rows
)
legend("topright", levels(happy$continent), fill = rainbow(4), cex = 0.8) # Add legend

# Principal components analysis
pca = prcomp(hap, scale = T) # scale = T to normalize the data
pca
## Standard deviations (1, .., p=6):
## [1] 2.05559281 1.00010945 0.62330029 0.58541215 0.19209578 0.07878984
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## happiness2022 -0.47028515 -0.001693254 0.31414323 -0.12011340 0.766714302
## happiness2021 -0.47486687 -0.009517416 0.31250767 -0.11283403 -0.145187923
## happiness2020 -0.47230463 -0.023195359 0.30466760 -0.11667355 -0.625148601
## pop_millions 0.05368957 0.991082299 0.08655679 -0.08439891 -0.015322094
## income_thousands -0.40196512 0.050609475 -0.78549639 -0.46779776 0.003371525
## lifeexp -0.40718932 0.120677607 -0.29382647 0.85631222 0.003569068
## PC6
## happiness2022 0.279042995
## happiness2021 -0.801834638
## happiness2020 0.528341247
## pop_millions 0.004825646
## income_thousands -0.003468789
## lifeexp 0.004052974
plot(pca) # Plot the amount of variance each principal components captures

summary(pca) # Shows the importance of the components
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0556 1.0001 0.62330 0.58541 0.19210 0.07879
## Proportion of Variance 0.7042 0.1667 0.06475 0.05712 0.00615 0.00103
## Cumulative Proportion 0.7042 0.8710 0.93570 0.99282 0.99897 1.00000
percentVar = round(100 * summary(pca)$importance[2, 1:6], 0) # Compute % variances
percentVar
## PC1 PC2 PC3 PC4 PC5 PC6
## 70 17 6 6 1 0
# Biplot for Principal Component Analysis
library(ggfortify)
row.names(happy) = happy$country
## Warning: Setting row names on a tibble is deprecated.
autoplot(prcomp(hap, scale = T), data = happy,
colour = 'continent', shape = FALSE, label = TRUE, label.size = 3.5,
loadings = TRUE, loadings.colour = 'blue', loadings.label = TRUE,
loadings.label.size = 4, loadings.label.colour = 'blue')

# Multiple Linear Regression Analysis
mlr = happy[, c(2, 5, 8:10)]
mlr = within(mlr, continent <- relevel(continent, ref = "Africa")) # Set Africa as reference
model <- lm(happiness2022 ~ ., data = mlr)
summary(model)
##
## Call:
## lm(formula = happiness2022 ~ ., data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.45227 -0.28926 0.07479 0.37929 1.04385
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8411724 0.8442722 2.181 0.03093
## continentAsia 0.2507455 0.1846157 1.358 0.17666
## continentCentral America and Caribbean 0.8571915 0.2612629 3.281 0.00132
## continentEuropean Union 0.7678985 0.2343058 3.277 0.00133
## continentNorth America 0.9135337 0.4111882 2.222 0.02797
## continentOceania 0.9640445 0.5002571 1.927 0.05607
## continentRest of Europe 0.5540937 0.2261239 2.450 0.01555
## continentSouth America 0.7122935 0.2625450 2.713 0.00754
## pop_millions -0.0003540 0.0003241 -1.092 0.27666
## income_thousands 0.0218452 0.0037680 5.798 4.54e-08
## lifeexp 0.0407132 0.0130255 3.126 0.00217
##
## (Intercept) *
## continentAsia
## continentCentral America and Caribbean **
## continentEuropean Union **
## continentNorth America *
## continentOceania .
## continentRest of Europe *
## continentSouth America **
## pop_millions
## income_thousands ***
## lifeexp **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6274 on 135 degrees of freedom
## Multiple R-squared: 0.6985, Adjusted R-squared: 0.6762
## F-statistic: 31.28 on 10 and 135 DF, p-value: < 2.2e-16
summary.aov(model) #ANOVA table testing the significance of predictors
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 7 89.96 12.851 32.650 < 2e-16 ***
## pop_millions 1 1.09 1.090 2.768 0.09847 .
## income_thousands 1 28.22 28.217 71.687 3.75e-14 ***
## lifeexp 1 3.85 3.845 9.770 0.00217 **
## Residuals 135 53.14 0.394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,2))
plot(model) # Post regression model diagnostics

# Prediction using the model
Predict_Happiness = data.frame(continent = "Africa", pop_millions = 54, income_thousands = 2, lifeexp = 67)
predict(model, Predict_Happiness)
## 1
## 4.59353
# The Minimal Adequate Model
step(model)
## Start: AIC=-125.57
## happiness2022 ~ continent + pop_millions + income_thousands +
## lifeexp
##
## Df Sum of Sq RSS AIC
## - pop_millions 1 0.4696 53.607 -126.281
## <none> 53.137 -125.566
## - continent 7 8.3566 61.494 -118.241
## - lifeexp 1 3.8455 56.983 -117.365
## - income_thousands 1 13.2301 66.368 -95.106
##
## Step: AIC=-126.28
## happiness2022 ~ continent + income_thousands + lifeexp
##
## Df Sum of Sq RSS AIC
## <none> 53.607 -126.281
## - continent 7 8.7219 62.329 -118.272
## - lifeexp 1 3.9195 57.526 -117.979
## - income_thousands 1 13.4317 67.039 -95.637
##
## Call:
## lm(formula = happiness2022 ~ continent + income_thousands + lifeexp,
## data = mlr)
##
## Coefficients:
## (Intercept) continentAsia
## 1.80680 0.21371
## continentCentral America and Caribbean continentEuropean Union
## 0.85980 0.76103
## continentNorth America continentOceania
## 0.85424 0.95490
## continentRest of Europe continentSouth America
## 0.54815 0.70184
## income_thousands lifeexp
## 0.02200 0.04109
model1 = update(model, .~.-pop_millions)
summary(model1)
##
## Call:
## lm(formula = happiness2022 ~ continent + income_thousands + lifeexp,
## data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.41366 -0.29099 0.07505 0.38375 1.08043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.806803 0.844284 2.140 0.03414 *
## continentAsia 0.213707 0.181604 1.177 0.24134
## continentCentral America and Caribbean 0.859802 0.261437 3.289 0.00128 **
## continentEuropean Union 0.761034 0.234388 3.247 0.00147 **
## continentNorth America 0.854238 0.407878 2.094 0.03808 *
## continentOceania 0.954904 0.500542 1.908 0.05853 .
## continentRest of Europe 0.548155 0.226219 2.423 0.01670 *
## continentSouth America 0.701838 0.262557 2.673 0.00844 **
## income_thousands 0.021996 0.003768 5.837 3.7e-08 ***
## lifeexp 0.041089 0.013030 3.153 0.00199 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6278 on 136 degrees of freedom
## Multiple R-squared: 0.6958, Adjusted R-squared: 0.6757
## F-statistic: 34.57 on 9 and 136 DF, p-value: < 2.2e-16
anova(model, model1) # Compare the two models, model and model1
## Analysis of Variance Table
##
## Model 1: happiness2022 ~ continent + pop_millions + income_thousands +
## lifeexp
## Model 2: happiness2022 ~ continent + income_thousands + lifeexp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 135 53.137
## 2 136 53.607 -1 -0.4696 1.1931 0.2767
model2 = update(model1, .~. -continent)
summary(model2)
##
## Call:
## lm(formula = happiness2022 ~ income_thousands + lifeexp, data = mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7466 -0.3288 0.1204 0.4536 1.0436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.084039 0.688582 0.122 0.903
## income_thousands 0.022584 0.003520 6.416 1.92e-09 ***
## lifeexp 0.069780 0.009821 7.105 5.24e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6602 on 143 degrees of freedom
## Multiple R-squared: 0.6464, Adjusted R-squared: 0.6414
## F-statistic: 130.7 on 2 and 143 DF, p-value: < 2.2e-16
anova(model1, model2) # Compare the two models, model1 and model2
## Analysis of Variance Table
##
## Model 1: happiness2022 ~ continent + income_thousands + lifeexp
## Model 2: happiness2022 ~ income_thousands + lifeexp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 136 53.607
## 2 143 62.329 -7 -8.7219 3.161 0.003965 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1 is better than Model 2 because it has a bigger Adjusted R-Squared