# Importing Data
library("readxl")
happy = read_excel("C:/Users/hp/Documents/datasets/datasets/Global/happiness.xlsx", sheet = "world")

happy$continent = factor(happy$continent) # Changes continent into a factor
happy$happiness2022_measure = factor(happy$happiness2022_measure) 
happy$income_measure = factor(happy$income_measure)


str(happy) # checks the data type per variable
## tibble [146 x 10] (S3: tbl_df/tbl/data.frame)
##  $ country              : chr [1:146] "Finland" "Denmark" "Iceland" "Switzerland" ...
##  $ continent            : Factor w/ 8 levels "Africa","Asia",..: 4 4 7 7 4 4 4 7 2 6 ...
##  $ happiness2022_measure: Factor w/ 3 levels "Fairly Happy",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ income_measure       : Factor w/ 5 levels "Fairly Rich",..: 4 5 5 5 5 5 5 5 4 4 ...
##  $ happiness2022        : num [1:146] 7.82 7.64 7.56 7.51 7.41 ...
##  $ happiness2021        : num [1:146] 7.84 7.62 7.55 7.57 7.46 ...
##  $ happiness2020        : num [1:146] 7.81 7.65 7.5 7.56 7.45 ...
##  $ pop_millions         : num [1:146] 5.531 5.831 0.366 8.637 17.441 ...
##  $ income_thousands     : num [1:146] 48.8 61.1 59.3 87.1 52.4 ...
##  $ lifeexp              : num [1:146] 82.1 81.6 83.1 83.1 81.4 ...
summary(happy) # summary statistics per variable
##    country                                  continent     happiness2022_measure
##  Length:146         Africa                       :41   Fairly Happy  :55       
##  Class :character   Asia                         :38   Somewhat Happy:78       
##  Mode  :character   European Union               :26   Unhappy       :13       
##                     Rest of Europe               :17                           
##                     South America                :10                           
##                     Central America and Caribbean: 9                           
##                     (Other)                      : 5                           
##        income_measure happiness2022   happiness2021   happiness2020  
##  Fairly Rich  :38     Min.   :2.404   Min.   :2.523   Min.   :2.567  
##  Middle Income:50     1st Qu.:4.876   1st Qu.:4.853   1st Qu.:4.755  
##  Poor         :21     Median :5.553   Median :5.505   Median :5.527  
##  Rich         :26     Mean   :5.527   Mean   :5.534   Mean   :5.508  
##  Very Rich    :11     3rd Qu.:6.285   3rd Qu.:6.247   3rd Qu.:6.250  
##                       Max.   :7.821   Max.   :7.842   Max.   :7.809  
##                                                                      
##   pop_millions       income_thousands     lifeexp     
##  Min.   :   0.3664   Min.   :  0.000   Min.   :54.51  
##  1st Qu.:   5.3993   1st Qu.:  1.854   1st Qu.:67.62  
##  Median :  11.7458   Median :  5.290   Median :74.62  
##  Mean   :  51.1032   Mean   : 14.795   Mean   :73.22  
##  3rd Qu.:  37.6907   3rd Qu.: 19.899   3rd Qu.:78.32  
##  Max.   :1410.9294   Max.   :116.356   Max.   :85.39  
## 
par(mfrow = c(1,1))    # Number of plots in rows and columns;


# Histograms and qqplots of the quantitative variables
hist(happy$happiness2022)

qqnorm(happy$happiness2022)

hist(happy$pop_millions)

qqnorm(happy$pop_millions)

hist(happy$income_thousands)

qqnorm(happy$income_thousands)

hist(happy$lifeexp)

qqnorm(happy$lifeexp)

# Shapiro tests of Normality
shapiro.test(happy$happiness2022)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$happiness2022
## W = 0.99001, p-value = 0.387
shapiro.test(happy$happiness2021)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$happiness2021
## W = 0.99078, p-value = 0.4576
shapiro.test(happy$happiness2020)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$happiness2020
## W = 0.98872, p-value = 0.2863
shapiro.test(happy$pop_millions)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$pop_millions
## W = 0.25867, p-value < 2.2e-16
shapiro.test(happy$income_thousands)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$income_thousands
## W = 0.70948, p-value = 1.211e-15
shapiro.test(happy$lifeexp)
## 
##  Shapiro-Wilk normality test
## 
## data:  happy$lifeexp
## W = 0.95239, p-value = 6.589e-05
# calculate skewness
library(moments)

skewness(happy$happiness2022)
## [1] -0.2344808
skewness(happy$happiness2021)
## [1] -0.1045487
skewness(happy$happiness2020)
## [1] -0.06276648
skewness(happy$income_thousands)
## [1] 2.112674
skewness(happy$pop_millions)
## [1] 7.25343
skewness(happy$lifeexp)
## [1] -0.5723455
# State count in each continent
counts = sort(table(happy$continent), decreasing = TRUE)  # Number of countries in each continent
percentages = 100 * counts / length(happy$continent)      
barplot(percentages, ylab = "Percentage", col = "lightblue") 
text(x=seq(0.7, 5, 1.2), 2, paste("n=", counts))      # Add count to each bar

#Chi-Square Test
library("MASS")

# Create a data frame from the main data set.
happy.data = data.frame(happy$happiness2022_measure, happy$income_measure)

# Create a table with the needed variables.
happy.data = table(happy$happiness2022_measure, happy$income_measure)
happy.data
##                 
##                  Fairly Rich Middle Income Poor Rich Very Rich
##   Fairly Happy            15             5    0   24        11
##   Somewhat Happy          22            40   14    2         0
##   Unhappy                  1             5    7    0         0
# Perform a Chi-Square test.
print(chisq.test(happy.data))
## Warning in chisq.test(happy.data): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  happy.data
## X-squared = 92.666, df = 8, p-value < 2.2e-16
# Lollipop plot of the Happiness Index in each country
library(ggplot2)

ggplot(happy, aes(x = country, y = happiness2022)) +
  geom_point(size = 3, color = "red") + 
  geom_segment(aes(x = country, xend = country, y = 0, yend = happiness2022)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) # Rotate axis label

# Ridge Diagrams
library(ggridges)
ggplot(happy, aes(x = happiness2022, y = continent, fill = continent)) +
  geom_density_ridges() +
  theme_ridges() +                                 # No color on backgroud
  theme(legend.position = "none",                  # No show legend
        axis.title.x = element_text(hjust = 0.5),  # x axis title in the center
        axis.title.y = element_text(hjust = 0.5))  # y axis title in the center
## Picking joint bandwidth of 0.257

# Correlation diagram of the numeric variables
hap = happy[, 5:10] # Take numeric variables as goal matrix
hap
## # A tibble: 146 x 6
##    happiness2022 happiness2021 happiness2020 pop_millions income_thousands
##            <dbl>         <dbl>         <dbl>        <dbl>            <dbl>
##  1          7.82          7.84          7.81        5.53              48.8
##  2          7.64          7.62          7.65        5.83              61.1
##  3          7.56          7.55          7.50        0.366             59.3
##  4          7.51          7.57          7.56        8.64              87.1
##  5          7.41          7.46          7.45       17.4               52.4
##  6          7.40          7.32          7.24        0.632            116. 
##  7          7.38          7.36          7.35       10.4               52.3
##  8          7.37          7.39          7.49        5.38              67.3
##  9          7.36          7.16          7.13        9.22              44.2
## 10          7.20          7.28          7.3         5.08              41.4
## # ... with 136 more rows, and 1 more variable: lifeexp <dbl>
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
library(corrplot)
## corrplot 0.92 loaded
corMatrix = cor(as.matrix(hap)) # Calculate correlation matrix
col = colorRampPalette(c("red", "yellow", "blue"))  # 3 colors to represent coefficients -1 to 1.
corrplot.mixed(corMatrix, order = "AOE", lower = "number", lower.col = "black", 
               number.cex = .8, upper = "ellipse",  upper.col = col(10), 
               diag = "u", tl.pos = "lt", tl.col = "black") # Mix plots of "number" and "ellipse"

# Cluster dendrogram for numeric variables
plot(hclust(as.dist(1 - cor(as.matrix(hap)))))  # Hierarchical clustering

# density plot showing the distribution of Happiness by income
ggplot(happy, aes(x = happiness2022, fill = income_measure)) + geom_density(alpha = 0.9)

# Box plot of gdp by continent
happy$gdp = (happy$pop_millions*1000000*happy$income_thousands*1000)/1000000000
boxplot(happy$gdp ~ happy$continent, xlab = "Continent", ylab = "GDP in Dollars (Billions)")

# ANOVA test of gdp by continent
gdp_differences <- aov(happy$gdp ~ happy$continent, happy)
summary(gdp_differences) #the ANOVA test shows a significant difference in the gdp of the different continents
##                  Df    Sum Sq  Mean Sq F value   Pr(>F)    
## happy$continent   7 176258115 25179731   6.754 6.88e-07 ***
## Residuals       138 514495892  3728231                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Scatterplot for Income and Happiness Index by Life Expectancy
ggplot(happy, aes(x = income_thousands, y = happiness2022)) + 
  geom_point(aes(size = lifeexp, color = continent)) + 
  geom_smooth(method = 'lm',formula = y ~ x)  # Add regression line

# Happiness distribution by income
ggplot(happy, aes(x = income_measure, y = happiness2022, fill = income_measure)) + 
  geom_violin(trim = FALSE) + 
  geom_boxplot(width = 0.1)

# Relationship between Happiness, Income, Population and Life Expectancy
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# group Pop into Pop size first
happy.pop_millions = happy %>% mutate(PopSize = factor(ifelse(pop_millions < 1, "Below 1M",  
                                                              ifelse(pop_millions < 20 & pop_millions >= 1, "1M to 19M",
                                                                     ifelse(pop_millions < 50 & pop_millions >= 20, "20M to 49M",
                                                                            ifelse(pop_millions < 100 & pop_millions >= 50, "50M to 99M",
                                                                     "100M and Above"))))))
ggplot(happy.pop_millions, aes(x = income_thousands, y = happiness2022)) + 
  geom_point(aes(shape = PopSize, color = continent, size = lifeexp)) + 
  geom_smooth(method = 'lm', formula = y ~ x)

# Segment diagram for all countries
row.names(hap) <- happy$country
## Warning: Setting row names on a tibble is deprecated.
stars(hap, key.loc = c(13, 1.5), draw.segments = T)

# Heat map for whole happy data set
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
hap.matrix = as.matrix(hap)    # Transfer the data frame to matrix
s = apply(hap.matrix, 2, function(y)(y - mean(y)) / sd(y))  # Standardize data
a = heatmap.2(s, 
               col = greenred(75), # Color green red
               density.info = "none", 
               trace = "none", 
               scale = "none", 
               RowSideColors = rainbow(4)[happy$continent],  
               srtCol = 45,        # Column labels at 45 degree
               margins = c(5, 8),  # Bottom and right margins
               lhei = c(5, 15)     # Relative heights of the rows
) 
legend("topright", levels(happy$continent), fill = rainbow(4), cex = 0.8)  # Add legend

# Principal components analysis
pca = prcomp(hap, scale = T)  # scale = T to normalize the data
pca
## Standard deviations (1, .., p=6):
## [1] 2.05559281 1.00010945 0.62330029 0.58541215 0.19209578 0.07878984
## 
## Rotation (n x k) = (6 x 6):
##                          PC1          PC2         PC3         PC4          PC5
## happiness2022    -0.47028515 -0.001693254  0.31414323 -0.12011340  0.766714302
## happiness2021    -0.47486687 -0.009517416  0.31250767 -0.11283403 -0.145187923
## happiness2020    -0.47230463 -0.023195359  0.30466760 -0.11667355 -0.625148601
## pop_millions      0.05368957  0.991082299  0.08655679 -0.08439891 -0.015322094
## income_thousands -0.40196512  0.050609475 -0.78549639 -0.46779776  0.003371525
## lifeexp          -0.40718932  0.120677607 -0.29382647  0.85631222  0.003569068
##                           PC6
## happiness2022     0.279042995
## happiness2021    -0.801834638
## happiness2020     0.528341247
## pop_millions      0.004825646
## income_thousands -0.003468789
## lifeexp           0.004052974
plot(pca)     # Plot the amount of variance each principal components captures

summary(pca)  # Shows the importance of the components
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0556 1.0001 0.62330 0.58541 0.19210 0.07879
## Proportion of Variance 0.7042 0.1667 0.06475 0.05712 0.00615 0.00103
## Cumulative Proportion  0.7042 0.8710 0.93570 0.99282 0.99897 1.00000
percentVar = round(100 * summary(pca)$importance[2, 1:6], 0) # Compute % variances
percentVar
## PC1 PC2 PC3 PC4 PC5 PC6 
##  70  17   6   6   1   0
# Biplot for Principal Component Analysis
library(ggfortify)
row.names(happy) = happy$country
## Warning: Setting row names on a tibble is deprecated.
autoplot(prcomp(hap,  scale = T), data = happy, 
         colour = 'continent', shape = FALSE, label = TRUE, label.size = 3.5,
         loadings = TRUE, loadings.colour = 'blue', loadings.label = TRUE, 
         loadings.label.size = 4, loadings.label.colour = 'blue')

# Multiple Linear Regression Analysis
mlr = happy[, c(2, 5, 8:10)]
mlr = within(mlr, continent <- relevel(continent, ref = "Africa"))  # Set Africa as reference
model <- lm(happiness2022 ~ .,  data = mlr)
summary(model)
## 
## Call:
## lm(formula = happiness2022 ~ ., data = mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.45227 -0.28926  0.07479  0.37929  1.04385 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             1.8411724  0.8442722   2.181  0.03093
## continentAsia                           0.2507455  0.1846157   1.358  0.17666
## continentCentral America and Caribbean  0.8571915  0.2612629   3.281  0.00132
## continentEuropean Union                 0.7678985  0.2343058   3.277  0.00133
## continentNorth America                  0.9135337  0.4111882   2.222  0.02797
## continentOceania                        0.9640445  0.5002571   1.927  0.05607
## continentRest of Europe                 0.5540937  0.2261239   2.450  0.01555
## continentSouth America                  0.7122935  0.2625450   2.713  0.00754
## pop_millions                           -0.0003540  0.0003241  -1.092  0.27666
## income_thousands                        0.0218452  0.0037680   5.798 4.54e-08
## lifeexp                                 0.0407132  0.0130255   3.126  0.00217
##                                           
## (Intercept)                            *  
## continentAsia                             
## continentCentral America and Caribbean ** 
## continentEuropean Union                ** 
## continentNorth America                 *  
## continentOceania                       .  
## continentRest of Europe                *  
## continentSouth America                 ** 
## pop_millions                              
## income_thousands                       ***
## lifeexp                                ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6274 on 135 degrees of freedom
## Multiple R-squared:  0.6985, Adjusted R-squared:  0.6762 
## F-statistic: 31.28 on 10 and 135 DF,  p-value: < 2.2e-16
summary.aov(model) #ANOVA table testing the significance of predictors
##                   Df Sum Sq Mean Sq F value   Pr(>F)    
## continent          7  89.96  12.851  32.650  < 2e-16 ***
## pop_millions       1   1.09   1.090   2.768  0.09847 .  
## income_thousands   1  28.22  28.217  71.687 3.75e-14 ***
## lifeexp            1   3.85   3.845   9.770  0.00217 ** 
## Residuals        135  53.14   0.394                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2,2))
plot(model) # Post regression model diagnostics

# Prediction using the model
Predict_Happiness = data.frame(continent = "Africa", pop_millions = 54, income_thousands = 2, lifeexp = 67)
predict(model, Predict_Happiness)
##       1 
## 4.59353
# The Minimal Adequate Model
step(model)
## Start:  AIC=-125.57
## happiness2022 ~ continent + pop_millions + income_thousands + 
##     lifeexp
## 
##                    Df Sum of Sq    RSS      AIC
## - pop_millions      1    0.4696 53.607 -126.281
## <none>                          53.137 -125.566
## - continent         7    8.3566 61.494 -118.241
## - lifeexp           1    3.8455 56.983 -117.365
## - income_thousands  1   13.2301 66.368  -95.106
## 
## Step:  AIC=-126.28
## happiness2022 ~ continent + income_thousands + lifeexp
## 
##                    Df Sum of Sq    RSS      AIC
## <none>                          53.607 -126.281
## - continent         7    8.7219 62.329 -118.272
## - lifeexp           1    3.9195 57.526 -117.979
## - income_thousands  1   13.4317 67.039  -95.637
## 
## Call:
## lm(formula = happiness2022 ~ continent + income_thousands + lifeexp, 
##     data = mlr)
## 
## Coefficients:
##                            (Intercept)                           continentAsia  
##                                1.80680                                 0.21371  
## continentCentral America and Caribbean                 continentEuropean Union  
##                                0.85980                                 0.76103  
##                 continentNorth America                        continentOceania  
##                                0.85424                                 0.95490  
##                continentRest of Europe                  continentSouth America  
##                                0.54815                                 0.70184  
##                       income_thousands                                 lifeexp  
##                                0.02200                                 0.04109
model1 = update(model, .~.-pop_millions)
summary(model1)
## 
## Call:
## lm(formula = happiness2022 ~ continent + income_thousands + lifeexp, 
##     data = mlr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.41366 -0.29099  0.07505  0.38375  1.08043 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            1.806803   0.844284   2.140  0.03414 *  
## continentAsia                          0.213707   0.181604   1.177  0.24134    
## continentCentral America and Caribbean 0.859802   0.261437   3.289  0.00128 ** 
## continentEuropean Union                0.761034   0.234388   3.247  0.00147 ** 
## continentNorth America                 0.854238   0.407878   2.094  0.03808 *  
## continentOceania                       0.954904   0.500542   1.908  0.05853 .  
## continentRest of Europe                0.548155   0.226219   2.423  0.01670 *  
## continentSouth America                 0.701838   0.262557   2.673  0.00844 ** 
## income_thousands                       0.021996   0.003768   5.837  3.7e-08 ***
## lifeexp                                0.041089   0.013030   3.153  0.00199 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6278 on 136 degrees of freedom
## Multiple R-squared:  0.6958, Adjusted R-squared:  0.6757 
## F-statistic: 34.57 on 9 and 136 DF,  p-value: < 2.2e-16
anova(model, model1) # Compare the two models, model and model1
## Analysis of Variance Table
## 
## Model 1: happiness2022 ~ continent + pop_millions + income_thousands + 
##     lifeexp
## Model 2: happiness2022 ~ continent + income_thousands + lifeexp
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    135 53.137                           
## 2    136 53.607 -1   -0.4696 1.1931 0.2767
model2 = update(model1, .~. -continent)
summary(model2)
## 
## Call:
## lm(formula = happiness2022 ~ income_thousands + lifeexp, data = mlr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7466 -0.3288  0.1204  0.4536  1.0436 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.084039   0.688582   0.122    0.903    
## income_thousands 0.022584   0.003520   6.416 1.92e-09 ***
## lifeexp          0.069780   0.009821   7.105 5.24e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6602 on 143 degrees of freedom
## Multiple R-squared:  0.6464, Adjusted R-squared:  0.6414 
## F-statistic: 130.7 on 2 and 143 DF,  p-value: < 2.2e-16
anova(model1, model2) # Compare the two models, model1 and model2
## Analysis of Variance Table
## 
## Model 1: happiness2022 ~ continent + income_thousands + lifeexp
## Model 2: happiness2022 ~ income_thousands + lifeexp
##   Res.Df    RSS Df Sum of Sq     F   Pr(>F)   
## 1    136 53.607                               
## 2    143 62.329 -7   -8.7219 3.161 0.003965 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1 is better than Model 2 because it has a bigger Adjusted R-Squared