Dataset was used from Kaggle: https://www.kaggle.com/datasets/vipulgohel/clustering-pca-assignment

I will do PCA analysis

Research question

How the annual GDP growth rate can be explained by life expectancy, imports, exports and income?

#install.packages("readxl")
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
mydata <- read_xlsx("~/IMB/Mutivariat analysis/r podatki/country_data - HW 4.xlsx")


colnames(mydata) <- c("Country", "Children_death", "Exports", "Imports", "Income", "Inflation", "Life_expectancy", "Total_fertility", "GDPP")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Create a new ID column with unique identifiers
mydata <- mutate(mydata, ID = 1:nrow(mydata))
library(dplyr)

# Move the 'ID' column to the first position
mydata <- select(mydata, ID, everything())


head(mydata)
## # A tibble: 6 × 10
##      ID Country  Children_death Exports Imports Income Inflation Life_expectancy
##   <int> <chr>             <dbl>   <dbl>   <dbl>  <dbl>     <dbl>           <dbl>
## 1     1 Afghani…           90.2    10      44.9   1610      9.44            56.2
## 2     2 Albania            16.6    28      48.6   9930      4.49            76.3
## 3     3 Algeria            27.3    38.4    31.4  12900     16.1             76.5
## 4     4 Angola            119      62.3    42.9   5900     22.4             60.1
## 5     5 Antigua…           10.3    45.5    58.9  19100      1.44            76.8
## 6     6 Argenti…           14.5    18.9    16    18700     20.9             75.8
## # ℹ 2 more variables: Total_fertility <dbl>, GDPP <dbl>

Explanation of dataset

summary(mydata[,c(-1,-2)])
##  Children_death      Exports          Imports             Income      
##  Min.   :  2.60   Min.   :  2.20   Min.   :  0.0659   Min.   :   609  
##  1st Qu.:  8.25   1st Qu.: 23.80   1st Qu.: 30.2000   1st Qu.:  3355  
##  Median : 19.30   Median : 35.40   Median : 43.3000   Median :  9960  
##  Mean   : 38.27   Mean   : 41.76   Mean   : 46.8902   Mean   : 17145  
##  3rd Qu.: 62.10   3rd Qu.: 51.40   3rd Qu.: 58.7500   3rd Qu.: 22800  
##  Max.   :208.00   Max.   :200.00   Max.   :174.0000   Max.   :125000  
##    Inflation      Life_expectancy Total_fertility      GDPP       
##  Min.   :-4.210   Min.   :32.10   Min.   :1.150   Min.   :   231  
##  1st Qu.: 1.755   1st Qu.:65.30   1st Qu.:1.795   1st Qu.:  1330  
##  Median : 5.140   Median :73.10   Median :2.410   Median :  4660  
##  Mean   : 7.157   Mean   :70.56   Mean   :2.948   Mean   : 12964  
##  3rd Qu.:10.350   3rd Qu.:76.80   3rd Qu.:3.880   3rd Qu.: 14050  
##  Max.   :45.900   Max.   :82.80   Max.   :7.490   Max.   :105000
library(psych)
describe(mydata[,c(-1,-2)])
##                 vars   n     mean       sd  median  trimmed      mad    min
## Children_death     1 167    38.27    40.33   19.30    31.59    21.94   2.60
## Exports            2 167    41.76    27.72   35.40    38.22    20.90   2.20
## Imports            3 167    46.89    24.21   43.30    44.34    21.05   0.07
## Income             4 167 17144.69 19278.07 9960.00 13807.63 11638.41 609.00
## Inflation          5 167     7.16     7.48    5.14     6.15     5.66  -4.21
## Life_expectancy    6 167    70.56     8.89   73.10    71.33     8.90  32.10
## Total_fertility    7 167     2.95     1.51    2.41     2.77     1.22   1.15
## GDPP               8 167 12964.16 18328.70 4660.00  9146.43  5814.76 231.00
##                      max     range  skew kurtosis      se
## Children_death  2.08e+02    205.40  1.42     1.62    3.12
## Exports         2.00e+02    197.80  2.36     9.05    2.15
## Imports         1.74e+02    173.93  1.87     6.41    1.87
## Income          1.25e+05 124391.00  2.19     6.67 1491.78
## Inflation       4.59e+01     50.11  1.78     4.91    0.58
## Life_expectancy 8.28e+01     50.70 -0.95     1.03    0.69
## Total_fertility 7.49e+00      6.34  0.95    -0.25    0.12
## GDPP            1.05e+05 104769.00  2.18     5.23 1418.32

Explanations of parameters

PCA analysis

For the PCA analysis some requirements need to be meet:

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
fit <- lm(GDPP ~ Children_death + Exports + Imports + Income + Inflation + Life_expectancy + Total_fertility, 
          data = mydata)
library(car)
vif(fit) 
##  Children_death         Exports         Imports          Income       Inflation 
##        7.410763        2.934215        2.433882        2.097863        1.238293 
## Life_expectancy Total_fertility 
##        5.754270        3.816680
mean(vif(fit))
## [1] 3.669424
mydata_PCA <- mydata[, c("Life_expectancy", "Imports", "Exports", "Income")] 

library(pastecs) 
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
round(stat.desc(mydata_PCA, basic = FALSE), 2)
##              Life_expectancy Imports Exports       Income
## median                 73.10   43.30   35.40      9960.00
## mean                   70.56   46.89   41.76     17144.69
## SE.mean                 0.69    1.87    2.15      1491.78
## CI.mean.0.95            1.36    3.70    4.24      2945.31
## var                    79.09  586.10  768.63 371643894.16
## std.dev                 8.89   24.21   27.72     19278.07
## coef.var                0.13    0.52    0.66         1.12
R <- cor(mydata_PCA) 
round(R, 3)
##                 Life_expectancy Imports Exports Income
## Life_expectancy           1.000   0.054   0.303  0.612
## Imports                   0.054   1.000   0.683  0.122
## Exports                   0.303   0.683   1.000  0.494
## Income                    0.612   0.122   0.494  1.000
library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA)) 
## $chisq
## [1] 246.1815
## 
## $p.value
## [1] 2.684382e-50
## 
## $df
## [1] 6

BSD we can reject H0 (p<0.001) and conclude that at least one correlation coeffcient differ from 0.

det(R) 
## [1] 0.2225433
library(psych)
KMO(R) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.52
## MSA for each item = 
## Life_expectancy         Imports         Exports          Income 
##            0.63            0.45            0.52            0.53
library(FactoMineR) 
## Warning: package 'FactoMineR' was built under R version 4.3.2
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE,
                  graph = FALSE) 

library(factoextra) 
## Warning: package 'factoextra' was built under R version 4.3.2
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
get_eigenvalue(components) 
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.1664236        54.160591                    54.16059
## Dim.2  1.2208981        30.522453                    84.68304
## Dim.3  0.4048570        10.121424                    94.80447
## Dim.4  0.2078213         5.195532                   100.00000
library(factoextra)
fviz_eig(components,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

- We should take 2 components, break is at third component, and we should take one less, according to the formla n-1

library(psych)
fa.parallel(mydata_PCA, 
            sim = FALSE, 
            fa = "pc") 
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  2
library(FactoMineR)
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE, 
                  graph = FALSE,
                  ncp = 2) 

components
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 167 individuals, described by 4 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
print(components$var$cor) 
##                     Dim.1      Dim.2
## Life_expectancy 0.6605012 -0.6024900
## Imports         0.6178233  0.7173255
## Exports         0.8628375  0.3623410
## Income          0.7771535 -0.4604965
print(components$var$contrib) 
##                    Dim.1    Dim.2
## Life_expectancy 20.13742 29.73174
## Imports         17.61916 42.14568
## Exports         34.36486 10.75364
## Income          27.87856 17.36894
library(factoextra)
fviz_pca_var(components, 
             repel = TRUE)

- PC1 measures General socio-economic factors - PC2 measures contrast between economic factors (imports and exports) and social factors (income and life expectancy)

Although income is more of an economic factor than a social one, I would categorize it as a social factor for the purpose of easier explanation.

library(factoextra)
fviz_pca_biplot(components) 

head(components$ind$coord)
##        Dim.1       Dim.2
## 1 -1.8615211  0.78934874
## 2 -0.1695789 -0.31409276
## 3 -0.1564174 -0.73002579
## 4 -0.4718798  1.02316985
## 5  0.6579004 -0.05902964
## 6 -0.7138841 -1.45828880
components$ind$coord[124, ]
##     Dim.1     Dim.2 
##  3.449634 -3.266095
mydata_PCA_std <- scale(mydata_PCA) 
mydata_PCA_std[124, ] 
## Life_expectancy         Imports         Exports          Income 
##       1.0057504      -0.9537632       0.7408327       5.5947159
mydata$PC1 <- components$ind$coord[ , 1] 
mydata$PC2 <- components$ind$coord[ , 2] 

head(mydata, 3)
## # A tibble: 3 × 12
##      ID Country  Children_death Exports Imports Income Inflation Life_expectancy
##   <int> <chr>             <dbl>   <dbl>   <dbl>  <dbl>     <dbl>           <dbl>
## 1     1 Afghani…           90.2    10      44.9   1610      9.44            56.2
## 2     2 Albania            16.6    28      48.6   9930      4.49            76.3
## 3     3 Algeria            27.3    38.4    31.4  12900     16.1             76.5
## # ℹ 4 more variables: Total_fertility <dbl>, GDPP <dbl>, PC1 <dbl>, PC2 <dbl>
cor(x=mydata$PC1, y=mydata$PC2)
## [1] -1.569485e-16
fit <- lm(GDPP ~ PC1 + PC2 + Income + Imports + Inflation, 
          data = mydata) 

library(car)
vif(fit) 
##       PC1       PC2    Income   Imports Inflation 
##  9.804409  6.968546  5.659903 11.122897  1.185929
mean(vif(fit))
## [1] 6.948337
mydata$StdResid <- rstandard(fit)
mydata$StdFitted <- scale(fit$fitted.values)

hist(mydata$StdResid, 
     xlab = "Standardized residuals", 
     ylab = "Frequency", 
     main = "Histrogram of standardized residuals")

shapiro.test(mydata$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$StdResid
## W = 0.83296, p-value = 1.532e-12
library(ggplot2)
ggplot(mydata, aes(y=StdResid, x=StdFitted)) +
  theme_dark() +
  geom_point(colour = "snow") +
  ylab("Standardized residuals") +
  xlab("Standardized fitted values")

library(olsrr)
ols_test_breusch_pagan(fit)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : GDPP 
##  Variables: fitted values of GDPP 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    378.8171 
##  Prob > Chi2   =    2.251682e-84
head(mydata[order(mydata$StdResid), c("ID", "StdResid") ], 3)
## # A tibble: 3 × 2
##      ID StdResid
##   <int>    <dbl>
## 1   124    -4.91
## 2    24    -3.65
## 3    83    -2.80
library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(124, 24))