file.exists('/Users/julianvidovic/Downloads/archive/Country-data.csv')
## [1] TRUE

Research question

“What factors, including life expectancy, imports, exports, and income, influence the annual GDP growth rate?”

mydata <- read.csv('/Users/julianvidovic/Downloads/archive/Country-data.csv')

colnames(mydata) <- c("Country", "Children_death", "Exports", "Health", "Imports", "Income", "Inflation", "Life_expectancy", "Total_fertility", "GDPP")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata <- mydata %>% 
  select(-which(is.na(names(mydata)))) %>% 
  mutate(ID = 1:nrow(mydata))
library(dplyr)

mydata <- mutate(mydata, ID = 1:nrow(mydata))


mydata <- select(mydata, ID, everything())


head(mydata)
##   ID             Country Children_death Exports Health Imports Income Inflation
## 1  1         Afghanistan           90.2    10.0   7.58    44.9   1610      9.44
## 2  2             Albania           16.6    28.0   6.55    48.6   9930      4.49
## 3  3             Algeria           27.3    38.4   4.17    31.4  12900     16.10
## 4  4              Angola          119.0    62.3   2.85    42.9   5900     22.40
## 5  5 Antigua and Barbuda           10.3    45.5   6.03    58.9  19100      1.44
## 6  6           Argentina           14.5    18.9   8.10    16.0  18700     20.90
##   Life_expectancy Total_fertility  GDPP
## 1            56.2            5.82   553
## 2            76.3            1.65  4090
## 3            76.5            2.89  4460
## 4            60.1            6.16  3530
## 5            76.8            2.13 12200
## 6            75.8            2.37 10300

Description of Dataset:

Unit of observation: Country Sample size: N = 167

Variable definitions:

Country: Name of the country. Children_death: Number of deaths of children under five per 1,000 live births. Exports: Percentage of goods and services exported, expressed as a percentage of the total GDP. Imports: Percentage of goods and services imported, expressed as a percentage of the total GDP. Income: Average net income per person. Inflation: Inflation rate as a percentage. Life_expectancy: The average lifespan of a newborn if current mortality rates remain unchanged. Total_fertility: The average number of children born per woman if age-specific fertility rates stay constant. GDPP: The annual growth rate of the total GDP.

library(psych)
describe(mydata[ , -1])
##                 vars   n     mean       sd  median  trimmed      mad    min
## Country*           1 167    84.00    48.35   84.00    84.00    62.27   1.00
## Children_death     2 167    38.27    40.33   19.30    31.59    21.94   2.60
## Exports            3 167    41.11    27.41   35.00    37.80    21.20   0.11
## Health             4 167     6.82     2.75    6.32     6.66     2.64   1.81
## Imports            5 167    46.89    24.21   43.30    44.34    21.05   0.07
## Income             6 167 17144.69 19278.07 9960.00 13807.63 11638.41 609.00
## Inflation          7 167     7.78    10.57    5.39     6.27     5.72  -4.21
## Life_expectancy    8 167    70.56     8.89   73.10    71.33     8.90  32.10
## Total_fertility    9 167     2.95     1.51    2.41     2.77     1.22   1.15
## GDPP              10 167 12964.16 18328.70 4660.00  9146.43  5814.76 231.00
##                      max     range  skew kurtosis      se
## Country*        1.67e+02    166.00  0.00    -1.22    3.74
## Children_death  2.08e+02    205.40  1.42     1.62    3.12
## Exports         2.00e+02    199.89  2.40     9.65    2.12
## Health          1.79e+01     16.09  0.69     0.59    0.21
## Imports         1.74e+02    173.93  1.87     6.41    1.87
## Income          1.25e+05 124391.00  2.19     6.67 1491.78
## Inflation       1.04e+02    108.21  5.06    39.95    0.82
## Life_expectancy 8.28e+01     50.70 -0.95     1.03    0.69
## Total_fertility 7.49e+00      6.34  0.95    -0.25    0.12
## GDPP            1.05e+05 104769.00  2.18     5.23 1418.32

Life_expectancy: Mean: The average life expectancy across the 167 countries is 70.56 years. Min and Max: The lowest value is 32.10 years, while the highest is 82.80 years, showing a wide disparity in health and living conditions. Standard Deviation (SD): A standard deviation of 8.89 years indicates notable variation among countries, influenced by factors such as healthcare access, economic stability, and social infrastructure.

Imports: Mean: The average percentage of imports across the 167 countries is 46.89% of GDP, reflecting the reliance on external goods and services. Min and Max: The lowest percentage of imports is 0.07%, while the highest is 174.00%, showing significant differences in trade dependence among countries. Standard Deviation (SD): A standard deviation of 24.21% highlights the variation in import reliance across the dataset.

Exports: Mean: The average percentage of exports across the 167 countries is 41.11% of GDP, representing the share of economic output sold internationally. Min and Max: Export percentages range from 0.11% to 200.00%, indicating large disparities in the role of exports in national economies. Standard Deviation: A standard deviation of 27.41% shows that some countries rely much more heavily on exports than others.

Income: Mean: The average income across the 167 countries is 17,144.69 USD, providing a measure of economic well-being. Min and Max: The lowest income is 609.00 USD, and the highest is 125,000.00 USD, demonstrating vast disparities in wealth. Standard Deviation: A high standard deviation of 19,278.07 USD reflects significant inequality in income levels between countries.

GDPP (GDP per capita): Mean: The average GDP per capita is 12,964.16 across the countries, indicating the economic performance of nations. Min and Max: The GDP per capita ranges from 231.00 to 91,464.30, showcasing significant economic disparities. Standard Deviation: A high standard deviation of 18,328.70 highlights the uneven distribution of wealth, which could directly affect life expectancy through access to healthcare and improved living conditions.

###PCA Analysis To conduct a PCA (Principal Component Analysis), certain conditions must be met:

Sufficient Correlations: The variables should have correlations of at least 0.3. Stronger correlations indicate a better grouping of variables, which enhances the success of the PCA. KMO Criterion: The Kaiser-Meyer-Olkin (KMO) value should ideally be above 0.5. While a lower KMO value does not prevent the use of PCA, it suggests that the analysis will retain less information. For PCA, KMO serves as a guideline rather than a strict requirement. Bartlett’s Test of Sphericity: This test evaluates whether the dataset is suitable for PCA by checking if the correlation matrix is significantly different from an identity matrix.

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
fit <- lm(GDPP ~ Children_death + Exports + Imports + Income + Inflation + Life_expectancy + Total_fertility, 
          data = mydata)
library(car)
vif(fit) 
##  Children_death         Exports         Imports          Income       Inflation 
##        7.203036        3.961320        3.157743        2.329877        1.250201 
## Life_expectancy Total_fertility 
##        5.600812        3.718993
mean(vif(fit))
## [1] 3.888855

The VIF values show moderate to high multicollinearity. Children_death (VIF = 7.20) and Life_expectancy (VIF = 5.6) are above the threshold of 5, indicating strong correlations with other variables. The average VIF of 3.89 is also above the acceptable limit, suggesting multicollinearity is an issue. Instead of excluding variables, combining them into components would be a better approach to reduce redundancy while retaining information.

mydata_PCA <- mydata[, c("Life_expectancy", "Imports", "Exports", "Income")] 

library(pastecs) 
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
round(stat.desc(mydata_PCA, basic = FALSE), 2)
##              Life_expectancy Imports Exports       Income
## median                 73.10   43.30   35.00      9960.00
## mean                   70.56   46.89   41.11     17144.69
## SE.mean                 0.69    1.87    2.12      1491.78
## CI.mean.0.95            1.36    3.70    4.19      2945.31
## var                    79.09  586.10  751.42 371643894.16
## std.dev                 8.89   24.21   27.41     19278.07
## coef.var                0.13    0.52    0.67         1.12
R <- cor(mydata_PCA) 
round(R, 3)
##                 Life_expectancy Imports Exports Income
## Life_expectancy           1.000   0.054   0.316  0.612
## Imports                   0.054   1.000   0.737  0.122
## Exports                   0.316   0.737   1.000  0.517
## Income                    0.612   0.122   0.517  1.000

The correlations between all variables are above 0.3, except for the correlation between Imports and Life Expectancy and Imports and Income. Since strong correlations are generally observed between most variables, this requirement for PCA is met.

library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA)) 
## $chisq
## [1] 293.2047
## 
## $p.value
## [1] 2.33663e-60
## 
## $df
## [1] 6

H0: P = I H1: P ≠ I

Based on the results, we can reject H0 (p < 0.001) and conclude that at least one correlation coefficient is significantly different from 0.

library(psych)
KMO(R) 
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA =  0.5
## MSA for each item = 
## Life_expectancy         Imports         Exports          Income 
##            0.66            0.42            0.50            0.50

The overall MSA exceeds the threshold of 0.5, meaning this criterion is satisfied. However, the variable imports shows a slightly lower MSA value, as it has weaker correlations with other variables. When combining variables into principal components, imports would contribute the most to information loss.

library(FactoMineR) 
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE,
                  graph = FALSE) 

library(factoextra) 
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
get_eigenvalue(components) 
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.2145756        55.364390                    55.36439
## Dim.2  1.2323857        30.809642                    86.17403
## Dim.3  0.4000768        10.001919                    96.17595
## Dim.4  0.1529620         3.824049                   100.00000

If we sum all the eigenvalues, we get 4, which corresponds to the four standardized initial variables, each with a variance of 1. Together, they represent the total variance in the dataset.

The variance of the first principal component (PC1) is 2.21, capturing 55.4% of the total variance in the measured variables. The second component (PC2) explains an additional 30.8%, so the first two components together account for 86.2% of the total information. The third component (PC3) contributes another 10.0%, bringing the cumulative variance explained to 96.2%. The fourth component (PC4) adds the remaining 3.8%, resulting in a total cumulative variance of 100%.

library(factoextra)
fviz_eig(components,
         choice = "eigenvalue",
         main = "Scree plot",
         ylab = "Eigenvalue",
         xlab = "Principal component",
         addlabels = TRUE)

library(psych)
fa.parallel(mydata_PCA, 
            sim = FALSE, 
            fa = "pc") 
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  2

Both scree plot and parallel analysis indicate a usage of two principle components for our data.

library(FactoMineR)
components <- PCA(mydata_PCA, 
                  scale.unit = TRUE, 
                  graph = FALSE,
                  ncp = 2) 

components
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 167 individuals, described by 4 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
print(components$var$cor) 
##                     Dim.1      Dim.2
## Life_expectancy 0.6474494  0.6170661
## Imports         0.6453242 -0.7089320
## Exports         0.8875505 -0.3481674
## Income          0.7688924  0.4772944

The table shows the loadings of each variable on the first two principal components (PC1 and PC2).

PC1 (Dim.1) is strongly influenced by Exports (0.89), Income (0.77), and Life Expectancy (0.65), suggesting it represents overall economic and development factors. PC2 (Dim.2) has a strong negative loading for Imports (-0.71) and a high positive loading for Life Expectancy (0.62), indicating it may capture differences in trade dependency and health-related aspects.

print(components$var$contrib) 
##                    Dim.1     Dim.2
## Life_expectancy 18.92872 30.897033
## Imports         18.80466 40.781433
## Exports         35.57096  9.836253
## Income          26.69566 18.485281

Also here PC1 (Dim.1) is mainly driven by Exports (35.57%) and Income (26.70%), indicating a strong economic influence. PC2 (Dim.2) is most influenced by Imports (40.78%) and Life Expectancy (30.90%), suggesting a relationship between trade dependency and health-related factors.

library(factoextra)
fviz_pca_var(components, 
             repel = TRUE)

Dim1 (55.4%) explains the majority of the variance, with Exports and Income strongly contributing. Dim2 (30.8%) captures additional variance, with Life_expectancy and Imports having significant loadings. Exports and Imports are negatively correlated, as seen from their opposite directions. Life_expectancy and Income are closely aligned, suggesting a strong positive relationship. This plot indicates that economic variables (Income, Exports) and trade dependence (Imports) play a major role in explaining variance in the dataset.

library(factoextra)
fviz_pca_biplot(components) 

Unit 124 is both in pc 1 general factors and pc 2 economic factors above average, as unit 132 is above average in general factors, but below average in economic factors. Unit 67 is in both factors below average.

head(components$ind$coord)
##        Dim.1       Dim.2
## 1 -1.8366657 -0.83782264
## 2 -0.1674340  0.30392694
## 3 -0.1598371  0.71862336
## 4 -0.4245703 -1.04462554
## 5  0.6705568  0.06706357
## 6 -0.7404905  1.43570230
components$ind$coord[108, ]
##     Dim.1     Dim.2 
## -2.281148  1.173604
mydata_PCA_std <- scale(mydata_PCA) 
mydata_PCA_std[108, ] 
## Life_expectancy         Imports         Exports          Income 
##      -0.4223115      -1.9341227      -1.4956939      -0.6963711

Unit 108 (Myanmar) shows following characteristics: Dim.1 (-2.28): Strongly negative, indicating that Myanmar has lower values for Exports, Income, and Imports compared to other countries. Dim.2 (1.17): Slightly positive, suggesting moderate differentiation along this component. Standardized values confirm that Myanmar has:

Very low Imports (-1.93) and Exports (-1.50), indicating limited trade activity. Lower-than-average Income (-0.70), reflecting weaker economic conditions. Slightly below-average Life Expectancy (-0.42), suggesting lower health and living standards.

mydata$PC1 <- components$ind$coord[ , 1] 
mydata$PC2 <- components$ind$coord[ , 2] 

head(mydata, 6)
##   ID             Country Children_death Exports Health Imports Income Inflation
## 1  1         Afghanistan           90.2    10.0   7.58    44.9   1610      9.44
## 2  2             Albania           16.6    28.0   6.55    48.6   9930      4.49
## 3  3             Algeria           27.3    38.4   4.17    31.4  12900     16.10
## 4  4              Angola          119.0    62.3   2.85    42.9   5900     22.40
## 5  5 Antigua and Barbuda           10.3    45.5   6.03    58.9  19100      1.44
## 6  6           Argentina           14.5    18.9   8.10    16.0  18700     20.90
##   Life_expectancy Total_fertility  GDPP        PC1         PC2
## 1            56.2            5.82   553 -1.8366657 -0.83782264
## 2            76.3            1.65  4090 -0.1674340  0.30392694
## 3            76.5            2.89  4460 -0.1598371  0.71862336
## 4            60.1            6.16  3530 -0.4245703 -1.04462554
## 5            76.8            2.13 12200  0.6705568  0.06706357
## 6            75.8            2.37 10300 -0.7404905  1.43570230
cor(x=mydata$PC1, y=mydata$PC2)
## [1] 6.285737e-16
fit <- lm(GDPP ~ PC1 + PC2 + Income + Imports + Inflation, 
          data = mydata) 

library(car)
vif(fit) 
##       PC1       PC2    Income   Imports Inflation 
## 13.233310  8.262497  6.043822 15.327407  1.211431
mean(vif(fit))
## [1] 8.815693
mydata$StdResid <- rstandard(fit)
mydata$StdFitted <- scale(fit$fitted.values)

hist(mydata$StdResid, 
     xlab = "Standardized residuals", 
     ylab = "Frequency", 
     main = "Histrogram of standardized residuals")

Histogramm shows potential outliers.

shapiro.test(mydata$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$StdResid
## W = 0.82433, p-value = 6.731e-13

The Shapiro-Wilk normality test results a p-value < 0.001, indicating that the residuals deviate significantly from a normal distribution.

library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(fit)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##               Data               
##  --------------------------------
##  Response : GDPP 
##  Variables: fitted values of GDPP 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    399.4468 
##  Prob > Chi2   =    7.266927e-89

The Breusch-Pagan test for heteroskedasticity results a p-value < 0.001, indicating strong evidence against the null hypothesis of homoscedasticity. This suggests that variance is not constant, meaning heteroskedasticity is present in the model. To address this, robust standard errors should be used.

head(mydata[order(mydata$StdResid), c("ID", "StdResid") ], 3)
##      ID  StdResid
## 124 124 -5.006183
## 24   24 -3.784531
## 83   83 -2.837365

Units 24 and 124 should be removed as their standardized residual value is below 3.

library(dplyr)
mydata <- mydata %>%
  filter(!ID %in% c(124, 24))