Dataset was used from Kaggle: https://www.kaggle.com/datasets/vipulgohel/clustering-pca-assignment
How the annual GDP growth rate can be explained by life expectancy, imports, exports and income?
#install.packages("readxl")
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
mydata <- read_xlsx("~/IMB/Mutivariat analysis/r podatki/country_data - HW 4.xlsx")
colnames(mydata) <- c("Country", "Children_death", "Exports", "Imports", "Income", "Inflation", "Life_expectancy", "Total_fertility", "GDPP")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create a new ID column with unique identifiers
mydata <- mutate(mydata, ID = 1:nrow(mydata))
library(dplyr)
# Move the 'ID' column to the first position
mydata <- select(mydata, ID, everything())
head(mydata)
## # A tibble: 6 × 10
## ID Country Children_death Exports Imports Income Inflation Life_expectancy
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Afghani… 90.2 10 44.9 1610 9.44 56.2
## 2 2 Albania 16.6 28 48.6 9930 4.49 76.3
## 3 3 Algeria 27.3 38.4 31.4 12900 16.1 76.5
## 4 4 Angola 119 62.3 42.9 5900 22.4 60.1
## 5 5 Antigua… 10.3 45.5 58.9 19100 1.44 76.8
## 6 6 Argenti… 14.5 18.9 16 18700 20.9 75.8
## # ℹ 2 more variables: Total_fertility <dbl>, GDPP <dbl>
Explanation of dataset
summary(mydata[,c(-1,-2)])
## Children_death Exports Imports Income
## Min. : 2.60 Min. : 2.20 Min. : 0.0659 Min. : 609
## 1st Qu.: 8.25 1st Qu.: 23.80 1st Qu.: 30.2000 1st Qu.: 3355
## Median : 19.30 Median : 35.40 Median : 43.3000 Median : 9960
## Mean : 38.27 Mean : 41.76 Mean : 46.8902 Mean : 17145
## 3rd Qu.: 62.10 3rd Qu.: 51.40 3rd Qu.: 58.7500 3rd Qu.: 22800
## Max. :208.00 Max. :200.00 Max. :174.0000 Max. :125000
## Inflation Life_expectancy Total_fertility GDPP
## Min. :-4.210 Min. :32.10 Min. :1.150 Min. : 231
## 1st Qu.: 1.755 1st Qu.:65.30 1st Qu.:1.795 1st Qu.: 1330
## Median : 5.140 Median :73.10 Median :2.410 Median : 4660
## Mean : 7.157 Mean :70.56 Mean :2.948 Mean : 12964
## 3rd Qu.:10.350 3rd Qu.:76.80 3rd Qu.:3.880 3rd Qu.: 14050
## Max. :45.900 Max. :82.80 Max. :7.490 Max. :105000
library(psych)
describe(mydata[,c(-1,-2)])
## vars n mean sd median trimmed mad min
## Children_death 1 167 38.27 40.33 19.30 31.59 21.94 2.60
## Exports 2 167 41.76 27.72 35.40 38.22 20.90 2.20
## Imports 3 167 46.89 24.21 43.30 44.34 21.05 0.07
## Income 4 167 17144.69 19278.07 9960.00 13807.63 11638.41 609.00
## Inflation 5 167 7.16 7.48 5.14 6.15 5.66 -4.21
## Life_expectancy 6 167 70.56 8.89 73.10 71.33 8.90 32.10
## Total_fertility 7 167 2.95 1.51 2.41 2.77 1.22 1.15
## GDPP 8 167 12964.16 18328.70 4660.00 9146.43 5814.76 231.00
## max range skew kurtosis se
## Children_death 2.08e+02 205.40 1.42 1.62 3.12
## Exports 2.00e+02 197.80 2.36 9.05 2.15
## Imports 1.74e+02 173.93 1.87 6.41 1.87
## Income 1.25e+05 124391.00 2.19 6.67 1491.78
## Inflation 4.59e+01 50.11 1.78 4.91 0.58
## Life_expectancy 8.28e+01 50.70 -0.95 1.03 0.69
## Total_fertility 7.49e+00 6.34 0.95 -0.25 0.12
## GDPP 1.05e+05 104769.00 2.18 5.23 1418.32
Explanations of parameters
PCA analysis
For the PCA analysis some requirements need to be meet:
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
fit <- lm(GDPP ~ Children_death + Exports + Imports + Income + Inflation + Life_expectancy + Total_fertility,
data = mydata)
library(car)
vif(fit)
## Children_death Exports Imports Income Inflation
## 7.410763 2.934215 2.433882 2.097863 1.238293
## Life_expectancy Total_fertility
## 5.754270 3.816680
mean(vif(fit))
## [1] 3.669424
mydata_PCA <- mydata[, c("Life_expectancy", "Imports", "Exports", "Income")]
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
round(stat.desc(mydata_PCA, basic = FALSE), 2)
## Life_expectancy Imports Exports Income
## median 73.10 43.30 35.40 9960.00
## mean 70.56 46.89 41.76 17144.69
## SE.mean 0.69 1.87 2.15 1491.78
## CI.mean.0.95 1.36 3.70 4.24 2945.31
## var 79.09 586.10 768.63 371643894.16
## std.dev 8.89 24.21 27.72 19278.07
## coef.var 0.13 0.52 0.66 1.12
R <- cor(mydata_PCA)
round(R, 3)
## Life_expectancy Imports Exports Income
## Life_expectancy 1.000 0.054 0.303 0.612
## Imports 0.054 1.000 0.683 0.122
## Exports 0.303 0.683 1.000 0.494
## Income 0.612 0.122 0.494 1.000
library(psych)
cortest.bartlett(R, n = nrow(mydata_PCA))
## $chisq
## [1] 246.1815
##
## $p.value
## [1] 2.684382e-50
##
## $df
## [1] 6
BSD we can reject H0 (p<0.001) and conclude that at least one correlation coeffcient differ from 0.
det(R)
## [1] 0.2225433
library(psych)
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.52
## MSA for each item =
## Life_expectancy Imports Exports Income
## 0.63 0.45 0.52 0.53
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
get_eigenvalue(components)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.1664236 54.160591 54.16059
## Dim.2 1.2208981 30.522453 84.68304
## Dim.3 0.4048570 10.121424 94.80447
## Dim.4 0.2078213 5.195532 100.00000
library(factoextra)
fviz_eig(components,
choice = "eigenvalue",
main = "Scree plot",
ylab = "Eigenvalue",
xlab = "Principal component",
addlabels = TRUE)
- We should take 2 components, break is at third component, and we
should take one less, according to the formla n-1
library(psych)
fa.parallel(mydata_PCA,
sim = FALSE,
fa = "pc")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
library(FactoMineR)
components <- PCA(mydata_PCA,
scale.unit = TRUE,
graph = FALSE,
ncp = 2)
components
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 167 individuals, described by 4 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
print(components$var$cor)
## Dim.1 Dim.2
## Life_expectancy 0.6605012 -0.6024900
## Imports 0.6178233 0.7173255
## Exports 0.8628375 0.3623410
## Income 0.7771535 -0.4604965
print(components$var$contrib)
## Dim.1 Dim.2
## Life_expectancy 20.13742 29.73174
## Imports 17.61916 42.14568
## Exports 34.36486 10.75364
## Income 27.87856 17.36894
library(factoextra)
fviz_pca_var(components,
repel = TRUE)
- PC1 measures General socio-economic factors - PC2 measures contrast
between economic factors (imports and exports) and social factors
(income and life expectancy)
Although income is more of an economic factor than a social one, I would categorize it as a social factor for the purpose of easier explanation.
library(factoextra)
fviz_pca_biplot(components)
head(components$ind$coord)
## Dim.1 Dim.2
## 1 -1.8615211 0.78934874
## 2 -0.1695789 -0.31409276
## 3 -0.1564174 -0.73002579
## 4 -0.4718798 1.02316985
## 5 0.6579004 -0.05902964
## 6 -0.7138841 -1.45828880
components$ind$coord[124, ]
## Dim.1 Dim.2
## 3.449634 -3.266095
mydata_PCA_std <- scale(mydata_PCA)
mydata_PCA_std[124, ]
## Life_expectancy Imports Exports Income
## 1.0057504 -0.9537632 0.7408327 5.5947159
mydata$PC1 <- components$ind$coord[ , 1]
mydata$PC2 <- components$ind$coord[ , 2]
head(mydata, 3)
## # A tibble: 3 × 12
## ID Country Children_death Exports Imports Income Inflation Life_expectancy
## <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Afghani… 90.2 10 44.9 1610 9.44 56.2
## 2 2 Albania 16.6 28 48.6 9930 4.49 76.3
## 3 3 Algeria 27.3 38.4 31.4 12900 16.1 76.5
## # ℹ 4 more variables: Total_fertility <dbl>, GDPP <dbl>, PC1 <dbl>, PC2 <dbl>
cor(x=mydata$PC1, y=mydata$PC2)
## [1] -1.569485e-16
fit <- lm(GDPP ~ PC1 + PC2 + Income + Imports + Inflation,
data = mydata)
library(car)
vif(fit)
## PC1 PC2 Income Imports Inflation
## 9.804409 6.968546 5.659903 11.122897 1.185929
mean(vif(fit))
## [1] 6.948337
mydata$StdResid <- rstandard(fit)
mydata$StdFitted <- scale(fit$fitted.values)
hist(mydata$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histrogram of standardized residuals")
shapiro.test(mydata$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydata$StdResid
## W = 0.83296, p-value = 1.532e-12
library(ggplot2)
ggplot(mydata, aes(y=StdResid, x=StdFitted)) +
theme_dark() +
geom_point(colour = "snow") +
ylab("Standardized residuals") +
xlab("Standardized fitted values")
library(olsrr)
ols_test_breusch_pagan(fit)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------
## Response : GDPP
## Variables: fitted values of GDPP
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 378.8171
## Prob > Chi2 = 2.251682e-84
head(mydata[order(mydata$StdResid), c("ID", "StdResid") ], 3)
## # A tibble: 3 × 2
## ID StdResid
## <int> <dbl>
## 1 124 -4.91
## 2 24 -3.65
## 3 83 -2.80
library(dplyr)
mydata <- mydata %>%
filter(!ID %in% c(124, 24))