df <- read.csv("./WDI2025_for_2022.csv", header=TRUE, sep=",", dec=".")
column 485 = “GDP per capita, PPP (current international $)”
# number of columns
n_cols <- ncol(df)
# reset the vector
cor_results <- numeric(n_cols)
# repeat for each column
for (i in 1:n_cols) {
cor_results[i] <- if (is.numeric(df[[i]])) {
cor(df[[485]], df[[i]], use = "complete.obs")
} else {
NA
}
}
## Warning in cor(df[[485]], df[[i]], use = "complete.obs"): 표준편차가 0입니다
# dataframe with correlation results
cor_df <- data.frame(
Column = 1:n_cols,
Variable = colnames(df),
Correlation = cor_results
)
# absolute value of correlation, descending order
cor_df <- cor_df[order(abs(cor_df$Correlation), decreasing = TRUE), ]
# top 10
head(cor_df, 10)
## Column
## 485 485
## 912 912
## 484 484
## 705 705
## 510 510
## 509 509
## 704 704
## 738 738
## 486 486
## 425 425
## Variable
## 485 GDP.per.capita..PPP..current.international.....NY.GDP.PCAP.PP.CD.
## 912 Net.official.flows.from.UN.agencies..UNCTAD..current.US....DT.NFL.UNCTAD.CD.
## 484 GDP.per.capita..PPP..constant.2021.international.....NY.GDP.PCAP.PP.KD.
## 705 Learning.poverty..Share.of.Male.Children.at.the.End.of.Primary.age.below.minimum.reading.proficiency.adjusted.by.Out.of.School.Children......SE.LPV.PRIM.MA.
## 510 GNI.per.capita..PPP..current.international.....NY.GNP.PCAP.PP.CD.
## 509 GNI.per.capita..PPP..constant.2021.international.....NY.GNP.PCAP.PP.KD.
## 704 Learning.poverty..Share.of.Female.Children.at.the.End.of.Primary.age.below.minimum.reading.proficiency.adjusted.by.Out.of.School.Children......SE.LPV.PRIM.FE.
## 738 Male.pupils.below.minimum.reading.proficiency.at.end.of.primary......Low.GAML.threshold..SE.LPV.PRIM.LD.MA.
## 486 GDP.per.person.employed..constant.2021.PPP.....SL.GDP.PCAP.EM.KD.
## 425 Female.pupils.below.minimum.reading.proficiency.at.end.of.primary......Low.GAML.threshold..SE.LPV.PRIM.LD.FE.
## Correlation
## 485 1.0000000
## 912 -1.0000000
## 484 0.9976572
## 705 -0.9826350
## 510 0.9805546
## 509 0.9804742
## 704 -0.9770293
## 738 -0.9751282
## 486 0.9671797
## 425 -0.9657136
write.csv(cor_df, "./cor.csv")
col 1299 = Services, value added per worker (constant 2015 US$) [NV.SRV.EMPL.KD] Value added per worker is a measure of labor productivity—value added per unit of input. Value added denotes the net output of a sector after adding up all outputs and subtracting intermediate inputs. Data are in constant 2015 U.S. dollars. Services corresponds to the International Standard Industrial Classification (ISIC) tabulation categories G-P (revision 3) or tabulation categories G-U (revision 4), and includes wholesale and retail trade and restaurants and hotels; transport, storage, and communications; financing, insurance, real estate, and business services; and community, social and personal services.
col 322 = Domestic general government health expenditure per capita, PPP (current international $) [SH.XPD.GHED.PP.CD] Public expenditure on health from domestic sources per capita expressed in international dollars at purchasing power parity.
data_selected <- df[complete.cases(df[ , c(4, 485, 1299)]), c(4, 485, 1299)]
data_selected$log_GDP <- log(data_selected$GDP.per.capita..PPP..current.international.....NY.GDP.PCAP.PP.CD.)
colnames(data_selected) <- c("Country_Name", "GDP_per_capita", "Services", "log_GDP")
write.csv(data_selected, "./data_selected.csv")
plot(x = data_selected$log_GDP,
y = data_selected$Services,
xlab = "log(GDP per capita, PPP)",
ylab = "Services, value added per worker (constant 2015 US$)",
main = "Scatterplot")
head(data_selected[order(-data_selected$Services), ])
## Country_Name GDP_per_capita Services log_GDP
## 111 Luxembourg 144750.80 204015.6 11.88277
## 32 Switzerland 90138.95 146974.4 11.40911
## 85 Ireland 137230.87 128238.8 11.82942
## 195 United States 78035.18 116191.1 11.26491
## 138 Norway 122516.24 108148.4 11.71600
## 11 Australia 65834.78 102962.8 11.09490
data_selected2 <- data_selected[-c(94, 27), ]
plot(x = data_selected2$log_GDP,
y = data_selected2$Services,
xlab = "log(GDP per capita, PPP)",
ylab = "Services, value added per worker (constant 2015 US$)",
main = "Scatterplot without outliers")
plot(x = data_selected2$log_GDP,
y = data_selected2$Services,
xlab = "log(GDP per capita, PPP)",
ylab = "Services, value added per worker (constant 2015 US$)",
main = "Scatterplot without outliers & with highlight",
col = "black")
austria <- which(data_selected2$Country_Name == "Austria")
x <- data_selected2$log_GDP[austria]
y <- data_selected2$Services[austria]
points(x, y, col = "red", pch = 19)
text(x, y, "Austria", pos = 4, col = "red")
korea <- which(data_selected2$Country_Name == "Korea, Rep.")
x1 <- data_selected2$log_GDP[korea]
y1 <- data_selected2$Services[korea]
points(x1, y1, col = "blue", pch = 19)
text(x1, y1, "Korea", pos = 4, col = "blue")
lm(Services ~ log_GDP, data = data_selected2)
##
## Call:
## lm(formula = Services ~ log_GDP, data = data_selected2)
##
## Coefficients:
## (Intercept) log_GDP
## -155570 18909
model <- lm(Services ~ log_GDP, data = data_selected2)
# install.packages("stargazer")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model,
type = "text",
star.cutoffs = c(0.05, 0.01, 0.001)
)
##
## =================================================
## Dependent variable:
## -----------------------------
## Services
## -------------------------------------------------
## log_GDP 18,908.760***
## (1,113.455)
##
## Constant -155,569.800***
## (10,760.280)
##
## -------------------------------------------------
## Observations 172
## R2 0.629
## Adjusted R2 0.627
## Residual Std. Error 17,109.600 (df = 170)
## F Statistic 288.390*** (df = 1; 170)
## =================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
\(\beta_{0}\) = -155570 predicted value op log_GDP when Services = 0
\(\beta_{1}\) = 18909 one unit increase in log_GDP is associated with 18909 increase in Services, value added per worker (constant 2015 US$)
summary(model)
##
## Call:
## lm(formula = Services ~ log_GDP, data = data_selected2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30079 -12099 -4472 8668 60129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -155570 10760 -14.46 <2e-16 ***
## log_GDP 18909 1114 16.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17110 on 170 degrees of freedom
## Multiple R-squared: 0.6291, Adjusted R-squared: 0.627
## F-statistic: 288.4 on 1 and 170 DF, p-value: < 2.2e-16
Multiple R-squared: 0.6291 explaining 0.63 of its variation
p-value < 2.2e-16 the model is statistically significant
y_hat <- model$fitted.values
resid <- residuals(model)
plot(x = data_selected2$log_GDP,
y = resid,
xlab = "log(GDP per capita, PPP)",
ylab = "Residuals",
main = "Scatter"
)
mean(resid)
## [1] 1.164443e-13
cov(resid, data_selected2$log_GDP)
## [1] -8.12878e-12
mean(resid) = 1.164443e-13 approximately 0, residuals sum to zero(OLS)
cov(resid, data_selected2$log_GDP) = -8.12878e-12 approximately 0, residuals are uncorrelated with the independent variable
plot(x = data_selected2$log_GDP,
y = data_selected2$Services,
xlab = "log(GDP per capita, PPP)",
ylab = "Services, value added per worker (constant 2015 US$)",
main = "Scatterplot without outliers2",
col = "black")
abline(model, col = "red", lwd = 2)
the fitted line does not fully capture the curvature observed in the data nonlinear model might provide a better fit