1. preparing the data

df <- read.csv("./WDI2025_for_2022.csv", header=TRUE, sep=",", dec=".")

column 485 = “GDP per capita, PPP (current international $)”

find out correlation results

# number of columns
n_cols <- ncol(df)

# reset the vector
cor_results <- numeric(n_cols)

# repeat for each column
for (i in 1:n_cols) {
  cor_results[i] <- if (is.numeric(df[[i]])) {
    cor(df[[485]], df[[i]], use = "complete.obs")
  } else {
    NA
  }
}
## Warning in cor(df[[485]], df[[i]], use = "complete.obs"): 표준편차가 0입니다
# dataframe with correlation results
cor_df <- data.frame(
  Column = 1:n_cols,
  Variable = colnames(df),
  Correlation = cor_results
)

# absolute value of correlation, descending order
cor_df <- cor_df[order(abs(cor_df$Correlation), decreasing = TRUE), ]

# top 10
head(cor_df, 10)
##     Column
## 485    485
## 912    912
## 484    484
## 705    705
## 510    510
## 509    509
## 704    704
## 738    738
## 486    486
## 425    425
##                                                                                                                                                           Variable
## 485                                                                                              GDP.per.capita..PPP..current.international.....NY.GDP.PCAP.PP.CD.
## 912                                                                                   Net.official.flows.from.UN.agencies..UNCTAD..current.US....DT.NFL.UNCTAD.CD.
## 484                                                                                        GDP.per.capita..PPP..constant.2021.international.....NY.GDP.PCAP.PP.KD.
## 705   Learning.poverty..Share.of.Male.Children.at.the.End.of.Primary.age.below.minimum.reading.proficiency.adjusted.by.Out.of.School.Children......SE.LPV.PRIM.MA.
## 510                                                                                              GNI.per.capita..PPP..current.international.....NY.GNP.PCAP.PP.CD.
## 509                                                                                        GNI.per.capita..PPP..constant.2021.international.....NY.GNP.PCAP.PP.KD.
## 704 Learning.poverty..Share.of.Female.Children.at.the.End.of.Primary.age.below.minimum.reading.proficiency.adjusted.by.Out.of.School.Children......SE.LPV.PRIM.FE.
## 738                                                    Male.pupils.below.minimum.reading.proficiency.at.end.of.primary......Low.GAML.threshold..SE.LPV.PRIM.LD.MA.
## 486                                                                                              GDP.per.person.employed..constant.2021.PPP.....SL.GDP.PCAP.EM.KD.
## 425                                                  Female.pupils.below.minimum.reading.proficiency.at.end.of.primary......Low.GAML.threshold..SE.LPV.PRIM.LD.FE.
##     Correlation
## 485   1.0000000
## 912  -1.0000000
## 484   0.9976572
## 705  -0.9826350
## 510   0.9805546
## 509   0.9804742
## 704  -0.9770293
## 738  -0.9751282
## 486   0.9671797
## 425  -0.9657136
write.csv(cor_df, "./cor.csv")

col 1299 = Services, value added per worker (constant 2015 US$) [NV.SRV.EMPL.KD] Value added per worker is a measure of labor productivity—value added per unit of input. Value added denotes the net output of a sector after adding up all outputs and subtracting intermediate inputs. Data are in constant 2015 U.S. dollars. Services corresponds to the International Standard Industrial Classification (ISIC) tabulation categories G-P (revision 3) or tabulation categories G-U (revision 4), and includes wholesale and retail trade and restaurants and hotels; transport, storage, and communications; financing, insurance, real estate, and business services; and community, social and personal services.

col 322 = Domestic general government health expenditure per capita, PPP (current international $) [SH.XPD.GHED.PP.CD] Public expenditure on health from domestic sources per capita expressed in international dollars at purchasing power parity.

1. a)

data_selected <- df[complete.cases(df[ , c(4, 485, 1299)]), c(4, 485, 1299)]

data_selected$log_GDP <- log(data_selected$GDP.per.capita..PPP..current.international.....NY.GDP.PCAP.PP.CD.)

colnames(data_selected) <- c("Country_Name", "GDP_per_capita", "Services", "log_GDP")

write.csv(data_selected, "./data_selected.csv")

1. b)

plot(x = data_selected$log_GDP,
     y = data_selected$Services,
     xlab = "log(GDP per capita, PPP)",
     ylab = "Services, value added per worker (constant 2015 US$)",
     main = "Scatterplot")

head(data_selected[order(-data_selected$Services), ])
##      Country_Name GDP_per_capita Services  log_GDP
## 111    Luxembourg      144750.80 204015.6 11.88277
## 32    Switzerland       90138.95 146974.4 11.40911
## 85        Ireland      137230.87 128238.8 11.82942
## 195 United States       78035.18 116191.1 11.26491
## 138        Norway      122516.24 108148.4 11.71600
## 11      Australia       65834.78 102962.8 11.09490
data_selected2 <- data_selected[-c(94, 27), ]

plot(x = data_selected2$log_GDP,
     y = data_selected2$Services,
     xlab = "log(GDP per capita, PPP)",
     ylab = "Services, value added per worker (constant 2015 US$)",
     main = "Scatterplot without outliers")

1. c)

plot(x = data_selected2$log_GDP,
            y = data_selected2$Services,
            xlab = "log(GDP per capita, PPP)",
            ylab = "Services, value added per worker (constant 2015 US$)",
            main = "Scatterplot without outliers & with highlight",
            col = "black")

austria <- which(data_selected2$Country_Name == "Austria")
x <- data_selected2$log_GDP[austria]
y <- data_selected2$Services[austria]

points(x, y, col = "red", pch = 19)
text(x, y, "Austria", pos = 4, col = "red")

korea <- which(data_selected2$Country_Name == "Korea, Rep.")
x1 <- data_selected2$log_GDP[korea]
y1 <- data_selected2$Services[korea]

points(x1, y1, col = "blue", pch = 19)
text(x1, y1, "Korea", pos = 4, col = "blue")

2. a)

lm(Services ~ log_GDP, data = data_selected2)
## 
## Call:
## lm(formula = Services ~ log_GDP, data = data_selected2)
## 
## Coefficients:
## (Intercept)      log_GDP  
##     -155570        18909

2. b)

model <- lm(Services ~ log_GDP, data = data_selected2)

# install.packages("stargazer")

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model,
          type = "text",
          star.cutoffs = c(0.05, 0.01, 0.001)
          )
## 
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                               Services           
## -------------------------------------------------
## log_GDP                     18,908.760***        
##                              (1,113.455)         
##                                                  
## Constant                   -155,569.800***       
##                             (10,760.280)         
##                                                  
## -------------------------------------------------
## Observations                     172             
## R2                              0.629            
## Adjusted R2                     0.627            
## Residual Std. Error     17,109.600 (df = 170)    
## F Statistic           288.390*** (df = 1; 170)   
## =================================================
## Note:               *p<0.05; **p<0.01; ***p<0.001

2. c)

\(\beta_{0}\) = -155570 predicted value op log_GDP when Services = 0

\(\beta_{1}\) = 18909 one unit increase in log_GDP is associated with 18909 increase in Services, value added per worker (constant 2015 US$)

2. d)

summary(model)
## 
## Call:
## lm(formula = Services ~ log_GDP, data = data_selected2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30079 -12099  -4472   8668  60129 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -155570      10760  -14.46   <2e-16 ***
## log_GDP        18909       1114   16.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17110 on 170 degrees of freedom
## Multiple R-squared:  0.6291, Adjusted R-squared:  0.627 
## F-statistic: 288.4 on 1 and 170 DF,  p-value: < 2.2e-16

Multiple R-squared: 0.6291 explaining 0.63 of its variation

p-value < 2.2e-16 the model is statistically significant

2. e)

y_hat <- model$fitted.values

resid <- residuals(model)

2. f)

plot(x = data_selected2$log_GDP,
     y = resid,
     xlab = "log(GDP per capita, PPP)",
     ylab = "Residuals",
     main = "Scatter"
     )

mean(resid)
## [1] 1.164443e-13
cov(resid, data_selected2$log_GDP)
## [1] -8.12878e-12

mean(resid) = 1.164443e-13 approximately 0, residuals sum to zero(OLS)

cov(resid, data_selected2$log_GDP) = -8.12878e-12 approximately 0, residuals are uncorrelated with the independent variable

3. a)

plot(x = data_selected2$log_GDP,
     y = data_selected2$Services,
     xlab = "log(GDP per capita, PPP)",
     ylab = "Services, value added per worker (constant 2015 US$)",
     main = "Scatterplot without outliers2",
     col = "black")

abline(model, col = "red", lwd = 2)

3. b)

the fitted line does not fully capture the curvature observed in the data nonlinear model might provide a better fit