Final Project Data Science

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

# Install Library
##install.packages("GGally")
##install.packages("corrplot")
##install.packages("Hmisc")

# Load libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(httr)
library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(ggplot2)
library(GGally)

## Warning: package 'GGally' was built under R version 4.5.1

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.1

## corrplot 0.95 loaded

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 4.5.1

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(dplyr)
library(knitr)
library(broom)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

## Pengumpulan Data dari World Bank API
indicators <- list(
  gdp_pc = "NY.GDP.PCAP.CD",
  rnd_exp = "GB.XPD.RSDV.GD.ZS",
  edu_exp = "SE.XPD.TOTL.GD.ZS",
  acc_own = "FX.OWN.TOTL.ZS"
)

get_data <- function(indicator) {
  url <- paste0("https://api.worldbank.org/v2/countries/all/indicators/",
                indicator, "?format=json&per_page=20000")
  res <- fromJSON(content(GET(url), "text"), flatten = TRUE)
  data <- res[[2]] %>% 
    as_tibble() %>% 
    transmute(
      country = country.value,
      iso3c = country.id,
      year = as.integer(date),
      value = value
    )
  return(data)
}

data_list <- lapply(indicators, get_data)
names(data_list) <- names(indicators)

## Pembersihan dan Penggabungan Data
data_cleaned <- reduce(data_list, function(x, y) {
  full_join(x, y, by = c("country", "iso3c", "year"))
})

colnames(data_cleaned) <- c("country", "iso3c", "year", "gdp_pc", "rnd_exp", "edu_exp", "acc_own")

## Hapus NA dan data duplikat
final_data <- data_cleaned %>% 
  filter(!is.na(gdp_pc) & !is.na(rnd_exp) & !is.na(edu_exp) & !is.na(acc_own)) %>% 
  distinct()

## Analisis Korelasi
### Transformasi log GDP per kapita

final_data <- final_data %>% 
  mutate(log_gdp_pc = log(gdp_pc))

## Analisis Korelasi
### Korelasi dan Visualisasi

vars <- final_data %>% 
  select(log_gdp_pc, rnd_exp, edu_exp, acc_own)

# Korelasi Pearson
cor_matrix <- rcorr(as.matrix(vars), type = "pearson")

# Heatmap
corrplot(cor_matrix$r, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black")

## Analisis Korelasi
### Scatter Plot dengan Garis Tren
plot_list <- list(
  ggplot(final_data, aes(x = rnd_exp, y = log_gdp_pc)) + geom_point() + geom_smooth(method = "lm") + ggtitle("R&D vs Log(GDP per capita)"),
  ggplot(final_data, aes(x = edu_exp, y = log_gdp_pc)) + geom_point() + geom_smooth(method = "lm") + ggtitle("Education vs Log(GDP per capita)"),
  ggplot(final_data, aes(x = acc_own, y = log_gdp_pc)) + geom_point() + geom_smooth(method = "lm") + ggtitle("Account Ownership vs Log(GDP per capita)")
)

print(plot_list[[1]])

## `geom_smooth()` using formula = 'y ~ x'

print(plot_list[[2]])

## `geom_smooth()` using formula = 'y ~ x'

print(plot_list[[3]])

## `geom_smooth()` using formula = 'y ~ x'

## Analisis Korelasi
### Uji Signifikansi Korelasi
kable(cor_matrix$P, caption = "P-value Matrix untuk Uji Signifikansi Korelasi")

P-value Matrix untuk Uji Signifikansi Korelasi
	log_gdp_pc	rnd_exp	edu_exp	acc_own
log_gdp_pc	NA	0	0	0
rnd_exp	0	NA	0	0
edu_exp	0	0	NA	0
acc_own	0	0	0	NA

## Analisis Temuan
### Variabel dengan Dampak Terbesar
model <- lm(log_gdp_pc ~ rnd_exp + edu_exp + acc_own, data = final_data)
summary(model)

## 
## Call:
## lm(formula = log_gdp_pc ~ rnd_exp + edu_exp + acc_own, data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.96257 -0.40599  0.04058  0.39261  1.86672 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.743688   0.147812  45.624  < 2e-16 ***
## rnd_exp     0.263443   0.043609   6.041  3.9e-09 ***
## edu_exp     0.053997   0.029444   1.834   0.0675 .  
## acc_own     0.029876   0.001736  17.208  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6448 on 352 degrees of freedom
## Multiple R-squared:  0.7174, Adjusted R-squared:  0.7149 
## F-statistic: 297.8 on 3 and 352 DF,  p-value: < 2.2e-16

# Tampilkan hasil regresi dalam bentuk tabel
reg_table <- tidy(model, conf.int = TRUE)
kable(reg_table, caption = "Hasil Regresi Linear: log(GDP per capita) terhadap Variabel Sosial Ekonomi (dengan Confidence Interval)")

Hasil Regresi Linear: log(GDP per capita) terhadap Variabel Sosial Ekonomi (dengan Confidence Interval)
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	6.7436881	0.1478116	45.623538	0.0000000	6.4529832	7.0343931
rnd_exp	0.2634431	0.0436094	6.040966	0.0000000	0.1776753	0.3492109
edu_exp	0.0539973	0.0294436	1.833922	0.0675096	-0.0039102	0.1119048
acc_own	0.0298764	0.0017362	17.207869	0.0000000	0.0264617	0.0332910

# Nilai R-squared dan uji F
model_summary <- summary(model)
r_squared <- model_summary$r.squared
f_statistic <- model_summary$fstatistic[1]
df1 <- model_summary$fstatistic[2]
df2 <- model_summary$fstatistic[3]
p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)

# VIF (Multikolinearitas)
vif_values <- vif(model)
kable(as.data.frame(vif_values), caption = "Nilai Variance Inflation Factor (VIF) untuk Setiap Variabel Independen")

Nilai Variance Inflation Factor (VIF) untuk Setiap Variabel Independen
	vif_values
rnd_exp	1.815551
edu_exp	1.169495
acc_own	1.802186

# tabel hasil statistik model
table_stat <- tibble(
  `R-squared` = r_squared,
  `F-statistic` = f_statistic,
  `df1` = df1,
  `df2` = df2,
  `p-value` = p_value
)
kable(table_stat, caption = "Ringkasan Statistik Model Regresi")

Ringkasan Statistik Model Regresi
R-squared	F-statistic	df1	df2	p-value
0.7173527	297.7895	3	352	0

## Analisis Temuan
### Perbandingan untuk 10 Negara dengan GDP per kapita Tertinggi
top10_countries <- final_data %>% 
  group_by(country) %>% 
  filter(year == max(year)) %>% 
  arrange(desc(gdp_pc)) %>% 
  head(10)

kable(top10_countries[, c("country", "year", "gdp_pc", "rnd_exp", "edu_exp", "acc_own")])

country	year	gdp_pc	rnd_exp	edu_exp	acc_own
Luxembourg	2017	110193.21	1.23898	3.486460	98.77
Ireland	2021	103961.98	1.11065	3.005980	99.66
Switzerland	2021	93664.77	3.30645	5.037090	99.49
Norway	2021	93072.89	1.88764	6.960220	99.48
United States	2021	71307.40	3.48313	5.428300	94.95
Denmark	2021	69727.99	2.76142	6.999930	100.00
North America	2021	69412.91	3.35576	5.088450	95.45
Iceland	2021	69178.40	2.77875	8.217810	100.00
Singapore	2017	61236.25	1.89913	2.756848	97.93
Sweden	2021	61174.97	3.40216	7.572200	99.69

## Analisis Temuan
### Korelasi antar Variabel Sosial Ekonomi
vars2 <- top10_countries %>% 
  ungroup() %>% 
  select(rnd_exp, edu_exp, acc_own) %>% 
  mutate(across(everything(), as.numeric))

corrplot(cor(vars2, use = "pairwise.complete.obs"), method = "number")

Final Project Data Science - MET FEB UNPAD

Robby Fathir Nashary - 120720240502

2025-07-11

R Markdown