install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
getwd()
## [1] "/cloud/project/Case Studies"
worldbank2014 <-read.csv("/cloud/project/Case Studies/wb_GDP_merged - from worldbank_cleannoregions.csv")
#viewing data and verifying column types
str(worldbank2014)
## 'data.frame': 104 obs. of 13 variables:
## $ country_name : chr "Macao SAR, China" "Norway" "Switzerland" "Qatar" ...
## $ country_code : chr "MAC" "NOR" "CHE" "QAT" ...
## $ GDP_to_InternetUsers: num 1343 1012 995 954 748 ...
## $ SE_COM_DURS : int 10 10 12 9 12 10 12 10 9 9 ...
## $ SE_PRM_ENRL : int 22862 425917 483886 117454 24538372 2168531 359064 467484 327247 757164 ...
## $ SE_SEC_ENRL : int 32054 439250 616127 88466 24229776 2371284 650998 553791 697388 826694 ...
## $ UIS_E_3 : int 16655 249882 350927 40991 11736315 1104162 233794 311359 363188 502035 ...
## $ NY_GDP_PCAP_CD : num 94004 97200 86606 86853 54599 ...
## $ SE_XPD_TOTL_GD_ZS : num 2.03 NA 5.1 3.61 5.38 5.23 NA NA 5.5 7.68 ...
## $ IT_NET_USER_P2 : int 70 96 87 91 73 84 12 96 81 93 ...
## $ UIS_SAP_CE : int NA 615879 956774 NA 49488316 NA 1164601 669247 743482 910369 ...
## $ SP_POP_TOTL : int 588781 5137232 8188649 2374419 318563456 23460694 5466241 5643475 8541575 9696110 ...
## $ SL_UEM_TOTL_ZS : num 1.7 3.5 4.5 0.2 6.2 6.1 8.9 6.6 5.6 8 ...
#creating vectors for each column for correlation matrix analysis
y1 <- worldbank2014$UIS_E_3
y2 <- worldbank2014$SL_UEM_TOTL_ZS
x1 <- worldbank2014$SE_COM_DURS
x2 <- worldbank2014$SE_PRM_ENRL
x3 <- worldbank2014$SE_SEC_ENRL
x4 <- worldbank2014$NY_GDP_PCAP_CD
x5 <- worldbank2014$SE_XPD_TOTL_GD_ZS
x6 <- worldbank2014$IT_NET_USER_P2
x7 <- worldbank2014$UIS_SAP_CE
x8 <- worldbank2014$SP_POP_TOTL
#framing out variables into a datatable (without string columns)
wb_datatable <- as.data.frame(cbind(y1, y2, x1, x2, x3, x4, x5, x6, x7, x8))
cor(wb_datatable)
## y1 y2 x1 x2 x3 x4 x5 x6 x7
## y1 1.000000000 -0.08500817 NA NA NA -0.008012006 NA 0.007571796 NA
## y2 -0.085008174 1.00000000 NA NA NA -0.188907156 NA 0.098906733 NA
## x1 NA NA 1 NA NA NA NA NA NA
## x2 NA NA NA 1 NA NA NA NA NA
## x3 NA NA NA NA 1 NA NA NA NA
## x4 -0.008012006 -0.18890716 NA NA NA 1.000000000 NA 0.749030201 NA
## x5 NA NA NA NA NA NA 1 NA NA
## x6 0.007571796 0.09890673 NA NA NA 0.749030201 NA 1.000000000 NA
## x7 NA NA NA NA NA NA NA NA 1
## x8 0.994671905 -0.09424925 NA NA NA -0.031578237 NA -0.020014588 NA
## x8
## y1 0.99467191
## y2 -0.09424925
## x1 NA
## x2 NA
## x3 NA
## x4 -0.03157824
## x5 NA
## x6 -0.02001459
## x7 NA
## x8 1.00000000
#many relationships not described, due to null values.
#Choosing to focus on those that have correlation coefficients for this project.
#fitting various models
model1 <- lm(x4 ~ x6, data = wb_datatable)
model2 <- lm(x6 ~ x4, data = wb_datatable)
model3 <- lm(y1 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
model4 <- lm(y2 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
summary(model1)
##
## Call:
## lm(formula = x4 ~ x6, data = wb_datatable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24708 -10056 -1301 6755 64338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11389.86 2893.54 -3.936 0.000151 ***
## x6 586.52 51.37 11.418 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14920 on 102 degrees of freedom
## Multiple R-squared: 0.561, Adjusted R-squared: 0.5567
## F-statistic: 130.4 on 1 and 102 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = x6 ~ x4, data = wb_datatable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.15 -16.60 4.28 13.55 35.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.223e+01 2.355e+00 13.68 <2e-16 ***
## x4 9.566e-04 8.378e-05 11.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.06 on 102 degrees of freedom
## Multiple R-squared: 0.561, Adjusted R-squared: 0.5567
## F-statistic: 130.4 on 1 and 102 DF, p-value: < 2.2e-16
summary(model3)
##
## Call:
## lm(formula = y1 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -675638 -98950 -31225 65367 1189773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.902e+05 3.106e+05 -0.934 0.355741
## x1 -5.295e+02 2.774e+04 -0.019 0.984867
## x2 9.069e-02 4.689e-02 1.934 0.060228 .
## x3 2.436e-01 7.080e-02 3.440 0.001372 **
## x4 2.534e-01 3.986e+00 0.064 0.949627
## x5 9.663e+03 4.145e+04 0.233 0.816842
## x6 4.307e+03 3.450e+03 1.248 0.219193
## x7 -7.319e-02 2.145e-02 -3.412 0.001487 **
## x8 2.156e-02 5.944e-03 3.628 0.000801 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 328900 on 40 degrees of freedom
## (55 observations deleted due to missingness)
## Multiple R-squared: 0.9836, Adjusted R-squared: 0.9803
## F-statistic: 299.1 on 8 and 40 DF, p-value: < 2.2e-16
summary(model4)
##
## Call:
## lm(formula = y2 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0828 -2.1113 -0.2326 1.3544 14.6158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.153e+00 4.715e+00 0.881 0.38363
## x1 -1.252e-01 4.211e-01 -0.297 0.76783
## x2 9.177e-07 7.119e-07 1.289 0.20474
## x3 -1.407e-06 1.075e-06 -1.309 0.19810
## x4 -1.472e-04 6.051e-05 -2.433 0.01953 *
## x5 -3.830e-02 6.292e-01 -0.061 0.95177
## x6 1.516e-01 5.237e-02 2.894 0.00613 **
## x7 -3.569e-07 3.256e-07 -1.096 0.27958
## x8 9.378e-08 9.022e-08 1.039 0.30488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.993 on 40 degrees of freedom
## (55 observations deleted due to missingness)
## Multiple R-squared: 0.2158, Adjusted R-squared: 0.05897
## F-statistic: 1.376 on 8 and 40 DF, p-value: 0.2365
#exploring this variable
var(worldbank2014$GDP_to_InternetUsers)
## [1] 59471.66
mean(worldbank2014$GDP_to_InternetUsers)
## [1] 264.9169