install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
getwd()
## [1] "/cloud/project/Case Studies"
worldbank2014 <-read.csv("/cloud/project/Case Studies/wb_GDP_merged - from worldbank_cleannoregions.csv")
  #viewing data and verifying column types
str(worldbank2014)
## 'data.frame':    104 obs. of  13 variables:
##  $ country_name        : chr  "Macao SAR, China" "Norway" "Switzerland" "Qatar" ...
##  $ country_code        : chr  "MAC" "NOR" "CHE" "QAT" ...
##  $ GDP_to_InternetUsers: num  1343 1012 995 954 748 ...
##  $ SE_COM_DURS         : int  10 10 12 9 12 10 12 10 9 9 ...
##  $ SE_PRM_ENRL         : int  22862 425917 483886 117454 24538372 2168531 359064 467484 327247 757164 ...
##  $ SE_SEC_ENRL         : int  32054 439250 616127 88466 24229776 2371284 650998 553791 697388 826694 ...
##  $ UIS_E_3             : int  16655 249882 350927 40991 11736315 1104162 233794 311359 363188 502035 ...
##  $ NY_GDP_PCAP_CD      : num  94004 97200 86606 86853 54599 ...
##  $ SE_XPD_TOTL_GD_ZS   : num  2.03 NA 5.1 3.61 5.38 5.23 NA NA 5.5 7.68 ...
##  $ IT_NET_USER_P2      : int  70 96 87 91 73 84 12 96 81 93 ...
##  $ UIS_SAP_CE          : int  NA 615879 956774 NA 49488316 NA 1164601 669247 743482 910369 ...
##  $ SP_POP_TOTL         : int  588781 5137232 8188649 2374419 318563456 23460694 5466241 5643475 8541575 9696110 ...
##  $ SL_UEM_TOTL_ZS      : num  1.7 3.5 4.5 0.2 6.2 6.1 8.9 6.6 5.6 8 ...
  #creating vectors for each column for correlation matrix analysis
y1 <- worldbank2014$UIS_E_3
y2 <- worldbank2014$SL_UEM_TOTL_ZS
x1 <- worldbank2014$SE_COM_DURS
x2 <- worldbank2014$SE_PRM_ENRL
x3 <- worldbank2014$SE_SEC_ENRL
x4 <- worldbank2014$NY_GDP_PCAP_CD
x5 <- worldbank2014$SE_XPD_TOTL_GD_ZS
x6 <- worldbank2014$IT_NET_USER_P2
x7 <- worldbank2014$UIS_SAP_CE
x8 <- worldbank2014$SP_POP_TOTL
  #framing out variables into a datatable (without string columns)
wb_datatable <- as.data.frame(cbind(y1, y2, x1, x2, x3, x4, x5, x6, x7, x8))

cor(wb_datatable)
##              y1          y2 x1 x2 x3           x4 x5           x6 x7
## y1  1.000000000 -0.08500817 NA NA NA -0.008012006 NA  0.007571796 NA
## y2 -0.085008174  1.00000000 NA NA NA -0.188907156 NA  0.098906733 NA
## x1           NA          NA  1 NA NA           NA NA           NA NA
## x2           NA          NA NA  1 NA           NA NA           NA NA
## x3           NA          NA NA NA  1           NA NA           NA NA
## x4 -0.008012006 -0.18890716 NA NA NA  1.000000000 NA  0.749030201 NA
## x5           NA          NA NA NA NA           NA  1           NA NA
## x6  0.007571796  0.09890673 NA NA NA  0.749030201 NA  1.000000000 NA
## x7           NA          NA NA NA NA           NA NA           NA  1
## x8  0.994671905 -0.09424925 NA NA NA -0.031578237 NA -0.020014588 NA
##             x8
## y1  0.99467191
## y2 -0.09424925
## x1          NA
## x2          NA
## x3          NA
## x4 -0.03157824
## x5          NA
## x6 -0.02001459
## x7          NA
## x8  1.00000000
  #many relationships not described, due to null values. 
  #Choosing to focus on those that have correlation coefficients for this project.
  #fitting various models
model1 <- lm(x4 ~ x6, data = wb_datatable)
model2 <- lm(x6 ~ x4, data = wb_datatable)
model3 <- lm(y1 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
model4 <- lm(y2 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
summary(model1)
## 
## Call:
## lm(formula = x4 ~ x6, data = wb_datatable)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24708 -10056  -1301   6755  64338 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11389.86    2893.54  -3.936 0.000151 ***
## x6             586.52      51.37  11.418  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14920 on 102 degrees of freedom
## Multiple R-squared:  0.561,  Adjusted R-squared:  0.5567 
## F-statistic: 130.4 on 1 and 102 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = x6 ~ x4, data = wb_datatable)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.15 -16.60   4.28  13.55  35.23 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.223e+01  2.355e+00   13.68   <2e-16 ***
## x4          9.566e-04  8.378e-05   11.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.06 on 102 degrees of freedom
## Multiple R-squared:  0.561,  Adjusted R-squared:  0.5567 
## F-statistic: 130.4 on 1 and 102 DF,  p-value: < 2.2e-16
summary(model3)
## 
## Call:
## lm(formula = y1 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -675638  -98950  -31225   65367 1189773 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.902e+05  3.106e+05  -0.934 0.355741    
## x1          -5.295e+02  2.774e+04  -0.019 0.984867    
## x2           9.069e-02  4.689e-02   1.934 0.060228 .  
## x3           2.436e-01  7.080e-02   3.440 0.001372 ** 
## x4           2.534e-01  3.986e+00   0.064 0.949627    
## x5           9.663e+03  4.145e+04   0.233 0.816842    
## x6           4.307e+03  3.450e+03   1.248 0.219193    
## x7          -7.319e-02  2.145e-02  -3.412 0.001487 ** 
## x8           2.156e-02  5.944e-03   3.628 0.000801 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 328900 on 40 degrees of freedom
##   (55 observations deleted due to missingness)
## Multiple R-squared:  0.9836, Adjusted R-squared:  0.9803 
## F-statistic: 299.1 on 8 and 40 DF,  p-value: < 2.2e-16
summary(model4)
## 
## Call:
## lm(formula = y2 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8, data = wb_datatable)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0828  -2.1113  -0.2326   1.3544  14.6158 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  4.153e+00  4.715e+00   0.881  0.38363   
## x1          -1.252e-01  4.211e-01  -0.297  0.76783   
## x2           9.177e-07  7.119e-07   1.289  0.20474   
## x3          -1.407e-06  1.075e-06  -1.309  0.19810   
## x4          -1.472e-04  6.051e-05  -2.433  0.01953 * 
## x5          -3.830e-02  6.292e-01  -0.061  0.95177   
## x6           1.516e-01  5.237e-02   2.894  0.00613 **
## x7          -3.569e-07  3.256e-07  -1.096  0.27958   
## x8           9.378e-08  9.022e-08   1.039  0.30488   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.993 on 40 degrees of freedom
##   (55 observations deleted due to missingness)
## Multiple R-squared:  0.2158, Adjusted R-squared:  0.05897 
## F-statistic: 1.376 on 8 and 40 DF,  p-value: 0.2365
  #exploring this variable
var(worldbank2014$GDP_to_InternetUsers)
## [1] 59471.66
mean(worldbank2014$GDP_to_InternetUsers)
## [1] 264.9169