knitr::opts_chunk$set(echo = FALSE)
rm(list=ls())
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## Warning: package 'car' was built under R version 3.4.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(gplots)

## Warning: package 'gplots' was built under R version 3.4.3

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(lme4)

## Warning: package 'lme4' was built under R version 3.4.2

## Loading required package: Matrix

library(apsrtable)
library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 3.4.3

library(GGally)

## Warning: package 'GGally' was built under R version 3.4.3

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

library(mctest)

## Warning: package 'mctest' was built under R version 3.4.2

library(ppcor)

## Warning: package 'ppcor' was built under R version 3.4.3

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.4.3

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(NbClust)
library(cluster)

## Warning: package 'cluster' was built under R version 3.4.3

library(maps)

## Warning: package 'maps' was built under R version 3.4.3

## 
## Attaching package: 'maps'

## The following object is masked from 'package:cluster':
## 
##     votes.repub

library(plotly)

## Warning: package 'plotly' was built under R version 3.4.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.3

library(reshape2)

## Warning: package 'reshape2' was built under R version 3.4.3

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 3.4.3

library(leaps)

## Warning: package 'leaps' was built under R version 3.4.3

# data available here: https://drive.google.com/file/d/1wqoU8Fi-ZyG_iiD5PdaDD1_J7VwebJLD/view
data  = read.csv("C:/Users/jason/Downloads/Updated Master Data.csv")

view

## 'data.frame':    180 obs. of  14 variables:
##  $ geo_idx  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ YEAR     : int  1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
##  $ Cell     : num  0.1159 0.0537 0.9963 2.1926 0 ...
##  $ ENERGY   : num  338.5 62.6 1933.6 1554.4 48.7 ...
##  $ INTERNET : num  0.025498 0.00017 0.574087 0.46232 0.000218 ...
##  $ PATENT   : int  2184 422 30413 0 2027 45587 62400 1406 0 5828 ...
##  $ HEALTH   : num  6.7 3.5 10.4 10.1 4.1 ...
##  $ MIL      : num  1.93 2.07 3.28 1.85 2.83 ...
##  $ INFANT   : num  44.6 40.2 6.3 5.9 81.7 7.3 4.3 32 22.1 46.6 ...
##  $ FEMEDU   : num  6.4 6.1 10 10.8 3.3 10.4 12.2 7.3 11.7 7.6 ...
##  $ GDPCAP   : num  3380 536 18823 20344 333 ...
##  $ GDPGROWTH: num  3.06 12.7 -1.08 -1.65 2.72 ...
##  $ POP      : num  1578 12030 577 807 9245 ...
##  $ Category : Factor w/ 2 levels "Developed","Emerging": 2 2 1 1 2 1 1 2 2 2 ...

This allows us to see the curret structure of the data and identify objects that may need to be modified prior to building our model

##   geo_idx YEAR       Cell     ENERGY    INTERNET PATENT HEALTH      MIL
## 1       1 1993 0.11593404  338.47540 0.025498253   2184    6.7 1.929473
## 2       2 1993 0.05367264   62.55920 0.000170147    422    3.5 2.071665
## 3       3 1993 0.99630325 1933.63500 0.574086886  30413   10.4 3.283614
## 4       4 1993 2.19259004 1554.44600 0.462319739      0   10.1 1.853910
## 5       5 1993 0.00000000   48.65522 0.000218176   2027    4.1 2.827158
## 6       6 1993 2.12017489  989.39810 0.122689814  45587    7.2 2.004464
##   INFANT FEMEDU     GDPCAP  GDPGROWTH        POP  Category
## 1   44.6    6.4  3380.1284  3.0625488  1578.1222  Emerging
## 2   40.2    6.1   536.3573 12.6969383 12029.8296  Emerging
## 3    6.3   10.0 18822.6976 -1.0794842   577.4988 Developed
## 4    5.9   10.8 20344.2176 -1.6507664   806.7600 Developed
## 5   81.7    3.3   332.6057  2.7177655  9244.7563  Emerging
## 6    7.3   10.4 16817.5977 -0.9134017   570.9741 Developed

Using the head function gives us a snapshot of what the current values of the data look like in their dataframe.

##     geo_idx           YEAR           Cell             ENERGY       
##  Min.   : 1.00   Min.   :1993   Min.   :-0.9520   Min.   :-1.1010  
##  1st Qu.: 3.75   1st Qu.:1996   1st Qu.:-0.8685   1st Qu.:-0.7905  
##  Median : 6.50   Median :2000   Median :-0.3926   Median :-0.2685  
##  Mean   : 6.50   Mean   :2000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 9.25   3rd Qu.:2004   3rd Qu.: 0.7725   3rd Qu.: 0.5303  
##  Max.   :12.00   Max.   :2007   Max.   : 3.1748   Max.   : 2.9348  
##     INTERNET           PATENT            HEALTH              MIL         
##  Min.   :-0.7896   Min.   :-0.7956   Min.   :-1.50947   Min.   :-1.6882  
##  1st Qu.:-0.7487   1st Qu.:-0.7321   1st Qu.:-0.80329   1st Qu.:-0.6289  
##  Median :-0.4800   Median :-0.3405   Median :-0.08599   Median :-0.1740  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.4852   3rd Qu.: 0.3209   3rd Qu.: 0.63143   3rd Qu.: 0.5724  
##  Max.   : 2.5887   Max.   : 4.4546   Max.   : 3.23443   Max.   : 3.5475  
##      INFANT            FEMEDU            GDPCAP          GDPGROWTH      
##  Min.   :-0.8879   Min.   :-2.4770   Min.   :-1.0909   Min.   :-4.3167  
##  1st Qu.:-0.7760   1st Qu.:-0.7523   1st Qu.:-0.8946   1st Qu.:-0.4673  
##  Median :-0.5018   Median : 0.3119   Median :-0.2313   Median :-0.1561  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4721   3rd Qu.: 0.8624   3rd Qu.: 0.6642   3rd Qu.: 0.3281  
##  Max.   : 3.0027   Max.   : 1.2293   Max.   : 2.0060   Max.   : 3.1193  
##       POP               Category 
##  Min.   :-0.6214   Developed:90  
##  1st Qu.:-0.5725   Emerging :90  
##  Median :-0.4242                 
##  Mean   : 0.0000                 
##  3rd Qu.:-0.1956                 
##  Max.   : 2.5824

## [1] 1 2 3 4 5 6

First, we scale the data so that all the means of the nonfactor data are 0. We will confirm this has been completed by running the summary function.

Second, we will change the numbers that represent the countrires to their names in characters. This is to allow for the map function we used in ggplot to match up and display the countries with their respective colors/groupings. We will confirm the result by using the head function only on the geo_idx column to confirm the data has been transformed.

Last week we discuss PCA and used kmeans clustering to create three clusters from our dataset. This week we will use linear regression to build a model. This model will attempt to predict GDP Capacity per capita for each of our countries for each year. But First, les look at our individual nations and how GDP capacity varies per country.

We will start by using all variables using the lm() function, whereas y = GDPCAP.

## 
## Call:
## lm(formula = data$GDPCAP ~ ., data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44595 -0.08237 -0.01831  0.08898  0.52953 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.022310  11.106896   0.902 0.368178    
## geo_idx          -0.019180   0.006280  -3.054 0.002630 ** 
## YEAR             -0.004705   0.005560  -0.846 0.398632    
## Cell             -0.024552   0.030691  -0.800 0.424882    
## ENERGY            0.640197   0.059640  10.734  < 2e-16 ***
## INTERNET          0.100513   0.027955   3.596 0.000427 ***
## PATENT            0.152445   0.026996   5.647 6.94e-08 ***
## HEALTH           -0.322767   0.037342  -8.644 4.41e-15 ***
## MIL              -0.248648   0.016358 -15.200  < 2e-16 ***
## INFANT            0.149581   0.035648   4.196 4.41e-05 ***
## FEMEDU            0.198968   0.044302   4.491 1.32e-05 ***
## GDPGROWTH        -0.068797   0.015664  -4.392 1.99e-05 ***
## POP               0.025330   0.023449   1.080 0.281603    
## CategoryEmerging -0.974335   0.054487 -17.882  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1472 on 166 degrees of freedom
## Multiple R-squared:  0.9799, Adjusted R-squared:  0.9783 
## F-statistic: 622.9 on 13 and 166 DF,  p-value: < 2.2e-16

This appears to be a really good fit, if we were only looking at R2 and pvalue show significance. In the following, We’ll take a deeper dive into what our model shows with some plots.

Plots seem to show a pretty good fit, though I still have reservations.

##   geo_idx      YEAR      Cell    ENERGY  INTERNET    PATENT    HEALTH 
##  3.905549  4.795419  7.784075 29.393132  6.457750  6.022510 11.522995 
##       MIL    INFANT    FEMEDU GDPGROWTH       POP  Category 
##  2.211203 10.501567 16.218906  2.027566  4.543819  6.167758

Removing the highly corrleated variables give us a worse r2. We removed aything with a VIF of 10 or higher.

## 
## Call:
## lm(formula = GDPCAP ~ YEAR + Cell + INTERNET + PATENT + MIL + 
##     +FEMEDU + GDPGROWTH + POP, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.38105 -0.22571 -0.01888  0.22818  0.99351 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 165.31937   21.67712   7.626 1.60e-12 ***
## YEAR         -0.08266    0.01084  -7.626 1.60e-12 ***
## Cell          0.16037    0.06276   2.555 0.011477 *  
## INTERNET      0.21669    0.06386   3.393 0.000858 ***
## PATENT        0.50449    0.04752  10.616  < 2e-16 ***
## MIL          -0.19797    0.03050  -6.490 8.92e-10 ***
## FEMEDU        0.38471    0.05565   6.913 9.03e-11 ***
## GDPGROWTH    -0.04854    0.03686  -1.317 0.189660    
## POP           0.10772    0.04990   2.158 0.032286 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3512 on 171 degrees of freedom
## Multiple R-squared:  0.8821, Adjusted R-squared:  0.8766 
## F-statistic:   160 on 8 and 171 DF,  p-value: < 2.2e-16

Next, since VIF did not improve our fit, we will use forward and backward selection to give us the best fit. First we must create a regression model that looks like this lm(thing ~ 1). This gives us a a model with only the intercept.

## 
## Call:
## lm(formula = GDPCAP ~ 1, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0909 -0.8946 -0.2313  0.6642  2.0060 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.965e-17  7.454e-02       0        1
## 
## Residual standard error: 1 on 179 degrees of freedom

## Start:  AIC=1
## GDPCAP ~ 1
## 
##             Df Sum of Sq     RSS     AIC
## + Category   1   150.140  28.860 -325.49
## + PATENT     1   131.638  47.362 -236.32
## + ENERGY     1   129.194  49.806 -227.27
## + FEMEDU     1   103.492  75.508 -152.37
## + INFANT     1    96.749  82.251 -136.97
## + HEALTH     1    89.834  89.166 -122.44
## + INTERNET   1    77.096 101.904  -98.41
## + Cell       1    37.068 141.932  -38.77
## + POP        1    33.322 145.678  -34.08
## + geo_idx    1    19.125 159.875  -17.34
## + GDPGROWTH  1    14.748 164.252  -12.48
## <none>                   179.000    1.00
## + YEAR       1     1.226 177.774    1.76
## + MIL        1     0.072 178.928    2.92
## 
## Step:  AIC=-325.49
## GDPCAP ~ Category
## 
##             Df Sum of Sq    RSS     AIC
## + PATENT     1   18.1282 10.732 -501.55
## + ENERGY     1    9.9337 18.927 -399.43
## + INTERNET   1    5.8799 22.980 -364.50
## + geo_idx    1    4.1835 24.677 -351.68
## + FEMEDU     1    4.0965 24.764 -351.04
## + HEALTH     1    2.6349 26.225 -340.72
## + YEAR       1    1.2264 27.634 -331.30
## + MIL        1    0.7591 28.101 -328.29
## + Cell       1    0.4540 28.406 -326.34
## + INFANT     1    0.4099 28.450 -326.06
## <none>                   28.860 -325.49
## + GDPGROWTH  1    0.1918 28.669 -324.69
## + POP        1    0.1352 28.725 -324.33
## 
## Step:  AIC=-501.55
## GDPCAP ~ Category + PATENT
## 
##             Df Sum of Sq    RSS     AIC
## + MIL        1   1.85718  8.875 -533.75
## + POP        1   0.58317 10.149 -509.60
## + geo_idx    1   0.56193 10.170 -509.23
## + FEMEDU     1   0.37896 10.353 -506.02
## + GDPGROWTH  1   0.37467 10.357 -505.94
## + ENERGY     1   0.23914 10.493 -503.60
## + INTERNET   1   0.17303 10.559 -502.47
## + INFANT     1   0.15263 10.579 -502.13
## <none>                   10.732 -501.55
## + Cell       1   0.00482 10.727 -499.63
## + HEALTH     1   0.00080 10.731 -499.56
## + YEAR       1   0.00047 10.732 -499.56
## 
## Step:  AIC=-533.75
## GDPCAP ~ Category + PATENT + MIL
## 
##             Df Sum of Sq    RSS     AIC
## + ENERGY     1   2.07493 6.8000 -579.69
## + geo_idx    1   1.36819 7.5068 -561.89
## + FEMEDU     1   0.96142 7.9135 -552.39
## + GDPGROWTH  1   0.36599 8.5090 -539.33
## + POP        1   0.33796 8.5370 -538.74
## <none>                   8.8750 -533.75
## + HEALTH     1   0.09626 8.7787 -533.71
## + INFANT     1   0.08984 8.7851 -533.58
## + YEAR       1   0.05833 8.8166 -532.94
## + INTERNET   1   0.05656 8.8184 -532.90
## + Cell       1   0.02383 8.8511 -532.23
## 
## Step:  AIC=-579.69
## GDPCAP ~ Category + PATENT + MIL + ENERGY
## 
##             Df Sum of Sq    RSS     AIC
## + HEALTH     1   1.73871 5.0613 -630.84
## + geo_idx    1   0.48301 6.3170 -590.95
## + FEMEDU     1   0.40101 6.3990 -588.63
## + GDPGROWTH  1   0.17801 6.6220 -582.46
## <none>                   6.8000 -579.69
## + YEAR       1   0.06871 6.7313 -579.51
## + INFANT     1   0.02716 6.7729 -578.41
## + POP        1   0.02318 6.7769 -578.30
## + INTERNET   1   0.01131 6.7887 -577.98
## + Cell       1   0.00850 6.7915 -577.91
## 
## Step:  AIC=-630.84
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH
## 
##             Df Sum of Sq    RSS     AIC
## + GDPGROWTH  1   0.59202 4.4693 -651.23
## + INTERNET   1   0.24950 4.8118 -637.94
## + POP        1   0.19757 4.8638 -636.01
## + FEMEDU     1   0.19100 4.8703 -635.76
## + Cell       1   0.07300 4.9883 -631.45
## + geo_idx    1   0.06118 5.0002 -631.03
## <none>                   5.0613 -630.84
## + YEAR       1   0.00256 5.0588 -628.93
## + INFANT     1   0.00032 5.0610 -628.85
## 
## Step:  AIC=-651.23
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH
## 
##            Df Sum of Sq    RSS     AIC
## + INTERNET  1   0.37405 4.0953 -664.96
## + Cell      1   0.17680 4.2925 -656.50
## + YEAR      1   0.10700 4.3623 -653.59
## + FEMEDU    1   0.09309 4.3762 -653.02
## <none>                  4.4693 -651.23
## + geo_idx   1   0.00154 4.4678 -649.29
## + INFANT    1   0.00149 4.4678 -649.29
## + POP       1   0.00082 4.4685 -649.26
## 
## Step:  AIC=-664.96
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH + 
##     INTERNET
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 4.0953 -664.96
## + YEAR     1  0.033015 4.0622 -664.42
## + FEMEDU   1  0.023056 4.0722 -663.98
## + INFANT   1  0.022562 4.0727 -663.96
## + Cell     1  0.010860 4.0844 -663.44
## + geo_idx  1  0.002267 4.0930 -663.06
## + POP      1  0.000070 4.0952 -662.97

## 
## Call:
## lm(formula = GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + 
##     GDPGROWTH + INTERNET, data = data)
## 
## Coefficients:
##      (Intercept)  CategoryEmerging            PATENT               MIL  
##          0.47540          -0.95080           0.19405          -0.21636  
##           ENERGY            HEALTH         GDPGROWTH          INTERNET  
##          0.61783          -0.30891          -0.06927           0.06678

Stepwise forward selection gives us the following: lowest AIC is 2747.4. Variables selected were GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH + INTERNET. Pretty Good. Lets see what backward selection gives us.

## Start:  AIC=-676.38
## data$GDPCAP ~ geo_idx + YEAR + Cell + ENERGY + INTERNET + PATENT + 
##     HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
## 
##             Df Sum of Sq     RSS     AIC
## - Cell       1    0.0139  3.6096 -677.69
## - YEAR       1    0.0155  3.6112 -677.60
## - POP        1    0.0253  3.6210 -677.12
## <none>                    3.5957 -676.38
## - geo_idx    1    0.2020  3.7978 -668.54
## - INTERNET   1    0.2800  3.8758 -664.88
## - INFANT     1    0.3814  3.9771 -660.23
## - GDPGROWTH  1    0.4179  4.0136 -658.59
## - FEMEDU     1    0.4369  4.0326 -657.74
## - PATENT     1    0.6907  4.2864 -646.75
## - HEALTH     1    1.6183  5.2141 -611.49
## - ENERGY     1    2.4959  6.0917 -583.49
## - MIL        1    5.0049  8.6006 -521.40
## - Category   1    6.9263 10.5220 -485.11
## 
## Step:  AIC=-677.69
## data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + 
##     MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
## 
##             Df Sum of Sq     RSS     AIC
## - POP        1    0.0319  3.6414 -678.10
## <none>                    3.6096 -677.69
## - YEAR       1    0.0568  3.6664 -676.88
## - geo_idx    1    0.2072  3.8168 -669.64
## - INTERNET   1    0.3007  3.9103 -665.28
## - INFANT     1    0.3684  3.9780 -662.19
## - FEMEDU     1    0.4233  4.0328 -659.73
## - GDPGROWTH  1    0.4418  4.0514 -658.90
## - PATENT     1    0.7245  4.3340 -646.76
## - HEALTH     1    1.6627  5.2723 -611.49
## - ENERGY     1    2.7751  6.3847 -577.03
## - MIL        1    5.0715  8.6811 -521.72
## - Category   1    8.7270 12.3366 -458.47
## 
## Step:  AIC=-678.1
## data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + 
##     MIL + INFANT + FEMEDU + GDPGROWTH + Category
## 
##             Df Sum of Sq     RSS     AIC
## <none>                    3.6414 -678.10
## - YEAR       1    0.0479  3.6893 -677.75
## - geo_idx    1    0.2452  3.8867 -668.37
## - INTERNET   1    0.3109  3.9524 -665.36
## - INFANT     1    0.3637  4.0051 -662.97
## - FEMEDU     1    0.4042  4.0456 -661.16
## - GDPGROWTH  1    0.4333  4.0747 -659.87
## - PATENT     1    0.7890  4.4304 -644.80
## - HEALTH     1    2.1821  5.8236 -595.59
## - ENERGY     1    3.1721  6.8136 -567.33
## - MIL        1    5.3777  9.0191 -516.85
## - Category   1    8.7097 12.3512 -460.26

## 
## Call:
## lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + 
##     PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category, 
##     data = data)
## 
## Coefficients:
##      (Intercept)           geo_idx              YEAR            ENERGY  
##         13.93140          -0.02077          -0.00666           0.67140  
##         INTERNET            PATENT            HEALTH               MIL  
##          0.09023           0.15992          -0.34329          -0.24321  
##           INFANT            FEMEDU         GDPGROWTH  CategoryEmerging  
##          0.14454           0.16923          -0.06137          -0.95182

Backward gives us the lowest AIC. AIC is 2734.26 with the folloing lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category,

data = data).

## 
## Call:
## lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + 
##     PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41811 -0.08861 -0.01894  0.08453  0.53730 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      13.931396   8.955996   1.556 0.121700    
## geo_idx          -0.020768   0.006174  -3.364 0.000953 ***
## YEAR             -0.006660   0.004482  -1.486 0.139124    
## ENERGY            0.671402   0.055499  12.097  < 2e-16 ***
## INTERNET          0.090228   0.023824   3.787 0.000212 ***
## PATENT            0.159917   0.026506   6.033 9.95e-09 ***
## HEALTH           -0.343287   0.034214 -10.034  < 2e-16 ***
## MIL              -0.243214   0.015441 -15.751  < 2e-16 ***
## INFANT            0.144543   0.035289   4.096 6.52e-05 ***
## FEMEDU            0.169225   0.039190   4.318 2.68e-05 ***
## GDPGROWTH        -0.061370   0.013726  -4.471 1.43e-05 ***
## CategoryEmerging -0.951821   0.047483 -20.046  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1472 on 168 degrees of freedom
## Multiple R-squared:  0.9797, Adjusted R-squared:  0.9783 
## F-statistic: 735.5 on 11 and 168 DF,  p-value: < 2.2e-16

## Analysis of Variance Table
## 
## Model 1: data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + 
##     MIL + INFANT + FEMEDU + GDPGROWTH + Category
## Model 2: data$GDPCAP ~ geo_idx + YEAR + Cell + ENERGY + INTERNET + PATENT + 
##     HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    168 3.6414                           
## 2    166 3.5957  2  0.045715 1.0552 0.3504

Above we declared our bestfit from the backward selection method. We then used ANOVA to compare datafit and bestfit models. The r2 is similar to datafit and the error is one higher, but we have a greater range of freedom.

##          1          2          3          4          5          6 
## -0.9299421 -1.0033906  0.3186223  0.3786556 -1.0170498  0.4509672 
##          7          8          9         10         11         12 
##  1.2387695 -0.5931275 -1.0716209 -1.0285418  0.4600126  0.9720158 
##         13         14         15         16         17         18 
## -0.9822191 -0.9546097  0.2723538  0.2999530 -1.0027749  0.3273582 
##         19         20         21         22         23         24 
##  1.2655205 -0.6017745 -1.1536335 -1.1285187  0.4838275  1.0353649 
##         25         26         27         28         29         30 
## -0.9160424 -0.9211300  0.3089280  0.3567295 -1.0515545  0.2906975 
##         31         32         33         34         35         36 
##  1.3224865 -0.4244683 -1.1361338 -1.0374409  0.6126838  1.2864221 
##         37         38         39         40         41         42 
## -0.8171832 -0.9659702  0.4934597  0.4296327 -1.0547683  0.2972946 
##         43         44         45         46         47         48 
##  1.2788516 -0.6464483 -0.9876786 -0.9648818  0.7024472  1.3934874 
##         49         50         51         52         53         54 
## -0.8725043 -0.9380773  0.4598259  0.5690214 -1.0204872  0.2868329 
##         55         56         57         58         59         60 
##  1.2149269 -0.6930386 -1.0683053 -0.8879040  0.6496592  1.4138957 
##         61         62         63         64         65         66 
## -0.8466510 -0.9486410  0.5514954  0.5901278 -1.1143584  0.2982505 
##         67         68         69         70         71         72 
##  1.5186234 -0.6519834 -0.5827428 -0.7454075  0.7783591  1.5759286 
##         73         74         75         76         77         78 
## -0.8590851 -0.9609911  0.6187487  0.6473179 -1.2226168  0.3366205 
##         79         80         81         82         83         84 
##  1.4986126 -0.6677172 -0.8230224 -0.7735036  0.8334953  1.5478364 
##         85         86         87         88         89         90 
## -0.9455591 -1.0000630  0.6734713  0.6659078 -1.1634202  0.3245598 
##         91         92         93         94         95         96 
##  1.4732507 -0.7041273 -0.9489750 -0.8341165  0.8479688  1.5749461 
##         97         98         99        100        101        102 
## -0.9579354 -1.0486323  0.7809875  0.7023973 -1.1813115  0.3623259 
##        103        104        105        106        107        108 
##  1.6381591 -0.6071022 -0.8809178 -0.7667840  0.8979165  1.5405583 
##        109        110        111        112        113        114 
## -0.9801997 -1.0791666  0.7537096  0.8315097 -1.1175351  0.3908585 
##        115        116        117        118        119        120 
##  2.1172852 -0.6248700 -0.9167528 -0.8888009  0.9313344  1.6247623 
##        121        122        123        124        125        126 
## -0.9847729 -1.0720984  0.8226900  0.8723982 -1.1780756  0.3541677 
##        127        128        129        130        131        132 
##  1.8067999 -0.6020735 -0.9034680 -0.8198419  0.9114827  1.5300290 
##        133        134        135        136        137        138 
## -1.0648713 -1.0467838  0.8449187  0.9099869 -1.1869288  0.3953432 
##        139        140        141        142        143        144 
##  1.8657922 -0.6095543 -0.8175898 -0.7609553  0.9029892  1.6110790 
##        145        146        147        148        149        150 
## -1.0160268 -1.0155456  0.8568985  0.9405295 -1.1975587  0.4554741 
##        151        152        153        154        155        156 
##  1.9728308 -0.5667654 -0.9447568 -0.7662340  0.9504230  1.7208066 
##        157        158        159        160        161        162 
## -0.9855563 -1.0358174  0.8504059  0.8619451 -1.1536747  0.3886818 
##        163        164        165        166        167        168 
##  1.8289716 -0.5909490 -0.8617514 -0.7399544  0.8604110  1.6573785 
##        169        170        171        172        173        174 
## -1.0549447 -1.0737488  0.8691502  0.7885249 -1.1493041  0.3825637 
##        175        176        177        178        179        180 
##  1.7721382 -0.6218190 -0.8910639 -0.7699301  0.7696947  1.6440912

Now we will use the predict() function to predict values based on our model. We then will print out the data and look at a summary of both. The summary will give us an idea of how close the mins, maxes, medians are to each other. Medians are less than $1500 apart - mins and maxes show greater seperation.

##   geo_idx YEAR     GDPCAP Prediction
## 1  Brazil 1993 -0.8579063 -0.9299421
## 2   China 1993 -1.0753332 -1.0033906
## 3  France 1993  0.3227898  0.3186223
## 4 Germany 1993  0.4391211  0.3786556
## 5   India 1993 -1.0909115 -1.0170498
## 6   Italy 1993  0.1694854  0.4509672

##          geo_idx YEAR     GDPCAP Prediction
## 1         Brazil 1993 -0.8579063 -0.9299421
## 2          China 1993 -1.0753332 -1.0033906
## 3         France 1993  0.3227898  0.3186223
## 4        Germany 1993  0.4391211  0.3786556
## 5          India 1993 -1.0909115 -1.0170498
## 6          Italy 1993  0.1694854  0.4509672
## 7          Japan 1993  1.5932277  1.2387695
## 8         Mexico 1993 -0.7258467 -0.5931275
## 9         Russia 1993 -0.9690977 -1.0716209
## 10  South Africa 1993 -0.8943710 -1.0285418
## 11            UK 1993  0.4213458  0.4600126
## 12           USA 1993  1.0882691  0.9720158
## 13        Brazil 1994 -0.8482409 -0.9822191
## 14         China 1994 -1.0704824 -0.9546097
## 15        France 1994  0.3490528  0.2723538
## 16       Germany 1994  0.4720492  0.2999530
## 17         India 1994 -1.0897382 -1.0027749
## 18         Italy 1994  0.1968763  0.3273582
## 19         Japan 1994  1.6073319  1.2655205
## 20        Mexico 1994 -0.7155511 -0.6017745
## 21        Russia 1994 -0.9874462 -1.1536335
## 22  South Africa 1994 -0.8920372 -1.1285187
## 23            UK 1994  0.4830844  0.4838275
## 24           USA 1994  1.1510031  1.0353649
## 25        Brazil 1995 -0.8406364 -0.9160424
## 26         China 1995 -1.0660333 -0.9211300
## 27        France 1995  0.3729059  0.3089280
## 28       Germany 1995  0.4939469  0.3567295
## 29         India 1995 -1.0882606 -1.0515545
## 30         Italy 1995  0.2347653  0.2906975
## 31         Japan 1995  1.6496545  1.3224865
## 32        Mexico 1995 -0.7469141 -0.4244683
## 33        Russia 1995 -0.9926244 -1.1361338
## 34  South Africa 1995 -0.8899959 -1.0374409
## 35            UK 1995  0.5273965  0.6126838
## 36           USA 1995  1.1812738  1.2864221
## 37        Brazil 1996 -0.8389833 -0.8171832
## 38         China 1996 -1.0615795 -0.9659702
## 39        France 1996  0.3826830  0.4934597
## 40       Germany 1996  0.5019894  0.4296327
## 41         India 1996 -1.0866960 -1.0547683
## 42         Italy 1996  0.2497121  0.2972946
## 43         Japan 1996  1.7145816  1.2788516
## 44        Mexico 1996 -0.7344879 -0.6464483
## 45        Russia 1996 -0.9967537 -0.9876786
## 46  South Africa 1996 -0.8854434 -0.9648818
## 47            UK 1996  0.5704235  0.7024472
## 48           USA 1996  1.2406812  1.3934874
## 49        Brazil 1997 -0.8339640 -0.8725043
## 50         China 1997 -1.0570961 -0.9380773
## 51        France 1997  0.4092024  0.4598259
## 52       Germany 1997  0.5276971  0.5690214
## 53         India 1997 -1.0860505 -1.0204872
## 54         Italy 1997  0.2744668  0.2868329
## 55         Japan 1997  1.7522254  1.2149269
## 56        Mexico 1997 -0.7153777 -0.6930386
## 57        Russia 1997 -0.9947214 -1.0683053
## 58  South Africa 1997 -0.8846936 -0.8879040
## 59            UK 1997  0.6217255  0.6496592
## 60           USA 1997  1.3174040  1.4138957
## 61        Brazil 1998 -0.8380998 -0.8466510
## 62         China 1998 -1.0530848 -0.9486410
## 63        France 1998  0.4540189  0.5514954
## 64       Germany 1998  0.5580530  0.5901278
## 65         India 1998 -1.0847428 -1.1143584
## 66         Italy 1998  0.2942021  0.2982505
## 67         Japan 1998  1.6876699  1.5186234
## 68        Mexico 1998 -0.7024716 -0.6519834
## 69        Russia 1998 -1.0008497 -0.5827428
## 70  South Africa 1998 -0.8889057 -0.7454075
## 71            UK 1998  0.6775775  0.7783591
## 72           USA 1998  1.3950762  1.5759286
## 73        Brazil 1999 -0.8414878 -0.8590851
## 74         China 1999 -1.0488641 -0.9609911
## 75        France 1999  0.5004533  0.6187487
## 76       Germany 1999  0.5882801  0.6473179
## 77         India 1999 -1.0826569 -1.2226168
## 78         Italy 1999  0.3144297  0.3366205
## 79         Japan 1999  1.6767775  1.4986126
## 80        Mexico 1999 -0.6930385 -0.6677172
## 81        Russia 1999 -0.9929627 -0.8230224
## 82  South Africa 1999 -0.8890952 -0.7735036
## 83            UK 1999  0.7282259  0.8334953
## 84           USA 1999  1.4872839  1.5478364
## 85        Brazil 2000 -0.8337445 -0.9455591
## 86         China 2000 -1.0437701 -1.0000630
## 87        France 2000  0.5485142  0.6734713
## 88       Germany 2000  0.6380239  0.6659078
## 89         India 2000 -1.0819041 -1.1634202
## 90         Italy 2000  0.3660326  0.3245598
## 91         Japan 2000  1.7348792  1.4732507
## 92        Mexico 2000 -0.6716193 -0.7041273
## 93        Russia 2000 -0.9806192 -0.9489750
## 94  South Africa 2000 -0.8854449 -0.8341165
## 95            UK 2000  0.7994943  0.8479688
## 96           USA 2000  1.5659253  1.5749461
## 97        Brazil 2001 -0.8339973 -0.9579354
## 98         China 2001 -1.0383155 -1.0486323
## 99        France 2001  0.5667899  0.7809875
## 100      Germany 2001  0.6615980  0.7023973
## 101        India 2001 -1.0807910 -1.1813115
## 102        Italy 2001  0.3927133  0.3623259
## 103        Japan 2001  1.7387356  1.6381591
## 104       Mexico 2001 -0.6783206 -0.6071022
## 105       Russia 2001 -0.9733629 -0.8809178
## 106 South Africa 2001 -0.8839341 -0.7667840
## 107           UK 2001  0.8475887  0.8979165
## 108          USA 2001  1.5685472  1.5405583
## 109       Brazil 2002 -0.8303977 -0.9801997
## 110        China 2002 -1.0317835 -1.0791666
## 111       France 2002  0.5701242  0.7537096
## 112      Germany 2002  0.6587915  0.8315097
## 113        India 2002 -1.0799895 -1.1175351
## 114        Italy 2002  0.3947469  0.3908585
## 115        Japan 2002  1.7403521  2.1172852
## 116       Mexico 2002 -0.6803134 -0.6248700
## 117       Russia 2002 -0.9659106 -0.9167528
## 118 South Africa 2002 -0.8787092 -0.8888009
## 119           UK 2002  0.8879824  0.9313344
## 120          USA 2002  1.5923785  1.6247623
## 121       Brazil 2003 -0.8308509 -0.9847729
## 122        China 2003 -1.0239053 -1.0720984
## 123       France 2003  0.5732862  0.8226900
## 124      Germany 2003  0.6511480  0.8723982
## 125        India 2003 -1.0777117 -1.1780756
## 126        Italy 2003  0.3823162  0.3541677
## 127        Japan 2003  1.7822815  1.8067999
## 128       Mexico 2003 -0.6798152 -0.6020735
## 129       Russia 2003 -0.9541537 -0.9034680
## 130 South Africa 2003 -0.8747950 -0.8198419
## 131           UK 2003  0.9560458  0.9114827
## 132          USA 2003  1.6377486  1.5300290
## 133       Brazil 2004 -0.8182153 -1.0648713
## 134        China 2004 -1.0151719 -1.0467838
## 135       France 2004  0.6035827  0.8449187
## 136      Germany 2004  0.6720591  0.9099869
## 137        India 2004 -1.0753143 -1.1869288
## 138        Italy 2004  0.3932979  0.3953432
## 139        Japan 2004  1.8497115  1.8657922
## 140       Mexico 2004 -0.6676202 -0.6095543
## 141       Russia 2004 -0.9416091 -0.8175898
## 142 South Africa 2004 -0.8667609 -0.7609553
## 143           UK 2004  1.0055693  0.9029892
## 144          USA 2004  1.7073307  1.6110790
## 145       Brazil 2005 -0.8123003 -1.0160268
## 146        China 2005 -1.0044000 -1.0155456
## 147       France 2005  0.6218551  0.8568985
## 148      Germany 2005  0.6853262  0.9405295
## 149        India 2005 -1.0721756 -1.1975587
## 150        Italy 2005  0.3961323  0.4554741
## 151        Japan 2005  1.8880689  1.9728308
## 152       Mexico 2005 -0.6589244 -0.5667654
## 153       Russia 2005 -0.9295594 -0.9447568
## 154 South Africa 2005 -0.8565613 -0.7662340
## 155           UK 2005  1.0515299  0.9504230
## 156          USA 2005  1.7674729  1.7208066
## 157       Brazil 2006 -0.8035890 -0.9855563
## 158        China 2006 -0.9908859 -1.0358174
## 159       France 2006  0.6523705  0.8504059
## 160      Germany 2006  0.7540965  0.8619451
## 161        India 2006 -1.0687929 -1.1536747
## 162        Italy 2006  0.4206194  0.3886818
## 163        Japan 2006  1.9393373  1.8289716
## 164       Mexico 2006 -0.6413948 -0.5909490
## 165       Russia 2006 -0.9134088 -0.8617514
## 166 South Africa 2006 -0.8450644 -0.7399544
## 167           UK 2006  1.0942739  0.8604110
## 168          USA 2006  1.8157475  1.6573785
## 169       Brazil 2007 -0.7877419 -1.0549447
## 170        China 2007 -0.9738175 -1.0737488
## 171       France 2007  0.6816438  0.8691502
## 172      Germany 2007  0.8178266  0.7885249
## 173        India 2007 -1.0648855 -1.1493041
## 174        Italy 2007  0.4350695  0.3825637
## 175        Japan 2007  2.0059630  1.7721382
## 176       Mexico 2007 -0.6321316 -0.6218190
## 177       Russia 2007 -0.8954683 -0.8910639
## 178 South Africa 2007 -0.8331376 -0.7699301
## 179           UK 2007  1.1598900  0.7696947
## 180          USA 2007  1.8433858  1.6440912

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0909 -0.8946 -0.2313  0.0000  0.6642  2.0060

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.22262 -0.94872 -0.07606  0.00000  0.83635  2.11729

we see that our predictions are a bit off. Lets see some graphs.

Not quite matching up

Lets treat this as a training dataset. I’d like to add more countries to this. Currently there are only 12, but wht happens when we add all the countries in the world with publicly available data. Would that change our clusters? How about our regession.

Can we predict GDPCAP using linear regression?

Jason Touleyrou

January 29, 2018

Some of these may not be needed - this data set was used for a seperate project last semester that makes up a combination of plm, glm, lm and clustering using k-meansdata available here: https://drive.google.com/file/d/1wqoU8Fi-ZyG_iiD5PdaDD1_J7VwebJLD/view

This allows us to see the curret structure of the data and identify objects that may need to be modified prior to building our model

Using the head function gives us a snapshot of what the current values of the data look like in their dataframe.

First, we scale the data so that all the means of the nonfactor data are 0. We will confirm this has been completed by running the summary function.

We will start by using all variables using the lm() function, whereas y = GDPCAP.

This appears to be a really good fit, if we were only looking at R2 and pvalue show significance. In the following, We’ll take a deeper dive into what our model shows with some plots.

Plots seem to show a pretty good fit, though I still have reservations.

Removing the highly corrleated variables give us a worse r2. We removed aything with a VIF of 10 or higher.

Next, since VIF did not improve our fit, we will use forward and backward selection to give us the best fit. First we must create a regression model that looks like this lm(thing ~ 1). This gives us a a model with only the intercept.

Stepwise forward selection gives us the following: lowest AIC is 2747.4. Variables selected were GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH + INTERNET. Pretty Good. Lets see what backward selection gives us.

Backward gives us the lowest AIC. AIC is 2734.26 with the folloing lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category,

data = data).

Above we declared our bestfit from the backward selection method. We then used ANOVA to compare datafit and bestfit models. The r2 is similar to datafit and the error is one higher, but we have a greater range of freedom.

we see that our predictions are a bit off. Lets see some graphs.

Not quite matching up

Lets treat this as a training dataset. I’d like to add more countries to this. Currently there are only 12, but wht happens when we add all the countries in the world with publicly available data. Would that change our clusters? How about our regession.