knitr::opts_chunk$set(echo = FALSE)
rm(list=ls())
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(gplots)
## Warning: package 'gplots' was built under R version 3.4.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(lme4)
## Warning: package 'lme4' was built under R version 3.4.2
## Loading required package: Matrix
library(apsrtable)
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.3
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(mctest)
## Warning: package 'mctest' was built under R version 3.4.2
library(ppcor)
## Warning: package 'ppcor' was built under R version 3.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.4.3
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(NbClust)
library(cluster)
## Warning: package 'cluster' was built under R version 3.4.3
library(maps)
## Warning: package 'maps' was built under R version 3.4.3
##
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
##
## votes.repub
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.4.3
library(leaps)
## Warning: package 'leaps' was built under R version 3.4.3
# data available here: https://drive.google.com/file/d/1wqoU8Fi-ZyG_iiD5PdaDD1_J7VwebJLD/view
data = read.csv("C:/Users/jason/Downloads/Updated Master Data.csv")
Some of these may not be needed - this data set was used for a seperate project last semester that makes up a combination of plm, glm, lm and clustering using k-meansdata available here: https://drive.google.com/file/d/1wqoU8Fi-ZyG_iiD5PdaDD1_J7VwebJLD/view
## 'data.frame': 180 obs. of 14 variables:
## $ geo_idx : int 1 2 3 4 5 6 7 8 9 10 ...
## $ YEAR : int 1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
## $ Cell : num 0.1159 0.0537 0.9963 2.1926 0 ...
## $ ENERGY : num 338.5 62.6 1933.6 1554.4 48.7 ...
## $ INTERNET : num 0.025498 0.00017 0.574087 0.46232 0.000218 ...
## $ PATENT : int 2184 422 30413 0 2027 45587 62400 1406 0 5828 ...
## $ HEALTH : num 6.7 3.5 10.4 10.1 4.1 ...
## $ MIL : num 1.93 2.07 3.28 1.85 2.83 ...
## $ INFANT : num 44.6 40.2 6.3 5.9 81.7 7.3 4.3 32 22.1 46.6 ...
## $ FEMEDU : num 6.4 6.1 10 10.8 3.3 10.4 12.2 7.3 11.7 7.6 ...
## $ GDPCAP : num 3380 536 18823 20344 333 ...
## $ GDPGROWTH: num 3.06 12.7 -1.08 -1.65 2.72 ...
## $ POP : num 1578 12030 577 807 9245 ...
## $ Category : Factor w/ 2 levels "Developed","Emerging": 2 2 1 1 2 1 1 2 2 2 ...
This allows us to see the curret structure of the data and identify objects that may need to be modified prior to building our model
## geo_idx YEAR Cell ENERGY INTERNET PATENT HEALTH MIL
## 1 1 1993 0.11593404 338.47540 0.025498253 2184 6.7 1.929473
## 2 2 1993 0.05367264 62.55920 0.000170147 422 3.5 2.071665
## 3 3 1993 0.99630325 1933.63500 0.574086886 30413 10.4 3.283614
## 4 4 1993 2.19259004 1554.44600 0.462319739 0 10.1 1.853910
## 5 5 1993 0.00000000 48.65522 0.000218176 2027 4.1 2.827158
## 6 6 1993 2.12017489 989.39810 0.122689814 45587 7.2 2.004464
## INFANT FEMEDU GDPCAP GDPGROWTH POP Category
## 1 44.6 6.4 3380.1284 3.0625488 1578.1222 Emerging
## 2 40.2 6.1 536.3573 12.6969383 12029.8296 Emerging
## 3 6.3 10.0 18822.6976 -1.0794842 577.4988 Developed
## 4 5.9 10.8 20344.2176 -1.6507664 806.7600 Developed
## 5 81.7 3.3 332.6057 2.7177655 9244.7563 Emerging
## 6 7.3 10.4 16817.5977 -0.9134017 570.9741 Developed
Using the head function gives us a snapshot of what the current values of the data look like in their dataframe.
## geo_idx YEAR Cell ENERGY
## Min. : 1.00 Min. :1993 Min. :-0.9520 Min. :-1.1010
## 1st Qu.: 3.75 1st Qu.:1996 1st Qu.:-0.8685 1st Qu.:-0.7905
## Median : 6.50 Median :2000 Median :-0.3926 Median :-0.2685
## Mean : 6.50 Mean :2000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 9.25 3rd Qu.:2004 3rd Qu.: 0.7725 3rd Qu.: 0.5303
## Max. :12.00 Max. :2007 Max. : 3.1748 Max. : 2.9348
## INTERNET PATENT HEALTH MIL
## Min. :-0.7896 Min. :-0.7956 Min. :-1.50947 Min. :-1.6882
## 1st Qu.:-0.7487 1st Qu.:-0.7321 1st Qu.:-0.80329 1st Qu.:-0.6289
## Median :-0.4800 Median :-0.3405 Median :-0.08599 Median :-0.1740
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.4852 3rd Qu.: 0.3209 3rd Qu.: 0.63143 3rd Qu.: 0.5724
## Max. : 2.5887 Max. : 4.4546 Max. : 3.23443 Max. : 3.5475
## INFANT FEMEDU GDPCAP GDPGROWTH
## Min. :-0.8879 Min. :-2.4770 Min. :-1.0909 Min. :-4.3167
## 1st Qu.:-0.7760 1st Qu.:-0.7523 1st Qu.:-0.8946 1st Qu.:-0.4673
## Median :-0.5018 Median : 0.3119 Median :-0.2313 Median :-0.1561
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4721 3rd Qu.: 0.8624 3rd Qu.: 0.6642 3rd Qu.: 0.3281
## Max. : 3.0027 Max. : 1.2293 Max. : 2.0060 Max. : 3.1193
## POP Category
## Min. :-0.6214 Developed:90
## 1st Qu.:-0.5725 Emerging :90
## Median :-0.4242
## Mean : 0.0000
## 3rd Qu.:-0.1956
## Max. : 2.5824
## [1] 1 2 3 4 5 6
First, we scale the data so that all the means of the nonfactor data are 0. We will confirm this has been completed by running the summary function.
Second, we will change the numbers that represent the countrires to their names in characters. This is to allow for the map function we used in ggplot to match up and display the countries with their respective colors/groupings. We will confirm the result by using the head function only on the geo_idx column to confirm the data has been transformed.
Last week we discuss PCA and used kmeans clustering to create three clusters from our dataset. This week we will use linear regression to build a model. This model will attempt to predict GDP Capacity per capita for each of our countries for each year. But First, les look at our individual nations and how GDP capacity varies per country.

We will start by using all variables using the lm() function, whereas y = GDPCAP.
##
## Call:
## lm(formula = data$GDPCAP ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44595 -0.08237 -0.01831 0.08898 0.52953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.022310 11.106896 0.902 0.368178
## geo_idx -0.019180 0.006280 -3.054 0.002630 **
## YEAR -0.004705 0.005560 -0.846 0.398632
## Cell -0.024552 0.030691 -0.800 0.424882
## ENERGY 0.640197 0.059640 10.734 < 2e-16 ***
## INTERNET 0.100513 0.027955 3.596 0.000427 ***
## PATENT 0.152445 0.026996 5.647 6.94e-08 ***
## HEALTH -0.322767 0.037342 -8.644 4.41e-15 ***
## MIL -0.248648 0.016358 -15.200 < 2e-16 ***
## INFANT 0.149581 0.035648 4.196 4.41e-05 ***
## FEMEDU 0.198968 0.044302 4.491 1.32e-05 ***
## GDPGROWTH -0.068797 0.015664 -4.392 1.99e-05 ***
## POP 0.025330 0.023449 1.080 0.281603
## CategoryEmerging -0.974335 0.054487 -17.882 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1472 on 166 degrees of freedom
## Multiple R-squared: 0.9799, Adjusted R-squared: 0.9783
## F-statistic: 622.9 on 13 and 166 DF, p-value: < 2.2e-16
This appears to be a really good fit, if we were only looking at R2 and pvalue show significance. In the following, We’ll take a deeper dive into what our model shows with some plots.

Plots seem to show a pretty good fit, though I still have reservations.
## geo_idx YEAR Cell ENERGY INTERNET PATENT HEALTH
## 3.905549 4.795419 7.784075 29.393132 6.457750 6.022510 11.522995
## MIL INFANT FEMEDU GDPGROWTH POP Category
## 2.211203 10.501567 16.218906 2.027566 4.543819 6.167758
Removing the highly corrleated variables give us a worse r2. We removed aything with a VIF of 10 or higher.
##
## Call:
## lm(formula = GDPCAP ~ YEAR + Cell + INTERNET + PATENT + MIL +
## +FEMEDU + GDPGROWTH + POP, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.38105 -0.22571 -0.01888 0.22818 0.99351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.31937 21.67712 7.626 1.60e-12 ***
## YEAR -0.08266 0.01084 -7.626 1.60e-12 ***
## Cell 0.16037 0.06276 2.555 0.011477 *
## INTERNET 0.21669 0.06386 3.393 0.000858 ***
## PATENT 0.50449 0.04752 10.616 < 2e-16 ***
## MIL -0.19797 0.03050 -6.490 8.92e-10 ***
## FEMEDU 0.38471 0.05565 6.913 9.03e-11 ***
## GDPGROWTH -0.04854 0.03686 -1.317 0.189660
## POP 0.10772 0.04990 2.158 0.032286 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3512 on 171 degrees of freedom
## Multiple R-squared: 0.8821, Adjusted R-squared: 0.8766
## F-statistic: 160 on 8 and 171 DF, p-value: < 2.2e-16
Next, since VIF did not improve our fit, we will use forward and backward selection to give us the best fit. First we must create a regression model that looks like this lm(thing ~ 1). This gives us a a model with only the intercept.
##
## Call:
## lm(formula = GDPCAP ~ 1, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0909 -0.8946 -0.2313 0.6642 2.0060
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.965e-17 7.454e-02 0 1
##
## Residual standard error: 1 on 179 degrees of freedom
## Start: AIC=1
## GDPCAP ~ 1
##
## Df Sum of Sq RSS AIC
## + Category 1 150.140 28.860 -325.49
## + PATENT 1 131.638 47.362 -236.32
## + ENERGY 1 129.194 49.806 -227.27
## + FEMEDU 1 103.492 75.508 -152.37
## + INFANT 1 96.749 82.251 -136.97
## + HEALTH 1 89.834 89.166 -122.44
## + INTERNET 1 77.096 101.904 -98.41
## + Cell 1 37.068 141.932 -38.77
## + POP 1 33.322 145.678 -34.08
## + geo_idx 1 19.125 159.875 -17.34
## + GDPGROWTH 1 14.748 164.252 -12.48
## <none> 179.000 1.00
## + YEAR 1 1.226 177.774 1.76
## + MIL 1 0.072 178.928 2.92
##
## Step: AIC=-325.49
## GDPCAP ~ Category
##
## Df Sum of Sq RSS AIC
## + PATENT 1 18.1282 10.732 -501.55
## + ENERGY 1 9.9337 18.927 -399.43
## + INTERNET 1 5.8799 22.980 -364.50
## + geo_idx 1 4.1835 24.677 -351.68
## + FEMEDU 1 4.0965 24.764 -351.04
## + HEALTH 1 2.6349 26.225 -340.72
## + YEAR 1 1.2264 27.634 -331.30
## + MIL 1 0.7591 28.101 -328.29
## + Cell 1 0.4540 28.406 -326.34
## + INFANT 1 0.4099 28.450 -326.06
## <none> 28.860 -325.49
## + GDPGROWTH 1 0.1918 28.669 -324.69
## + POP 1 0.1352 28.725 -324.33
##
## Step: AIC=-501.55
## GDPCAP ~ Category + PATENT
##
## Df Sum of Sq RSS AIC
## + MIL 1 1.85718 8.875 -533.75
## + POP 1 0.58317 10.149 -509.60
## + geo_idx 1 0.56193 10.170 -509.23
## + FEMEDU 1 0.37896 10.353 -506.02
## + GDPGROWTH 1 0.37467 10.357 -505.94
## + ENERGY 1 0.23914 10.493 -503.60
## + INTERNET 1 0.17303 10.559 -502.47
## + INFANT 1 0.15263 10.579 -502.13
## <none> 10.732 -501.55
## + Cell 1 0.00482 10.727 -499.63
## + HEALTH 1 0.00080 10.731 -499.56
## + YEAR 1 0.00047 10.732 -499.56
##
## Step: AIC=-533.75
## GDPCAP ~ Category + PATENT + MIL
##
## Df Sum of Sq RSS AIC
## + ENERGY 1 2.07493 6.8000 -579.69
## + geo_idx 1 1.36819 7.5068 -561.89
## + FEMEDU 1 0.96142 7.9135 -552.39
## + GDPGROWTH 1 0.36599 8.5090 -539.33
## + POP 1 0.33796 8.5370 -538.74
## <none> 8.8750 -533.75
## + HEALTH 1 0.09626 8.7787 -533.71
## + INFANT 1 0.08984 8.7851 -533.58
## + YEAR 1 0.05833 8.8166 -532.94
## + INTERNET 1 0.05656 8.8184 -532.90
## + Cell 1 0.02383 8.8511 -532.23
##
## Step: AIC=-579.69
## GDPCAP ~ Category + PATENT + MIL + ENERGY
##
## Df Sum of Sq RSS AIC
## + HEALTH 1 1.73871 5.0613 -630.84
## + geo_idx 1 0.48301 6.3170 -590.95
## + FEMEDU 1 0.40101 6.3990 -588.63
## + GDPGROWTH 1 0.17801 6.6220 -582.46
## <none> 6.8000 -579.69
## + YEAR 1 0.06871 6.7313 -579.51
## + INFANT 1 0.02716 6.7729 -578.41
## + POP 1 0.02318 6.7769 -578.30
## + INTERNET 1 0.01131 6.7887 -577.98
## + Cell 1 0.00850 6.7915 -577.91
##
## Step: AIC=-630.84
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH
##
## Df Sum of Sq RSS AIC
## + GDPGROWTH 1 0.59202 4.4693 -651.23
## + INTERNET 1 0.24950 4.8118 -637.94
## + POP 1 0.19757 4.8638 -636.01
## + FEMEDU 1 0.19100 4.8703 -635.76
## + Cell 1 0.07300 4.9883 -631.45
## + geo_idx 1 0.06118 5.0002 -631.03
## <none> 5.0613 -630.84
## + YEAR 1 0.00256 5.0588 -628.93
## + INFANT 1 0.00032 5.0610 -628.85
##
## Step: AIC=-651.23
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH
##
## Df Sum of Sq RSS AIC
## + INTERNET 1 0.37405 4.0953 -664.96
## + Cell 1 0.17680 4.2925 -656.50
## + YEAR 1 0.10700 4.3623 -653.59
## + FEMEDU 1 0.09309 4.3762 -653.02
## <none> 4.4693 -651.23
## + geo_idx 1 0.00154 4.4678 -649.29
## + INFANT 1 0.00149 4.4678 -649.29
## + POP 1 0.00082 4.4685 -649.26
##
## Step: AIC=-664.96
## GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH +
## INTERNET
##
## Df Sum of Sq RSS AIC
## <none> 4.0953 -664.96
## + YEAR 1 0.033015 4.0622 -664.42
## + FEMEDU 1 0.023056 4.0722 -663.98
## + INFANT 1 0.022562 4.0727 -663.96
## + Cell 1 0.010860 4.0844 -663.44
## + geo_idx 1 0.002267 4.0930 -663.06
## + POP 1 0.000070 4.0952 -662.97
##
## Call:
## lm(formula = GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH +
## GDPGROWTH + INTERNET, data = data)
##
## Coefficients:
## (Intercept) CategoryEmerging PATENT MIL
## 0.47540 -0.95080 0.19405 -0.21636
## ENERGY HEALTH GDPGROWTH INTERNET
## 0.61783 -0.30891 -0.06927 0.06678
Stepwise forward selection gives us the following: lowest AIC is 2747.4. Variables selected were GDPCAP ~ Category + PATENT + MIL + ENERGY + HEALTH + GDPGROWTH + INTERNET. Pretty Good. Lets see what backward selection gives us.
## Start: AIC=-676.38
## data$GDPCAP ~ geo_idx + YEAR + Cell + ENERGY + INTERNET + PATENT +
## HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
##
## Df Sum of Sq RSS AIC
## - Cell 1 0.0139 3.6096 -677.69
## - YEAR 1 0.0155 3.6112 -677.60
## - POP 1 0.0253 3.6210 -677.12
## <none> 3.5957 -676.38
## - geo_idx 1 0.2020 3.7978 -668.54
## - INTERNET 1 0.2800 3.8758 -664.88
## - INFANT 1 0.3814 3.9771 -660.23
## - GDPGROWTH 1 0.4179 4.0136 -658.59
## - FEMEDU 1 0.4369 4.0326 -657.74
## - PATENT 1 0.6907 4.2864 -646.75
## - HEALTH 1 1.6183 5.2141 -611.49
## - ENERGY 1 2.4959 6.0917 -583.49
## - MIL 1 5.0049 8.6006 -521.40
## - Category 1 6.9263 10.5220 -485.11
##
## Step: AIC=-677.69
## data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH +
## MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
##
## Df Sum of Sq RSS AIC
## - POP 1 0.0319 3.6414 -678.10
## <none> 3.6096 -677.69
## - YEAR 1 0.0568 3.6664 -676.88
## - geo_idx 1 0.2072 3.8168 -669.64
## - INTERNET 1 0.3007 3.9103 -665.28
## - INFANT 1 0.3684 3.9780 -662.19
## - FEMEDU 1 0.4233 4.0328 -659.73
## - GDPGROWTH 1 0.4418 4.0514 -658.90
## - PATENT 1 0.7245 4.3340 -646.76
## - HEALTH 1 1.6627 5.2723 -611.49
## - ENERGY 1 2.7751 6.3847 -577.03
## - MIL 1 5.0715 8.6811 -521.72
## - Category 1 8.7270 12.3366 -458.47
##
## Step: AIC=-678.1
## data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH +
## MIL + INFANT + FEMEDU + GDPGROWTH + Category
##
## Df Sum of Sq RSS AIC
## <none> 3.6414 -678.10
## - YEAR 1 0.0479 3.6893 -677.75
## - geo_idx 1 0.2452 3.8867 -668.37
## - INTERNET 1 0.3109 3.9524 -665.36
## - INFANT 1 0.3637 4.0051 -662.97
## - FEMEDU 1 0.4042 4.0456 -661.16
## - GDPGROWTH 1 0.4333 4.0747 -659.87
## - PATENT 1 0.7890 4.4304 -644.80
## - HEALTH 1 2.1821 5.8236 -595.59
## - ENERGY 1 3.1721 6.8136 -567.33
## - MIL 1 5.3777 9.0191 -516.85
## - Category 1 8.7097 12.3512 -460.26
##
## Call:
## lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET +
## PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category,
## data = data)
##
## Coefficients:
## (Intercept) geo_idx YEAR ENERGY
## 13.93140 -0.02077 -0.00666 0.67140
## INTERNET PATENT HEALTH MIL
## 0.09023 0.15992 -0.34329 -0.24321
## INFANT FEMEDU GDPGROWTH CategoryEmerging
## 0.14454 0.16923 -0.06137 -0.95182
Backward gives us the lowest AIC. AIC is 2734.26 with the folloing lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category,
data = data).
##
## Call:
## lm(formula = data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET +
## PATENT + HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + Category,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41811 -0.08861 -0.01894 0.08453 0.53730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.931396 8.955996 1.556 0.121700
## geo_idx -0.020768 0.006174 -3.364 0.000953 ***
## YEAR -0.006660 0.004482 -1.486 0.139124
## ENERGY 0.671402 0.055499 12.097 < 2e-16 ***
## INTERNET 0.090228 0.023824 3.787 0.000212 ***
## PATENT 0.159917 0.026506 6.033 9.95e-09 ***
## HEALTH -0.343287 0.034214 -10.034 < 2e-16 ***
## MIL -0.243214 0.015441 -15.751 < 2e-16 ***
## INFANT 0.144543 0.035289 4.096 6.52e-05 ***
## FEMEDU 0.169225 0.039190 4.318 2.68e-05 ***
## GDPGROWTH -0.061370 0.013726 -4.471 1.43e-05 ***
## CategoryEmerging -0.951821 0.047483 -20.046 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1472 on 168 degrees of freedom
## Multiple R-squared: 0.9797, Adjusted R-squared: 0.9783
## F-statistic: 735.5 on 11 and 168 DF, p-value: < 2.2e-16

## Analysis of Variance Table
##
## Model 1: data$GDPCAP ~ geo_idx + YEAR + ENERGY + INTERNET + PATENT + HEALTH +
## MIL + INFANT + FEMEDU + GDPGROWTH + Category
## Model 2: data$GDPCAP ~ geo_idx + YEAR + Cell + ENERGY + INTERNET + PATENT +
## HEALTH + MIL + INFANT + FEMEDU + GDPGROWTH + POP + Category
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 168 3.6414
## 2 166 3.5957 2 0.045715 1.0552 0.3504
Above we declared our bestfit from the backward selection method. We then used ANOVA to compare datafit and bestfit models. The r2 is similar to datafit and the error is one higher, but we have a greater range of freedom.
## 1 2 3 4 5 6
## -0.9299421 -1.0033906 0.3186223 0.3786556 -1.0170498 0.4509672
## 7 8 9 10 11 12
## 1.2387695 -0.5931275 -1.0716209 -1.0285418 0.4600126 0.9720158
## 13 14 15 16 17 18
## -0.9822191 -0.9546097 0.2723538 0.2999530 -1.0027749 0.3273582
## 19 20 21 22 23 24
## 1.2655205 -0.6017745 -1.1536335 -1.1285187 0.4838275 1.0353649
## 25 26 27 28 29 30
## -0.9160424 -0.9211300 0.3089280 0.3567295 -1.0515545 0.2906975
## 31 32 33 34 35 36
## 1.3224865 -0.4244683 -1.1361338 -1.0374409 0.6126838 1.2864221
## 37 38 39 40 41 42
## -0.8171832 -0.9659702 0.4934597 0.4296327 -1.0547683 0.2972946
## 43 44 45 46 47 48
## 1.2788516 -0.6464483 -0.9876786 -0.9648818 0.7024472 1.3934874
## 49 50 51 52 53 54
## -0.8725043 -0.9380773 0.4598259 0.5690214 -1.0204872 0.2868329
## 55 56 57 58 59 60
## 1.2149269 -0.6930386 -1.0683053 -0.8879040 0.6496592 1.4138957
## 61 62 63 64 65 66
## -0.8466510 -0.9486410 0.5514954 0.5901278 -1.1143584 0.2982505
## 67 68 69 70 71 72
## 1.5186234 -0.6519834 -0.5827428 -0.7454075 0.7783591 1.5759286
## 73 74 75 76 77 78
## -0.8590851 -0.9609911 0.6187487 0.6473179 -1.2226168 0.3366205
## 79 80 81 82 83 84
## 1.4986126 -0.6677172 -0.8230224 -0.7735036 0.8334953 1.5478364
## 85 86 87 88 89 90
## -0.9455591 -1.0000630 0.6734713 0.6659078 -1.1634202 0.3245598
## 91 92 93 94 95 96
## 1.4732507 -0.7041273 -0.9489750 -0.8341165 0.8479688 1.5749461
## 97 98 99 100 101 102
## -0.9579354 -1.0486323 0.7809875 0.7023973 -1.1813115 0.3623259
## 103 104 105 106 107 108
## 1.6381591 -0.6071022 -0.8809178 -0.7667840 0.8979165 1.5405583
## 109 110 111 112 113 114
## -0.9801997 -1.0791666 0.7537096 0.8315097 -1.1175351 0.3908585
## 115 116 117 118 119 120
## 2.1172852 -0.6248700 -0.9167528 -0.8888009 0.9313344 1.6247623
## 121 122 123 124 125 126
## -0.9847729 -1.0720984 0.8226900 0.8723982 -1.1780756 0.3541677
## 127 128 129 130 131 132
## 1.8067999 -0.6020735 -0.9034680 -0.8198419 0.9114827 1.5300290
## 133 134 135 136 137 138
## -1.0648713 -1.0467838 0.8449187 0.9099869 -1.1869288 0.3953432
## 139 140 141 142 143 144
## 1.8657922 -0.6095543 -0.8175898 -0.7609553 0.9029892 1.6110790
## 145 146 147 148 149 150
## -1.0160268 -1.0155456 0.8568985 0.9405295 -1.1975587 0.4554741
## 151 152 153 154 155 156
## 1.9728308 -0.5667654 -0.9447568 -0.7662340 0.9504230 1.7208066
## 157 158 159 160 161 162
## -0.9855563 -1.0358174 0.8504059 0.8619451 -1.1536747 0.3886818
## 163 164 165 166 167 168
## 1.8289716 -0.5909490 -0.8617514 -0.7399544 0.8604110 1.6573785
## 169 170 171 172 173 174
## -1.0549447 -1.0737488 0.8691502 0.7885249 -1.1493041 0.3825637
## 175 176 177 178 179 180
## 1.7721382 -0.6218190 -0.8910639 -0.7699301 0.7696947 1.6440912
we see that our predictions are a bit off. Lets see some graphs.

Not quite matching up


Lets treat this as a training dataset. I’d like to add more countries to this. Currently there are only 12, but wht happens when we add all the countries in the world with publicly available data. Would that change our clusters? How about our regession.