Part 0

rm(list=ls(all=TRUE))
pacman::p_load(tidyverse, foreign, car, stargazer,
               naniar, olsrr, haven, magrittr, GGally, DT, corrplot,
               install = FALSE, update = FALSE) # lmtest, sandwich, reshape

setwd("G:/My Drive/homework")
cdata <- read_dta("Tommy K/countries.dta")
cdata %>% dim()
## [1] 193  58
# Count NAs for continuous variables
cdata %>% 
  select(!where(is.labelled)) %>%
  select(!where(is_character)) %>%
  select(!c(civil_lib, polity2)) %>%
  lapply(., is.na) %>%
  lapply(., sum) %>%
  as.matrix()
##                [,1]
## avg_school     50  
## bornyear       1   
## co2            7   
## conf_church    138 
## conf_gov       139 
## conf_unions    138 
## debt           124 
## dist_mag       76  
## durable        29  
## eff_part       81  
## ethnic_frac    6   
## free_bus       13  
## free_corrupt   13  
## free_finance   15  
## free_fiscal    16  
## free_govt      15  
## free_index     16  
## free_invest    15  
## free_labor     14  
## free_monet     15  
## free_propright 16  
## free_trade     15  
## gdp            51  
## gdp_pc         51  
## gend_equal     47  
## gini           108 
## globalize      12  
## gov_eff        2   
## health_exp     6   
## hiv            46  
## inf_mort       2   
## inflation      6   
## mil_exp        42  
## oil_prod       23  
## pop            51  
## post_mater     139 
## proportion_dem 0   
## prot_area      5   
## trade          19  
## unemp          72  
## urban_pop      2   
## women_parlmnt  4   
## years_office   19
# DV: globalize
# IV: bornyear, women_parlmnt, trade (can be discontinuous)
# ex: gdp, inflation, pop, unemp
cdata %<>% select(globalize, bornyear, women_parlmnt, trade)
cdata %>% glimpse()
## Rows: 193
## Columns: 4
## $ globalize     <dbl> 81.25179, 78.26860, 40.06293, 40.03246, 60.77024, 41....
## $ bornyear      <dbl> 1920, 1920, 1978, 1979, 1962, 1983, 1981, 1909, 1820,...
## $ women_parlmnt <dbl> 27.3, 33.6, 14.3, 21.7, 13.3, 6.7, 10.5, 43.2, 4.1, 1...
## $ trade         <dbl> 44.86351, 54.37358, 87.52569, 86.31783, 86.67049, 75....
cdata %>% summary()
##    globalize        bornyear    women_parlmnt       trade       
##  Min.   :22.81   Min.   :1624   Min.   : 0.00   Min.   : 22.12  
##  1st Qu.:45.54   1st Qu.:1919   1st Qu.: 9.10   1st Qu.: 57.59  
##  Median :54.71   Median :1960   Median :15.60   Median : 78.61  
##  Mean   :57.09   Mean   :1937   Mean   :17.37   Mean   : 86.13  
##  3rd Qu.:68.02   3rd Qu.:1976   3rd Qu.:23.50   3rd Qu.:104.64  
##  Max.   :92.61   Max.   :2006   Max.   :56.30   Max.   :421.57  
##  NA's   :12      NA's   :1      NA's   :4       NA's   :19
cdata %>% filter(!complete.cases(.)) %>% nrow()
## [1] 22
cdata %>% slice_sample(n = 10)
## # A tibble: 10 x 4
##    globalize bornyear women_parlmnt trade
##        <dbl>    <dbl>         <dbl> <dbl>
##  1      NA       1990           3    NA  
##  2      86.6     1816          27.4  63.5
##  3      59.3     1956          27.6  93.0
##  4      NA       1948          15.6  NA  
##  5      52.5     1966          30    NA  
##  6      48.2     1975          39.2  68.1
##  7      43.1     1960          12.5  81.6
##  8      NA       1968          NA    NA  
##  9      64.2     1920          44.5  55.7
## 10      55.0     1990          26.9 110.

Part 1

https://stackoverflow.com/a/61501002

cdata %>% ggpairs()

to_string <- as_labeller(c(`globalize` = "Global Index",  # ' also works instead of `
                           `bornyear` = "Independence Year",
                           `trade` = "Trade % of GDP EE-6",
                           `women_parlmnt` = "Women % Parliment"
                           )
                         )

cdata %>%
  gather() %>%
  ggplot(aes(value)) +
  geom_histogram(aes(y = stat(density))) +
  geom_density(col = "red") +
  facet_wrap(vars(key),
             scales = "free",
             labeller = to_string
             )

cdata %>%
  gather() %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(vars(key), 
             scales = "free",
             labeller = to_string
             )

Part 2

to_string <- as_labeller(c(`bornyear` = "Independence Year",
                           `trade` = "Trade % of GDP EE-6",
                           `women_parlmnt` = "Women % Parliment"
                           )
                         )

cdata %>%
  gather(IV, value, -globalize) %>%
  ggplot(aes(value, globalize)) +
  geom_point() +
  # geom_smooth(method = loess) +
  facet_wrap(vars(IV),
             scales = "free",
             labeller = to_string
             ) +
  ggtitle("Global Index by Our Indicators")

Part 3

https://cran.r-project.org/web/packages/naniar/vignettes/naniar-visualisation.html https://www.wikiwand.com/en/Missing_data#/Types

cdata %>% vis_miss()

cdata %>% gg_miss_upset()

to_string <- as_labeller(c(`bornyear` = "Independence Year",
                           `trade` = "Trade % of GDP EE-6",
                           `women_parlmnt` = "Women % Parliment"
                           )
                         )

cdata %>%
  gather(IV, value, -globalize) %>%
  ggplot(aes(value, globalize)) +
  geom_miss_point() +
  facet_wrap(vars(IV),
             scales = "free",
             labeller = to_string
             ) +
  ggtitle("Global Index by Our IV Indicators")

Part 4

model.fit <-
  cdata %>%
  lm(globalize ~ ., data = .)

summary(model.fit)
## 
## Call:
## lm(formula = globalize ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.769 -10.170  -0.050   9.672  35.035 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   251.33282   35.38133   7.104 3.35e-11 ***
## bornyear       -0.10753    0.01827  -5.886 2.11e-08 ***
## women_parlmnt   0.26786    0.10076   2.658  0.00862 ** 
## trade           0.11528    0.02404   4.796 3.57e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.17 on 167 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.2719, Adjusted R-squared:  0.2588 
## F-statistic: 20.79 on 3 and 167 DF,  p-value: 1.706e-11
stargazer(model.fit, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                              globalize         
## -----------------------------------------------
## bornyear                     -0.108***         
##                               (0.018)          
##                                                
## women_parlmnt                0.268***          
##                               (0.101)          
##                                                
## trade                        0.115***          
##                               (0.024)          
##                                                
## Constant                    251.333***         
##                              (35.381)          
##                                                
## -----------------------------------------------
## Observations                    171            
## R2                             0.272           
## Adjusted R2                    0.259           
## Residual Std. Error      14.167 (df = 167)     
## F Statistic           20.788*** (df = 3; 167)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Part 5

model.fit %>%
  influencePlot(.,
                id.method="identify",
                main="Influence Plot",
                sub="Circle size is proportial to Cook's Distance"
                )

##        StudRes        Hat      CookD
## 34  -1.5709919 0.09435629 0.06372361
## 115 -0.5617155 0.32569323 0.03825672
## 135 -2.4326624 0.01164814 0.01693730
## 174  2.5255068 0.01028752 0.01605731
## 181 -2.0912677 0.20338750 0.27362204
model.fit %>% leveragePlots()

ols_plot_resid_stud(model.fit)

plot(model.fit, which = 4)

ols_plot_dfbetas(model.fit)

Model Rerun

model.fit <-
  cdata %>%
  slice(-c(34, 115, 135, 174, 181)) %>%
  lm(globalize ~ ., data = .)

summary(model.fit)
## 
## Call:
## lm(formula = globalize ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.879  -9.310  -0.138   9.450  31.480 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   287.00446   38.15911   7.521 3.52e-12 ***
## bornyear       -0.12671    0.01970  -6.432 1.35e-09 ***
## women_parlmnt   0.28924    0.10375   2.788  0.00594 ** 
## trade           0.13182    0.02825   4.667 6.37e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.54 on 162 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.3111, Adjusted R-squared:  0.2983 
## F-statistic: 24.38 on 3 and 162 DF,  p-value: 4.496e-13
model.fit %>%
  influencePlot(.,
                id.method="identify",
                main="Influence Plot",
                sub="Circle size is proportial to Cook's Distance"
                )

##         StudRes        Hat        CookD
## 9   -2.28261259 0.05234699 7.013000e-02
## 33  -1.98217955 0.08913622 9.441579e-02
## 72  -2.45595552 0.01729790 2.574356e-02
## 127 -2.56685483 0.01771259 2.871153e-02
## 182  0.03872247 0.19669014 9.235292e-05
model.fit %>% leveragePlots()

ols_plot_resid_stud(model.fit)

plot(model.fit, which = 4)

ols_plot_dfbetas(model.fit)