http://www.bethanylacina.com/docs/Lacina_civilwar_severity.pdf http://hdl.handle.net/1902.1/22535

library(tidyverse)
library(haven)
library(DT)
library(stargazer)
library(GGally)
library(car)
data2006 <- read_dta("G:/My Drive/homework/Jenna S/Lacina_JCR_2006_postedJan07.dta")

# https://stackoverflow.com/a/54282131
data.frame(
    "name" = names(data2006[ , -c(1:3)]),
    "label" = sapply(data2006[ , -c(1:3)], function(x) attr(x, "label"))  %>% as.character()
    ) %>%
  datatable(options = list(pageLength = 5))
data2006 %>%
  as.data.frame() %>%
  stargazer(type = "text")
## 
## ===============================================================================================
## Statistic       N       Mean         St. Dev.       Min    Pctl(25)   Pctl(75)        Max      
## -----------------------------------------------------------------------------------------------
## ID             114   2,086.675        615.437      1,010    1,622       2,555        3,211     
## ccode          114    554.316         227.977       40      380.8        750          850      
## region         114     3.246           1.743         1        2          4.8           6       
## secession      114     0.377           0.487         0        0           1            1       
## battledeadbest 114   62,677.140     230,855.800     900    3,196.8     40,475      2,097,705   
## lnbdb          114     9.362           1.711       6.802    8.070      10.608        14.556    
## battledeadlow  114   59,497.240     230,424.900     418    2,594.5     34,386      2,097,705   
## lnbdl          114     9.196           1.781       6.035    7.860      10.445        14.556    
## battledeadhigh 114   67,705.100     232,758.200     900    3,561.5     49,075      2,097,705   
## lnbdh          114     9.450           1.748       6.802    8.178      10.801        14.556    
## pop            114 90,287,581.000 193,274,165.000 611,000 5,561,250  52,170,750   929,358,016  
## lnpop          114     16.760          1.730      13.323    15.531     17.769        20.650    
## dper100k       114    459.743        1,104.403     0.161    14.080     314.112     8,870.539   
## begin          114   1,975.500        16.570       1,946    1,963       1,991        2,001     
## end            114   1,983.096        18.173       1,946    1,971       1,998        2,002     
## duration       114     8.596           9.605         1        2         11.8           55      
## lnduration     114     1.557           1.137         0       0.7         2.5           4       
## rate           114   9,023.805      30,645.620      65      1,000       5,375       300,000    
## lnrate         114     7.805           1.446       4.168    6.908       8.589        12.612    
## cw             114     0.667           0.473         0        0           1            1       
## intervention   114     0.553           0.499         0        0           1            1       
## democ          114     0.175           0.382         0        0           0            1       
## lngdp          107     7.029           0.973       3.912    6.341       7.755        9.052     
## gdpsq          107     12.960          2.419       3.892    12.088     14.108        17.808    
## milper         114    372.070         805.239        1       19.2        250         4,158     
## milperpop      114    0.00001         0.00001     0.00000  0.00000     0.00001       0.0001    
## lnmilperpop    114    -12.525          1.045      -15.614  -13.313     -11.794       -9.433    
## milex          112 1,903,079.000   6,345,521.000  63.000  41,257.250 850,000.000 57,107,000.000
## milqual        112   6,399.334      10,761.100     2.520  1,141.071   6,256.646    67,600.000  
## lnmilqual      112     7.950           1.375       0.924    7.040       8.741        11.121    
## milq2          112     15.236          3.656      -15.478   14.239     16.146        22.150    
## mountain       114     23.271         22.296       0.000    7.300      37.325        81.000    
## lnmountain     114     2.688           1.115       0.000    2.116       3.646        4.407     
## ethfrac        114     0.495           0.291       0.004    0.227       0.755        0.902     
## lnethfrac      114     -1.049          1.070      -5.493    -1.485     -0.281        -0.104    
## relfrac        114     0.398           0.197       0.000    0.222       0.551        0.753     
## lnrelfrac      113     -1.092          0.705      -3.922    -1.507     -0.596        -0.283    
## relpolar       114     0.781           0.416         0        1           1            1       
## ethnicpolar    114     0.781           0.416         0        1           1            1       
## -----------------------------------------------------------------------------------------------
# add summarize here

model1 <- 
  data2006 %>%
  lm(lnbdb ~ lnduration + lnmilperpop + lnmilqual + lngdp + cw + 
       lnmountain + democ + ethnicpolar + relpolar, data = .)
model1 %>% summary()
## 
## Call:
## lm(formula = lnbdb ~ lnduration + lnmilperpop + lnmilqual + lngdp + 
##     cw + lnmountain + democ + ethnicpolar + relpolar, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7396 -0.9133  0.0152  0.6963  3.1723 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.40574    2.61835   5.502 3.17e-07 ***
## lnduration   0.85772    0.11729   7.313 8.20e-11 ***
## lnmilperpop  0.38708    0.15777   2.453   0.0160 *  
## lnmilqual    0.23131    0.12819   1.804   0.0743 .  
## lngdp       -0.46214    0.20365  -2.269   0.0255 *  
## cw           0.72658    0.30468   2.385   0.0191 *  
## lnmountain   0.05914    0.11617   0.509   0.6119    
## democ       -0.76046    0.34052  -2.233   0.0279 *  
## ethnicpolar -0.87988    0.32993  -2.667   0.0090 ** 
## relpolar     0.15785    0.31183   0.506   0.6139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.286 on 95 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.4798, Adjusted R-squared:  0.4306 
## F-statistic: 9.737 on 9 and 95 DF,  p-value: 2.032e-10
data2006 %>%
  select(lnbdb, lnduration, lnmilperpop, lnmilqual, lngdp, cw, lnmountain, democ,
         ethnicpolar, relpolar) %>%
  ggpairs(axisLabels = "none")

model2 <- 
  data2006 %>%
  lm(lnbdb ~ lnduration + cw + democ + ethnicpolar, data = .)
model2 %>% summary()
## 
## Call:
## lm(formula = lnbdb ~ lnduration + cw + democ + ethnicpolar, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6584 -0.8932 -0.0507  0.6921  3.6222 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5965     0.3472  24.763  < 2e-16 ***
## lnduration    0.8572     0.1106   7.751 5.12e-12 ***
## cw            0.5908     0.2686   2.199 0.029976 *  
## democ        -0.9122     0.3264  -2.795 0.006136 ** 
## ethnicpolar  -1.0283     0.3034  -3.389 0.000978 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.296 on 109 degrees of freedom
## Multiple R-squared:  0.447,  Adjusted R-squared:  0.4267 
## F-statistic: 22.03 on 4 and 109 DF,  p-value: 2.417e-13

Part (1)

data2006 %>%
  ggplot(aes(lnduration, lnbdb)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, col = "red", show.legend = TRUE) +
  labs(x = "Log of Duration",
       y = "Log of Battle Data Best Estimate")
## `geom_smooth()` using formula 'y ~ x'

data2006 %>% lm(lnbdb ~ lnduration, data = .) %>% summary()
## 
## Call:
## lm(formula = lnbdb ~ lnduration, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3934 -0.8756 -0.1360  0.7390  4.7763 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.0790     0.2291  35.263  < 2e-16 ***
## lnduration    0.8242     0.1190   6.925 2.89e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.438 on 112 degrees of freedom
## Multiple R-squared:  0.2998, Adjusted R-squared:  0.2936 
## F-statistic: 47.96 on 1 and 112 DF,  p-value: 2.892e-10

Part (2)

crPlots(model2, terms = ~ lnduration)

Part (3)

model2$residuals %>%
  as.data.frame() %>%
  ggplot(aes(.)) +
  geom_histogram() +
  ggtitle("Model 2 Redisuals")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model2$residuals %>%
  as.data.frame() %>%
  ggplot(aes(.)) +
  geom_density() +
  ggtitle("Model 2 Redisuals")

model2$residuals %>%
  as.data.frame() %>%
  ggplot(aes(1:114, .)) +
  geom_hline(yintercept = mean(model2$residuals), col = "red") +
  geom_path() +
  ggtitle("Model 2 Redisuals") +
  labs(x = "Order", y = "Residual")

Part (4)

model2$residuals %>% 
  as.data.frame() %>%
  ggplot(aes(sample = .)) +
  geom_qq() + 
  geom_qq_line() +
  ggtitle("QQ Plot of Model 2 Residuals")

data2006 %>%
  ggplot(aes(sample = lnbdb)) +
  geom_qq() + 
  geom_qq_line() +
  ggtitle("Response = Log of Battle Data Best Estimate") +
  labs(subtitle = "QQ Plot")

Part (5)

data2006 %>%
  ggplot(aes(lnduration, model2$residuals)) +
  geom_point() +
  geom_hline(yintercept = mean(model2$residuals), col = "red") +
  labs(x = "Log Duration", y = "Model 2 Residuals")

Part (6)

model2 %>% 
  ggplot(aes(model2$fitted.values, model2$residuals)) +
  geom_point() +
  geom_hline(yintercept = mean(model2$residuals), col = "red") +
  labs(title = "Model 2",
       x = "Fitted Values",
       y = "Residuals")

Part (7)

https://www.wikiwand.com/en/Variance_inflation_factor#/Interpretation

model1 %>% vif()
##  lnduration lnmilperpop   lnmilqual       lngdp          cw  lnmountain 
##    1.113910    1.693288    1.980714    2.244088    1.248881    1.041766 
##       democ ethnicpolar    relpolar 
##    1.135256    1.144773    1.022562