Chile <- read.csv("C:/Users/aizax94/Downloads/Chile.CSV")
   View(Chile)

CODE FOR PRELIMINARY ANALYSIS OF VARIABLES:

9. Region:

tapply(Chile$region, Chile$vote, summary)
## $`0`
##   C   M   N   S  SA 
## 254  20 132 256 414 
## 
## $`1`
##   C   M   N   S  SA 
## 174  38 135 275 246
table(Chile$region, Chile$vote)
##     
##        0   1
##   C  254 174
##   M   20  38
##   N  132 135
##   S  256 275
##   SA 414 246
prop.table(table(Chile$region, Chile$vote)) * 100
##     
##              0         1
##   C  13.065844  8.950617
##   M   1.028807  1.954733
##   N   6.790123  6.944444
##   S  13.168724 14.146091
##   SA 21.296296 12.654321
k <- c("C", "M", "N", "S", "SA")
Chile$region <- factor(Chile$region, levels = k)
t <- table(Chile$region, Chile$vote)
p <- round(100*prop.table(t),2)
barplot(p, beside=T, col = c("green","purple", "red", "seagreen", "deepskyblue"), main = "Bar Chart of Vote by Region",ylab = "Percentage")
legend("topright", legend = rownames(p), fill = c("green","purple", "red", "seagreen", "deepskyblue"), cex = .75)

9A. Region table code:

t <- table(Chile$vote, Chile$region)
t
##    
##       C   M   N   S  SA
##   0 254  20 132 256 414
##   1 174  38 135 275 246
p <- round(100*prop.table(t, 2), 1)
p
##    
##        C    M    N    S   SA
##   0 59.3 34.5 49.4 48.2 62.7
##   1 40.7 65.5 50.6 51.8 37.3
addmargins(p, 1)
##      
##           C     M     N     S    SA
##   0    59.3  34.5  49.4  48.2  62.7
##   1    40.7  65.5  50.6  51.8  37.3
##   Sum 100.0 100.0 100.0 100.0 100.0
chisq.test(Chile$region, Chile$vote, correct = F)
## 
##  Pearson's Chi-squared test
## 
## data:  Chile$region and Chile$vote
## X-squared = 42.245, df = 4, p-value = 1.484e-08

10. Population:

tapply(Chile$population, Chile$vote, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3750   62500  250000  168100  250000  250000 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3750   25000  125000  131900  250000  250000
    table(Chile$population, Chile$vote)
##         
##            0   1
##   3750     7  12
##   8750    12  30
##   15000   89 104
##   25000  114 142
##   45000   44  43
##   62500   36  32
##   87500   32  33
##   125000  91 102
##   175000  61  40
##   250000 590 330
boxplot(Chile$population ~ Chile$vote, col="pink", main = "Boxplot of Population
and Vote")

11. Sex:

tapply(Chile$sex, Chile$vote, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.434   1.000   1.000 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.553   1.000   1.000
table(Chile$sex, Chile$vote)
##    
##       0   1
##   0 609 388
##   1 467 480
prop.table(table(Chile$sex, Chile$vote)) * 100
##    
##            0        1
##   0 31.32716 19.95885
##   1 24.02263 24.69136
t <- table(Chile$sex, Chile$vote)
p <- round(100*prop.table(t),2)
barplot(p, beside=T, col = c("green", "deepskyblue"), main = "Bar Chart of Vote by Sex",ylab = "Percentage")
legend("topright", legend = rownames(p), fill = c("green", "deepskyblue"), cex = .75)

11A Sex table code:

t2 <- table(Chile$vote, Chile$sex)
t2
##    
##       0   1
##   0 609 467
##   1 388 480
p2 <- round(100*prop.table(t2, 2), 1)
p2
##    
##        0    1
##   0 61.1 49.3
##   1 38.9 50.7
addmargins(p2, 1)
##      
##           0     1
##   0    61.1  49.3
##   1    38.9  50.7
##   Sum 100.0 100.0
chisq.test(Chile$sex, Chile$vote, correct = F)
## 
##  Pearson's Chi-squared test
## 
## data:  Chile$sex and Chile$vote
## X-squared = 27.223, df = 1, p-value = 1.813e-07

12. Age:

tapply(Chile$age, Chile$vote, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   23.00   33.00   35.55   44.25   70.00 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   27.00   38.00   40.37   53.00   70.00
boxplot(Chile$age ~ Chile$vote, col="orange", main = "Boxplot of Age and Vote")

13 Education:

table(Chile$education, Chile$vote)
##     
##        0   1
##   P  318 422
##   PS 256 130
##   S  500 311
r <- round(100*prop.table(table(Chile$education, Chile$vote)),2)
addmargins(r)
##      
##            0      1    Sum
##   P    16.42  21.79  38.21
##   PS   13.22   6.71  19.93
##   S    25.81  16.06  41.87
##   Sum  55.45  44.56 100.01
k <- c("P", "S", "PS")
Chile$education <- factor(Chile$education, levels = k)
t1 <- table(Chile$education, Chile$vote)
p1 <- round(100*prop.table(t1),2)
barplot(p1, beside=T, col = c("pink", "seagreen", "deepskyblue"), main = "Bar Chart of
Education",ylab = "Percentage")
legend("topright", legend = rownames(p1), fill = c("pink", "seagreen", "deepskyblue"),
cex = .75)

13A Education table code:

t3 <- table(Chile$vote, Chile$education)
t3
##    
##       P   S  PS
##   0 318 500 256
##   1 422 311 130
p3 <- round(100*prop.table(t3, 2), 1)
p3
##    
##        P    S   PS
##   0 43.0 61.7 66.3
##   1 57.0 38.3 33.7
addmargins(p3, 1)
##      
##           P     S    PS
##   0    43.0  61.7  66.3
##   1    57.0  38.3  33.7
##   Sum 100.0 100.0 100.0
chisq.test(Chile$vote, Chile$education, correct = F)
## 
##  Pearson's Chi-squared test
## 
## data:  Chile$vote and Chile$education
## X-squared = 77.729, df = 2, p-value < 2.2e-16

14 Income:

tapply(Chile$income, Chile$vote, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2500   15000   15000   36370   35000  200000      28 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2500    7500   15000   36500   35000  200000      27
table(Chile$income, Chile$vote)
##         
##            0   1
##   2500    55  55
##   7500   177 160
##   15000  296 240
##   35000  324 228
##   75000  119  92
##   125000  46  31
##   200000  31  35
prop.table(table(Chile$income, Chile$vote)) * 100
##         
##                  0         1
##   2500    2.911593  2.911593
##   7500    9.370037  8.470090
##   15000  15.669666 12.705135
##   35000  17.151932 12.069878
##   75000   6.299629  4.870302
##   125000  2.435151  1.641080
##   200000  1.641080  1.852832
boxplot(Chile$income ~ Chile$vote, col="green", main="Plot of Vote by Individuals Income")

15 Status Quo:

tapply(Chile$statusquo, Chile$vote, summary)
## $`0`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -1.7260 -1.2200 -1.0300 -0.7867 -0.5629  1.5130       3 
## 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -1.3040  0.5965  1.1770  0.9422  1.4320  1.7140       2
boxplot(Chile$statusquo ~ Chile$vote, col="red", main = "Boxplot of Status Quo and
Vote")

MODEL TESTING:

16. All variables:

model <- glm(vote ~ age + sex + education + region + population + statusquo, binomial, Chile)
summary(model)
## 
## Call:
## glm(formula = vote ~ age + sex + education + region + population + 
##     statusquo, family = binomial, data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8159  -0.3081  -0.1797   0.3246   3.0121  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.599e-01  3.622e-01  -1.270 0.204166    
## age          9.345e-03  6.059e-03   1.542 0.123011    
## sex          3.140e-01  1.643e-01   1.911 0.056009 .  
## educationS  -7.474e-01  1.943e-01  -3.847 0.000119 ***
## educationPS -8.439e-01  2.500e-01  -3.375 0.000737 ***
## regionM      9.084e-01  5.044e-01   1.801 0.071746 .  
## regionN     -8.756e-02  2.862e-01  -0.306 0.759650    
## regionS      1.746e-02  2.362e-01   0.074 0.941080    
## regionSA     1.092e-01  2.794e-01   0.391 0.695915    
## population  -1.412e-08  1.161e-06  -0.012 0.990302    
## statusquo    2.672e+00  1.103e-01  24.214  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2655.4  on 1931  degrees of freedom
## Residual deviance: 1032.4  on 1921  degrees of freedom
##   (180 observations deleted due to missingness)
## AIC: 1054.4
## 
## Number of Fisher Scoring iterations: 6

17. Statusquo and education only:

model2 <- glm(vote ~ education + statusquo, binomial, Chile)
summary(model2)
## 
## Call:
## glm(formula = vote ~ education + statusquo, family = binomial, 
##     data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9081  -0.3132  -0.1767   0.3256   2.8939  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1650     0.1306   1.263    0.207    
## educationS   -0.8484     0.1806  -4.698 2.63e-06 ***
## educationPS  -0.9822     0.2349  -4.182 2.89e-05 ***
## statusquo     2.6763     0.1085  24.668  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2655.4  on 1931  degrees of freedom
## Residual deviance: 1042.6  on 1928  degrees of freedom
##   (180 observations deleted due to missingness)
## AIC: 1050.6
## 
## Number of Fisher Scoring iterations: 6
confint(model2)
## Waiting for profiling to be done...
##                   2.5 %     97.5 %
## (Intercept) -0.09071804  0.4218749
## educationS  -1.20545869 -0.4966999
## educationPS -1.44685228 -0.5253752
## statusquo    2.47059605  2.8963472

18. Odds Interpretaion

exp(-0.9822)
## [1] 0.3744863
exp(-0.8484)
## [1] 0.4280993
exp(2.6763)
## [1] 14.53123

19. Confidence Tests

summary(model)
## 
## Call:
## glm(formula = vote ~ age + sex + education + region + population + 
##     statusquo, family = binomial, data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8159  -0.3081  -0.1797   0.3246   3.0121  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.599e-01  3.622e-01  -1.270 0.204166    
## age          9.345e-03  6.059e-03   1.542 0.123011    
## sex          3.140e-01  1.643e-01   1.911 0.056009 .  
## educationS  -7.474e-01  1.943e-01  -3.847 0.000119 ***
## educationPS -8.439e-01  2.500e-01  -3.375 0.000737 ***
## regionM      9.084e-01  5.044e-01   1.801 0.071746 .  
## regionN     -8.756e-02  2.862e-01  -0.306 0.759650    
## regionS      1.746e-02  2.362e-01   0.074 0.941080    
## regionSA     1.092e-01  2.794e-01   0.391 0.695915    
## population  -1.412e-08  1.161e-06  -0.012 0.990302    
## statusquo    2.672e+00  1.103e-01  24.214  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2655.4  on 1931  degrees of freedom
## Residual deviance: 1032.4  on 1921  degrees of freedom
##   (180 observations deleted due to missingness)
## AIC: 1054.4
## 
## Number of Fisher Scoring iterations: 6
summary(model2)
## 
## Call:
## glm(formula = vote ~ education + statusquo, family = binomial, 
##     data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9081  -0.3132  -0.1767   0.3256   2.8939  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1650     0.1306   1.263    0.207    
## educationS   -0.8484     0.1806  -4.698 2.63e-06 ***
## educationPS  -0.9822     0.2349  -4.182 2.89e-05 ***
## statusquo     2.6763     0.1085  24.668  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2655.4  on 1931  degrees of freedom
## Residual deviance: 1042.6  on 1928  degrees of freedom
##   (180 observations deleted due to missingness)
## AIC: 1050.6
## 
## Number of Fisher Scoring iterations: 6

20. Without statusquo:

model3 <- glm(vote ~ age + sex + education + region + population, binomial, Chile)
summary(model3)
## 
## Call:
## glm(formula = vote ~ age + sex + education + region + population, 
##     family = binomial, data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8885  -1.0178  -0.7258   1.1354   1.9320  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.973e-01  2.075e-01  -3.360 0.000781 ***
## age          1.893e-02  3.469e-03   5.458 4.83e-08 ***
## sex          5.038e-01  9.685e-02   5.202 1.97e-07 ***
## educationS  -4.421e-01  1.127e-01  -3.924 8.71e-05 ***
## educationPS -5.540e-01  1.410e-01  -3.929 8.55e-05 ***
## regionM      8.389e-01  3.048e-01   2.752 0.005922 ** 
## regionN      4.655e-01  1.651e-01   2.820 0.004798 ** 
## regionS      4.280e-01  1.378e-01   3.106 0.001896 ** 
## regionSA     4.100e-01  1.666e-01   2.461 0.013842 *  
## population  -3.598e-06  6.908e-07  -5.209 1.90e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2662.2  on 1936  degrees of freedom
## Residual deviance: 2475.8  on 1927  degrees of freedom
##   (175 observations deleted due to missingness)
## AIC: 2495.8
## 
## Number of Fisher Scoring iterations: 4
confint(model3)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept) -1.105482e+00 -2.914325e-01
## age          1.215341e-02  2.575894e-02
## sex          3.143843e-01  6.941225e-01
## educationS  -6.631163e-01 -2.213367e-01
## educationPS -8.316322e-01 -2.785240e-01
## regionM      2.514420e-01  1.451716e+00
## regionN      1.425639e-01  7.899793e-01
## regionS      1.584505e-01  6.988081e-01
## regionSA     8.536812e-02  7.386190e-01
## population  -4.961550e-06 -2.252035e-06

21. Without statusquo and region:

model4 <- glm(vote ~ age + sex + education + region + population, binomial, Chile)
summary(model4)
## 
## Call:
## glm(formula = vote ~ age + sex + education + region + population, 
##     family = binomial, data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8885  -1.0178  -0.7258   1.1354   1.9320  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.973e-01  2.075e-01  -3.360 0.000781 ***
## age          1.893e-02  3.469e-03   5.458 4.83e-08 ***
## sex          5.038e-01  9.685e-02   5.202 1.97e-07 ***
## educationS  -4.421e-01  1.127e-01  -3.924 8.71e-05 ***
## educationPS -5.540e-01  1.410e-01  -3.929 8.55e-05 ***
## regionM      8.389e-01  3.048e-01   2.752 0.005922 ** 
## regionN      4.655e-01  1.651e-01   2.820 0.004798 ** 
## regionS      4.280e-01  1.378e-01   3.106 0.001896 ** 
## regionSA     4.100e-01  1.666e-01   2.461 0.013842 *  
## population  -3.598e-06  6.908e-07  -5.209 1.90e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2662.2  on 1936  degrees of freedom
## Residual deviance: 2475.8  on 1927  degrees of freedom
##   (175 observations deleted due to missingness)
## AIC: 2495.8
## 
## Number of Fisher Scoring iterations: 4
confint(model4)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept) -1.105482e+00 -2.914325e-01
## age          1.215341e-02  2.575894e-02
## sex          3.143843e-01  6.941225e-01
## educationS  -6.631163e-01 -2.213367e-01
## educationPS -8.316322e-01 -2.785240e-01
## regionM      2.514420e-01  1.451716e+00
## regionN      1.425639e-01  7.899793e-01
## regionS      1.584505e-01  6.988081e-01
## regionSA     8.536812e-02  7.386190e-01
## population  -4.961550e-06 -2.252035e-06

22. Odds Interpretations

exp(1.638e-02)
## [1] 1.016515
exp(4.965e-01)
## [1] 1.642961
exp(-9.506e-01)
## [1] 0.386509
exp(-5.783e-01)
## [1] 0.560851
exp(-3.805e-06)
## [1] 0.9999962
exp(6.389e-06)
## [1] 1.000006

23. Confidence Test

summary(model3)
## 
## Call:
## glm(formula = vote ~ age + sex + education + region + population, 
##     family = binomial, data = Chile)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8885  -1.0178  -0.7258   1.1354   1.9320  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.973e-01  2.075e-01  -3.360 0.000781 ***
## age          1.893e-02  3.469e-03   5.458 4.83e-08 ***
## sex          5.038e-01  9.685e-02   5.202 1.97e-07 ***
## educationS  -4.421e-01  1.127e-01  -3.924 8.71e-05 ***
## educationPS -5.540e-01  1.410e-01  -3.929 8.55e-05 ***
## regionM      8.389e-01  3.048e-01   2.752 0.005922 ** 
## regionN      4.655e-01  1.651e-01   2.820 0.004798 ** 
## regionS      4.280e-01  1.378e-01   3.106 0.001896 ** 
## regionSA     4.100e-01  1.666e-01   2.461 0.013842 *  
## population  -3.598e-06  6.908e-07  -5.209 1.90e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2662.2  on 1936  degrees of freedom
## Residual deviance: 2475.8  on 1927  degrees of freedom
##   (175 observations deleted due to missingness)
## AIC: 2495.8
## 
## Number of Fisher Scoring iterations: 4

24. Status Quo Model:

model2 <- glm(vote ~ education + statusquo, binomial, Chile)
fit <- fitted(model2) # this gives the predicted probability
predvote <- numeric(2112)
predvote[fit >= 0.25] <- 1
n <- data.frame(Chile$vote, predvote)
t <- table(Chile$vote, predvote)
t
##    predvote
##       0   1
##   0 522 554
##   1 406 462
sensitivity <- 100*t[2,2]/(t[2,1] + t[2,2])
sensitivity
## [1] 53.22581
specificity <- 100*t[1,1]/(t[1,1] + t[1,2])
specificity
## [1] 48.51301

25. Final MOdel:

model <- glm(vote ~ age + sex + education + income +population, binomial, Chile)
fit <- fitted(model) # this gives the predicted probability
predvote <- numeric(2112)
predvote[fit >= 0.25] <- 1
n <- data.frame(Chile$vote, predvote)
t <- table(Chile$vote, predvote)
t
##    predvote
##       0   1
##   0 136 940
##   1  89 779
sensitivity <- 100*t[2,2]/(t[2,1] + t[2,2])
sensitivity
## [1] 89.74654
specificity <- 100*t[1,1]/(t[1,1] + t[1,2])
specificity
## [1] 12.63941

26. Correlation Matrix:

d <- Chile[, -c(1,5,8)]
round(cor(na.omit(d)),3)
##            population    sex    age income statusquo
## population      1.000  0.028  0.029  0.239    -0.205
## sex             0.028  1.000 -0.019 -0.040     0.097
## age             0.029 -0.019  1.000 -0.010     0.134
## income          0.239 -0.040 -0.010  1.000     0.043
## statusquo      -0.205  0.097  0.134  0.043     1.000