Chile <- read.csv("C:/Users/aizax94/Downloads/Chile.CSV")
View(Chile)
CODE FOR PRELIMINARY ANALYSIS OF VARIABLES:
9. Region:
tapply(Chile$region, Chile$vote, summary)
## $`0`
## C M N S SA
## 254 20 132 256 414
##
## $`1`
## C M N S SA
## 174 38 135 275 246
table(Chile$region, Chile$vote)
##
## 0 1
## C 254 174
## M 20 38
## N 132 135
## S 256 275
## SA 414 246
prop.table(table(Chile$region, Chile$vote)) * 100
##
## 0 1
## C 13.065844 8.950617
## M 1.028807 1.954733
## N 6.790123 6.944444
## S 13.168724 14.146091
## SA 21.296296 12.654321
k <- c("C", "M", "N", "S", "SA")
Chile$region <- factor(Chile$region, levels = k)
t <- table(Chile$region, Chile$vote)
p <- round(100*prop.table(t),2)
barplot(p, beside=T, col = c("green","purple", "red", "seagreen", "deepskyblue"), main = "Bar Chart of Vote by Region",ylab = "Percentage")
legend("topright", legend = rownames(p), fill = c("green","purple", "red", "seagreen", "deepskyblue"), cex = .75)
9A. Region table code:
t <- table(Chile$vote, Chile$region)
t
##
## C M N S SA
## 0 254 20 132 256 414
## 1 174 38 135 275 246
p <- round(100*prop.table(t, 2), 1)
p
##
## C M N S SA
## 0 59.3 34.5 49.4 48.2 62.7
## 1 40.7 65.5 50.6 51.8 37.3
addmargins(p, 1)
##
## C M N S SA
## 0 59.3 34.5 49.4 48.2 62.7
## 1 40.7 65.5 50.6 51.8 37.3
## Sum 100.0 100.0 100.0 100.0 100.0
chisq.test(Chile$region, Chile$vote, correct = F)
##
## Pearson's Chi-squared test
##
## data: Chile$region and Chile$vote
## X-squared = 42.245, df = 4, p-value = 1.484e-08
10. Population:
tapply(Chile$population, Chile$vote, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3750 62500 250000 168100 250000 250000
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3750 25000 125000 131900 250000 250000
table(Chile$population, Chile$vote)
##
## 0 1
## 3750 7 12
## 8750 12 30
## 15000 89 104
## 25000 114 142
## 45000 44 43
## 62500 36 32
## 87500 32 33
## 125000 91 102
## 175000 61 40
## 250000 590 330
boxplot(Chile$population ~ Chile$vote, col="pink", main = "Boxplot of Population
and Vote")
11. Sex:
tapply(Chile$sex, Chile$vote, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.434 1.000 1.000
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.553 1.000 1.000
table(Chile$sex, Chile$vote)
##
## 0 1
## 0 609 388
## 1 467 480
prop.table(table(Chile$sex, Chile$vote)) * 100
##
## 0 1
## 0 31.32716 19.95885
## 1 24.02263 24.69136
t <- table(Chile$sex, Chile$vote)
p <- round(100*prop.table(t),2)
barplot(p, beside=T, col = c("green", "deepskyblue"), main = "Bar Chart of Vote by Sex",ylab = "Percentage")
legend("topright", legend = rownames(p), fill = c("green", "deepskyblue"), cex = .75)
11A Sex table code:
t2 <- table(Chile$vote, Chile$sex)
t2
##
## 0 1
## 0 609 467
## 1 388 480
p2 <- round(100*prop.table(t2, 2), 1)
p2
##
## 0 1
## 0 61.1 49.3
## 1 38.9 50.7
addmargins(p2, 1)
##
## 0 1
## 0 61.1 49.3
## 1 38.9 50.7
## Sum 100.0 100.0
chisq.test(Chile$sex, Chile$vote, correct = F)
##
## Pearson's Chi-squared test
##
## data: Chile$sex and Chile$vote
## X-squared = 27.223, df = 1, p-value = 1.813e-07
12. Age:
tapply(Chile$age, Chile$vote, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 23.00 33.00 35.55 44.25 70.00
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 27.00 38.00 40.37 53.00 70.00
boxplot(Chile$age ~ Chile$vote, col="orange", main = "Boxplot of Age and Vote")
13 Education:
table(Chile$education, Chile$vote)
##
## 0 1
## P 318 422
## PS 256 130
## S 500 311
r <- round(100*prop.table(table(Chile$education, Chile$vote)),2)
addmargins(r)
##
## 0 1 Sum
## P 16.42 21.79 38.21
## PS 13.22 6.71 19.93
## S 25.81 16.06 41.87
## Sum 55.45 44.56 100.01
k <- c("P", "S", "PS")
Chile$education <- factor(Chile$education, levels = k)
t1 <- table(Chile$education, Chile$vote)
p1 <- round(100*prop.table(t1),2)
barplot(p1, beside=T, col = c("pink", "seagreen", "deepskyblue"), main = "Bar Chart of
Education",ylab = "Percentage")
legend("topright", legend = rownames(p1), fill = c("pink", "seagreen", "deepskyblue"),
cex = .75)
13A Education table code:
t3 <- table(Chile$vote, Chile$education)
t3
##
## P S PS
## 0 318 500 256
## 1 422 311 130
p3 <- round(100*prop.table(t3, 2), 1)
p3
##
## P S PS
## 0 43.0 61.7 66.3
## 1 57.0 38.3 33.7
addmargins(p3, 1)
##
## P S PS
## 0 43.0 61.7 66.3
## 1 57.0 38.3 33.7
## Sum 100.0 100.0 100.0
chisq.test(Chile$vote, Chile$education, correct = F)
##
## Pearson's Chi-squared test
##
## data: Chile$vote and Chile$education
## X-squared = 77.729, df = 2, p-value < 2.2e-16
14 Income:
tapply(Chile$income, Chile$vote, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2500 15000 15000 36370 35000 200000 28
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2500 7500 15000 36500 35000 200000 27
table(Chile$income, Chile$vote)
##
## 0 1
## 2500 55 55
## 7500 177 160
## 15000 296 240
## 35000 324 228
## 75000 119 92
## 125000 46 31
## 200000 31 35
prop.table(table(Chile$income, Chile$vote)) * 100
##
## 0 1
## 2500 2.911593 2.911593
## 7500 9.370037 8.470090
## 15000 15.669666 12.705135
## 35000 17.151932 12.069878
## 75000 6.299629 4.870302
## 125000 2.435151 1.641080
## 200000 1.641080 1.852832
boxplot(Chile$income ~ Chile$vote, col="green", main="Plot of Vote by Individuals Income")
15 Status Quo:
tapply(Chile$statusquo, Chile$vote, summary)
## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.7260 -1.2200 -1.0300 -0.7867 -0.5629 1.5130 3
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.3040 0.5965 1.1770 0.9422 1.4320 1.7140 2
boxplot(Chile$statusquo ~ Chile$vote, col="red", main = "Boxplot of Status Quo and
Vote")
MODEL TESTING:
16. All variables:
model <- glm(vote ~ age + sex + education + region + population + statusquo, binomial, Chile)
summary(model)
##
## Call:
## glm(formula = vote ~ age + sex + education + region + population +
## statusquo, family = binomial, data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8159 -0.3081 -0.1797 0.3246 3.0121
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.599e-01 3.622e-01 -1.270 0.204166
## age 9.345e-03 6.059e-03 1.542 0.123011
## sex 3.140e-01 1.643e-01 1.911 0.056009 .
## educationS -7.474e-01 1.943e-01 -3.847 0.000119 ***
## educationPS -8.439e-01 2.500e-01 -3.375 0.000737 ***
## regionM 9.084e-01 5.044e-01 1.801 0.071746 .
## regionN -8.756e-02 2.862e-01 -0.306 0.759650
## regionS 1.746e-02 2.362e-01 0.074 0.941080
## regionSA 1.092e-01 2.794e-01 0.391 0.695915
## population -1.412e-08 1.161e-06 -0.012 0.990302
## statusquo 2.672e+00 1.103e-01 24.214 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2655.4 on 1931 degrees of freedom
## Residual deviance: 1032.4 on 1921 degrees of freedom
## (180 observations deleted due to missingness)
## AIC: 1054.4
##
## Number of Fisher Scoring iterations: 6
17. Statusquo and education only:
model2 <- glm(vote ~ education + statusquo, binomial, Chile)
summary(model2)
##
## Call:
## glm(formula = vote ~ education + statusquo, family = binomial,
## data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9081 -0.3132 -0.1767 0.3256 2.8939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1650 0.1306 1.263 0.207
## educationS -0.8484 0.1806 -4.698 2.63e-06 ***
## educationPS -0.9822 0.2349 -4.182 2.89e-05 ***
## statusquo 2.6763 0.1085 24.668 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2655.4 on 1931 degrees of freedom
## Residual deviance: 1042.6 on 1928 degrees of freedom
## (180 observations deleted due to missingness)
## AIC: 1050.6
##
## Number of Fisher Scoring iterations: 6
confint(model2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.09071804 0.4218749
## educationS -1.20545869 -0.4966999
## educationPS -1.44685228 -0.5253752
## statusquo 2.47059605 2.8963472
18. Odds Interpretaion
exp(-0.9822)
## [1] 0.3744863
exp(-0.8484)
## [1] 0.4280993
exp(2.6763)
## [1] 14.53123
19. Confidence Tests
summary(model)
##
## Call:
## glm(formula = vote ~ age + sex + education + region + population +
## statusquo, family = binomial, data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8159 -0.3081 -0.1797 0.3246 3.0121
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.599e-01 3.622e-01 -1.270 0.204166
## age 9.345e-03 6.059e-03 1.542 0.123011
## sex 3.140e-01 1.643e-01 1.911 0.056009 .
## educationS -7.474e-01 1.943e-01 -3.847 0.000119 ***
## educationPS -8.439e-01 2.500e-01 -3.375 0.000737 ***
## regionM 9.084e-01 5.044e-01 1.801 0.071746 .
## regionN -8.756e-02 2.862e-01 -0.306 0.759650
## regionS 1.746e-02 2.362e-01 0.074 0.941080
## regionSA 1.092e-01 2.794e-01 0.391 0.695915
## population -1.412e-08 1.161e-06 -0.012 0.990302
## statusquo 2.672e+00 1.103e-01 24.214 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2655.4 on 1931 degrees of freedom
## Residual deviance: 1032.4 on 1921 degrees of freedom
## (180 observations deleted due to missingness)
## AIC: 1054.4
##
## Number of Fisher Scoring iterations: 6
summary(model2)
##
## Call:
## glm(formula = vote ~ education + statusquo, family = binomial,
## data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9081 -0.3132 -0.1767 0.3256 2.8939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1650 0.1306 1.263 0.207
## educationS -0.8484 0.1806 -4.698 2.63e-06 ***
## educationPS -0.9822 0.2349 -4.182 2.89e-05 ***
## statusquo 2.6763 0.1085 24.668 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2655.4 on 1931 degrees of freedom
## Residual deviance: 1042.6 on 1928 degrees of freedom
## (180 observations deleted due to missingness)
## AIC: 1050.6
##
## Number of Fisher Scoring iterations: 6
20. Without statusquo:
model3 <- glm(vote ~ age + sex + education + region + population, binomial, Chile)
summary(model3)
##
## Call:
## glm(formula = vote ~ age + sex + education + region + population,
## family = binomial, data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8885 -1.0178 -0.7258 1.1354 1.9320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.973e-01 2.075e-01 -3.360 0.000781 ***
## age 1.893e-02 3.469e-03 5.458 4.83e-08 ***
## sex 5.038e-01 9.685e-02 5.202 1.97e-07 ***
## educationS -4.421e-01 1.127e-01 -3.924 8.71e-05 ***
## educationPS -5.540e-01 1.410e-01 -3.929 8.55e-05 ***
## regionM 8.389e-01 3.048e-01 2.752 0.005922 **
## regionN 4.655e-01 1.651e-01 2.820 0.004798 **
## regionS 4.280e-01 1.378e-01 3.106 0.001896 **
## regionSA 4.100e-01 1.666e-01 2.461 0.013842 *
## population -3.598e-06 6.908e-07 -5.209 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2662.2 on 1936 degrees of freedom
## Residual deviance: 2475.8 on 1927 degrees of freedom
## (175 observations deleted due to missingness)
## AIC: 2495.8
##
## Number of Fisher Scoring iterations: 4
confint(model3)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.105482e+00 -2.914325e-01
## age 1.215341e-02 2.575894e-02
## sex 3.143843e-01 6.941225e-01
## educationS -6.631163e-01 -2.213367e-01
## educationPS -8.316322e-01 -2.785240e-01
## regionM 2.514420e-01 1.451716e+00
## regionN 1.425639e-01 7.899793e-01
## regionS 1.584505e-01 6.988081e-01
## regionSA 8.536812e-02 7.386190e-01
## population -4.961550e-06 -2.252035e-06
21. Without statusquo and region:
model4 <- glm(vote ~ age + sex + education + region + population, binomial, Chile)
summary(model4)
##
## Call:
## glm(formula = vote ~ age + sex + education + region + population,
## family = binomial, data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8885 -1.0178 -0.7258 1.1354 1.9320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.973e-01 2.075e-01 -3.360 0.000781 ***
## age 1.893e-02 3.469e-03 5.458 4.83e-08 ***
## sex 5.038e-01 9.685e-02 5.202 1.97e-07 ***
## educationS -4.421e-01 1.127e-01 -3.924 8.71e-05 ***
## educationPS -5.540e-01 1.410e-01 -3.929 8.55e-05 ***
## regionM 8.389e-01 3.048e-01 2.752 0.005922 **
## regionN 4.655e-01 1.651e-01 2.820 0.004798 **
## regionS 4.280e-01 1.378e-01 3.106 0.001896 **
## regionSA 4.100e-01 1.666e-01 2.461 0.013842 *
## population -3.598e-06 6.908e-07 -5.209 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2662.2 on 1936 degrees of freedom
## Residual deviance: 2475.8 on 1927 degrees of freedom
## (175 observations deleted due to missingness)
## AIC: 2495.8
##
## Number of Fisher Scoring iterations: 4
confint(model4)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.105482e+00 -2.914325e-01
## age 1.215341e-02 2.575894e-02
## sex 3.143843e-01 6.941225e-01
## educationS -6.631163e-01 -2.213367e-01
## educationPS -8.316322e-01 -2.785240e-01
## regionM 2.514420e-01 1.451716e+00
## regionN 1.425639e-01 7.899793e-01
## regionS 1.584505e-01 6.988081e-01
## regionSA 8.536812e-02 7.386190e-01
## population -4.961550e-06 -2.252035e-06
22. Odds Interpretations
exp(1.638e-02)
## [1] 1.016515
exp(4.965e-01)
## [1] 1.642961
exp(-9.506e-01)
## [1] 0.386509
exp(-5.783e-01)
## [1] 0.560851
exp(-3.805e-06)
## [1] 0.9999962
exp(6.389e-06)
## [1] 1.000006
23. Confidence Test
summary(model3)
##
## Call:
## glm(formula = vote ~ age + sex + education + region + population,
## family = binomial, data = Chile)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8885 -1.0178 -0.7258 1.1354 1.9320
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.973e-01 2.075e-01 -3.360 0.000781 ***
## age 1.893e-02 3.469e-03 5.458 4.83e-08 ***
## sex 5.038e-01 9.685e-02 5.202 1.97e-07 ***
## educationS -4.421e-01 1.127e-01 -3.924 8.71e-05 ***
## educationPS -5.540e-01 1.410e-01 -3.929 8.55e-05 ***
## regionM 8.389e-01 3.048e-01 2.752 0.005922 **
## regionN 4.655e-01 1.651e-01 2.820 0.004798 **
## regionS 4.280e-01 1.378e-01 3.106 0.001896 **
## regionSA 4.100e-01 1.666e-01 2.461 0.013842 *
## population -3.598e-06 6.908e-07 -5.209 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2662.2 on 1936 degrees of freedom
## Residual deviance: 2475.8 on 1927 degrees of freedom
## (175 observations deleted due to missingness)
## AIC: 2495.8
##
## Number of Fisher Scoring iterations: 4
24. Status Quo Model:
model2 <- glm(vote ~ education + statusquo, binomial, Chile)
fit <- fitted(model2) # this gives the predicted probability
predvote <- numeric(2112)
predvote[fit >= 0.25] <- 1
n <- data.frame(Chile$vote, predvote)
t <- table(Chile$vote, predvote)
t
## predvote
## 0 1
## 0 522 554
## 1 406 462
sensitivity <- 100*t[2,2]/(t[2,1] + t[2,2])
sensitivity
## [1] 53.22581
specificity <- 100*t[1,1]/(t[1,1] + t[1,2])
specificity
## [1] 48.51301
25. Final MOdel:
model <- glm(vote ~ age + sex + education + income +population, binomial, Chile)
fit <- fitted(model) # this gives the predicted probability
predvote <- numeric(2112)
predvote[fit >= 0.25] <- 1
n <- data.frame(Chile$vote, predvote)
t <- table(Chile$vote, predvote)
t
## predvote
## 0 1
## 0 136 940
## 1 89 779
sensitivity <- 100*t[2,2]/(t[2,1] + t[2,2])
sensitivity
## [1] 89.74654
specificity <- 100*t[1,1]/(t[1,1] + t[1,2])
specificity
## [1] 12.63941
26. Correlation Matrix:
d <- Chile[, -c(1,5,8)]
round(cor(na.omit(d)),3)
## population sex age income statusquo
## population 1.000 0.028 0.029 0.239 -0.205
## sex 0.028 1.000 -0.019 -0.040 0.097
## age 0.029 -0.019 1.000 -0.010 0.134
## income 0.239 -0.040 -0.010 1.000 0.043
## statusquo -0.205 0.097 0.134 0.043 1.000