library(haven)
library(car)
library(foreign)
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)
This project explores a basic application of “k nearest neighbor” classification. The data used are for practice and were drawn from ANES 2020 (.dta and codebook) and text: “Machine Learning with R”.
# call data
a20<-read_dta("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\Data\\ANES2020T.dta")
# self-placement: liberal (1) or conservative (0)
a20$lib.or.cons<-car::recode(a20$V201201, recodes = "-9:-1=NA;2=0;3=NA")
table(a20$lib.or.cons)
##
## 0 1
## 1534 1320
#age
a20$age<-car::recode(a20$V201507x, recodes = "-9:-1=NA")
table(a20$age)
##
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 35 52 46 51 57 75 92 104 108 132 120 131 142 109 117 123 142 152 144 149
## 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## 152 151 139 151 113 116 111 116 119 106 105 123 154 128 111 117 123 140 127 136
## 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
## 145 154 168 139 154 156 155 180 170 142 140 158 126 147 145 147 94 93 89 81
## 78 79 80
## 64 63 403
# feeling thermometer: Christian fundamentals
a20$t.c_fundamentals<-car::recode(a20$V202159, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.c_fundamentals)
##
## 0 1 2 3 4 5 6 7 8 9 10 15 16 17 18 19
## 955 7 8 2 2 42 2 1 1 1 92 511 2 1 1 1
## 20 22 23 25 30 33 35 39 40 43 45 49 50 51 55 56
## 84 2 1 50 482 3 23 1 605 2 25 6 1911 3 25 1
## 57 58 59 60 64 65 66 67 68 69 70 75 77 78 80 84
## 1 1 1 533 1 27 1 1 2 1 577 47 1 1 66 1
## 85 86 89 90 95 98 99 100
## 514 4 1 64 20 1 5 470
# feeling thermometer: feminists
a20$t.feminists<-car::recode(a20$V202160, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.feminists)
##
## 0 1 2 3 4 5 6 7 8 10 14 15 16 18 20 22
## 406 9 3 1 1 15 2 3 2 35 1 250 2 1 33 2
## 23 25 30 33 35 38 40 45 48 49 50 51 53 55 56 57
## 1 31 360 2 19 1 582 30 2 6 1726 2 1 26 1 2
## 59 60 63 65 66 67 69 70 72 75 77 78 79 80 85 86
## 1 719 1 40 1 1 1 803 1 111 2 1 4 140 909 7
## 87 88 89 90 94 95 98 99 100
## 2 5 1 148 1 56 5 6 802
# feeling thermometer: liberals
a20$t.liberals<-car::recode(a20$V202161, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.liberals)
##
## 0 1 2 3 4 5 6 7 10 12 13 15 16 18 20 22
## 869 8 7 6 1 28 2 2 57 1 1 471 5 1 49 1
## 25 29 30 33 35 38 40 45 48 49 50 51 52 55 57 60
## 36 1 501 4 23 1 570 28 1 5 1352 4 3 32 2 723
## 63 65 67 68 70 75 77 78 79 80 83 85 86 87 88 90
## 1 60 1 1 818 106 5 3 2 149 1 801 9 2 4 99
## 92 95 98 99 100
## 3 38 4 4 418
# feeling thermometer: conservatives
a20$t.conservatives<-car::recode(a20$V202164, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.conservatives)
##
## 0 1 2 3 4 5 6 7 8 9 10 12 15 16 17 18
## 457 6 6 2 3 27 1 3 1 4 63 1 413 1 2 1
## 20 22 25 26 29 30 33 34 35 36 40 42 44 45 48 49
## 75 1 52 1 1 554 1 1 43 1 698 1 2 49 2 3
## 50 51 54 55 58 59 60 65 66 67 68 70 71 72 75 76
## 1415 4 1 36 1 2 786 47 2 1 1 709 2 1 94 1
## 77 79 80 85 86 88 90 92 95 97 98 99 100
## 1 3 108 854 4 1 95 1 32 1 2 5 634
# feeling thermometer: gays and lesbians
a20$t.gayandlesbian<-car::recode(a20$V202166, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.gayandlesbian)
##
## 0 1 2 3 4 5 6 7 8 9 10 15 16 20 25 30
## 355 2 2 1 1 5 1 2 1 1 19 132 1 12 9 149
## 35 38 40 45 48 49 50 51 52 54 55 57 59 60 65 69
## 5 1 245 15 1 1 2223 3 1 1 13 1 2 443 30 3
## 70 75 76 77 78 79 80 82 83 85 86 87 88 89 90 95
## 764 82 1 1 2 1 99 1 1 984 8 1 2 1 154 57
## 98 99 100
## 6 8 1467
# feeling thermometer: transgenders
a20$t.transgender<-car::recode(a20$V202172, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.transgender)
##
## 0 1 2 3 4 5 8 10 12 13 15 20 25 27 30 35
## 493 5 2 1 5 15 1 26 1 1 200 25 13 1 213 10
## 40 42 44 45 48 49 50 51 52 55 60 61 63 65 66 67
## 381 1 1 16 1 2 2401 1 1 15 495 1 1 27 2 1
## 68 70 72 75 76 77 78 79 80 84 85 86 89 90 95 96
## 1 674 1 93 1 1 1 1 84 1 808 5 1 110 34 1
## 98 99 100
## 3 4 1123
# feeling thermometer: black lives matter
a20$t.BLM<-car::recode(a20$V202174, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.BLM)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 15 16 18
## 1380 16 8 1 2 21 1 3 5 3 50 1 3 417 1 2
## 20 25 26 30 35 40 42 43 44 45 47 48 49 50 51 55
## 41 27 1 320 16 397 1 1 2 14 1 2 4 737 2 26
## 59 60 63 65 68 69 70 73 75 79 80 81 82 85 86 87
## 1 568 1 50 2 1 732 1 91 2 109 1 1 886 7 1
## 88 89 90 94 95 98 99 100
## 6 1 158 1 66 5 4 1143
# feeling thermometer: NRA
a20$t.NRA<-car::recode(a20$V202178, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.NRA)
##
## 0 1 2 3 4 5 6 7 8 9 10 12 15 16 17 18
## 1329 5 8 6 1 45 2 1 1 1 106 2 467 2 1 1
## 20 23 25 27 30 32 35 38 40 42 44 45 49 50 51 53
## 74 1 32 1 394 1 25 1 440 1 2 33 2 1026 3 2
## 55 58 60 62 65 66 67 68 69 70 73 74 75 77 80 85
## 23 2 450 1 26 2 1 3 1 559 1 1 66 2 78 712
## 86 87 88 89 90 95 96 97 98 99 100
## 3 1 6 4 98 28 1 1 2 6 898
# feeling thermometer: socialists
a20$t.socialists<-car::recode(a20$V202179, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.socialists)
##
## 0 1 2 3 4 5 6 7 8 10 11 12 15 16 18 20
## 1634 14 6 4 3 29 2 1 1 73 1 1 502 2 2 57
## 23 25 30 31 32 33 35 40 45 49 50 51 52 55 56 58
## 1 28 424 1 1 1 15 472 37 2 1670 5 1 39 2 1
## 60 63 65 66 67 68 69 70 74 75 76 79 80 81 84 85
## 554 1 52 2 2 3 1 449 1 64 1 3 70 1 1 278
## 86 88 89 90 95 96 100
## 4 5 2 43 10 1 171
# feeling thermometer: capitalists
a20$t.capitalists<-car::recode(a20$V202180, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.capitalists)
##
## 0 1 2 5 6 7 9 10 13 14 15 17 18 19 20 21
## 384 4 3 18 3 2 1 32 1 1 201 1 1 1 50 1
## 25 30 33 34 35 38 40 43 45 46 48 49 50 51 53 55
## 32 326 1 1 24 1 601 2 41 1 1 2 1842 3 2 39
## 58 59 60 64 65 66 68 69 70 75 76 77 78 79 80 85
## 2 1 772 1 60 1 1 2 769 97 2 1 2 2 116 643
## 86 87 88 89 90 95 97 98 99 100
## 3 2 1 2 77 30 1 2 6 396
# feeling thermometer: planned parenthood
a20$t.plannedparenthood<-car::recode(a20$V202185, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.plannedparenthood)
##
## 0 1 2 3 5 6 7 9 10 12 15 16 18 20 22 25
## 943 9 1 2 25 1 1 2 41 1 285 2 1 36 1 21
## 30 33 35 38 39 40 44 45 48 49 50 55 57 58 60 65
## 254 1 12 1 1 278 1 15 1 2 1043 18 1 1 492 40
## 66 68 69 70 72 73 74 75 78 80 84 85 86 87 88 89
## 1 1 1 726 3 1 1 86 3 137 3 925 10 1 5 2
## 90 92 93 95 96 97 98 99 100
## 164 1 2 68 2 1 4 6 1465
# subset, w/o V200001 (Case ID)
a20sub<-dplyr::select(a20,lib.or.cons,age,t.c_fundamentals,t.feminists,t.liberals,t.conservatives,t.gayandlesbian,t.transgender,t.BLM,t.NRA,t.socialists,t.capitalists,t.plannedparenthood
)%>%
filter(complete.cases(.))
# check columns
colnames(a20sub)
## [1] "lib.or.cons" "age" "t.c_fundamentals"
## [4] "t.feminists" "t.liberals" "t.conservatives"
## [7] "t.gayandlesbian" "t.transgender" "t.BLM"
## [10] "t.NRA" "t.socialists" "t.capitalists"
## [13] "t.plannedparenthood"
# target vector (self-placement liberal/conservative)
round(prop.table(table(a20sub$lib.or.cons)) * 100, digits = 1)
##
## 0 1
## 51.1 48.9
table(a20sub$lib.or.cons)
##
## 0 1
## 862 826
# normalize function
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
# normalize non-target vectors
a20sub_n <- as.data.frame(lapply(a20sub[2:13], normalize))
# check normality
summary(a20sub_n)
## age t.c_fundamentals t.feminists t.liberals
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3185 1st Qu.:0.3000 1st Qu.:0.5000 1st Qu.:0.4000
## Median :0.5484 Median :0.5000 Median :0.6000 Median :0.5000
## Mean :0.5358 Mean :0.4678 Mean :0.6005 Mean :0.5229
## 3rd Qu.:0.7581 3rd Qu.:0.6000 3rd Qu.:0.8000 3rd Qu.:0.7000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## t.conservatives t.gayandlesbian t.transgender t.BLM
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.4000 1st Qu.:0.5000 1st Qu.:0.5000 1st Qu.:0.4000
## Median :0.5000 Median :0.6000 Median :0.5000 Median :0.6000
## Mean :0.5156 Mean :0.6514 Mean :0.6013 Mean :0.5776
## 3rd Qu.:0.6000 3rd Qu.:0.8500 3rd Qu.:0.8500 3rd Qu.:0.8500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## t.NRA t.socialists t.capitalists t.plannedparenthood
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3000 1st Qu.:0.3000 1st Qu.:0.4000 1st Qu.:0.5000
## Median :0.5000 Median :0.5000 Median :0.5000 Median :0.7000
## Mean :0.4932 Mean :0.4128 Mean :0.5171 Mean :0.6515
## 3rd Qu.:0.7000 3rd Qu.:0.5000 3rd Qu.:0.6000 3rd Qu.:0.8500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
# create training and test set
a20sub_train <- a20sub_n[1:1588, ]
a20sub_test <- a20sub_n[1589:1688, ]
sqrt(844) # k~29
## [1] 29.05168
# class labels for target vector
a20sub_train_labels <- a20sub[1:1588, 1]
a20sub_test_labels <- a20sub[1589:1688, 1]
# check length
dim(a20sub_train)
## [1] 1588 12
dim(a20sub_test)
## [1] 100 12
dim(a20sub_train_labels)
## [1] 1588 1
dim(a20sub_test_labels)
## [1] 100 1
# train model
libcons_pred <- knn(train = a20sub_train,test = a20sub_test,cl=a20sub_train_labels$lib.or.cons, k=29)
# evaluation of model classification (self-placement liberal (1) & conservative (0) with normalized features (thermometer)
eval1<-CrossTable(x = a20sub_test_labels$lib.or.cons, y = libcons_pred,
prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | libcons_pred
## a20sub_test_labels$lib.or.cons | 0 | 1 | Row Total |
## -------------------------------|-----------|-----------|-----------|
## 0 | 33 | 21 | 54 |
## | 0.611 | 0.389 | 0.540 |
## | 0.846 | 0.344 | |
## | 0.330 | 0.210 | |
## -------------------------------|-----------|-----------|-----------|
## 1 | 6 | 40 | 46 |
## | 0.130 | 0.870 | 0.460 |
## | 0.154 | 0.656 | |
## | 0.060 | 0.400 | |
## -------------------------------|-----------|-----------|-----------|
## Column Total | 39 | 61 | 100 |
## | 0.390 | 0.610 | |
## -------------------------------|-----------|-----------|-----------|
##
##
The model classified 33% as true negatives (self-placement conservative), 21% as false positives, 6% as false negatives, and 40% as true positives (self-placement liberal).
# z-score standardized a20sub
a20sub_z <- as.data.frame(scale(a20sub[-1]))
# check standardization
summary(a20sub_z)
## age t.c_fundamentals t.feminists t.liberals
## Min. :-2.00941 Min. :-1.8427 Min. :-2.518500 Min. :-2.2621
## 1st Qu.:-0.81479 1st Qu.:-0.6610 1st Qu.:-0.421365 1st Qu.:-0.5318
## Median : 0.04716 Median : 0.1268 Median :-0.001938 Median :-0.0992
## Mean : 0.00000 Mean : 0.0000 Mean : 0.000000 Mean : 0.0000
## 3rd Qu.: 0.83349 3rd Qu.: 0.5206 3rd Qu.: 0.836916 3rd Qu.: 0.7660
## Max. : 1.74080 Max. : 2.0962 Max. : 1.675770 Max. : 2.0637
## t.conservatives t.gayandlesbian t.transgender t.BLM
## Min. :-2.3872 Min. :-2.6467 Min. :-2.4005 Min. :-1.84369
## 1st Qu.:-0.5354 1st Qu.:-0.6153 1st Qu.:-0.4045 1st Qu.:-0.56685
## Median :-0.0724 Median :-0.2090 Median :-0.4045 Median : 0.07158
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.3905 3rd Qu.: 0.8068 3rd Qu.: 0.9926 3rd Qu.: 0.86961
## Max. : 2.2423 Max. : 1.4162 Max. : 1.5914 Max. : 1.34842
## t.NRA t.socialists t.capitalists t.plannedparenthood
## Min. :-1.59517 Min. :-1.6988 Min. :-2.3116 Min. :-2.3747
## 1st Qu.:-0.62484 1st Qu.:-0.4643 1st Qu.:-0.5234 1st Qu.:-0.5523
## Median : 0.02205 Median : 0.3586 Median :-0.0763 Median : 0.1767
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.66895 3rd Qu.: 0.3586 3rd Qu.: 0.3708 3rd Qu.: 0.7234
## Max. : 1.63928 Max. : 2.4161 Max. : 2.1590 Max. : 1.2702
# create new training and test sets
a20sub_train2 <- a20sub_z[1:1588, ]
a20sub_test2 <- a20sub_z[1589:1688, ]
sqrt(844) # k~29
## [1] 29.05168
# class labels for target vector
a20sub_train_labels2 <- a20sub[1:1588, 1]
a20sub_test_labels2 <- a20sub[1589:1688, 1]
# check length
dim(a20sub_train2)
## [1] 1588 12
dim(a20sub_test2)
## [1] 100 12
dim(a20sub_train_labels2)
## [1] 1588 1
dim(a20sub_test_labels2)
## [1] 100 1
# train model
libcons_pred2 <- knn(train = a20sub_train2,test = a20sub_test2,cl=a20sub_train_labels2$lib.or.cons, k=29)
# evaluation of model classification (self-placement liberal (1) & conservative (0) with normalized features (thermometer)
eval2<-CrossTable(x = a20sub_test_labels2$lib.or.cons, y = libcons_pred2,
prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | libcons_pred2
## a20sub_test_labels2$lib.or.cons | 0 | 1 | Row Total |
## --------------------------------|-----------|-----------|-----------|
## 0 | 38 | 16 | 54 |
## | 0.704 | 0.296 | 0.540 |
## | 0.844 | 0.291 | |
## | 0.380 | 0.160 | |
## --------------------------------|-----------|-----------|-----------|
## 1 | 7 | 39 | 46 |
## | 0.152 | 0.848 | 0.460 |
## | 0.156 | 0.709 | |
## | 0.070 | 0.390 | |
## --------------------------------|-----------|-----------|-----------|
## Column Total | 45 | 55 | 100 |
## | 0.450 | 0.550 | |
## --------------------------------|-----------|-----------|-----------|
##
##
The model classified 38% as true negatives (self-placement conservative), 16% as false positives, 7% as false negatives, and 39% as true positives (self-placement liberal).
The model with normalization showed greater accuracy.