library(haven)
library(car)
library(foreign) 
library(readr)
library(dplyr)
library(RWeka)
library(RODBC)
library(class)
library(gmodels)

This project explores a basic application of “k nearest neighbor” classification. The data used are for practice and were drawn from ANES 2020 (.dta and codebook) and text: “Machine Learning with R”.

# call data
a20<-read_dta("C:\\Users\\Jaire\\OneDrive\\Desktop\\Exploratory Research\\Data\\ANES2020T.dta")

Features & Transformations

# self-placement: liberal (1) or conservative (0)
a20$lib.or.cons<-car::recode(a20$V201201, recodes = "-9:-1=NA;2=0;3=NA")
table(a20$lib.or.cons)
## 
##    0    1 
## 1534 1320
#age
a20$age<-car::recode(a20$V201507x, recodes = "-9:-1=NA")
table(a20$age)
## 
##  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37 
##  35  52  46  51  57  75  92 104 108 132 120 131 142 109 117 123 142 152 144 149 
##  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57 
## 152 151 139 151 113 116 111 116 119 106 105 123 154 128 111 117 123 140 127 136 
##  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77 
## 145 154 168 139 154 156 155 180 170 142 140 158 126 147 145 147  94  93  89  81 
##  78  79  80 
##  64  63 403
# feeling thermometer: Christian fundamentals
a20$t.c_fundamentals<-car::recode(a20$V202159, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.c_fundamentals)
## 
##    0    1    2    3    4    5    6    7    8    9   10   15   16   17   18   19 
##  955    7    8    2    2   42    2    1    1    1   92  511    2    1    1    1 
##   20   22   23   25   30   33   35   39   40   43   45   49   50   51   55   56 
##   84    2    1   50  482    3   23    1  605    2   25    6 1911    3   25    1 
##   57   58   59   60   64   65   66   67   68   69   70   75   77   78   80   84 
##    1    1    1  533    1   27    1    1    2    1  577   47    1    1   66    1 
##   85   86   89   90   95   98   99  100 
##  514    4    1   64   20    1    5  470
# feeling thermometer: feminists
a20$t.feminists<-car::recode(a20$V202160, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.feminists)
## 
##    0    1    2    3    4    5    6    7    8   10   14   15   16   18   20   22 
##  406    9    3    1    1   15    2    3    2   35    1  250    2    1   33    2 
##   23   25   30   33   35   38   40   45   48   49   50   51   53   55   56   57 
##    1   31  360    2   19    1  582   30    2    6 1726    2    1   26    1    2 
##   59   60   63   65   66   67   69   70   72   75   77   78   79   80   85   86 
##    1  719    1   40    1    1    1  803    1  111    2    1    4  140  909    7 
##   87   88   89   90   94   95   98   99  100 
##    2    5    1  148    1   56    5    6  802
# feeling thermometer: liberals
a20$t.liberals<-car::recode(a20$V202161, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.liberals)
## 
##    0    1    2    3    4    5    6    7   10   12   13   15   16   18   20   22 
##  869    8    7    6    1   28    2    2   57    1    1  471    5    1   49    1 
##   25   29   30   33   35   38   40   45   48   49   50   51   52   55   57   60 
##   36    1  501    4   23    1  570   28    1    5 1352    4    3   32    2  723 
##   63   65   67   68   70   75   77   78   79   80   83   85   86   87   88   90 
##    1   60    1    1  818  106    5    3    2  149    1  801    9    2    4   99 
##   92   95   98   99  100 
##    3   38    4    4  418
# feeling thermometer: conservatives
a20$t.conservatives<-car::recode(a20$V202164, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.conservatives)
## 
##    0    1    2    3    4    5    6    7    8    9   10   12   15   16   17   18 
##  457    6    6    2    3   27    1    3    1    4   63    1  413    1    2    1 
##   20   22   25   26   29   30   33   34   35   36   40   42   44   45   48   49 
##   75    1   52    1    1  554    1    1   43    1  698    1    2   49    2    3 
##   50   51   54   55   58   59   60   65   66   67   68   70   71   72   75   76 
## 1415    4    1   36    1    2  786   47    2    1    1  709    2    1   94    1 
##   77   79   80   85   86   88   90   92   95   97   98   99  100 
##    1    3  108  854    4    1   95    1   32    1    2    5  634
# feeling thermometer: gays and lesbians
a20$t.gayandlesbian<-car::recode(a20$V202166, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.gayandlesbian)
## 
##    0    1    2    3    4    5    6    7    8    9   10   15   16   20   25   30 
##  355    2    2    1    1    5    1    2    1    1   19  132    1   12    9  149 
##   35   38   40   45   48   49   50   51   52   54   55   57   59   60   65   69 
##    5    1  245   15    1    1 2223    3    1    1   13    1    2  443   30    3 
##   70   75   76   77   78   79   80   82   83   85   86   87   88   89   90   95 
##  764   82    1    1    2    1   99    1    1  984    8    1    2    1  154   57 
##   98   99  100 
##    6    8 1467
# feeling thermometer: transgenders
a20$t.transgender<-car::recode(a20$V202172, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.transgender)
## 
##    0    1    2    3    4    5    8   10   12   13   15   20   25   27   30   35 
##  493    5    2    1    5   15    1   26    1    1  200   25   13    1  213   10 
##   40   42   44   45   48   49   50   51   52   55   60   61   63   65   66   67 
##  381    1    1   16    1    2 2401    1    1   15  495    1    1   27    2    1 
##   68   70   72   75   76   77   78   79   80   84   85   86   89   90   95   96 
##    1  674    1   93    1    1    1    1   84    1  808    5    1  110   34    1 
##   98   99  100 
##    3    4 1123
# feeling thermometer: black lives matter
a20$t.BLM<-car::recode(a20$V202174, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.BLM)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   15   16   18 
## 1380   16    8    1    2   21    1    3    5    3   50    1    3  417    1    2 
##   20   25   26   30   35   40   42   43   44   45   47   48   49   50   51   55 
##   41   27    1  320   16  397    1    1    2   14    1    2    4  737    2   26 
##   59   60   63   65   68   69   70   73   75   79   80   81   82   85   86   87 
##    1  568    1   50    2    1  732    1   91    2  109    1    1  886    7    1 
##   88   89   90   94   95   98   99  100 
##    6    1  158    1   66    5    4 1143
# feeling thermometer: NRA
a20$t.NRA<-car::recode(a20$V202178, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.NRA)
## 
##    0    1    2    3    4    5    6    7    8    9   10   12   15   16   17   18 
## 1329    5    8    6    1   45    2    1    1    1  106    2  467    2    1    1 
##   20   23   25   27   30   32   35   38   40   42   44   45   49   50   51   53 
##   74    1   32    1  394    1   25    1  440    1    2   33    2 1026    3    2 
##   55   58   60   62   65   66   67   68   69   70   73   74   75   77   80   85 
##   23    2  450    1   26    2    1    3    1  559    1    1   66    2   78  712 
##   86   87   88   89   90   95   96   97   98   99  100 
##    3    1    6    4   98   28    1    1    2    6  898
# feeling thermometer: socialists
a20$t.socialists<-car::recode(a20$V202179, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.socialists)
## 
##    0    1    2    3    4    5    6    7    8   10   11   12   15   16   18   20 
## 1634   14    6    4    3   29    2    1    1   73    1    1  502    2    2   57 
##   23   25   30   31   32   33   35   40   45   49   50   51   52   55   56   58 
##    1   28  424    1    1    1   15  472   37    2 1670    5    1   39    2    1 
##   60   63   65   66   67   68   69   70   74   75   76   79   80   81   84   85 
##  554    1   52    2    2    3    1  449    1   64    1    3   70    1    1  278 
##   86   88   89   90   95   96  100 
##    4    5    2   43   10    1  171
# feeling thermometer: capitalists
a20$t.capitalists<-car::recode(a20$V202180, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.capitalists)
## 
##    0    1    2    5    6    7    9   10   13   14   15   17   18   19   20   21 
##  384    4    3   18    3    2    1   32    1    1  201    1    1    1   50    1 
##   25   30   33   34   35   38   40   43   45   46   48   49   50   51   53   55 
##   32  326    1    1   24    1  601    2   41    1    1    2 1842    3    2   39 
##   58   59   60   64   65   66   68   69   70   75   76   77   78   79   80   85 
##    2    1  772    1   60    1    1    2  769   97    2    1    2    2  116  643 
##   86   87   88   89   90   95   97   98   99  100 
##    3    2    1    2   77   30    1    2    6  396
# feeling thermometer: planned parenthood
a20$t.plannedparenthood<-car::recode(a20$V202185, recodes = "-9:-1=NA;998:1000=NA")
table(a20$t.plannedparenthood)
## 
##    0    1    2    3    5    6    7    9   10   12   15   16   18   20   22   25 
##  943    9    1    2   25    1    1    2   41    1  285    2    1   36    1   21 
##   30   33   35   38   39   40   44   45   48   49   50   55   57   58   60   65 
##  254    1   12    1    1  278    1   15    1    2 1043   18    1    1  492   40 
##   66   68   69   70   72   73   74   75   78   80   84   85   86   87   88   89 
##    1    1    1  726    3    1    1   86    3  137    3  925   10    1    5    2 
##   90   92   93   95   96   97   98   99  100 
##  164    1    2   68    2    1    4    6 1465
# subset, w/o V200001 (Case ID)
a20sub<-dplyr::select(a20,lib.or.cons,age,t.c_fundamentals,t.feminists,t.liberals,t.conservatives,t.gayandlesbian,t.transgender,t.BLM,t.NRA,t.socialists,t.capitalists,t.plannedparenthood
)%>%
  filter(complete.cases(.))
# check columns
colnames(a20sub)
##  [1] "lib.or.cons"         "age"                 "t.c_fundamentals"   
##  [4] "t.feminists"         "t.liberals"          "t.conservatives"    
##  [7] "t.gayandlesbian"     "t.transgender"       "t.BLM"              
## [10] "t.NRA"               "t.socialists"        "t.capitalists"      
## [13] "t.plannedparenthood"
# target vector (self-placement liberal/conservative)
round(prop.table(table(a20sub$lib.or.cons)) * 100, digits = 1)
## 
##    0    1 
## 51.1 48.9
table(a20sub$lib.or.cons)
## 
##   0   1 
## 862 826
# normalize function
 normalize <- function(x) {
 return ((x - min(x)) / (max(x) - min(x)))
 }
# normalize non-target vectors
a20sub_n <- as.data.frame(lapply(a20sub[2:13], normalize))
# check normality 
summary(a20sub_n)
##       age         t.c_fundamentals  t.feminists       t.liberals    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3185   1st Qu.:0.3000   1st Qu.:0.5000   1st Qu.:0.4000  
##  Median :0.5484   Median :0.5000   Median :0.6000   Median :0.5000  
##  Mean   :0.5358   Mean   :0.4678   Mean   :0.6005   Mean   :0.5229  
##  3rd Qu.:0.7581   3rd Qu.:0.6000   3rd Qu.:0.8000   3rd Qu.:0.7000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  t.conservatives  t.gayandlesbian  t.transgender        t.BLM       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.4000   1st Qu.:0.5000   1st Qu.:0.5000   1st Qu.:0.4000  
##  Median :0.5000   Median :0.6000   Median :0.5000   Median :0.6000  
##  Mean   :0.5156   Mean   :0.6514   Mean   :0.6013   Mean   :0.5776  
##  3rd Qu.:0.6000   3rd Qu.:0.8500   3rd Qu.:0.8500   3rd Qu.:0.8500  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      t.NRA         t.socialists    t.capitalists    t.plannedparenthood
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000     
##  1st Qu.:0.3000   1st Qu.:0.3000   1st Qu.:0.4000   1st Qu.:0.5000     
##  Median :0.5000   Median :0.5000   Median :0.5000   Median :0.7000     
##  Mean   :0.4932   Mean   :0.4128   Mean   :0.5171   Mean   :0.6515     
##  3rd Qu.:0.7000   3rd Qu.:0.5000   3rd Qu.:0.6000   3rd Qu.:0.8500     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
# create training and test set
a20sub_train <- a20sub_n[1:1588, ]
a20sub_test <- a20sub_n[1589:1688, ]
sqrt(844) # k~29
## [1] 29.05168
# class labels for target vector
a20sub_train_labels <- a20sub[1:1588, 1]
a20sub_test_labels <- a20sub[1589:1688, 1]
# check length
dim(a20sub_train)
## [1] 1588   12
dim(a20sub_test)
## [1] 100  12
dim(a20sub_train_labels)
## [1] 1588    1
dim(a20sub_test_labels)
## [1] 100   1

Model

# train model
libcons_pred <- knn(train = a20sub_train,test = a20sub_test,cl=a20sub_train_labels$lib.or.cons, k=29)
# evaluation of model classification (self-placement liberal (1) & conservative (0) with normalized features (thermometer)
eval1<-CrossTable(x = a20sub_test_labels$lib.or.cons, y = libcons_pred,
 prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                                | libcons_pred 
## a20sub_test_labels$lib.or.cons |         0 |         1 | Row Total | 
## -------------------------------|-----------|-----------|-----------|
##                              0 |        33 |        21 |        54 | 
##                                |     0.611 |     0.389 |     0.540 | 
##                                |     0.846 |     0.344 |           | 
##                                |     0.330 |     0.210 |           | 
## -------------------------------|-----------|-----------|-----------|
##                              1 |         6 |        40 |        46 | 
##                                |     0.130 |     0.870 |     0.460 | 
##                                |     0.154 |     0.656 |           | 
##                                |     0.060 |     0.400 |           | 
## -------------------------------|-----------|-----------|-----------|
##                   Column Total |        39 |        61 |       100 | 
##                                |     0.390 |     0.610 |           | 
## -------------------------------|-----------|-----------|-----------|
## 
## 

The model classified 33% as true negatives (self-placement conservative), 21% as false positives, 6% as false negatives, and 40% as true positives (self-placement liberal).

Alternative Specification (z-score standardization)

# z-score standardized a20sub
a20sub_z <- as.data.frame(scale(a20sub[-1]))
# check standardization  
summary(a20sub_z)
##       age           t.c_fundamentals   t.feminists          t.liberals     
##  Min.   :-2.00941   Min.   :-1.8427   Min.   :-2.518500   Min.   :-2.2621  
##  1st Qu.:-0.81479   1st Qu.:-0.6610   1st Qu.:-0.421365   1st Qu.:-0.5318  
##  Median : 0.04716   Median : 0.1268   Median :-0.001938   Median :-0.0992  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.0000  
##  3rd Qu.: 0.83349   3rd Qu.: 0.5206   3rd Qu.: 0.836916   3rd Qu.: 0.7660  
##  Max.   : 1.74080   Max.   : 2.0962   Max.   : 1.675770   Max.   : 2.0637  
##  t.conservatives   t.gayandlesbian   t.transgender         t.BLM         
##  Min.   :-2.3872   Min.   :-2.6467   Min.   :-2.4005   Min.   :-1.84369  
##  1st Qu.:-0.5354   1st Qu.:-0.6153   1st Qu.:-0.4045   1st Qu.:-0.56685  
##  Median :-0.0724   Median :-0.2090   Median :-0.4045   Median : 0.07158  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.3905   3rd Qu.: 0.8068   3rd Qu.: 0.9926   3rd Qu.: 0.86961  
##  Max.   : 2.2423   Max.   : 1.4162   Max.   : 1.5914   Max.   : 1.34842  
##      t.NRA           t.socialists     t.capitalists     t.plannedparenthood
##  Min.   :-1.59517   Min.   :-1.6988   Min.   :-2.3116   Min.   :-2.3747    
##  1st Qu.:-0.62484   1st Qu.:-0.4643   1st Qu.:-0.5234   1st Qu.:-0.5523    
##  Median : 0.02205   Median : 0.3586   Median :-0.0763   Median : 0.1767    
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000    
##  3rd Qu.: 0.66895   3rd Qu.: 0.3586   3rd Qu.: 0.3708   3rd Qu.: 0.7234    
##  Max.   : 1.63928   Max.   : 2.4161   Max.   : 2.1590   Max.   : 1.2702
# create new training and test sets
a20sub_train2 <- a20sub_z[1:1588, ]
a20sub_test2 <- a20sub_z[1589:1688, ]
sqrt(844) # k~29
## [1] 29.05168
# class labels for target vector
a20sub_train_labels2 <- a20sub[1:1588, 1]
a20sub_test_labels2 <- a20sub[1589:1688, 1]
# check length
dim(a20sub_train2)
## [1] 1588   12
dim(a20sub_test2)
## [1] 100  12
dim(a20sub_train_labels2)
## [1] 1588    1
dim(a20sub_test_labels2)
## [1] 100   1

Model 2

# train model
libcons_pred2 <- knn(train = a20sub_train2,test = a20sub_test2,cl=a20sub_train_labels2$lib.or.cons, k=29)
# evaluation of model classification (self-placement liberal (1) & conservative (0) with normalized features (thermometer)
eval2<-CrossTable(x = a20sub_test_labels2$lib.or.cons, y = libcons_pred2,
 prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                                 | libcons_pred2 
## a20sub_test_labels2$lib.or.cons |         0 |         1 | Row Total | 
## --------------------------------|-----------|-----------|-----------|
##                               0 |        38 |        16 |        54 | 
##                                 |     0.704 |     0.296 |     0.540 | 
##                                 |     0.844 |     0.291 |           | 
##                                 |     0.380 |     0.160 |           | 
## --------------------------------|-----------|-----------|-----------|
##                               1 |         7 |        39 |        46 | 
##                                 |     0.152 |     0.848 |     0.460 | 
##                                 |     0.156 |     0.709 |           | 
##                                 |     0.070 |     0.390 |           | 
## --------------------------------|-----------|-----------|-----------|
##                    Column Total |        45 |        55 |       100 | 
##                                 |     0.450 |     0.550 |           | 
## --------------------------------|-----------|-----------|-----------|
## 
## 

The model classified 38% as true negatives (self-placement conservative), 16% as false positives, 7% as false negatives, and 39% as true positives (self-placement liberal).

The model with normalization showed greater accuracy.