ANLY 530 - Lab 2

1. Naïve Bayes Classifier

Question 1: Report the number of missing values in the dataset.

Answer 1: The dataset contains 1000 observations of 21 variables. No missing values are found in the dataset.

Question 2: Compute the percentage of both classes similar to Lab 1 and see if the distribution of both classes preserved for both training and testing data.

Answer 2: As Step 1 shows, the proportion of class 1 is 68.54% in the training set, and it is 74.40% in the test set.

As for the training set, false negatives (Type II error) are 85, while false positives (Type I error) are 117. True positives are 151, and true negatives are 397; hence, the accuracy is 73.07%. “Positive” means 0 being creditable (not defaulted). “Negative” means 1 being not creditable (defaulted). The data dictionary seems to wrongly describe “Creditability”.

As for the test set, false negatives (Type II error) are 22, while false positives (Type I error) are 35 True positives are 42, and true negatives are 151; hence, the accuracy is 77.20%.

Step 1: Exploring and preparing the data
This step is similar to Lab 1’s Tab 1 Step 1.

credit <- read.csv("~/Documents/HU/ANLY 530-53-B/Module 9- Support Vector Machine part II/Lab2/creditData.csv")
sum(is.na(credit))

## [1] 0

credit$Creditability <- as.factor(credit$Creditability) #creditable: yes(0), no(1); no creditable = default

set.seed(12345)
creditR <- credit[order(runif(1000)),]
creditTraining <- creditR[1:750,]
creditTest <- creditR[751:1000,]

prop.table(table(creditTraining$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(creditTest$Creditability))

## 
##     0     1 
## 0.256 0.744

Step 2: Training a model on the data

creditModelNB <- naive_bayes(Creditability~., data=creditTraining)
creditModelNB

## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = creditTraining)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: Account.Balance (Gaussian) 
## --------------------------------------------------------------------------------- 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
## --------------------------------------------------------------------------------- 
##  ::: Duration.of.Credit..month. (Gaussian) 
## --------------------------------------------------------------------------------- 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
## --------------------------------------------------------------------------------- 
##  ::: Payment.Status.of.Previous.Credit (Gaussian) 
## --------------------------------------------------------------------------------- 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
## --------------------------------------------------------------------------------- 
##  ::: Purpose (Gaussian) 
## --------------------------------------------------------------------------------- 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
## --------------------------------------------------------------------------------- 
##  ::: Credit.Amount (Gaussian) 
## --------------------------------------------------------------------------------- 
##              
## Credit.Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 15 more tables
## 
## ---------------------------------------------------------------------------------

#accuracy on the training set
creditTrPredNB <- predict(creditModelNB, creditTraining, type="class")
CrossTable(creditTraining$Creditability, creditTrPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual Creditability", "Predicted Creditability")) #a confusion matrix of binary classification, "negative" means being (1) not creditable / defaulted / declined

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  750 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |       151 |        85 |       236 | 
##                      |     0.201 |     0.113 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |       117 |       397 |       514 | 
##                      |     0.156 |     0.529 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |       268 |       482 |       750 | 
## ---------------------|-----------|-----------|-----------|
## 
##

Step 3: Evaluating model performance

#accuracy on the test set
creditPredNB <- predict(creditModelNB, creditTest, type="class")
CrossTable(creditTest$Creditability, creditPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual Creditability", "Predicted Creditability")) #a confusion matrix of binary classification, "negative" means being (1) not creditable / defaulted / declined

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  250 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        42 |        22 |        64 | 
##                      |     0.168 |     0.088 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        35 |       151 |       186 | 
##                      |     0.140 |     0.604 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        77 |       173 |       250 | 
## ---------------------|-----------|-----------|-----------|
## 
##

2. Pre-processing

Question 3: What is the accuracy this time?

Answer 3: The target / dependent / response variable is selected as “Creditability”.
The predictor / independent / explanatory variables, before pre-processing, are selected as ①“Account.Balance”, ②“Duration.of.Credit..month.”, ③“Payment.Status.of.Previous.Credit”, ④“Purpose”, ⑤“Credit.Amount”, ⑥“Value.Savings.Stocks”, ⑦“Length.of.current.employment”, ⑧“Instalment.per.cent”, ⑨“Sex…Marital.Status”, ⑩“Guarantors”, ⑪“Duration.in.Current.address”, ⑫“Most.valuable.available.asset”, ⑬“Age..years.”, ⑭“Concurrent.Credits”, ⑮“Type.of.apartment”, ⑯“No.of.Credits.at.this.Bank”, ⑰“Occupation”, ⑱“No.of.dependents”, ⑲“Telephone”, and ⑳“Foreign.Worker”.

Variables ① ③ ⑥ ⑦ ⑧ ⑪ ⑯ ⑱ are ordinal, and variables ④ ⑨ ⑩ ⑫ ⑭ ⑮ ⑰ ⑲ ⑳ are nominal. Variable ② ⑤ ⑬ was originally metric but categorized into the ordinal type.

Similar to Lab 1’s Tab 3, it is incorrect to scale / standardize the categorical data and compute their correlations. Hence, Lab 2’s Part 2 instruction seems to make no sense. Twenty variables are all categorical; therefore, the Chi-squared Test of Independence is run instead. The null hypothesis (\(H_0\)) is that there is no association between the two variables. The alternative hypothesis (\(H_α\)) is that there is an association between the two variables. The results table for such tests is shown in the below chunk.

Five variables “Purpose”, “Length.of.current.employment”, “Most.valuable.available.asset”, “Type.of.apartment”, and “Occupation” are having many associations with other variables. As a result, these five variables are removed from the selection of independent variables. The new model shows the accuracy of 71.47% and 78.40% respectively, on the training set and the test set. The pre-processing technique slightly improves the model performance on the test set.

Step 1: Exploring and preparing the data

#change metric variables into categorical
credit$Duration.of.Credit..month. <- case_when(credit$Duration.of.Credit..month.>54 ~1,
                                               credit$Duration.of.Credit..month.>48 ~2,
                                               credit$Duration.of.Credit..month.>42 ~3,
                                               credit$Duration.of.Credit..month.>36 ~4,
                                               credit$Duration.of.Credit..month.>30 ~5,
                                               credit$Duration.of.Credit..month.>24 ~6,
                                               credit$Duration.of.Credit..month.>18 ~7,
                                               credit$Duration.of.Credit..month.>12 ~8,
                                               credit$Duration.of.Credit..month.>6 ~9,
                                               T ~10)
credit$Credit.Amount <- case_when(credit$Credit.Amount>20000 ~1,
                                  credit$Credit.Amount>15000 ~2,
                                  credit$Credit.Amount>10000 ~3,
                                  credit$Credit.Amount>7500 ~4,
                                  credit$Credit.Amount>5000 ~5,
                                  credit$Credit.Amount>2500 ~6,
                                  credit$Credit.Amount>1500 ~7,
                                  credit$Credit.Amount>1000 ~8,
                                  credit$Credit.Amount>500 ~9,
                                  T ~10)
credit$Age..years. <- case_when(credit$Age..years.>=65 ~4,
                                credit$Age..years.>=60 ~5,
                                credit$Age..years.>=40 ~3,
                                credit$Age..years.>=26 ~2,
                                T ~1)

#REMINDER: "for" loop is not recommended in R
pairs <- c()
for (i in 1:(ncol(credit)-2)) {
  for (j in (i+1):(ncol(credit)-1)) {
    if (chisq.test(table(credit[,i+1],credit[,j+1]))$p.value<0.05) {
      pairs <- c(pairs, c(i,j))
    }
  }
} #O(n²)
#pairs of variables that are rejected by Chi-squared Test of Independence
#such variables in a pair have a significant association
table(pairs)

## pairs
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
## 11 12 11 18  9  4 15  6 13  6  9 15 12  7 16 10 15 11 11  9

#REMINDER: declaration with dimensions will speed up the execution in R
chiTable <- data.frame(matrix(nrow=20,ncol=20))
for (i in 1:20) {
  for (j in 1:20) {
    if (chisq.test(table(credit[,i+1],credit[,j+1]))$p.value<0.05) {
      chiTable[i,j] <- "⬤"
    } else {
      chiTable[i,j] <- "◯"
    }
  }
}
colnames(chiTable) <- c(1:20)
chiTable$Variable <- c(1:20)
chiTable <- chiTable[,c(21,1:20)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)

Variable	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20
1	⬤	◯	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯
2	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤
3	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	◯	◯
4	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
5	◯	⬤	⬤	⬤	⬤	◯	◯	⬤	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	◯
6	⬤	◯	◯	⬤	◯	⬤	⬤	◯	◯	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯
7	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
8	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯	⬤
9	◯	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
10	⬤	◯	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	◯	◯	◯	⬤	◯	◯	⬤
11	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	◯
12	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
13	⬤	◯	⬤	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
14	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	⬤	◯	⬤	⬤	⬤	⬤	◯	◯	◯
15	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
16	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	◯	◯
17	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
18	◯	⬤	⬤	⬤	◯	◯	⬤	◯	⬤	◯	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤
19	◯	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤
20	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	◯	⬤	◯	⬤	⬤	⬤	⬤

set.seed(12345)
creditR2 <- credit[order(runif(1000)),-c(5,8,13,16,18)] #remove five variables with the count no less than 15 in the pairs table (the potential maximum count could be 19) 
creditTraining2 <- creditR2[1:750,]
creditTest2 <- creditR2[751:1000,]

Step 2: Training a model on the data

creditModelNB2 <- naive_bayes(Creditability~., data=creditTraining2)
creditModelNB2

## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = creditTraining2)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: Account.Balance (Gaussian) 
## --------------------------------------------------------------------------------- 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
## --------------------------------------------------------------------------------- 
##  ::: Duration.of.Credit..month. (Gaussian) 
## --------------------------------------------------------------------------------- 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 6.838983 7.671206
##                       sd   2.246398 1.819021
## 
## --------------------------------------------------------------------------------- 
##  ::: Payment.Status.of.Previous.Credit (Gaussian) 
## --------------------------------------------------------------------------------- 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
## --------------------------------------------------------------------------------- 
##  ::: Credit.Amount (Gaussian) 
## --------------------------------------------------------------------------------- 
##              
## Credit.Amount        0        1
##          mean 6.436441 6.756809
##          sd   1.788910 1.435149
## 
## --------------------------------------------------------------------------------- 
##  ::: Value.Savings.Stocks (Gaussian) 
## --------------------------------------------------------------------------------- 
##                     
## Value.Savings.Stocks        0        1
##                 mean 1.711864 2.334630
##                 sd   1.340700 1.674510
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 10 more tables
## 
## ---------------------------------------------------------------------------------

#accuracy on the training set
creditTrPredNB2 <- predict(creditModelNB2, creditTraining2, type="class")
CrossTable(creditTraining2$Creditability, creditTrPredNB2, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual Creditability", "Predicted Creditability"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  750 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |       153 |        83 |       236 | 
##                      |     0.204 |     0.111 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |       131 |       383 |       514 | 
##                      |     0.175 |     0.511 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |       284 |       466 |       750 | 
## ---------------------|-----------|-----------|-----------|
## 
##

Step 3: Evaluating model performance

#accuracy on the test set
creditPredNB2 <- predict(creditModelNB2, creditTest2, type="class")
CrossTable(creditTest2$Creditability, creditPredNB2, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual Creditability", "Predicted Creditability"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  250 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |        45 |        19 |        64 | 
##                      |     0.180 |     0.076 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        35 |       151 |       186 | 
##                      |     0.140 |     0.604 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        80 |       170 |       250 | 
## ---------------------|-----------|-----------|-----------|
## 
##

3. Support Vector Machine

Question 4: We may be able to do better than this by changing the Kernels. Try Polynomial and RBF kernels to improve the result.

Answer 4: They are shown as below: the accuracy rates from the confusion matrix on the training and the test set by Support Vector Machine model with different kernels.

	Linear kernel	Polynomial kernel	Radial Basis kernel
Training set	86.65%	86.65%	94.93%
Test set	83.95%	84.00%	93.35%

Step 1: Collecting the data

letters <- read.csv("~/Documents/HU/ANLY 530-53-B/Module 9- Support Vector Machine part II/Lab2/letterdata.csv")
str(letters)

## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

Step 2: Preparing the data

#no randomization in sampling
lettersTraining <- letters[1:18000,] #90%
lettersTest <- letters[18001:20000,]

Step 3: Training a model on the data

lettersModelSVM <- ksvm(letter~., data=lettersTraining) #about 1min runtime
summary(lettersModelSVM)

## Length  Class   Mode 
##      1   ksvm     S4

lettersTrPredSVM <- predict(lettersModelSVM, lettersTraining) #about 1min runtime
t1 <- table(lettersTraining$letter, lettersTrPredSVM) #column: predicted; row: actual
t1 %>% kable() %>% kable_styling()

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	701	0	2	1	0	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0	0	0	0	2	0
B	1	669	0	6	4	0	0	3	0	0	0	0	0	0	0	0	0	8	2	1	0	0	0	1	0	0
C	0	0	619	0	12	0	13	1	0	0	3	0	1	0	9	0	0	0	0	0	1	2	0	0	0	0
D	0	6	0	701	0	0	0	10	0	0	0	0	0	5	3	0	0	5	0	0	1	0	0	0	0	0
E	0	6	0	0	648	1	24	0	0	0	1	0	0	0	0	0	2	3	2	0	0	0	0	0	0	3
F	0	3	0	2	1	671	1	1	1	0	0	0	0	0	0	2	0	0	2	9	0	0	0	1	1	0
G	0	0	2	11	2	1	654	1	0	0	1	1	0	0	8	0	4	4	0	0	0	1	2	0	0	0
H	0	13	0	22	0	0	8	571	0	0	7	0	1	1	4	0	4	31	0	0	3	0	0	1	1	0
I	1	1	3	4	0	7	0	0	630	23	0	0	0	0	0	1	0	0	4	0	0	0	0	2	0	2
J	2	0	0	4	1	1	0	2	7	641	0	0	0	3	2	0	0	0	7	0	0	0	0	2	0	2
K	0	1	3	4	0	0	0	9	0	0	620	0	0	0	0	0	0	23	0	0	2	0	0	8	0	0
L	0	0	2	0	17	0	13	2	0	0	1	636	0	0	0	0	1	5	2	0	0	0	0	4	0	0
M	1	13	0	0	0	0	5	2	0	0	0	0	692	2	0	0	0	1	0	0	0	0	3	0	0	0
N	0	2	0	4	0	0	0	6	0	0	0	0	2	661	14	0	0	7	0	0	0	1	1	0	0	0
O	0	0	0	10	0	0	2	1	0	0	0	0	0	0	644	0	6	5	0	0	3	0	9	0	0	0
P	0	2	0	2	4	28	11	4	0	0	0	0	0	0	2	657	2	1	0	0	0	0	0	0	6	0
Q	0	3	0	0	3	0	2	0	0	0	0	0	0	0	8	0	692	1	1	0	0	0	1	0	0	0
R	1	21	0	7	0	0	2	4	0	0	6	0	0	4	0	0	1	632	0	0	0	0	0	1	0	0
S	0	8	0	0	4	5	1	1	0	0	0	0	0	0	0	0	0	1	654	0	0	0	0	1	0	2
T	0	2	0	1	0	4	3	6	0	0	3	0	0	0	0	0	0	1	0	674	2	0	0	5	0	1
U	0	0	0	0	0	0	0	1	0	0	2	0	5	1	3	0	0	0	0	0	702	4	4	0	0	0
V	1	19	0	0	1	1	1	2	0	0	0	0	1	3	0	0	0	1	0	1	0	654	6	0	2	0
W	0	2	0	0	0	0	2	2	0	0	0	0	8	0	0	0	0	1	0	0	3	0	665	0	0	0
X	0	2	0	5	4	1	0	0	1	1	8	0	0	0	0	0	3	1	0	0	0	0	0	674	1	0
Y	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	0	0	0	0	1	2	1	0	0	709	0
Z	0	0	0	0	5	0	0	0	0	1	0	0	0	0	0	0	6	1	8	0	0	0	0	1	0	630

# #OR
# round((1-lettersModelSVM@error)*100,2)
round((sum(diag(t1))/sum(t1))*100,2)

## [1] 95.01

# #alternative
# table(lettersTraining$letter==lettersTrPredSVM)

Step 4: Evaluating model performance

lettersPredSVM <- predict(lettersModelSVM, lettersTest)
t2 <- table(lettersTest$letter, lettersPredSVM) #column: predicted; row: actual
t2 %>% kable() %>% kable_styling()

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	75	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	3	0
B	0	67	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0
C	0	0	72	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
D	0	2	0	71	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
E	0	0	3	0	70	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2
F	0	1	0	0	2	76	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0
G	0	0	0	1	0	0	76	1	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0
H	0	0	0	2	0	0	1	58	0	0	3	0	0	0	0	0	1	1	0	0	1	0	0	0	0	0
I	0	0	0	2	0	3	0	0	69	2	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0
J	0	0	0	2	1	0	0	1	1	66	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0
K	0	0	0	1	0	0	0	0	0	0	62	0	0	0	0	0	0	4	0	0	0	0	0	2	0	0
L	0	1	1	0	2	0	0	1	0	0	0	69	0	0	0	0	0	0	0	0	0	0	0	4	0	0
M	0	0	0	0	0	0	0	1	0	0	0	0	71	0	0	0	0	0	0	0	0	0	1	0	0	0
N	0	1	0	0	0	0	0	0	0	0	1	0	1	78	2	0	0	2	0	0	0	0	0	0	0	0
O	0	0	0	0	0	0	0	0	0	0	0	0	0	0	67	0	3	1	0	0	0	0	2	0	0	0
P	0	2	0	2	0	6	0	0	0	0	0	0	0	0	1	72	1	0	0	0	0	0	0	0	0	0
Q	2	1	0	1	0	0	0	1	0	0	0	0	0	0	2	0	65	0	0	0	0	0	0	0	0	0
R	0	1	0	1	0	0	0	1	0	0	2	0	0	0	0	0	0	74	0	0	0	0	0	0	0	0
S	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0	0	0	1
T	1	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	1	0	88	0	0	0	0	1	0
U	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	89	0	1	0	0	0
V	0	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0
W	0	0	0	0	0	0	0	0	0	0	0	0	2	1	0	0	0	0	0	0	0	0	66	0	0	0
X	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	84	0	0
Y	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	65	0
Z	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	81

round((sum(diag(t2))/sum(t2))*100,2)

## [1] 93.35

# #alternative
# table(lettersTest$letter==lettersPredSVM)

tryKernel <- function(train, test, kern) {
  model <- ksvm(letter~., data=train, kernel=kern)
  trainAccuracy <- round((1-model@error)*100,2)
  pred <- predict(model, test)
  table <- table(test$letter==pred)
  testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
  names(testAccuracy) <- NULL
  return(c(trainAccuracy, testAccuracy))
}

result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
                     "Linear"=tryKernel(lettersTraining,lettersTest,kern="vanilladot"),
                     "Polynomial"=tryKernel(lettersTraining,lettersTest,kern="polydot"),
                     "RBF"=tryKernel(lettersTraining,lettersTest,kern="rbfdot"))

##  Setting default kernel parameters  
##  Setting default kernel parameters

result %>% kable() %>% kable_styling(full_width=F)

Accuracy	Linear	Polynomial	RBF
Training (%)	86.65	86.65	94.93
Test (%)	83.95	84.00	93.35

4. Evaluating News Popularity

Question 5: Do you see any improvement compared to last three techniques? Please completely explain your results and analysis.

Answer 5: They are shown as below: the accuracy rates from the confusion matrix on the training and the test set by the seven different models. The best model is random forest according to the accuracy on the test set. In general, decision tree, random forest, regression trees, and support vector machine models perform the similar accuracy around 60%.

	Decision Tree	Random Forest	Regression Trees	Naïve Bayes Classifier	Linear SVM	Polynomial SVM	Radial Basis SVM
Training set	61.85%	99.60%	56.32%	47.82%	56.73%	56.72%	61.80%
Test set	59.11%	60.25%	56.37%	47.22%	57.51%	57.51%	59.34%

Step 1: Collecting the data
This step is the same as Lab 1’s Tab 4 Step 1.

Step 2: Pre-processing
This step is the same as Lab 1’s Tab 4 Step 2.

Step 3: Modeling and evaluation

#Naïve Bayes Classifier model
newsModelNB <- naive_bayes(popular~., data=newsTraining)
newsModelNB

## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = popular ~ ., data = newsTraining)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##         0         1 
## 0.4677631 0.5322369 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: n_tokens_title (Gaussian) 
## --------------------------------------------------------------------------------- 
##               
## n_tokens_title         0         1
##           mean 10.494967 10.301232
##           sd    2.109034  2.131500
## 
## --------------------------------------------------------------------------------- 
##  ::: n_tokens_content (Gaussian) 
## --------------------------------------------------------------------------------- 
##                 
## n_tokens_content        0        1
##             mean 531.2408 562.4753
##             sd   432.2289 511.4219
## 
## --------------------------------------------------------------------------------- 
##  ::: n_unique_tokens (Gaussian) 
## --------------------------------------------------------------------------------- 
##                
## n_unique_tokens         0         1
##            mean 0.5367565 0.5687350
##            sd   0.1319017 5.5700879
## 
## --------------------------------------------------------------------------------- 
##  ::: n_non_stop_words (Gaussian) 
## --------------------------------------------------------------------------------- 
##                 
## n_non_stop_words         0         1
##             mean 0.9738280 1.0325434
##             sd   0.1596523 8.2774127
## 
## --------------------------------------------------------------------------------- 
##  ::: num_hrefs (Gaussian) 
## --------------------------------------------------------------------------------- 
##          
## num_hrefs         0         1
##      mean  9.774590 11.779147
##      sd    9.671197 12.413653
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 11 more tables
## 
## ---------------------------------------------------------------------------------

#accuracy on the training set
newsTrPredNB <- predict(newsModelNB, newsTraining, type="class")
CrossTable(newsTraining$popular, newsTrPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  29733 
## 
##  
##              | Predicted 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |     13425 |       483 |     13908 | 
##              |     0.452 |     0.016 |           | 
## -------------|-----------|-----------|-----------|
##            1 |     15032 |       793 |     15825 | 
##              |     0.506 |     0.027 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |     28457 |      1276 |     29733 | 
## -------------|-----------|-----------|-----------|
## 
##

#accuracy on the test set
newsPredNB <- predict(newsModelNB, newsTest, type="class")
CrossTable(newsTest$popular, newsPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  9911 
## 
##  
##              | Predicted 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      4423 |       159 |      4582 | 
##              |     0.446 |     0.016 |           | 
## -------------|-----------|-----------|-----------|
##            1 |      5072 |       257 |      5329 | 
##              |     0.512 |     0.026 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      9495 |       416 |      9911 | 
## -------------|-----------|-----------|-----------|
## 
##

#Support Vector Machine model
tryKernel2 <- function(train, test, kern) {
  model <- ksvm(popular~., data=train, kernel=kern)
  trainAccuracy <- round((1-model@error)*100,2)
  pred <- predict(model, test)
  table <- table(test$popular==pred)
  testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
  names(testAccuracy) <- NULL
  return(c(trainAccuracy, testAccuracy))
}
result2 <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
                     "Linear"=tryKernel2(newsTraining,newsTest,kern="vanilladot"),
                     "Polynomial"=tryKernel2(newsTraining,newsTest,kern="polydot"),
                     "RBF"=tryKernel2(newsTraining,newsTest,kern="rbfdot")) #about 5min runtime

##  Setting default kernel parameters  
##  Setting default kernel parameters

result2 %>% kable() %>% kable_styling(full_width=F)

Accuracy	Linear	Polynomial	RBF
Training (%)	56.73	56.72	61.80
Test (%)	57.51	57.51	59.37

ANLY 530 - Lab 2

Classification Models (continued)

Zhengxiao Wei

2019-12-22

1. Naïve Bayes Classifier

2. Pre-processing

3. Support Vector Machine

4. Evaluating News Popularity

Variable	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20
1	⬤	◯	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯
2	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤
3	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	◯	◯
4	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
5	◯	⬤	⬤	⬤	⬤	◯	◯	⬤	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	◯
6	⬤	◯	◯	⬤	◯	⬤	⬤	◯	◯	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯
7	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
8	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯	⬤
9	◯	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
10	⬤	◯	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	◯	◯	◯	⬤	◯	◯	⬤
11	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	◯
12	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
13	⬤	◯	⬤	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
14	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	⬤	◯	⬤	⬤	⬤	⬤	◯	◯	◯
15	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
16	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	◯	◯
17	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
18	◯	⬤	⬤	⬤	◯	◯	⬤	◯	⬤	◯	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤
19	◯	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤
20	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	◯	⬤	◯	⬤	⬤	⬤	⬤

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	701	0	2	1	0	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0	0	0	0	2	0
B	1	669	0	6	4	0	0	3	0	0	0	0	0	0	0	0	0	8	2	1	0	0	0	1	0	0
C	0	0	619	0	12	0	13	1	0	0	3	0	1	0	9	0	0	0	0	0	1	2	0	0	0	0
D	0	6	0	701	0	0	0	10	0	0	0	0	0	5	3	0	0	5	0	0	1	0	0	0	0	0
E	0	6	0	0	648	1	24	0	0	0	1	0	0	0	0	0	2	3	2	0	0	0	0	0	0	3
F	0	3	0	2	1	671	1	1	1	0	0	0	0	0	0	2	0	0	2	9	0	0	0	1	1	0
G	0	0	2	11	2	1	654	1	0	0	1	1	0	0	8	0	4	4	0	0	0	1	2	0	0	0
H	0	13	0	22	0	0	8	571	0	0	7	0	1	1	4	0	4	31	0	0	3	0	0	1	1	0
I	1	1	3	4	0	7	0	0	630	23	0	0	0	0	0	1	0	0	4	0	0	0	0	2	0	2
J	2	0	0	4	1	1	0	2	7	641	0	0	0	3	2	0	0	0	7	0	0	0	0	2	0	2
K	0	1	3	4	0	0	0	9	0	0	620	0	0	0	0	0	0	23	0	0	2	0	0	8	0	0
L	0	0	2	0	17	0	13	2	0	0	1	636	0	0	0	0	1	5	2	0	0	0	0	4	0	0
M	1	13	0	0	0	0	5	2	0	0	0	0	692	2	0	0	0	1	0	0	0	0	3	0	0	0
N	0	2	0	4	0	0	0	6	0	0	0	0	2	661	14	0	0	7	0	0	0	1	1	0	0	0
O	0	0	0	10	0	0	2	1	0	0	0	0	0	0	644	0	6	5	0	0	3	0	9	0	0	0
P	0	2	0	2	4	28	11	4	0	0	0	0	0	0	2	657	2	1	0	0	0	0	0	0	6	0
Q	0	3	0	0	3	0	2	0	0	0	0	0	0	0	8	0	692	1	1	0	0	0	1	0	0	0
R	1	21	0	7	0	0	2	4	0	0	6	0	0	4	0	0	1	632	0	0	0	0	0	1	0	0
S	0	8	0	0	4	5	1	1	0	0	0	0	0	0	0	0	0	1	654	0	0	0	0	1	0	2
T	0	2	0	1	0	4	3	6	0	0	3	0	0	0	0	0	0	1	0	674	2	0	0	5	0	1
U	0	0	0	0	0	0	0	1	0	0	2	0	5	1	3	0	0	0	0	0	702	4	4	0	0	0
V	1	19	0	0	1	1	1	2	0	0	0	0	1	3	0	0	0	1	0	1	0	654	6	0	2	0
W	0	2	0	0	0	0	2	2	0	0	0	0	8	0	0	0	0	1	0	0	3	0	665	0	0	0
X	0	2	0	5	4	1	0	0	1	1	8	0	0	0	0	0	3	1	0	0	0	0	0	674	1	0
Y	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	0	0	0	0	1	2	1	0	0	709	0
Z	0	0	0	0	5	0	0	0	0	1	0	0	0	0	0	0	6	1	8	0	0	0	0	1	0	630

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	75	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	3	0
B	0	67	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0
C	0	0	72	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
D	0	2	0	71	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
E	0	0	3	0	70	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2
F	0	1	0	0	2	76	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0
G	0	0	0	1	0	0	76	1	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0
H	0	0	0	2	0	0	1	58	0	0	3	0	0	0	0	0	1	1	0	0	1	0	0	0	0	0
I	0	0	0	2	0	3	0	0	69	2	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0
J	0	0	0	2	1	0	0	1	1	66	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0
K	0	0	0	1	0	0	0	0	0	0	62	0	0	0	0	0	0	4	0	0	0	0	0	2	0	0
L	0	1	1	0	2	0	0	1	0	0	0	69	0	0	0	0	0	0	0	0	0	0	0	4	0	0
M	0	0	0	0	0	0	0	1	0	0	0	0	71	0	0	0	0	0	0	0	0	0	1	0	0	0
N	0	1	0	0	0	0	0	0	0	0	1	0	1	78	2	0	0	2	0	0	0	0	0	0	0	0
O	0	0	0	0	0	0	0	0	0	0	0	0	0	0	67	0	3	1	0	0	0	0	2	0	0	0
P	0	2	0	2	0	6	0	0	0	0	0	0	0	0	1	72	1	0	0	0	0	0	0	0	0	0
Q	2	1	0	1	0	0	0	1	0	0	0	0	0	0	2	0	65	0	0	0	0	0	0	0	0	0
R	0	1	0	1	0	0	0	1	0	0	2	0	0	0	0	0	0	74	0	0	0	0	0	0	0	0
S	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0	0	0	1
T	1	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	1	0	88	0	0	0	0	1	0
U	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	89	0	1	0	0	0
V	0	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0
W	0	0	0	0	0	0	0	0	0	0	0	0	2	1	0	0	0	0	0	0	0	0	66	0	0	0
X	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	84	0	0
Y	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	65	0
Z	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	81

Variable	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20
1	⬤	◯	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯
2	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤
3	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	◯	◯
4	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
5	◯	⬤	⬤	⬤	⬤	◯	◯	⬤	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	◯
6	⬤	◯	◯	⬤	◯	⬤	⬤	◯	◯	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯
7	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
8	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯	⬤
9	◯	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
10	⬤	◯	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	◯	◯	◯	⬤	◯	◯	⬤
11	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	◯
12	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
13	⬤	◯	⬤	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
14	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	⬤	◯	⬤	⬤	⬤	⬤	◯	◯	◯
15	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
16	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	◯	◯
17	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
18	◯	⬤	⬤	⬤	◯	◯	⬤	◯	⬤	◯	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤
19	◯	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤
20	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	◯	⬤	◯	⬤	⬤	⬤	⬤

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	701	0	2	1	0	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0	0	0	0	2	0
B	1	669	0	6	4	0	0	3	0	0	0	0	0	0	0	0	0	8	2	1	0	0	0	1	0	0
C	0	0	619	0	12	0	13	1	0	0	3	0	1	0	9	0	0	0	0	0	1	2	0	0	0	0
D	0	6	0	701	0	0	0	10	0	0	0	0	0	5	3	0	0	5	0	0	1	0	0	0	0	0
E	0	6	0	0	648	1	24	0	0	0	1	0	0	0	0	0	2	3	2	0	0	0	0	0	0	3
F	0	3	0	2	1	671	1	1	1	0	0	0	0	0	0	2	0	0	2	9	0	0	0	1	1	0
G	0	0	2	11	2	1	654	1	0	0	1	1	0	0	8	0	4	4	0	0	0	1	2	0	0	0
H	0	13	0	22	0	0	8	571	0	0	7	0	1	1	4	0	4	31	0	0	3	0	0	1	1	0
I	1	1	3	4	0	7	0	0	630	23	0	0	0	0	0	1	0	0	4	0	0	0	0	2	0	2
J	2	0	0	4	1	1	0	2	7	641	0	0	0	3	2	0	0	0	7	0	0	0	0	2	0	2
K	0	1	3	4	0	0	0	9	0	0	620	0	0	0	0	0	0	23	0	0	2	0	0	8	0	0
L	0	0	2	0	17	0	13	2	0	0	1	636	0	0	0	0	1	5	2	0	0	0	0	4	0	0
M	1	13	0	0	0	0	5	2	0	0	0	0	692	2	0	0	0	1	0	0	0	0	3	0	0	0
N	0	2	0	4	0	0	0	6	0	0	0	0	2	661	14	0	0	7	0	0	0	1	1	0	0	0
O	0	0	0	10	0	0	2	1	0	0	0	0	0	0	644	0	6	5	0	0	3	0	9	0	0	0
P	0	2	0	2	4	28	11	4	0	0	0	0	0	0	2	657	2	1	0	0	0	0	0	0	6	0
Q	0	3	0	0	3	0	2	0	0	0	0	0	0	0	8	0	692	1	1	0	0	0	1	0	0	0
R	1	21	0	7	0	0	2	4	0	0	6	0	0	4	0	0	1	632	0	0	0	0	0	1	0	0
S	0	8	0	0	4	5	1	1	0	0	0	0	0	0	0	0	0	1	654	0	0	0	0	1	0	2
T	0	2	0	1	0	4	3	6	0	0	3	0	0	0	0	0	0	1	0	674	2	0	0	5	0	1
U	0	0	0	0	0	0	0	1	0	0	2	0	5	1	3	0	0	0	0	0	702	4	4	0	0	0
V	1	19	0	0	1	1	1	2	0	0	0	0	1	3	0	0	0	1	0	1	0	654	6	0	2	0
W	0	2	0	0	0	0	2	2	0	0	0	0	8	0	0	0	0	1	0	0	3	0	665	0	0	0
X	0	2	0	5	4	1	0	0	1	1	8	0	0	0	0	0	3	1	0	0	0	0	0	674	1	0
Y	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	0	0	0	0	1	2	1	0	0	709	0
Z	0	0	0	0	5	0	0	0	0	1	0	0	0	0	0	0	6	1	8	0	0	0	0	1	0	630

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	75	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	3	0
B	0	67	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0
C	0	0	72	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
D	0	2	0	71	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
E	0	0	3	0	70	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2
F	0	1	0	0	2	76	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0
G	0	0	0	1	0	0	76	1	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0
H	0	0	0	2	0	0	1	58	0	0	3	0	0	0	0	0	1	1	0	0	1	0	0	0	0	0
I	0	0	0	2	0	3	0	0	69	2	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0
J	0	0	0	2	1	0	0	1	1	66	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0
K	0	0	0	1	0	0	0	0	0	0	62	0	0	0	0	0	0	4	0	0	0	0	0	2	0	0
L	0	1	1	0	2	0	0	1	0	0	0	69	0	0	0	0	0	0	0	0	0	0	0	4	0	0
M	0	0	0	0	0	0	0	1	0	0	0	0	71	0	0	0	0	0	0	0	0	0	1	0	0	0
N	0	1	0	0	0	0	0	0	0	0	1	0	1	78	2	0	0	2	0	0	0	0	0	0	0	0
O	0	0	0	0	0	0	0	0	0	0	0	0	0	0	67	0	3	1	0	0	0	0	2	0	0	0
P	0	2	0	2	0	6	0	0	0	0	0	0	0	0	1	72	1	0	0	0	0	0	0	0	0	0
Q	2	1	0	1	0	0	0	1	0	0	0	0	0	0	2	0	65	0	0	0	0	0	0	0	0	0
R	0	1	0	1	0	0	0	1	0	0	2	0	0	0	0	0	0	74	0	0	0	0	0	0	0	0
S	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0	0	0	1
T	1	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	1	0	88	0	0	0	0	1	0
U	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	89	0	1	0	0	0
V	0	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0
W	0	0	0	0	0	0	0	0	0	0	0	0	2	1	0	0	0	0	0	0	0	0	66	0	0	0
X	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	84	0	0
Y	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	65	0
Z	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	81

Variable	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20
1	⬤	◯	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯
2	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤
3	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	◯	◯	⬤	⬤	⬤	⬤	◯	⬤	◯	◯
4	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
5	◯	⬤	⬤	⬤	⬤	◯	◯	⬤	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	◯
6	⬤	◯	◯	⬤	◯	⬤	⬤	◯	◯	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯
7	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
8	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	◯	◯	◯	⬤
9	◯	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
10	⬤	◯	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	◯	◯	◯	⬤	◯	◯	⬤
11	⬤	◯	◯	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	◯	◯	⬤	◯
12	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤
13	⬤	◯	⬤	⬤	◯	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	◯
14	◯	⬤	⬤	⬤	◯	◯	◯	◯	◯	◯	◯	⬤	◯	⬤	⬤	⬤	⬤	◯	◯	◯
15	⬤	⬤	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
16	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	◯	◯
17	⬤	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤	⬤
18	◯	⬤	⬤	⬤	◯	◯	⬤	◯	⬤	◯	◯	⬤	⬤	◯	⬤	⬤	⬤	⬤	◯	⬤
19	◯	⬤	◯	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤	⬤	◯	⬤	◯	⬤	◯	⬤	⬤
20	◯	⬤	◯	⬤	◯	◯	◯	⬤	◯	⬤	◯	⬤	◯	◯	⬤	◯	⬤	⬤	⬤	⬤

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	701	0	2	1	0	0	0	0	0	0	0	0	4	0	0	0	0	0	0	0	0	0	0	0	2	0
B	1	669	0	6	4	0	0	3	0	0	0	0	0	0	0	0	0	8	2	1	0	0	0	1	0	0
C	0	0	619	0	12	0	13	1	0	0	3	0	1	0	9	0	0	0	0	0	1	2	0	0	0	0
D	0	6	0	701	0	0	0	10	0	0	0	0	0	5	3	0	0	5	0	0	1	0	0	0	0	0
E	0	6	0	0	648	1	24	0	0	0	1	0	0	0	0	0	2	3	2	0	0	0	0	0	0	3
F	0	3	0	2	1	671	1	1	1	0	0	0	0	0	0	2	0	0	2	9	0	0	0	1	1	0
G	0	0	2	11	2	1	654	1	0	0	1	1	0	0	8	0	4	4	0	0	0	1	2	0	0	0
H	0	13	0	22	0	0	8	571	0	0	7	0	1	1	4	0	4	31	0	0	3	0	0	1	1	0
I	1	1	3	4	0	7	0	0	630	23	0	0	0	0	0	1	0	0	4	0	0	0	0	2	0	2
J	2	0	0	4	1	1	0	2	7	641	0	0	0	3	2	0	0	0	7	0	0	0	0	2	0	2
K	0	1	3	4	0	0	0	9	0	0	620	0	0	0	0	0	0	23	0	0	2	0	0	8	0	0
L	0	0	2	0	17	0	13	2	0	0	1	636	0	0	0	0	1	5	2	0	0	0	0	4	0	0
M	1	13	0	0	0	0	5	2	0	0	0	0	692	2	0	0	0	1	0	0	0	0	3	0	0	0
N	0	2	0	4	0	0	0	6	0	0	0	0	2	661	14	0	0	7	0	0	0	1	1	0	0	0
O	0	0	0	10	0	0	2	1	0	0	0	0	0	0	644	0	6	5	0	0	3	0	9	0	0	0
P	0	2	0	2	4	28	11	4	0	0	0	0	0	0	2	657	2	1	0	0	0	0	0	0	6	0
Q	0	3	0	0	3	0	2	0	0	0	0	0	0	0	8	0	692	1	1	0	0	0	1	0	0	0
R	1	21	0	7	0	0	2	4	0	0	6	0	0	4	0	0	1	632	0	0	0	0	0	1	0	0
S	0	8	0	0	4	5	1	1	0	0	0	0	0	0	0	0	0	1	654	0	0	0	0	1	0	2
T	0	2	0	1	0	4	3	6	0	0	3	0	0	0	0	0	0	1	0	674	2	0	0	5	0	1
U	0	0	0	0	0	0	0	1	0	0	2	0	5	1	3	0	0	0	0	0	702	4	4	0	0	0
V	1	19	0	0	1	1	1	2	0	0	0	0	1	3	0	0	0	1	0	1	0	654	6	0	2	0
W	0	2	0	0	0	0	2	2	0	0	0	0	8	0	0	0	0	1	0	0	3	0	665	0	0	0
X	0	2	0	5	4	1	0	0	1	1	8	0	0	0	0	0	3	1	0	0	0	0	0	674	1	0
Y	1	0	0	1	0	1	0	1	0	0	0	0	1	0	0	0	0	0	0	1	2	1	0	0	709	0
Z	0	0	0	0	5	0	0	0	0	1	0	0	0	0	0	0	6	1	8	0	0	0	0	1	0	630

	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z
A	75	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	3	0
B	0	67	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0
C	0	0	72	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	1	0	0	0
D	0	2	0	71	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
E	0	0	3	0	70	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2
F	0	1	0	0	2	76	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0
G	0	0	0	1	0	0	76	1	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0
H	0	0	0	2	0	0	1	58	0	0	3	0	0	0	0	0	1	1	0	0	1	0	0	0	0	0
I	0	0	0	2	0	3	0	0	69	2	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0
J	0	0	0	2	1	0	0	1	1	66	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0
K	0	0	0	1	0	0	0	0	0	0	62	0	0	0	0	0	0	4	0	0	0	0	0	2	0	0
L	0	1	1	0	2	0	0	1	0	0	0	69	0	0	0	0	0	0	0	0	0	0	0	4	0	0
M	0	0	0	0	0	0	0	1	0	0	0	0	71	0	0	0	0	0	0	0	0	0	1	0	0	0
N	0	1	0	0	0	0	0	0	0	0	1	0	1	78	2	0	0	2	0	0	0	0	0	0	0	0
O	0	0	0	0	0	0	0	0	0	0	0	0	0	0	67	0	3	1	0	0	0	0	2	0	0	0
P	0	2	0	2	0	6	0	0	0	0	0	0	0	0	1	72	1	0	0	0	0	0	0	0	0	0
Q	2	1	0	1	0	0	0	1	0	0	0	0	0	0	2	0	65	0	0	0	0	0	0	0	0	0
R	0	1	0	1	0	0	0	1	0	0	2	0	0	0	0	0	0	74	0	0	0	0	0	0	0	0
S	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0	0	0	1
T	1	0	0	0	0	0	0	3	0	0	0	0	0	0	0	0	0	1	0	88	0	0	0	0	1	0
U	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	89	0	1	0	0	0
V	0	1	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	68	0	0	0	0
W	0	0	0	0	0	0	0	0	0	0	0	0	2	1	0	0	0	0	0	0	0	0	66	0	0	0
X	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	84	0	0
Y	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	1	65	0
Z	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	81