Load the libraries + functions

library(Rling)
library(party)
library(dplyr)
library(skimr)
library(DataExplorer)
data(nerd)
head(nerd)
##   Noun Num Century Register    Eval
## 1 nerd  pl      XX     ACAD Neutral
## 2 nerd  pl     XXI     ACAD Neutral
## 3 nerd  pl      XX     ACAD Neutral
## 4 nerd  pl      XX     ACAD Neutral
## 5 nerd  pl      XX     ACAD Neutral
## 6 nerd  pl     XXI     ACAD Neutral

Description of the data

Dependent variable:

Independent variables:

summary(nerd)
##    Noun     Num      Century   Register         Eval    
##  geek:670   pl:436   XX :515   ACAD: 54   Neg     :132  
##  nerd:646   sg:880   XXI:801   MAG :622   Neutral :896  
##                                NEWS:343   Positive:288  
##                                SPOK:297
dim(nerd)
## [1] 1316    5
glimpse(nerd)
## Rows: 1,316
## Columns: 5
## $ Noun     <fct> nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd...
## $ Num      <fct> pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl...
## $ Century  <fct> XX, XXI, XX, XX, XX, XXI, XXI, XX, XXI, XX, XXI, XX, XXI, ...
## $ Register <fct> ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD...
## $ Eval     <fct> Neutral, Neutral, Neutral, Neutral, Neutral, Neutral, Neut...
skim(nerd)
Data summary
Name nerd
Number of rows 1316
Number of columns 5
_______________________
Column type frequency:
factor 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Noun 0 1 FALSE 2 gee: 670, ner: 646
Num 0 1 FALSE 2 sg: 880, pl: 436
Century 0 1 FALSE 2 XXI: 801, XX: 515
Register 0 1 FALSE 4 MAG: 622, NEW: 343, SPO: 297, ACA: 54
Eval 0 1 FALSE 3 Neu: 896, Pos: 288, Neg: 132

Conditional inference model

set.seed(123456)
cimodel = ctree(Noun ~ Num + Century + Register + Eval, data = nerd)

Make a plot

plot(cimodel)

Interpret the categories

Write out an interpretation of the results from the model. You can interpret the branches of the tree to determine what features defines each category.

  • First split is using Eval levels
  • The next split is using century
  • In node3, there are 194 observations, about 10% of those are nerd and the rest is geek. Similar interpretations can be done on other nodes
  • For twentieth century there are more negative or neutral observations of the word nerd.

Conditional inference model predictiveness

predictiveness = table(predict(cimodel),nerd$Noun)
predictiveness
##       
##        geek nerd
##   geek  227   61
##   nerd  443  585
sum(diag(predictiveness))/sum(predictiveness) * 100
## [1] 61.70213

Random forests

forest = cforest(Noun ~ Num + Century + Register + Eval, data = nerd, controls = cforest_unbiased(ntree = 1000, mtry = 2))

Variable importance

Which variables were the most important?

  • Eval and Century are most important variables
variable_importance = varimp(forest,conditional = T)
round(variable_importance, 3)
##      Num  Century Register     Eval 
##   -0.002    0.024   -0.002    0.055
dotchart(sort(variable_importance),main = "Importance of variables")

Forest model predictiveness

Did it do better than the conditional inference tree?

  • Random forest model is slightly better than the tree model
predictions = table(predict(forest),nerd$Noun)
predictions
##       
##        geek nerd
##   geek  325  136
##   nerd  345  510
sum(diag(predictions))/sum(predictions) * 100
## [1] 63.44985
library(reticulate)
py_config()
## python:         C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python.exe
## libpython:      C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python36.dll
## pythonhome:     C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate
## version:        3.6.11 (default, Aug  5 2020, 19:41:03) [MSC v.1916 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/Lib/site-packages/numpy
## numpy_version:  1.19.1
py_module_available("sklearn")
## [1] TRUE

Python model

import sklearn
from sklearn import tree
import pandas as pd

data = pd.get_dummies(r.nerd[["Num","Century","Register","Eval"]])
data.head()
##    Num_pl  Num_sg  Century_XX  ...  Eval_Neg  Eval_Neutral  Eval_Positive
## 0       1       0           1  ...         0             1              0
## 1       1       0           0  ...         0             1              0
## 2       1       0           1  ...         0             1              0
## 3       1       0           1  ...         0             1              0
## 4       1       0           1  ...         0             1              0
## 
## [5 rows x 11 columns]
nouns = pd.get_dummies(r.nerd["Noun"])
nouns.head()
##    geek  nerd
## 0     0     1
## 1     0     1
## 2     0     1
## 3     0     1
## 4     0     1

Create the Tree

decision_tree = tree.DecisionTreeClassifier()
decision_tree = decision_tree.fit(data,nouns)

Printing out the Tree

print(tree.export_text(decision_tree,feature_names = list(data.columns)))
## |--- Eval_Positive <= 0.50
## |   |--- Century_XXI <= 0.50
## |   |   |--- Num_pl <= 0.50
## |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |--- Register_MAG <= 0.50
## |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |--- Eval_Neg <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Eval_Neg >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |--- Eval_Neg <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Eval_Neg >  0.50
## |   |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |--- Register_MAG >  0.50
## |   |   |   |   |   |--- Eval_Neg <= 0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Eval_Neg >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |--- class: 0
## |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |--- class: 0
## |   |   |--- Num_pl >  0.50
## |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |--- Register_MAG <= 0.50
## |   |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |   |--- Register_NEWS <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Register_NEWS >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |--- Register_MAG >  0.50
## |   |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |--- class: 0
## |   |--- Century_XXI >  0.50
## |   |   |--- Num_pl <= 0.50
## |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |--- Register_MAG <= 0.50
## |   |   |   |   |   |--- Eval_Neg <= 0.50
## |   |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |   |--- Eval_Neg >  0.50
## |   |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |--- Register_MAG >  0.50
## |   |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |   |--- class: 1
## |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |--- class: 1
## |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |--- class: 0
## |   |   |--- Num_pl >  0.50
## |   |   |   |--- Register_MAG <= 0.50
## |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |   |   |--- Register_NEWS <= 0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |   |--- Register_NEWS >  0.50
## |   |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |--- Register_MAG >  0.50
## |   |   |   |   |--- Eval_Neutral <= 0.50
## |   |   |   |   |   |--- class: 1
## |   |   |   |   |--- Eval_Neutral >  0.50
## |   |   |   |   |   |--- class: 0
## |--- Eval_Positive >  0.50
## |   |--- Century_XXI <= 0.50
## |   |   |--- Register_MAG <= 0.50
## |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |--- Num_sg <= 0.50
## |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |--- Num_sg >  0.50
## |   |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |   |--- class: 0
## |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |--- class: 0
## |   |   |--- Register_MAG >  0.50
## |   |   |   |--- Num_sg <= 0.50
## |   |   |   |   |--- class: 1
## |   |   |   |--- Num_sg >  0.50
## |   |   |   |   |--- class: 1
## |   |--- Century_XXI >  0.50
## |   |   |--- Register_MAG <= 0.50
## |   |   |   |--- Register_ACAD <= 0.50
## |   |   |   |   |--- Register_SPOK <= 0.50
## |   |   |   |   |   |--- Num_sg <= 0.50
## |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |   |--- Num_sg >  0.50
## |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |--- Register_SPOK >  0.50
## |   |   |   |   |   |--- Num_pl <= 0.50
## |   |   |   |   |   |   |--- class: 1
## |   |   |   |   |   |--- Num_pl >  0.50
## |   |   |   |   |   |   |--- class: 1
## |   |   |   |--- Register_ACAD >  0.50
## |   |   |   |   |--- class: 1
## |   |   |--- Register_MAG >  0.50
## |   |   |   |--- Num_sg <= 0.50
## |   |   |   |   |--- class: 1
## |   |   |   |--- Num_sg >  0.50
## |   |   |   |   |--- class: 1

Confusion Matrix

prediction = pd.DataFrame(decision_tree.predict(data))
prediction.columns = list(nouns.columns)
prediction_category = prediction.idxmax(axis = 1)
variable_category = nouns.idxmax(axis = 1)
sklearn.metrics.confusion_matrix(prediction_category,variable_category,labels=["nerd","geek"])
## array([[439, 269],
##        [207, 401]], dtype=int64)

Thought questions

Are the models easier to create using R or Python (your own thoughts, they can be different than what I said in the lecture)?

  • Easier to create models using R as the visualizations are inbuilt and python requires creation of dummy variables

Which model gave you a better classification of the categories?

  • Random forest model

What other variables might be useful in understanding the category membership of geek versus nerd? Basically, what could we add to the model to improve it (there’s no one right answer here - it’s helpful to think about what other facets we have not measured)?

  • Gender, age, locations would be helpful to get more insights. An explanation of Eval column would be helpful to understand the dataset context more.