library(Rling)
library(party)
library(dplyr)
library(skimr)
library(DataExplorer)
data(nerd)
head(nerd)
## Noun Num Century Register Eval
## 1 nerd pl XX ACAD Neutral
## 2 nerd pl XXI ACAD Neutral
## 3 nerd pl XX ACAD Neutral
## 4 nerd pl XX ACAD Neutral
## 5 nerd pl XX ACAD Neutral
## 6 nerd pl XXI ACAD Neutral
Dependent variable:
Independent variables:
summary(nerd)
## Noun Num Century Register Eval
## geek:670 pl:436 XX :515 ACAD: 54 Neg :132
## nerd:646 sg:880 XXI:801 MAG :622 Neutral :896
## NEWS:343 Positive:288
## SPOK:297
dim(nerd)
## [1] 1316 5
glimpse(nerd)
## Rows: 1,316
## Columns: 5
## $ Noun <fct> nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd, nerd...
## $ Num <fct> pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl, pl...
## $ Century <fct> XX, XXI, XX, XX, XX, XXI, XXI, XX, XXI, XX, XXI, XX, XXI, ...
## $ Register <fct> ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD, ACAD...
## $ Eval <fct> Neutral, Neutral, Neutral, Neutral, Neutral, Neutral, Neut...
skim(nerd)
| Name | nerd |
| Number of rows | 1316 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Noun | 0 | 1 | FALSE | 2 | gee: 670, ner: 646 |
| Num | 0 | 1 | FALSE | 2 | sg: 880, pl: 436 |
| Century | 0 | 1 | FALSE | 2 | XXI: 801, XX: 515 |
| Register | 0 | 1 | FALSE | 4 | MAG: 622, NEW: 343, SPO: 297, ACA: 54 |
| Eval | 0 | 1 | FALSE | 3 | Neu: 896, Pos: 288, Neg: 132 |
set.seed(123456)
cimodel = ctree(Noun ~ Num + Century + Register + Eval, data = nerd)
plot(cimodel)
predictiveness = table(predict(cimodel),nerd$Noun)
predictiveness
##
## geek nerd
## geek 227 61
## nerd 443 585
sum(diag(predictiveness))/sum(predictiveness) * 100
## [1] 61.70213
forest = cforest(Noun ~ Num + Century + Register + Eval, data = nerd, controls = cforest_unbiased(ntree = 1000, mtry = 2))
variable_importance = varimp(forest,conditional = T)
round(variable_importance, 3)
## Num Century Register Eval
## -0.002 0.024 -0.002 0.055
dotchart(sort(variable_importance),main = "Importance of variables")
predictions = table(predict(forest),nerd$Noun)
predictions
##
## geek nerd
## geek 325 136
## nerd 345 510
sum(diag(predictions))/sum(predictions) * 100
## [1] 63.44985
library(reticulate)
py_config()
## python: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python.exe
## libpython: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python36.dll
## pythonhome: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate
## version: 3.6.11 (default, Aug 5 2020, 19:41:03) [MSC v.1916 64 bit (AMD64)]
## Architecture: 64bit
## numpy: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/Lib/site-packages/numpy
## numpy_version: 1.19.1
py_module_available("sklearn")
## [1] TRUE
import sklearn
from sklearn import tree
import pandas as pd
data = pd.get_dummies(r.nerd[["Num","Century","Register","Eval"]])
data.head()
## Num_pl Num_sg Century_XX ... Eval_Neg Eval_Neutral Eval_Positive
## 0 1 0 1 ... 0 1 0
## 1 1 0 0 ... 0 1 0
## 2 1 0 1 ... 0 1 0
## 3 1 0 1 ... 0 1 0
## 4 1 0 1 ... 0 1 0
##
## [5 rows x 11 columns]
nouns = pd.get_dummies(r.nerd["Noun"])
nouns.head()
## geek nerd
## 0 0 1
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
decision_tree = tree.DecisionTreeClassifier()
decision_tree = decision_tree.fit(data,nouns)
print(tree.export_text(decision_tree,feature_names = list(data.columns)))
## |--- Eval_Positive <= 0.50
## | |--- Century_XXI <= 0.50
## | | |--- Num_pl <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- Eval_Neg <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Eval_Neg > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- Eval_Neg <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Eval_Neg > 0.50
## | | | | | | | |--- class: 1
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- class: 0
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- class: 0
## | | |--- Num_pl > 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Eval_Neutral <= 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Eval_Neutral > 0.50
## | | | | | | |--- Register_SPOK <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_SPOK > 0.50
## | | | | | | | |--- class: 0
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neutral <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Eval_Neutral > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 0
## | |--- Century_XXI > 0.50
## | | |--- Num_pl <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- Register_SPOK <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_SPOK > 0.50
## | | | | | | | |--- class: 1
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- Register_SPOK <= 0.50
## | | | | | | | |--- class: 1
## | | | | | | |--- Register_SPOK > 0.50
## | | | | | | | |--- class: 0
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neutral <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Eval_Neutral > 0.50
## | | | | | | |--- class: 1
## | | | |--- Register_ACAD > 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- class: 1
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- class: 0
## | | |--- Num_pl > 0.50
## | | | |--- Register_MAG <= 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- Register_ACAD <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_ACAD > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- class: 0
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- Register_ACAD <= 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_ACAD > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_MAG > 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- class: 1
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- class: 0
## |--- Eval_Positive > 0.50
## | |--- Century_XXI <= 0.50
## | | |--- Register_MAG <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Num_sg <= 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- class: 0
## | | | | |--- Num_sg > 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 0
## | | |--- Register_MAG > 0.50
## | | | |--- Num_sg <= 0.50
## | | | | |--- class: 1
## | | | |--- Num_sg > 0.50
## | | | | |--- class: 1
## | |--- Century_XXI > 0.50
## | | |--- Register_MAG <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_SPOK <= 0.50
## | | | | | |--- Num_sg <= 0.50
## | | | | | | |--- class: 1
## | | | | | |--- Num_sg > 0.50
## | | | | | | |--- class: 1
## | | | | |--- Register_SPOK > 0.50
## | | | | | |--- Num_pl <= 0.50
## | | | | | | |--- class: 1
## | | | | | |--- Num_pl > 0.50
## | | | | | | |--- class: 1
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 1
## | | |--- Register_MAG > 0.50
## | | | |--- Num_sg <= 0.50
## | | | | |--- class: 1
## | | | |--- Num_sg > 0.50
## | | | | |--- class: 1
prediction = pd.DataFrame(decision_tree.predict(data))
prediction.columns = list(nouns.columns)
prediction_category = prediction.idxmax(axis = 1)
variable_category = nouns.idxmax(axis = 1)
sklearn.metrics.confusion_matrix(prediction_category,variable_category,labels=["nerd","geek"])
## array([[439, 269],
## [207, 401]], dtype=int64)