Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
The data for this project has already been loaded. You will be distinguishing between the categories of nerd and geek to determine the influence of respective variables on their category definition.
If you are having trouble with the Rling library - the nerd data is avaliable on Canvas, and you can load it directly.
##r chunk
library(Rling)
data(nerd)
head(nerd)
## Noun Num Century Register Eval
## 1 nerd pl XX ACAD Neutral
## 2 nerd pl XXI ACAD Neutral
## 3 nerd pl XX ACAD Neutral
## 4 nerd pl XX ACAD Neutral
## 5 nerd pl XX ACAD Neutral
## 6 nerd pl XXI ACAD Neutral
library(party)
## Warning: package 'party' was built under R version 3.6.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.6.3
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.6.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.6.3
Dependent variable:
Independent variables:
ctree() to create a conditional inference model.##r chunk
set.seed(12345)
tree.output = ctree(Noun ~ Num + Century + Register + Eval, data = nerd)
##r chunk
plot(tree.output)
##r chunk
outcomes = table(predict(tree.output), nerd$Noun)
outcomes
##
## geek nerd
## geek 227 61
## nerd 443 585
sum(diag(outcomes)) / sum(outcomes) * 100
## [1] 61.70213
sum(outcomes[1]) / sum(outcomes[,1]) * 100
## [1] 33.8806
sum(outcomes[4]) / sum(outcomes[,2]) * 100
## [1] 90.55728
sum(outcomes[,1]) / (sum(outcomes[,1]) + sum(outcomes[,2]))
## [1] 0.5091185
sum(outcomes[1,]) / (sum(outcomes[1,]) + sum(outcomes[2,]))
## [1] 0.218845
##r chunk
forest.output = cforest(Noun ~ Num + Century + Register + Eval,
data = nerd,
controls = cforest_unbiased(ntree = 1000,
mtry = 2))
##r chunk
forest.importance = varimp(forest.output,
conditional = T)
round(forest.importance, 3)
## Num Century Register Eval
## -0.002 0.023 -0.003 0.056
dotchart(sort(forest.importance),
main = "Conditional Importance of Variables")
##r chunk
forest.outcomes = table(predict(forest.output), nerd$Noun)
forest.outcomes
##
## geek nerd
## geek 337 149
## nerd 333 497
sum(diag(forest.outcomes)) / sum(forest.outcomes) * 100
## [1] 63.37386
sum(forest.outcomes[1]) / sum(forest.outcomes[,1]) * 100
## [1] 50.29851
sum(forest.outcomes[4]) / sum(forest.outcomes[,2]) * 100
## [1] 76.93498
sum(forest.outcomes[1,]) / (sum(forest.outcomes[1,]) + sum(forest.outcomes[2,]))
## [1] 0.3693009
##python chunk
import pandas as pd
#nerd_pydata = pd.read_csv('nerd.csv')
Xvars = pd.get_dummies(r.nerd[["Num", "Century", "Register", "Eval"]])
Xvars.head()
## Num_pl Num_sg Century_XX ... Eval_Neg Eval_Neutral Eval_Positive
## 0 1 0 1 ... 0 1 0
## 1 1 0 0 ... 0 1 0
## 2 1 0 1 ... 0 1 0
## 3 1 0 1 ... 0 1 0
## 4 1 0 1 ... 0 1 0
##
## [5 rows x 11 columns]
Yvar = pd.get_dummies(r.nerd["Noun"])
Yvar.head()
## geek nerd
## 0 0 1
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
nerd data.##python chunk
import sklearn
from sklearn import tree
CIT = tree.DecisionTreeClassifier()
CIT = CIT.fit(Xvars,Yvar)
CIT.predict(Xvars)
## array([[0, 1],
## [0, 1],
## [0, 1],
## ...,
## [1, 0],
## [0, 0],
## [0, 0]], dtype=uint8)
##python chunk
print(tree.export_text(CIT,feature_names= list(Xvars.columns)))
## |--- Eval_Positive <= 0.50
## | |--- Century_XX <= 0.50
## | | |--- Num_sg <= 0.50
## | | | |--- Register_MAG <= 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- class: 0
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- Register_ACAD <= 0.50
## | | | | | | |--- Register_SPOK <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_SPOK > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_ACAD > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_MAG > 0.50
## | | | | |--- Eval_Neutral <= 0.50
## | | | | | |--- class: 1
## | | | | |--- Eval_Neutral > 0.50
## | | | | | |--- class: 0
## | | |--- Num_sg > 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- Register_SPOK <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_SPOK > 0.50
## | | | | | | | |--- class: 1
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 1
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- class: 1
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- Eval_Neg <= 0.50
## | | | | | |--- class: 0
## | | | | |--- Eval_Neg > 0.50
## | | | | | |--- class: 1
## | |--- Century_XX > 0.50
## | | |--- Num_pl <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Register_SPOK <= 0.50
## | | | | | | |--- Eval_Neutral <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Eval_Neutral > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Register_SPOK > 0.50
## | | | | | | |--- Eval_Neg <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Eval_Neg > 0.50
## | | | | | | | |--- class: 1
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neutral <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Eval_Neutral > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- Eval_Neg <= 0.50
## | | | | | |--- class: 0
## | | | | |--- Eval_Neg > 0.50
## | | | | | |--- class: 0
## | | |--- Num_pl > 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_MAG <= 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 0
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- Register_NEWS <= 0.50
## | | | | | | | |--- class: 0
## | | | | | | |--- Register_NEWS > 0.50
## | | | | | | | |--- class: 0
## | | | | |--- Register_MAG > 0.50
## | | | | | |--- Eval_Neg <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Eval_Neg > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 0
## |--- Eval_Positive > 0.50
## | |--- Century_XXI <= 0.50
## | | |--- Register_MAG <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Num_sg <= 0.50
## | | | | | |--- Register_NEWS <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Register_NEWS > 0.50
## | | | | | | |--- class: 0
## | | | | |--- Num_sg > 0.50
## | | | | | |--- Register_NEWS <= 0.50
## | | | | | | |--- class: 0
## | | | | | |--- Register_NEWS > 0.50
## | | | | | | |--- class: 0
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 0
## | | |--- Register_MAG > 0.50
## | | | |--- Num_pl <= 0.50
## | | | | |--- class: 1
## | | | |--- Num_pl > 0.50
## | | | | |--- class: 1
## | |--- Century_XXI > 0.50
## | | |--- Register_MAG <= 0.50
## | | | |--- Register_ACAD <= 0.50
## | | | | |--- Register_NEWS <= 0.50
## | | | | | |--- Num_sg <= 0.50
## | | | | | | |--- class: 1
## | | | | | |--- Num_sg > 0.50
## | | | | | | |--- class: 1
## | | | | |--- Register_NEWS > 0.50
## | | | | | |--- Num_sg <= 0.50
## | | | | | | |--- class: 1
## | | | | | |--- Num_sg > 0.50
## | | | | | | |--- class: 1
## | | | |--- Register_ACAD > 0.50
## | | | | |--- class: 1
## | | |--- Register_MAG > 0.50
## | | | |--- Num_sg <= 0.50
## | | | | |--- class: 1
## | | | |--- Num_sg > 0.50
## | | | | |--- class: 1
##python chunk
Y_predict = pd.DataFrame(CIT.predict(Xvars))
Y_predict.columns = list(Yvar.columns)
Y_predict_category = Y_predict.idxmax(axis=1)
Yvar_category = Yvar.idxmax(axis=1)
sklearn.metrics.confusion_matrix(Y_predict_category, Yvar_category, labels = ["nerd","geek"])
## array([[439, 269],
## [207, 401]], dtype=int64)