library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
library (ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library (class)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(regclass)
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following objects are masked from 'package:psych':
## 
##     fisherz, logistic, logit
## The following object is masked from 'package:caret':
## 
##     predictors
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
## 
## Attaching package: 'regclass'
## The following object is masked from 'package:lattice':
## 
##     qq
library (pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library (nnet)
#1
pt = read.csv("affair_pt.csv")
pt$occupation <- as.factor(pt$occupation)
pt$gender <- ifelse(pt$gender=="male", 1, 0)
pt$children <- ifelse(pt$children=="yes", 1, 0)
pt$cheated <- ifelse(pt$affairs>0, 1, 0)
pt$affairs <- as.factor(pt$affairs)
#1b)
# Examine whether collinearity exists in the dataset by a correlation matrix.
#Let 0.6 be your cutoff for determining collinearity. what are two ways to deal with collinearity?
# pt Correlation Matrix 1 must not include occupation and affairs because they are categorical, input must be numeric
#setdiff: find differences between two sets
ptcort1 <- pt[, setdiff(names(pt), c("occupation", "affairs"))]
# Prints a boolean matrix if collinearity exists
cor(ptcort1) > 0.6
##               gender   age yearsmarried children religiousness education rating
## gender          TRUE FALSE        FALSE    FALSE         FALSE     FALSE  FALSE
## age            FALSE  TRUE         TRUE    FALSE         FALSE     FALSE  FALSE
## yearsmarried   FALSE  TRUE         TRUE    FALSE         FALSE     FALSE  FALSE
## children       FALSE FALSE        FALSE     TRUE         FALSE     FALSE  FALSE
## religiousness  FALSE FALSE        FALSE    FALSE          TRUE     FALSE  FALSE
## education      FALSE FALSE        FALSE    FALSE         FALSE      TRUE  FALSE
## rating         FALSE FALSE        FALSE    FALSE         FALSE     FALSE   TRUE
## cheated        FALSE FALSE        FALSE    FALSE         FALSE     FALSE  FALSE
##               cheated
## gender          FALSE
## age             FALSE
## yearsmarried    FALSE
## children        FALSE
## religiousness   FALSE
## education       FALSE
## rating          FALSE
## cheated          TRUE
# collinearity exists between age and years married
# Correlation plot to confirm results
corPlot(ptcort1)

# One way to deal with collinearity is to make a combined variable
# that eliminates the inherent collinearity between the 2 related variables
ptmarriage_age <- data.frame(0.5*pt$age + 0.5*pt$yearsmarried)
pt <- cbind(pt, ptmarriage_age)
names(pt)[length (pt)] <- "Marriage Age"
ptcort2 <- pt[,setdiff(names (pt), c("occupation", "affairs", "age", "yearsmarried"))]
# Correlation matrix 2 calculation
cor(ptcort2) > 0.6
##               gender children religiousness education rating cheated
## gender          TRUE    FALSE         FALSE     FALSE  FALSE   FALSE
## children       FALSE     TRUE         FALSE     FALSE  FALSE   FALSE
## religiousness  FALSE    FALSE          TRUE     FALSE  FALSE   FALSE
## education      FALSE    FALSE         FALSE      TRUE  FALSE   FALSE
## rating         FALSE    FALSE         FALSE     FALSE   TRUE   FALSE
## cheated        FALSE    FALSE         FALSE     FALSE  FALSE    TRUE
## Marriage Age   FALSE    FALSE         FALSE     FALSE  FALSE   FALSE
##               Marriage Age
## gender               FALSE
## children             FALSE
## religiousness        FALSE
## education            FALSE
## rating               FALSE
## cheated              FALSE
## Marriage Age          TRUE
corPlot (ptcort2)

# 2 ways to deal with collinearity include: 
# - Remove one or more of the correlated predictors
# - Regularization techniques that reduce the impact of highly correlated
#predictors, such as lasso regression (setting some of the coefficients to 0)
# 1c)
# Set_seed=2021. Split the dataset randomly into 80% training and 20% testing.
set.seed(2021)
training_percent <- 0.8 
ptindex_break <- sort(sample(nrow(pt), nrow(pt)*training_percent))
pt_training <- pt[ptindex_break,]
pt_testing <- pt[-ptindex_break,]

# 1d)
# After accounting for the collinearity, build a simple model with
#"cheated" as the response variable and all other appropriate variables.
# Simple model appropriate variables should not include affairs
#(=cheated, not predictor, age and yrs married introduce collinearity)
simple_model <- glm(cheated ~., data = subset(pt_training, select = c(-affairs, -age, -yearsmarried)), family = binomial)
summary(simple_model)
## 
## Call:
## glm(formula = cheated ~ ., family = binomial, data = subset(pt_training, 
##     select = c(-affairs, -age, -yearsmarried)))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4993  -0.7695  -0.5839   0.9606   2.3220  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.772429   1.038907  -0.744   0.4572    
## gender         -0.166875   0.271277  -0.615   0.5385    
## children        0.710989   0.309989   2.294   0.0218 *  
## religiousness  -0.246119   0.098431  -2.500   0.0124 *  
## education       0.070401   0.059810   1.177   0.2392    
## occupation2     0.962637   0.804365   1.197   0.2314    
## occupation3     0.650915   0.490897   1.326   0.1848    
## occupation4     1.059432   0.445551   2.378   0.0174 *  
## occupation5     0.272522   0.371679   0.733   0.4634    
## occupation6     0.202020   0.451861   0.447   0.6548    
## occupation7     0.362631   0.753645   0.481   0.6304    
## rating         -0.434288   0.099472  -4.366 1.27e-05 ***
## `Marriage Age`  0.006357   0.018791   0.338   0.7351    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 544.19  on 479  degrees of freedom
## Residual deviance: 498.65  on 467  degrees of freedom
## AIC: 524.65
## 
## Number of Fisher Scoring iterations: 4
# le)
# Build a model with a trimmed selection of feature vectors and
# use it to predict labels on the testing set. Explain your choice
# of predictor variables. Look at the VIF and determine if these choices are acceptable.
# Based on simple model summary, only children, religiousness, occupation 4,
# and rating are significant predictors, as they have p-value less than 0.05
pttrim_model <- glm(cheated ~ children +religiousness + occupation +rating, 
                    family = binomial, data = pt_training)
summary (pttrim_model)
## 
## Call:
## glm(formula = cheated ~ children + religiousness + occupation + 
##     rating, family = binomial, data = pt_training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5085  -0.7723  -0.5823   0.9021   2.2499  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.29373    0.57938   0.507  0.61217    
## children       0.75941    0.27822   2.730  0.00634 ** 
## religiousness -0.24413    0.09619  -2.538  0.01115 *  
## occupation2    0.69493    0.76127   0.913  0.36132    
## occupation3    0.54858    0.47050   1.166  0.24364    
## occupation4    1.03142    0.41803   2.467  0.01361 *  
## occupation5    0.34692    0.33823   1.026  0.30504    
## occupation6    0.33416    0.35846   0.932  0.35121    
## occupation7    0.43684    0.70973   0.615  0.53823    
## rating        -0.42244    0.09583  -4.408 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 544.19  on 479  degrees of freedom
## Residual deviance: 500.27  on 470  degrees of freedom
## AIC: 520.27
## 
## Number of Fisher Scoring iterations: 4
VIF (pttrim_model)
##                   GVIF Df GVIF^(1/(2*Df))
## children      1.094506  1        1.046186
## religiousness 1.025624  1        1.012731
## occupation    1.097432  6        1.007778
## rating        1.049423  1        1.024414
# Generally speaking, a VIF (variance inflation factor), close to 1
# indicates that there is no presence of multicollinearity. A VIF
# score outside the bounds of 0.25 and 4 indicate multicollinearity
# might exist and outside the bounds of 0.1 and 10 indicates
#that multicollinearity does exist. Given the VIF's of all the
# chosen predictors close to 1, it can be determined that the choices of
# predictors in the trimmed model are acceptable.
# 1f)
# Run the trimmed model on testing data. Plot confusion matrix.
# comment on whether or not the model is a good model and comment on
# things that can be improved.
pt_probabilities <- predict(pttrim_model, pt_testing, type="response")
pt_predictions <- ifelse(pt_probabilities >= 0.5, 1, 0)
pt_cm <- confusionMatrix(reference=as.factor(pt_testing$cheated), data = as.factor(pt_predictions))
pt_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 92 24
##          1  1  4
##                                           
##                Accuracy : 0.7934          
##                  95% CI : (0.7103, 0.8616)
##     No Information Rate : 0.7686          
##     P-Value [Acc > NIR] : 0.2999          
##                                           
##                   Kappa : 0.1853          
##                                           
##  Mcnemar's Test P-Value : 1.083e-05       
##                                           
##             Sensitivity : 0.9892          
##             Specificity : 0.1429          
##          Pos Pred Value : 0.7931          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.7686          
##          Detection Rate : 0.7603          
##    Detection Prevalence : 0.9587          
##       Balanced Accuracy : 0.5661          
##                                           
##        'Positive' Class : 0               
## 
# Based on the confusion matrix, this model does an okay job, with an
# approximate 79% total accuracy it seems that the model does a great job
# predicting the behaviors of non-cheaters. A sensitivity of approximately 99%
#reveals it does a great job of predicting whether or not that person cheated or not.
#However, the model is very poor at predicting that someone is a cheater given 
#that they are a cheater. Out of the 28 cheaters, the model only predicted that 
#4 of them were cheaters, and hence the extremely low specificity rate 
#and an extremely high false rate of loyalty.

2) exploring the af_RB Dataset

rb <- read.csv("/Users/jianingjin/Desktop/IEMS_304/lab7/affair_rb.csv")
# 2a)
# Repeat Steps for RB Dataset
rb$children <- ifelse(rb$children>0, 1, 0)
rb$occupation <- as.factor(rb$occupation)
rb$occupation_husb <- as.factor(rb$occupation_husb)
# Need to ignore categorical vars when calculating correlation
rbcort1 <- rb[,setdiff(names(rb), c("occupation", "occupation_husb"))]
cor(rbcort1) > 0.6
##               rate_marriage   age yrs_married children religious  educ affair
## rate_marriage          TRUE FALSE       FALSE    FALSE     FALSE FALSE  FALSE
## age                   FALSE  TRUE        TRUE    FALSE     FALSE FALSE  FALSE
## yrs_married           FALSE  TRUE        TRUE     TRUE     FALSE FALSE  FALSE
## children              FALSE FALSE        TRUE     TRUE     FALSE FALSE  FALSE
## religious             FALSE FALSE       FALSE    FALSE      TRUE FALSE  FALSE
## educ                  FALSE FALSE       FALSE    FALSE     FALSE  TRUE  FALSE
## affair                FALSE FALSE       FALSE    FALSE     FALSE FALSE   TRUE
# Age and yrs_married present collinearity issues so I combined into marriage
# age again
corPlot (rbcort1)

rbmarriage_age <- data.frame (0.5*rb$age + 0.5*rb$yrs_married)
rb <- cbind(rb, rbmarriage_age) 
names (rb) [length (rb)] <- "Marriage Age"
rbcort2 <- rb[, setdiff (names(rb), c("occupation", "occupation_husb",                                    "age", "yrs_married"))]
cor (rbcort2) > 0.6
##               rate_marriage children religious  educ affair Marriage Age
## rate_marriage          TRUE    FALSE     FALSE FALSE  FALSE        FALSE
## children              FALSE     TRUE     FALSE FALSE  FALSE        FALSE
## religious             FALSE    FALSE      TRUE FALSE  FALSE        FALSE
## educ                  FALSE    FALSE     FALSE  TRUE  FALSE        FALSE
## affair                FALSE    FALSE     FALSE FALSE   TRUE        FALSE
## Marriage Age          FALSE    FALSE     FALSE FALSE  FALSE         TRUE
corPlot (rbcort2)

# 2b)
# Set seed=2021. Split the dataset randomly into 80% training and 20% testing
set.seed (2021)
training_percent <- 0.8 
rbindex_break <- sort(sample(nrow(rb), nrow(rb)*training_percent)) 
rb_training <- rb[rbindex_break,]
rb_testing <- rb[-rbindex_break,]
#2c)
# Naive model with all parameters. You can choose either using
# (children) or have_children for your
# predictor variable. You can experiment with both and comment on
# your insight with the two different approaches.
naive_model <- glm(affair ~., data = subset(rb_training, select = c(-age, -yrs_married)), family = binomial)
summary (naive_model)
## 
## Call:
## glm(formula = affair ~ ., family = binomial, data = subset(rb_training, 
##     select = c(-age, -yrs_married)))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3711  -0.8219  -0.5688   1.0145   2.3343  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       1.889240   0.632964   2.985  0.00284 ** 
## rate_marriage    -0.692802   0.035709 -19.402  < 2e-16 ***
## children          0.638495   0.087256   7.317 2.53e-13 ***
## religious        -0.384152   0.038911  -9.873  < 2e-16 ***
## educ             -0.026931   0.018914  -1.424  0.15449    
## occupation2       0.225955   0.520571   0.434  0.66425    
## occupation3       0.516095   0.514297   1.003  0.31562    
## occupation4       0.280443   0.515927   0.544  0.58674    
## occupation5       0.871116   0.519881   1.676  0.09382 .  
## occupation6       0.790941   0.569869   1.388  0.16516    
## occupation_husb2 -0.011029   0.209294  -0.053  0.95797    
## occupation_husb3  0.162443   0.226559   0.717  0.47337    
## occupation_husb4 -0.023977   0.203883  -0.118  0.90638    
## occupation_husb5  0.041642   0.205104   0.203  0.83911    
## occupation_husb6  0.056407   0.227078   0.248  0.80382    
## `Marriage Age`    0.027725   0.005768   4.806 1.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6380.4  on 5091  degrees of freedom
## Residual deviance: 5543.1  on 5076  degrees of freedom
## AIC: 5575.1
## 
## Number of Fisher Scoring iterations: 4
# 2d)
# Similar to first model, create a modified one with skimmed down predictor vars
rbtrim_model <- glm(affair ~ rate_marriage + children + religious + `Marriage Age`, 
                    family = binomial, data = rb_training)
summary (rbtrim_model)
## 
## Call:
## glm(formula = affair ~ rate_marriage + children + religious + 
##     `Marriage Age`, family = binomial, data = rb_training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2030  -0.8389  -0.5988   1.0386   2.3414  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.945361   0.187286  10.387  < 2e-16 ***
## rate_marriage  -0.694068   0.035415 -19.598  < 2e-16 ***
## children        0.598800   0.085161   7.031 2.04e-12 ***
## religious      -0.384001   0.038642  -9.937  < 2e-16 ***
## `Marriage Age`  0.031563   0.005656   5.580 2.41e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6380.4  on 5091  degrees of freedom
## Residual deviance: 5590.8  on 5087  degrees of freedom
## AIC: 5600.8
## 
## Number of Fisher Scoring iterations: 4
VIF (rbtrim_model)
##  rate_marriage       children      religious `Marriage Age` 
##       1.004625       1.432398       1.044589       1.441662
# Generally speaking, a VIF (variance inflation factor), close to 1
# indicates that there is no presence of multicollinearity. A VIF
# score outside the bounds of 0.25 and 4 indicate multicollinearity
# might exist and outside the bounds of 0.1 and 10 indicates
# that multicollinearity does exist. Given the VIF's of all the chosen
# predictors are close to 1, it can be determined that the choices of predictors
# in the skimmed down model are acceptable.
# 2e)
# Use the same parameters but use the entire af_RB as a training set and
# use af_PT as a testing set
# (note this means you the predictor variables you chose have to match so
# if they don't, just build the model without the ones that don't exist).
# Evaluate this model's performance.
names (rb) [1] <- "rating"
names (rb) [5] <- "religiousness"
names (rb) [length(rb)-1] <-"cheated"
entire_rb_training <- glm(cheated ~ religiousness + rating + children, 
                          family = binomial, data = rb)
femalept <- pt[pt$gender==0,]
femaleProbabilities <- predict(entire_rb_training, femalept, type="response")
femalePredictions <- ifelse(femaleProbabilities >= 0.5, 1, 0)
female_cm <- confusionMatrix(reference=as.factor(femalept$cheated), 
                             data=as.factor(femalePredictions))
female_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 204  39
##          1  39  33
##                                           
##                Accuracy : 0.7524          
##                  95% CI : (0.7009, 0.7991)
##     No Information Rate : 0.7714          
##     P-Value [Acc > NIR] : 0.8093          
##                                           
##                   Kappa : 0.2978          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.8395          
##             Specificity : 0.4583          
##          Pos Pred Value : 0.8395          
##          Neg Pred Value : 0.4583          
##              Prevalence : 0.7714          
##          Detection Rate : 0.6476          
##    Detection Prevalence : 0.7714          
##       Balanced Accuracy : 0.6489          
##                                           
##        'Positive' Class : 0               
## 
# Once again, this model does an okay job with prediction as the
# overall accuracy rate is about 75%. This model has a relatively high
# sensitivity at about 84% and a higher specificity than previous model,
# although still relatively low, at about 46%. These results are intuitive
# as the act of cheating is rather erratic and unpredictable behavior, SO
# it may be more difficult to model. One way to improve this model would be
# to look for different predictors that do a better job of predicting
# infidelity when someone has had affairs.
#2f
# You work at a firm with its main service being processing social
# data and selling that data to other companies. One project is on
# the likelihood of a person having affairs based on his/her data on
# social media. You'd like to create a primitive model based on the data
# in af_RB and af_PT for prediction. If you classify an innocent person
# as having affairs, your company will get sued for defamation (very costly)
# so your boss wants you to plot a graph of Type I, II error, overall
# accuracy curve versus threshold parameter between 0,1 with intervals 0.05.
thresholds <- seq(0,1,0.05)
overall_error <- rep(0,21)
t1e <- rep(0,21)
t2e <- rep(0,21)
index = 1
for (i in thresholds) {
  femalePredictions <- factor(ifelse(femaleProbabilities >= i, 1, 0), levels=c(0,1))
  cmf <- confusionMatrix(reference=as.factor(femalept$cheated), data=as.factor(femalePredictions))
# Overall error
  overall_error[index] = 1 - cmf$overall["Accuracy"]
# Type 1 Error: False Positive Rate (false loyalty rate, rate at
# which cheaters are predicted to have been loyal)
  t1e[index] <- 1 - cmf$byClass["Specificity"]
# Type 2 Error: False Negative Rate (false cheating rate, rate at
# which non-cheaters are predicted to have cheated)
# Defamation implications
  t2e[index] <- 1 - cmf$byClass["Sensitivity"]
  index = index + 1
}
plots <- data.frame(thresholds, overall_error, t1e, t2e)
ggplot (plots, aes(thresholds)) +
  geom_line (aes (y=overall_error), color="purple") +
  geom_line (aes (y=t1e), color="red") +
  geom_line (aes (y=t2e), color="blue") +
  ylab ("Error Level") +
  xlab ("Threshold Level") +
  ggtitle("Overall Accuracy vs. Different Threshold Parameters")

# 2h)
# Train a similar knn model with k = 5 using the caret package and compare auc
# of knn model and the previous logistic regression model.
predictors <- c("religiousness", "rating", "children")
knnpredictions <- knn(as.matrix(rb[predictors]), femalept[predictors], 
                      cl=as.matrix(rb["cheated"]), k = 5, prob=TRUE)
knnprobabilities <- attributes (knnpredictions)$prob
par (pty='s')
roch <- roc(femalept$cheated, femaleProbabilities, plot=TRUE, legacy.axes=TRUE,
            percent=TRUE, xlab= "False Positive %", ylab = "True Positive %",
            print.auc=TRUE, col = "purple")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(femalept$cheated, knnprobabilities, legacy.axes=TRUE, percent=TRUE,
        print.auc=TRUE, print.auc.y=40, add=TRUE, col="green")
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

# The AUC of the logistic model is slightly higher than the AUC fo the knn
# model, meaning the logistic model was a better classifier and
# predictor of cheating than the knn model and its specific parameters
# 3.) Extra Credit
# Intuitively, multinomial logistic regression is better suited when the
# dependent variable has more than two categories
# Given that it is a more general model than logistic regression, it is
# not restricted to just two categories, but this may
# sacrifice the accuracy when predicting a factor that has 2 categories.
# In the case of affairs,
# cheated: yes or no, which could present a problem for the accuracy of
# predictions since it has just 2 categories and logistic
# regression may be a better prediction methodology.
predictors <- c("religiousness", "rating", "children")
mnlr <- multinom(affairs~religiousness+rating+children, data = pt_training)
## # weights:  30 (20 variable)
## initial  value 860.044545 
## iter  10 value 440.881786
## iter  20 value 432.682851
## final  value 432.675387 
## converged
summary (mnlr)
## Call:
## multinom(formula = affairs ~ religiousness + rating + children, 
##     data = pt_training)
## 
## Coefficients:
##    (Intercept) religiousness      rating  children
## 1   -3.1921662    0.05512222  0.01136019 0.5270719
## 2   -1.8320097   -0.54024377 -0.19677934 1.2481351
## 3   -0.3695357   -0.23342107 -0.59164321 0.3544932
## 7   -0.7870804   -0.30803115 -0.32290123 0.7177399
## 12   1.0872611   -0.31391030 -0.90148369 0.6080867
## 
## Std. Errors:
##    (Intercept) religiousness    rating  children
## 1    1.0564055     0.1759730 0.1946736 0.4874810
## 2    1.3405369     0.2387591 0.2377923 0.7808294
## 3    1.0569691     0.2172405 0.2013462 0.5955001
## 7    0.8756278     0.1639841 0.1615377 0.4783497
## 12   0.8608773     0.1809605 0.1689663 0.5307465
## 
## Residual Deviance: 865.3508 
## AIC: 905.3508
mnlrpredictions <- predict(mnlr, pt_testing[predictors])
mnlrpredictions
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1 2 3 7 12
#The multinomial logistic regression predicted no cheating, which is
#obviously false, highlighting the issue with using such a flexible model