library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
library (ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library (class)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(regclass)
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following objects are masked from 'package:psych':
##
## fisherz, logistic, logit
## The following object is masked from 'package:caret':
##
## predictors
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
##
## Attaching package: 'regclass'
## The following object is masked from 'package:lattice':
##
## qq
library (pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library (nnet)
#1
pt = read.csv("affair_pt.csv")
pt$occupation <- as.factor(pt$occupation)
pt$gender <- ifelse(pt$gender=="male", 1, 0)
pt$children <- ifelse(pt$children=="yes", 1, 0)
pt$cheated <- ifelse(pt$affairs>0, 1, 0)
pt$affairs <- as.factor(pt$affairs)
#1b)
# Examine whether collinearity exists in the dataset by a correlation matrix.
#Let 0.6 be your cutoff for determining collinearity. what are two ways to deal with collinearity?
# pt Correlation Matrix 1 must not include occupation and affairs because they are categorical, input must be numeric
#setdiff: find differences between two sets
ptcort1 <- pt[, setdiff(names(pt), c("occupation", "affairs"))]
# Prints a boolean matrix if collinearity exists
cor(ptcort1) > 0.6
## gender age yearsmarried children religiousness education rating
## gender TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## age FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## yearsmarried FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## children FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## religiousness FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## education FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## rating FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## cheated FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## cheated
## gender FALSE
## age FALSE
## yearsmarried FALSE
## children FALSE
## religiousness FALSE
## education FALSE
## rating FALSE
## cheated TRUE
# collinearity exists between age and years married
# Correlation plot to confirm results
corPlot(ptcort1)

# One way to deal with collinearity is to make a combined variable
# that eliminates the inherent collinearity between the 2 related variables
ptmarriage_age <- data.frame(0.5*pt$age + 0.5*pt$yearsmarried)
pt <- cbind(pt, ptmarriage_age)
names(pt)[length (pt)] <- "Marriage Age"
ptcort2 <- pt[,setdiff(names (pt), c("occupation", "affairs", "age", "yearsmarried"))]
# Correlation matrix 2 calculation
cor(ptcort2) > 0.6
## gender children religiousness education rating cheated
## gender TRUE FALSE FALSE FALSE FALSE FALSE
## children FALSE TRUE FALSE FALSE FALSE FALSE
## religiousness FALSE FALSE TRUE FALSE FALSE FALSE
## education FALSE FALSE FALSE TRUE FALSE FALSE
## rating FALSE FALSE FALSE FALSE TRUE FALSE
## cheated FALSE FALSE FALSE FALSE FALSE TRUE
## Marriage Age FALSE FALSE FALSE FALSE FALSE FALSE
## Marriage Age
## gender FALSE
## children FALSE
## religiousness FALSE
## education FALSE
## rating FALSE
## cheated FALSE
## Marriage Age TRUE
corPlot (ptcort2)

# 2 ways to deal with collinearity include:
# - Remove one or more of the correlated predictors
# - Regularization techniques that reduce the impact of highly correlated
#predictors, such as lasso regression (setting some of the coefficients to 0)
# 1c)
# Set_seed=2021. Split the dataset randomly into 80% training and 20% testing.
set.seed(2021)
training_percent <- 0.8
ptindex_break <- sort(sample(nrow(pt), nrow(pt)*training_percent))
pt_training <- pt[ptindex_break,]
pt_testing <- pt[-ptindex_break,]
# 1d)
# After accounting for the collinearity, build a simple model with
#"cheated" as the response variable and all other appropriate variables.
# Simple model appropriate variables should not include affairs
#(=cheated, not predictor, age and yrs married introduce collinearity)
simple_model <- glm(cheated ~., data = subset(pt_training, select = c(-affairs, -age, -yearsmarried)), family = binomial)
summary(simple_model)
##
## Call:
## glm(formula = cheated ~ ., family = binomial, data = subset(pt_training,
## select = c(-affairs, -age, -yearsmarried)))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4993 -0.7695 -0.5839 0.9606 2.3220
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.772429 1.038907 -0.744 0.4572
## gender -0.166875 0.271277 -0.615 0.5385
## children 0.710989 0.309989 2.294 0.0218 *
## religiousness -0.246119 0.098431 -2.500 0.0124 *
## education 0.070401 0.059810 1.177 0.2392
## occupation2 0.962637 0.804365 1.197 0.2314
## occupation3 0.650915 0.490897 1.326 0.1848
## occupation4 1.059432 0.445551 2.378 0.0174 *
## occupation5 0.272522 0.371679 0.733 0.4634
## occupation6 0.202020 0.451861 0.447 0.6548
## occupation7 0.362631 0.753645 0.481 0.6304
## rating -0.434288 0.099472 -4.366 1.27e-05 ***
## `Marriage Age` 0.006357 0.018791 0.338 0.7351
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 544.19 on 479 degrees of freedom
## Residual deviance: 498.65 on 467 degrees of freedom
## AIC: 524.65
##
## Number of Fisher Scoring iterations: 4
# le)
# Build a model with a trimmed selection of feature vectors and
# use it to predict labels on the testing set. Explain your choice
# of predictor variables. Look at the VIF and determine if these choices are acceptable.
# Based on simple model summary, only children, religiousness, occupation 4,
# and rating are significant predictors, as they have p-value less than 0.05
pttrim_model <- glm(cheated ~ children +religiousness + occupation +rating,
family = binomial, data = pt_training)
summary (pttrim_model)
##
## Call:
## glm(formula = cheated ~ children + religiousness + occupation +
## rating, family = binomial, data = pt_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5085 -0.7723 -0.5823 0.9021 2.2499
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29373 0.57938 0.507 0.61217
## children 0.75941 0.27822 2.730 0.00634 **
## religiousness -0.24413 0.09619 -2.538 0.01115 *
## occupation2 0.69493 0.76127 0.913 0.36132
## occupation3 0.54858 0.47050 1.166 0.24364
## occupation4 1.03142 0.41803 2.467 0.01361 *
## occupation5 0.34692 0.33823 1.026 0.30504
## occupation6 0.33416 0.35846 0.932 0.35121
## occupation7 0.43684 0.70973 0.615 0.53823
## rating -0.42244 0.09583 -4.408 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 544.19 on 479 degrees of freedom
## Residual deviance: 500.27 on 470 degrees of freedom
## AIC: 520.27
##
## Number of Fisher Scoring iterations: 4
VIF (pttrim_model)
## GVIF Df GVIF^(1/(2*Df))
## children 1.094506 1 1.046186
## religiousness 1.025624 1 1.012731
## occupation 1.097432 6 1.007778
## rating 1.049423 1 1.024414
# Generally speaking, a VIF (variance inflation factor), close to 1
# indicates that there is no presence of multicollinearity. A VIF
# score outside the bounds of 0.25 and 4 indicate multicollinearity
# might exist and outside the bounds of 0.1 and 10 indicates
#that multicollinearity does exist. Given the VIF's of all the
# chosen predictors close to 1, it can be determined that the choices of
# predictors in the trimmed model are acceptable.
# 1f)
# Run the trimmed model on testing data. Plot confusion matrix.
# comment on whether or not the model is a good model and comment on
# things that can be improved.
pt_probabilities <- predict(pttrim_model, pt_testing, type="response")
pt_predictions <- ifelse(pt_probabilities >= 0.5, 1, 0)
pt_cm <- confusionMatrix(reference=as.factor(pt_testing$cheated), data = as.factor(pt_predictions))
pt_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 92 24
## 1 1 4
##
## Accuracy : 0.7934
## 95% CI : (0.7103, 0.8616)
## No Information Rate : 0.7686
## P-Value [Acc > NIR] : 0.2999
##
## Kappa : 0.1853
##
## Mcnemar's Test P-Value : 1.083e-05
##
## Sensitivity : 0.9892
## Specificity : 0.1429
## Pos Pred Value : 0.7931
## Neg Pred Value : 0.8000
## Prevalence : 0.7686
## Detection Rate : 0.7603
## Detection Prevalence : 0.9587
## Balanced Accuracy : 0.5661
##
## 'Positive' Class : 0
##
# Based on the confusion matrix, this model does an okay job, with an
# approximate 79% total accuracy it seems that the model does a great job
# predicting the behaviors of non-cheaters. A sensitivity of approximately 99%
#reveals it does a great job of predicting whether or not that person cheated or not.
#However, the model is very poor at predicting that someone is a cheater given
#that they are a cheater. Out of the 28 cheaters, the model only predicted that
#4 of them were cheaters, and hence the extremely low specificity rate
#and an extremely high false rate of loyalty.
2) exploring the af_RB Dataset
rb <- read.csv("/Users/jianingjin/Desktop/IEMS_304/lab7/affair_rb.csv")
# 2a)
# Repeat Steps for RB Dataset
rb$children <- ifelse(rb$children>0, 1, 0)
rb$occupation <- as.factor(rb$occupation)
rb$occupation_husb <- as.factor(rb$occupation_husb)
# Need to ignore categorical vars when calculating correlation
rbcort1 <- rb[,setdiff(names(rb), c("occupation", "occupation_husb"))]
cor(rbcort1) > 0.6
## rate_marriage age yrs_married children religious educ affair
## rate_marriage TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## age FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## yrs_married FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## children FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## religious FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## educ FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## affair FALSE FALSE FALSE FALSE FALSE FALSE TRUE
# Age and yrs_married present collinearity issues so I combined into marriage
# age again
corPlot (rbcort1)

rbmarriage_age <- data.frame (0.5*rb$age + 0.5*rb$yrs_married)
rb <- cbind(rb, rbmarriage_age)
names (rb) [length (rb)] <- "Marriage Age"
rbcort2 <- rb[, setdiff (names(rb), c("occupation", "occupation_husb", "age", "yrs_married"))]
cor (rbcort2) > 0.6
## rate_marriage children religious educ affair Marriage Age
## rate_marriage TRUE FALSE FALSE FALSE FALSE FALSE
## children FALSE TRUE FALSE FALSE FALSE FALSE
## religious FALSE FALSE TRUE FALSE FALSE FALSE
## educ FALSE FALSE FALSE TRUE FALSE FALSE
## affair FALSE FALSE FALSE FALSE TRUE FALSE
## Marriage Age FALSE FALSE FALSE FALSE FALSE TRUE
corPlot (rbcort2)

# 2b)
# Set seed=2021. Split the dataset randomly into 80% training and 20% testing
set.seed (2021)
training_percent <- 0.8
rbindex_break <- sort(sample(nrow(rb), nrow(rb)*training_percent))
rb_training <- rb[rbindex_break,]
rb_testing <- rb[-rbindex_break,]
#2c)
# Naive model with all parameters. You can choose either using
# (children) or have_children for your
# predictor variable. You can experiment with both and comment on
# your insight with the two different approaches.
naive_model <- glm(affair ~., data = subset(rb_training, select = c(-age, -yrs_married)), family = binomial)
summary (naive_model)
##
## Call:
## glm(formula = affair ~ ., family = binomial, data = subset(rb_training,
## select = c(-age, -yrs_married)))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3711 -0.8219 -0.5688 1.0145 2.3343
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.889240 0.632964 2.985 0.00284 **
## rate_marriage -0.692802 0.035709 -19.402 < 2e-16 ***
## children 0.638495 0.087256 7.317 2.53e-13 ***
## religious -0.384152 0.038911 -9.873 < 2e-16 ***
## educ -0.026931 0.018914 -1.424 0.15449
## occupation2 0.225955 0.520571 0.434 0.66425
## occupation3 0.516095 0.514297 1.003 0.31562
## occupation4 0.280443 0.515927 0.544 0.58674
## occupation5 0.871116 0.519881 1.676 0.09382 .
## occupation6 0.790941 0.569869 1.388 0.16516
## occupation_husb2 -0.011029 0.209294 -0.053 0.95797
## occupation_husb3 0.162443 0.226559 0.717 0.47337
## occupation_husb4 -0.023977 0.203883 -0.118 0.90638
## occupation_husb5 0.041642 0.205104 0.203 0.83911
## occupation_husb6 0.056407 0.227078 0.248 0.80382
## `Marriage Age` 0.027725 0.005768 4.806 1.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6380.4 on 5091 degrees of freedom
## Residual deviance: 5543.1 on 5076 degrees of freedom
## AIC: 5575.1
##
## Number of Fisher Scoring iterations: 4
# 2d)
# Similar to first model, create a modified one with skimmed down predictor vars
rbtrim_model <- glm(affair ~ rate_marriage + children + religious + `Marriage Age`,
family = binomial, data = rb_training)
summary (rbtrim_model)
##
## Call:
## glm(formula = affair ~ rate_marriage + children + religious +
## `Marriage Age`, family = binomial, data = rb_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2030 -0.8389 -0.5988 1.0386 2.3414
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.945361 0.187286 10.387 < 2e-16 ***
## rate_marriage -0.694068 0.035415 -19.598 < 2e-16 ***
## children 0.598800 0.085161 7.031 2.04e-12 ***
## religious -0.384001 0.038642 -9.937 < 2e-16 ***
## `Marriage Age` 0.031563 0.005656 5.580 2.41e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6380.4 on 5091 degrees of freedom
## Residual deviance: 5590.8 on 5087 degrees of freedom
## AIC: 5600.8
##
## Number of Fisher Scoring iterations: 4
VIF (rbtrim_model)
## rate_marriage children religious `Marriage Age`
## 1.004625 1.432398 1.044589 1.441662
# Generally speaking, a VIF (variance inflation factor), close to 1
# indicates that there is no presence of multicollinearity. A VIF
# score outside the bounds of 0.25 and 4 indicate multicollinearity
# might exist and outside the bounds of 0.1 and 10 indicates
# that multicollinearity does exist. Given the VIF's of all the chosen
# predictors are close to 1, it can be determined that the choices of predictors
# in the skimmed down model are acceptable.
# 2e)
# Use the same parameters but use the entire af_RB as a training set and
# use af_PT as a testing set
# (note this means you the predictor variables you chose have to match so
# if they don't, just build the model without the ones that don't exist).
# Evaluate this model's performance.
names (rb) [1] <- "rating"
names (rb) [5] <- "religiousness"
names (rb) [length(rb)-1] <-"cheated"
entire_rb_training <- glm(cheated ~ religiousness + rating + children,
family = binomial, data = rb)
femalept <- pt[pt$gender==0,]
femaleProbabilities <- predict(entire_rb_training, femalept, type="response")
femalePredictions <- ifelse(femaleProbabilities >= 0.5, 1, 0)
female_cm <- confusionMatrix(reference=as.factor(femalept$cheated),
data=as.factor(femalePredictions))
female_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 204 39
## 1 39 33
##
## Accuracy : 0.7524
## 95% CI : (0.7009, 0.7991)
## No Information Rate : 0.7714
## P-Value [Acc > NIR] : 0.8093
##
## Kappa : 0.2978
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8395
## Specificity : 0.4583
## Pos Pred Value : 0.8395
## Neg Pred Value : 0.4583
## Prevalence : 0.7714
## Detection Rate : 0.6476
## Detection Prevalence : 0.7714
## Balanced Accuracy : 0.6489
##
## 'Positive' Class : 0
##
# Once again, this model does an okay job with prediction as the
# overall accuracy rate is about 75%. This model has a relatively high
# sensitivity at about 84% and a higher specificity than previous model,
# although still relatively low, at about 46%. These results are intuitive
# as the act of cheating is rather erratic and unpredictable behavior, SO
# it may be more difficult to model. One way to improve this model would be
# to look for different predictors that do a better job of predicting
# infidelity when someone has had affairs.
#2f
# You work at a firm with its main service being processing social
# data and selling that data to other companies. One project is on
# the likelihood of a person having affairs based on his/her data on
# social media. You'd like to create a primitive model based on the data
# in af_RB and af_PT for prediction. If you classify an innocent person
# as having affairs, your company will get sued for defamation (very costly)
# so your boss wants you to plot a graph of Type I, II error, overall
# accuracy curve versus threshold parameter between 0,1 with intervals 0.05.
thresholds <- seq(0,1,0.05)
overall_error <- rep(0,21)
t1e <- rep(0,21)
t2e <- rep(0,21)
index = 1
for (i in thresholds) {
femalePredictions <- factor(ifelse(femaleProbabilities >= i, 1, 0), levels=c(0,1))
cmf <- confusionMatrix(reference=as.factor(femalept$cheated), data=as.factor(femalePredictions))
# Overall error
overall_error[index] = 1 - cmf$overall["Accuracy"]
# Type 1 Error: False Positive Rate (false loyalty rate, rate at
# which cheaters are predicted to have been loyal)
t1e[index] <- 1 - cmf$byClass["Specificity"]
# Type 2 Error: False Negative Rate (false cheating rate, rate at
# which non-cheaters are predicted to have cheated)
# Defamation implications
t2e[index] <- 1 - cmf$byClass["Sensitivity"]
index = index + 1
}
plots <- data.frame(thresholds, overall_error, t1e, t2e)
ggplot (plots, aes(thresholds)) +
geom_line (aes (y=overall_error), color="purple") +
geom_line (aes (y=t1e), color="red") +
geom_line (aes (y=t2e), color="blue") +
ylab ("Error Level") +
xlab ("Threshold Level") +
ggtitle("Overall Accuracy vs. Different Threshold Parameters")

# 2h)
# Train a similar knn model with k = 5 using the caret package and compare auc
# of knn model and the previous logistic regression model.
predictors <- c("religiousness", "rating", "children")
knnpredictions <- knn(as.matrix(rb[predictors]), femalept[predictors],
cl=as.matrix(rb["cheated"]), k = 5, prob=TRUE)
knnprobabilities <- attributes (knnpredictions)$prob
par (pty='s')
roch <- roc(femalept$cheated, femaleProbabilities, plot=TRUE, legacy.axes=TRUE,
percent=TRUE, xlab= "False Positive %", ylab = "True Positive %",
print.auc=TRUE, col = "purple")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(femalept$cheated, knnprobabilities, legacy.axes=TRUE, percent=TRUE,
print.auc=TRUE, print.auc.y=40, add=TRUE, col="green")
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

# The AUC of the logistic model is slightly higher than the AUC fo the knn
# model, meaning the logistic model was a better classifier and
# predictor of cheating than the knn model and its specific parameters
# 3.) Extra Credit
# Intuitively, multinomial logistic regression is better suited when the
# dependent variable has more than two categories
# Given that it is a more general model than logistic regression, it is
# not restricted to just two categories, but this may
# sacrifice the accuracy when predicting a factor that has 2 categories.
# In the case of affairs,
# cheated: yes or no, which could present a problem for the accuracy of
# predictions since it has just 2 categories and logistic
# regression may be a better prediction methodology.
predictors <- c("religiousness", "rating", "children")
mnlr <- multinom(affairs~religiousness+rating+children, data = pt_training)
## # weights: 30 (20 variable)
## initial value 860.044545
## iter 10 value 440.881786
## iter 20 value 432.682851
## final value 432.675387
## converged
summary (mnlr)
## Call:
## multinom(formula = affairs ~ religiousness + rating + children,
## data = pt_training)
##
## Coefficients:
## (Intercept) religiousness rating children
## 1 -3.1921662 0.05512222 0.01136019 0.5270719
## 2 -1.8320097 -0.54024377 -0.19677934 1.2481351
## 3 -0.3695357 -0.23342107 -0.59164321 0.3544932
## 7 -0.7870804 -0.30803115 -0.32290123 0.7177399
## 12 1.0872611 -0.31391030 -0.90148369 0.6080867
##
## Std. Errors:
## (Intercept) religiousness rating children
## 1 1.0564055 0.1759730 0.1946736 0.4874810
## 2 1.3405369 0.2387591 0.2377923 0.7808294
## 3 1.0569691 0.2172405 0.2013462 0.5955001
## 7 0.8756278 0.1639841 0.1615377 0.4783497
## 12 0.8608773 0.1809605 0.1689663 0.5307465
##
## Residual Deviance: 865.3508
## AIC: 905.3508
mnlrpredictions <- predict(mnlr, pt_testing[predictors])
mnlrpredictions
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1 2 3 7 12
#The multinomial logistic regression predicted no cheating, which is
#obviously false, highlighting the issue with using such a flexible model