rm(list=ls(all=T))
options(digits=4, scipen=12)
library(magrittr)
議題:用申請假釋者的屬性,預測他會不會違反假釋規定
學習重點:
【1.1 讀進資料】How many parolees are contained in the dataset?
parole = read.csv("data/parole.csv")
str(parole)
'data.frame': 675 obs. of 9 variables:
$ male : int 1 0 1 1 1 1 1 0 0 1 ...
$ race : int 1 1 2 1 2 2 1 1 1 2 ...
$ age : num 33.2 39.7 29.5 22.4 21.6 46.7 31 24.6 32.6 29.1 ...
$ state : int 1 1 1 1 1 1 1 1 1 1 ...
$ time.served : num 5.5 5.4 5.6 5.7 5.4 6 6 4.8 4.5 4.7 ...
$ max.sentence : int 18 12 12 18 12 18 18 12 13 12 ...
$ multiple.offenses: int 0 0 0 0 0 0 0 0 0 0 ...
$ crime : int 4 3 3 1 1 4 3 1 3 2 ...
$ violator : int 0 0 0 0 0 0 0 0 0 0 ...
print("675")
[1] "675"
【1.2 底線機率】How many of the parolees in the dataset violated the terms of their parole?
table(parole$violator)
0 1
597 78
print("78")
[1] "78"
【2.1 類別變數】Which variables in this dataset are unordered factors with at least three levels?
str(parole)
'data.frame': 675 obs. of 9 variables:
$ male : int 1 0 1 1 1 1 1 0 0 1 ...
$ race : int 1 1 2 1 2 2 1 1 1 2 ...
$ age : num 33.2 39.7 29.5 22.4 21.6 46.7 31 24.6 32.6 29.1 ...
$ state : int 1 1 1 1 1 1 1 1 1 1 ...
$ time.served : num 5.5 5.4 5.6 5.7 5.4 6 6 4.8 4.5 4.7 ...
$ max.sentence : int 18 12 12 18 12 18 18 12 13 12 ...
$ multiple.offenses: int 0 0 0 0 0 0 0 0 0 0 ...
$ crime : int 4 3 3 1 1 4 3 1 3 2 ...
$ violator : int 0 0 0 0 0 0 0 0 0 0 ...
table(parole$state)
1 2 3 4
143 120 82 330
table(parole$race)
1 2
389 286
table(parole$crime)
1 2 3 4
315 106 153 101
print("crime, state")
[1] "crime, state"
【2.2 資料框摘要】How does the output of summary() change for a factor variable as compared to a numerical variable?
str(parole)
'data.frame': 675 obs. of 9 variables:
$ male : int 1 0 1 1 1 1 1 0 0 1 ...
$ race : int 1 1 2 1 2 2 1 1 1 2 ...
$ age : num 33.2 39.7 29.5 22.4 21.6 46.7 31 24.6 32.6 29.1 ...
$ state : int 1 1 1 1 1 1 1 1 1 1 ...
$ time.served : num 5.5 5.4 5.6 5.7 5.4 6 6 4.8 4.5 4.7 ...
$ max.sentence : int 18 12 12 18 12 18 18 12 13 12 ...
$ multiple.offenses: int 0 0 0 0 0 0 0 0 0 0 ...
$ crime : int 4 3 3 1 1 4 3 1 3 2 ...
$ violator : int 0 0 0 0 0 0 0 0 0 0 ...
parole$state = as.factor(parole$state)
parole$crime = as.factor(parole$crime)
str(parole)
'data.frame': 675 obs. of 9 variables:
$ male : int 1 0 1 1 1 1 1 0 0 1 ...
$ race : int 1 1 2 1 2 2 1 1 1 2 ...
$ age : num 33.2 39.7 29.5 22.4 21.6 46.7 31 24.6 32.6 29.1 ...
$ state : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
$ time.served : num 5.5 5.4 5.6 5.7 5.4 6 6 4.8 4.5 4.7 ...
$ max.sentence : int 18 12 12 18 12 18 18 12 13 12 ...
$ multiple.offenses: int 0 0 0 0 0 0 0 0 0 0 ...
$ crime : Factor w/ 4 levels "1","2","3","4": 4 3 3 1 1 4 3 1 3 2 ...
$ violator : int 0 0 0 0 0 0 0 0 0 0 ...
print("The output becomes similar to that of the table() function applied to that variable")
[1] "The output becomes similar to that of the table() function applied to that variable"
【3.1 指定隨機種子、依比例分割資料】Roughly what proportion of parolees have been allocated to the training and testing sets?
set.seed(144)
library(caTools)
split = sample.split(parole$violator, SplitRatio = 0.7)
train = subset(parole, split == TRUE)
test = subset(parole, split == FALSE)
nrow(train)
[1] 473
nrow(test)
[1] 202
473/(473+202)
[1] 0.7007
【3.2 隨機種子的功用】Now, suppose you re-ran lines [1]-[5] of Problem 3.1. What would you expect? If you instead ONLY re-ran lines [3]-[5], what would you expect? If you instead called set.seed() with a different number and then re-ran lines [3]-[5] of Problem 3.1, what would you expect?
set.seed(144)
library(caTools)
split = sample.split(parole$violator, SplitRatio = 0.7)
train = subset(parole, split == TRUE)
test = subset(parole, split == FALSE)
nrow(train)
[1] 473
nrow(test)
[1] 202
print("Question 1 : Now, suppose you re-ran lines [1]-[5] of Problem 3.1. What would you expect?
The exact same training/testing set split as the first execution of [1]-[5]")
[1] "Question 1 : Now, suppose you re-ran lines [1]-[5] of Problem 3.1. What would you expect?\n The exact same training/testing set split as the first execution of [1]-[5]"
split = sample.split(parole$violator, SplitRatio = 0.7)
train = subset(parole, split == TRUE)
test = subset(parole, split == FALSE)
print("Question 2 : If you instead ONLY re-ran lines [3]-[5], what would you expect?
A different training/testing set split from the first execution of [1]-[5] ")
[1] "Question 2 : If you instead ONLY re-ran lines [3]-[5], what would you expect?\n A different training/testing set split from the first execution of [1]-[5] "
set.seed(50)
library(caTools)
split = sample.split(parole$violator, SplitRatio = 0.7)
train = subset(parole, split == TRUE)
test = subset(parole, split == FALSE)
print("Question 3 :
If you instead called set.seed() with a different number and then re-ran lines [3]-[5] of Problem 3.1, what would you expect?
A different training/testing set split from the first execution of [1]-[5] ")
[1] "Question 3 : \nIf you instead called set.seed() with a different number and then re-ran lines [3]-[5] of Problem 3.1, what would you expect?\n\n A different training/testing set split from the first execution of [1]-[5] "
【4.1 顯著性】What variables are significant in this model?
set.seed(144)
library(caTools)
split = sample.split(parole$violator, SplitRatio = 0.7)
train = subset(parole, split == TRUE)
test = subset(parole, split == FALSE)
LRmodel = glm(violator~.,data = train, family = "binomial")
model1 = LRmodel
summary(LRmodel)
Call:
glm(formula = violator ~ ., family = "binomial", data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.704 -0.424 -0.272 -0.169 2.837
Coefficients:
Estimate Std. Error z value
(Intercept) -4.241157 1.293885 -3.28
male 0.386990 0.437961 0.88
race 0.886719 0.395066 2.24
age -0.000176 0.016085 -0.01
state2 0.443301 0.481662 0.92
state3 0.834980 0.556270 1.50
state4 -3.396788 0.611586 -5.55
time.served -0.123887 0.120423 -1.03
max.sentence 0.080295 0.055375 1.45
multiple.offenses 1.611992 0.385305 4.18
crime2 0.683714 0.500355 1.37
crime3 -0.278105 0.432836 -0.64
crime4 -0.011763 0.571304 -0.02
Pr(>|z|)
(Intercept) 0.001 **
male 0.377
race 0.025 *
age 0.991
state2 0.357
state3 0.133
state4 0.000000028 ***
time.served 0.304
max.sentence 0.147
multiple.offenses 0.000028683 ***
crime2 0.172
crime3 0.521
crime4 0.984
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 340.04 on 472 degrees of freedom
Residual deviance: 251.48 on 460 degrees of freedom
AIC: 277.5
Number of Fisher Scoring iterations: 6
print("race, state4, multiple.offenses")
[1] "race, state4, multiple.offenses"
【4.2 從回歸係數估計邊際效用】What can we say based on the coefficient of the multiple.offenses variable?
print("For parolees A and B who are identical other than A having committed multiple offenses, the predicted log odds of A is 1.61 more than the predicted log odds of B. Then we have:
ln(odds of A) = ln(odds of B) + 1.61
exp(ln(odds of A)) = exp(ln(odds of B) + 1.61)
exp(ln(odds of A)) = exp(ln(odds of B)) * exp(1.61)
odds of A = exp(1.61) * odds of B
odds of A= 5.01 * odds of B")
[1] "For parolees A and B who are identical other than A having committed multiple offenses, the predicted log odds of A is 1.61 more than the predicted log odds of B. Then we have:\n\nln(odds of A) = ln(odds of B) + 1.61\nexp(ln(odds of A)) = exp(ln(odds of B) + 1.61)\nexp(ln(odds of A)) = exp(ln(odds of B)) * exp(1.61)\nodds of A = exp(1.61) * odds of B\nodds of A= 5.01 * odds of B"
print("Our model predicts that a parolee who committed multiple offenses has 5.01 times higher odds of being a violator than a parolee who did not commit multiple offenses but is otherwise identical.")
[1] "Our model predicts that a parolee who committed multiple offenses has 5.01 times higher odds of being a violator than a parolee who did not commit multiple offenses but is otherwise identical."
【4.3 從預測值估計勝率和機率】According to the model, what are the odds this individual is a violator? What is the probability this individual is a violator?
print("log(odds) = -4.2411574 + 0.3869904*male + 0.8867192*race - 0.0001756*age + 0.4433007*state2 + 0.8349797*state3 - 3.3967878*state4 - 0.1238867*time.served + 0.0802954*max.sentence + 1.6119919*multiple.offenses + 0.6837143*crime2 - 0.2781054*crime3 - 0.0117627*crime4")
[1] "log(odds) = -4.2411574 + 0.3869904*male + 0.8867192*race - 0.0001756*age + 0.4433007*state2 + 0.8349797*state3 - 3.3967878*state4 - 0.1238867*time.served + 0.0802954*max.sentence + 1.6119919*multiple.offenses + 0.6837143*crime2 - 0.2781054*crime3 - 0.0117627*crime4"
exp(-4.2411574 + 0.3869904*1 + 0.8867192*1 - 0.0001756*50 + 0.4433007*0 + 0.8349797*0 - 3.3967878*0 - 0.1238867*3 + 0.0802954*12 + 1.6119919*0 + 0.6837143*1 - 0.2781054*0 - 0.0117627*0)
[1] 0.1826
1/(1+exp(1.700629))
[1] 0.1544
print("odds ratio = 0.1826")
[1] "odds ratio = 0.1826"
print("probability = 0.1544")
[1] "probability = 0.1544"
§ 5.1 What is the maximum predicted probability of a violation?
predtest1 = predict(model1, newdata = test, type = "response")
max(predtest1)
[1] 0.9073
§ 5.2 What is the model’s sensitivity, specificity, accuracy?
table(test$violator, predtest1 >= 0.5)
FALSE TRUE
0 167 12
1 11 12
sensitivity = 12/(11+12)
specificity = 167/(167+12)
accuracy = (167+12)/(167+12+12+11)
print("sensitivity = 0.5217391")
[1] "sensitivity = 0.5217391"
print("specificity = 0.9329609")
[1] "specificity = 0.9329609"
print("accuracy = 0.8861386")
[1] "accuracy = 0.8861386"
§ 5.3 What is the accuracy of a simple model that predicts that every parolee is a non-violator?
table(test$violator)
0 1
179 23
print("accuracy = 179/(179+23) = 0.8861386")
[1] "accuracy = 179/(179+23) = 0.8861386"
§ 5.4 Which of the following most likely describes their preferences and best course of action?
print("The board assigns more cost to a false negative than a false positive, and should therefore use a logistic regression cutoff less than 0.5. ")
[1] "The board assigns more cost to a false negative than a false positive, and should therefore use a logistic regression cutoff less than 0.5. "
§ 5.5 Which of the following is the most accurate assessment of the value of the logistic regression model with a cutoff 0.5 to a parole board, based on the model’s accuracy as compared to the simple baseline model?
print("The model is likely of value to the board, and using a different logistic regression cutoff is likely to improve the model's value")
[1] "The model is likely of value to the board, and using a different logistic regression cutoff is likely to improve the model's value"
§ 5.6 Using the ROCR package, what is the AUC value for the model?
rocrpred = prediction(predtest1, test$violator)
auc = as.numeric(performance(rocrpred, "auc")@y.values)
§ 5.7 Describe the meaning of AUC in this context.
print("The probability the model can correctly differentiate between a randomly selected parole violator and a randomly selected parole non-violator.")
[1] "The probability the model can correctly differentiate between a randomly selected parole violator and a randomly selected parole non-violator."
§ 6.1 How could we improve our dataset to best address selection bias?
print("We should use a dataset tracking a group of parolees from the start of their parole until either they violated parole or they completed their term.")
[1] "We should use a dataset tracking a group of parolees from the start of their parole until either they violated parole or they completed their term."