library(openintro)
data(email)
str(email)
## tibble [3,921 × 21] (S3: tbl_df/tbl/data.frame)
## $ spam : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ to_multiple : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...
## $ from : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ cc : int [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ sent_email : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 2 1 1 ...
## $ time : POSIXct[1:3921], format: "2011-12-31 22:16:41" "2011-12-31 23:03:59" ...
## $ image : num [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ attach : num [1:3921] 0 0 0 0 0 0 0 1 0 0 ...
## $ dollar : num [1:3921] 0 0 4 0 0 0 0 0 0 0 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num [1:3921] 0 0 1 0 0 0 0 0 0 0 ...
## $ viagra : num [1:3921] 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num [1:3921] 0 0 0 0 2 2 0 0 0 0 ...
## $ num_char : num [1:3921] 11.37 10.5 7.77 13.26 1.23 ...
## $ line_breaks : int [1:3921] 202 202 192 255 29 25 193 237 69 68 ...
## $ format : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 1 2 ...
## $ re_subj : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ exclaim_subj: num [1:3921] 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent_subj : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ exclaim_mess: num [1:3921] 0 1 6 48 1 1 1 18 1 0 ...
## $ number : Factor w/ 3 levels "none","small",..: 3 2 2 2 1 1 3 2 2 2 ...
Observe that spam
is coded as a factor variable. If we use the as.numeric
wrapper then it turns the numbers to 1’s and 2’s not 0’s and 1’s.
tail(email$spam)
## [1] 0 1 1 0 0 1
## Levels: 0 1
tail(as.numeric(email$spam))
## [1] 1 2 2 1 1 2
# Recode as 0's and 1's
email$spam<-as.numeric(email$spam)-1
tail(email$spam)
## [1] 0 1 1 0 0 1
Predicting spam or not using the presence of “winner”
library(tidyverse)
ggplot(email, aes(winner, fill=factor(spam)))+
geom_bar(position="fill")
What would the problem be with the logic of if winner then spam"?
Predicting spam or not using number of characters (in K)
library(tidyverse)
ggplot(email, aes(num_char, color=factor(spam)))+
geom_density()
Predicting spam or not using log number of characters (in K)
library(tidyverse)
ggplot(email, aes(log(num_char), color=factor(spam)))+
geom_density()
If log(num_char) < 1, then “spam”?
Each simple filter can be thought of as a regression model.
\(spam∼winner;G1∼G2\)
\(spam∼log(num_char);G1∼K1\)
Each one by itself has poor predictive power, so how can we combine them into a single stronger model?
\[spam∼log(num_char)\]
library(dplyr)
email <- mutate(email, log_num_char = log(num_char))
#linear
ggplot(data = email, aes(x = log_num_char, y = spam))+
geom_point()+
geom_smooth(method = "lm",
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
m1 <- glm(spam ~ log(num_char), data = email, family = "binomial")
# logistic
ggplot(data = email, aes(x = log_num_char, y = spam))+
geom_point()+
geom_line(aes(x = log_num_char, y = m1$fitted), color="blue")
m1 <- glm(spam ~ log(num_char), data = email, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = spam ~ log(num_char), family = "binomial", data = email)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8149 -0.4674 -0.3347 -0.2545 3.0129
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.72435 0.06062 -28.45 <2e-16 ***
## log(num_char) -0.54350 0.03646 -14.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2437.2 on 3920 degrees of freedom
## Residual deviance: 2190.3 on 3919 degrees of freedom
## AIC: 2194.3
##
## Number of Fisher Scoring iterations: 5
Each row of the summary output is still a H-test on that parameter being 0.
A positive slope estimate indicates that there is a positive association.
Each estimate is still conditional on the other variables held constant.
m2 <- glm(spam ~ log(num_char) + to_multiple + attach + dollar + inherit +
viagra, data = email, family = "binomial")
summary(m2)
##
## Call:
## glm(formula = spam ~ log(num_char) + to_multiple + attach + dollar +
## inherit + viagra, family = "binomial", data = email)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9315 -0.4549 -0.3275 -0.2358 3.0335
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.596420 0.064433 -24.777 < 2e-16 ***
## log(num_char) -0.548689 0.038314 -14.321 < 2e-16 ***
## to_multiple1 -1.928889 0.304931 -6.326 2.52e-10 ***
## attach 0.199698 0.065522 3.048 0.00231 **
## dollar -0.004557 0.018978 -0.240 0.81021
## inherit 0.400030 0.151663 2.638 0.00835 **
## viagra 1.736070 40.592963 0.043 0.96589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2437.2 on 3920 degrees of freedom
## Residual deviance: 2106.3 on 3914 degrees of freedom
## AIC: 2120.3
##
## Number of Fisher Scoring iterations: 11
pred1 <- predict(m1, newdata = email, type = "response")
conf_mat1<-data.frame(spam=email$spam, predSpam=pred1>.5)
table(conf_mat1$spam, conf_mat1$predSpam)
##
## FALSE TRUE
## 0 3541 13
## 1 362 5
pred2<-predict(m2, newdata = email, type = "response")
conf_mat2<-data.frame(spam=email$spam, predSpam=pred2>.5)
table(conf_mat2$spam, conf_mat2$predSpam)
##
## FALSE TRUE
## 0 3537 17
## 1 357 10
In the test-train paradigm, you balance descriptive power with predictive accuracy by separating your data set into:
Related to cross-validation…
set.seed(501)
train_indices <- sample(1:nrow(email), size = floor(nrow(email)/2))
train_data <- email %>%
slice(train_indices)
test_data <- email %>%
slice(-train_indices)
m1 <- glm(spam ~ log(num_char), data = train_data, family = "binomial")
m2 <- glm(spam ~ log(num_char) + to_multiple + attach + dollar + inherit +
viagra, data = train_data, family = "binomial")
test1 <- predict(m1, newdata = test_data, type = "response")
test_mat1<-data.frame(spam=test_data$spam, predSpam=test1>.5)
table(test_mat1$spam, test_mat1$predSpam)
##
## FALSE TRUE
## 0 1783 4
## 1 171 3
test2<-predict(m2, newdata = test_data, type = "response")
test_mat2<-data.frame(spam=test_data$spam, predSpam=test2>.5)
table(test_mat2$spam, test_mat2$predSpam)
##
## FALSE TRUE
## 0 1782 5
## 1 171 3