library(openintro)
data(email)
head(email)
## spam to_multiple from cc sent_email time image attach
## 1 0 0 1 0 0 2011-12-31 22:16:41 0 0
## 2 0 0 1 0 0 2011-12-31 23:03:59 0 0
## 3 0 0 1 0 0 2012-01-01 08:00:32 0 0
## 4 0 0 1 0 0 2012-01-01 01:09:49 0 0
## 5 0 0 1 0 0 2012-01-01 02:00:01 0 0
## 6 0 0 1 0 0 2012-01-01 02:04:46 0 0
## dollar winner inherit viagra password num_char line_breaks format
## 1 0 no 0 0 0 11.37 202 1
## 2 0 no 0 0 0 10.50 202 1
## 3 4 no 1 0 0 7.77 192 1
## 4 0 no 0 0 0 13.26 255 1
## 5 0 no 0 0 2 1.23 29 0
## 6 0 no 0 0 2 1.09 25 0
## re_subj exclaim_subj urgent_subj exclaim_mess number
## 1 0 0 0 0 big
## 2 0 0 0 1 small
## 3 0 0 0 6 small
## 4 0 0 0 48 small
## 5 0 0 0 1 none
## 6 0 0 0 1 none
# where did this data come from / how was it collected?
Predicting spam or not using the presence of “winner”
If “winner” then “spam”?
Predicting spam or not using number of characters (in K)
Predicting spam or not using log number of characters (in K)
If log(num_char)
< 1, then “spam”?
Each simple filter can be thought of as a regression model.
\(spam \sim winner; \quad G_1 \sim G_2\)
\(spam \sim log(num\_char); \quad G_1 \sim K_1\)
Each one by itself has poor predictive power, so how can we combine them into a single stronger model?
\[spam \sim log(num\_char)\]
library(dplyr)
email <- mutate(email, log_num_char = log(num_char))
#linear
qplot(x = log_num_char, y = spam, data = email,
geom = "point", alpha = I(.1), ylab = "spam") +
stat_smooth(method = "lm",
se = FALSE)
# logistic
qplot(x = log_num_char, y = spam, data = email,
geom = "point", alpha = I(.1), ylab = "spam") +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)
m1 <- glm(spam ~ log(num_char), data = email, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = spam ~ log(num_char), family = "binomial", data = email)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.815 -0.467 -0.335 -0.255 3.013
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7244 0.0606 -28.4 <2e-16 ***
## log(num_char) -0.5435 0.0365 -14.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2437.2 on 3920 degrees of freedom
## Residual deviance: 2190.3 on 3919 degrees of freedom
## AIC: 2194
##
## Number of Fisher Scoring iterations: 5
m2 <- glm(spam ~ log(num_char) + to_multiple + attach + dollar + inherit +
viagra, data = email, family = "binomial")
summary(m2)
##
## Call:
## glm(formula = spam ~ log(num_char) + to_multiple + attach + dollar +
## inherit + viagra, family = "binomial", data = email)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.931 -0.455 -0.328 -0.236 3.034
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.59642 0.06443 -24.78 < 2e-16 ***
## log(num_char) -0.54869 0.03831 -14.32 < 2e-16 ***
## to_multiple -1.92889 0.30493 -6.33 2.5e-10 ***
## attach 0.19970 0.06552 3.05 0.0023 **
## dollar -0.00456 0.01898 -0.24 0.8102
## inherit 0.40003 0.15166 2.64 0.0083 **
## viagra 1.73607 40.59296 0.04 0.9659
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2437.2 on 3920 degrees of freedom
## Residual deviance: 2106.3 on 3914 degrees of freedom
## AIC: 2120
##
## Number of Fisher Scoring iterations: 11
pred1 <- predict(m1, newdata = email, type = "response")
conf_mat1<-data.frame(spam=email$spam, predSpam=pred1>.5)%>%
group_by(spam, predSpam)%>%
summarise(n=n())
conf_mat1
## # A tibble: 4 x 3
## # Groups: spam [2]
## spam predSpam n
## <dbl> <lgl> <int>
## 1 0 FALSE 3541
## 2 0 TRUE 13
## 3 1 FALSE 362
## 4 1 TRUE 5
pred2<-predict(m2, newdata = email, type = "response")
conf_mat2<-data.frame(spam=email$spam, predSpam=pred2>.5)%>%
group_by(spam, predSpam)%>%
summarise(n=n())
conf_mat2
## # A tibble: 4 x 3
## # Groups: spam [2]
## spam predSpam n
## <dbl> <lgl> <int>
## 1 0 FALSE 3537
## 2 0 TRUE 17
## 3 1 FALSE 357
## 4 1 TRUE 10
In the test-train paradigm, you balance descriptive power with predictive accuracy by separating your data set into:
Related to cross-validation…
set.seed(501)
train_indices <- sample(1:nrow(email), size = floor(nrow(email)/2))
train_data <- email %>%
slice(train_indices)
test_data <- email %>%
slice(-train_indices)
m1 <- glm(spam ~ log(num_char), data = train_data, family = "binomial")
m2 <- glm(spam ~ log(num_char) + to_multiple + attach + dollar + inherit +
viagra, data = train_data, family = "binomial")
test1 <- predict(m1, newdata = test_data, type = "response")
test_mat1<-data.frame(spam=test_data$spam, predSpam=test1>.5)%>%
group_by(spam, predSpam)%>%
summarise(n=n())
test_mat1
## # A tibble: 4 x 3
## # Groups: spam [2]
## spam predSpam n
## <dbl> <lgl> <int>
## 1 0 FALSE 1782
## 2 0 TRUE 6
## 3 1 FALSE 172
## 4 1 TRUE 1
test2<-predict(m2, newdata = test_data, type = "response")
test_mat2<-data.frame(spam=test_data$spam, predSpam=test2>.5)%>%
group_by(spam, predSpam)%>%
summarise(n=n())
test_mat2
## # A tibble: 4 x 3
## # Groups: spam [2]
## spam predSpam n
## <dbl> <lgl> <int>
## 1 0 FALSE 1780
## 2 0 TRUE 8
## 3 1 FALSE 171
## 4 1 TRUE 2
A GLM consists of three things:
Normal distribution, identity link function
Binomial distribution, logit link function
Poisson distribution, logarithm