This report contains regression models created based on data describing 5000 speed dates of 4 minutes of duration involving 310 american young adults. The original data were collected by Columbia Business professors.
The response variable is the variable that you are interested in learn something about.
A predictor variable is a variable used in regression to predict another variable.
Our response variable will be "dec", we want to study how well the predictor variables can help predict its behavior and how they impact it.
data <- read_csv(here::here("data/speed-dating2.csv"),
col_types = cols(
.default = col_integer(),
int_corr = col_double(),
field = col_character(),
from = col_character(),
career = col_character(),
attr = col_double(),
sinc = col_double(),
intel = col_double(),
fun = col_double(),
amb = col_double(),
shar = col_double(),
like = col_double(),
prob = col_double(),
match_es = col_double(),
attr3_s = col_double(),
sinc3_s = col_double(),
intel3_s = col_double(),
fun3_s = col_double(),
amb3_s = col_double(),
dec = col_character()
)) %>%
mutate(dec = factor(dec),
gender = factor(gender),
samerace = factor(samerace),
race = factor(race))
data %>%
glimpse()
## Observations: 4,918
## Variables: 44
## $ iid <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ gender <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ order <int> 4, 3, 10, 5, 7, 6, 1, 2, 8, 9, 10, 9, 6, 1, 3, 2, 7, 8, 4, 5…
## $ pid <int> 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15, …
## $ int_corr <dbl> 0.14, 0.54, 0.16, 0.61, 0.21, 0.25, 0.34, 0.50, 0.28, -0.36,…
## $ samerace <fct> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, …
## $ age_o <int> 27, 22, 22, 23, 24, 25, 30, 27, 28, 24, 27, 22, 22, 23, 24, …
## $ age <int> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, …
## $ field <chr> "Law", "Law", "Law", "Law", "Law", "Law", "Law", "Law", "Law…
## $ race <fct> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ from <chr> "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Chic…
## $ career <chr> "lawyer", "lawyer", "lawyer", "lawyer", "lawyer", "lawyer", …
## $ sports <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ tvsports <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ exercise <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ dining <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10…
## $ museums <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ art <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ hiking <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ gaming <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, …
## $ clubbing <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ reading <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10…
## $ tv <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ theater <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, …
## $ movies <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8,…
## $ concerts <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 7, 7, 7, 7, 7, 7,…
## $ music <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ shopping <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ yoga <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ attr <dbl> 6, 7, 5, 7, 5, 4, 7, 4, 7, 5, 5, 8, 5, 7, 6, 8, 7, 5, 7, 6, …
## $ sinc <dbl> 9, 8, 8, 6, 6, 9, 6, 9, 6, 6, 7, 5, 8, 9, 8, 7, 5, 8, 6, 7, …
## $ intel <dbl> 7, 7, 9, 8, 7, 7, 7, 7, 8, 6, 8, 6, 9, 7, 7, 8, 9, 7, 8, 8, …
## $ fun <dbl> 7, 8, 8, 7, 7, 4, 4, 6, 9, 8, 4, 6, 6, 6, 9, 3, 6, 5, 9, 7, …
## $ amb <dbl> 6, 5, 5, 6, 6, 6, 6, 5, 8, 10, 6, 9, 3, 5, 7, 6, 7, 9, 4, 9,…
## $ shar <dbl> 5, 6, 7, 8, 6, 4, 7, 6, 8, 8, 3, 6, 4, 7, 8, 2, 9, 5, 5, 8, …
## $ like <dbl> 7, 7, 7, 7, 6, 6, 6, 6, 7, 6, 6, 7, 6, 7, 8, 6, 8, 5, 5, 8, …
## $ prob <dbl> 6, 5, NA, 6, 6, 5, 5, 7, 7, 6, 4, 3, 7, 8, 6, 5, 7, 6, 6, 7,…
## $ match_es <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ attr3_s <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ sinc3_s <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ intel3_s <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ fun3_s <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ amb3_s <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ dec <fct> yes, yes, yes, yes, yes, no, yes, no, yes, yes, no, no, no, …
data %>%
na.omit(race) %>%
ggplot(aes(race,y=(..count..)/sum(..count..))) +
geom_bar(color = "black",
fill = "grey") +
labs(x= "Participant Race",
y = "Relative Frequency")
data %>%
na.omit(intel) %>%
ggplot(aes(intel, ..ndensity..)) +
geom_histogram(color = "black",
fill = "grey",
breaks=seq(0, 10, by = 1)) +
scale_x_continuous(breaks=seq(0, 10, by = 1)) +
labs(x= "Intelligence (intel)",
y = "Relative Density")
data %>%
na.omit(attr) %>%
ggplot(aes(attr, ..ndensity..)) +
geom_histogram(color = "black",
fill = "grey",
breaks=seq(0, 10, by = 1)) +
scale_x_continuous(breaks=seq(0, 10, by = 1)) +
labs(x= "Attractivnes (attr)",
y = "Relative Density")
data %>%
na.omit(amb) %>%
ggplot(aes(amb, ..ndensity..)) +
geom_histogram(color = "black",
fill = "grey",
breaks=seq(0, 10, by = 1)) +
scale_x_continuous(breaks=seq(0, 10, by = 1)) +
labs(x= "Ambition (amb)",
y = "Relative Density")
data %>%
na.omit(sinc) %>%
ggplot(aes(sinc, ..ndensity..)) +
geom_histogram(color = "black",
fill = "grey",
breaks=seq(0, 10, by = 1)) +
scale_x_continuous(breaks=seq(0, 10, by = 1)) +
labs(x= "Sincerity (sinc)",
y = "Relative Density")
data %>%
na.omit(like) %>%
ggplot(aes(like, ..ndensity..)) +
geom_histogram(color = "black",
fill = "grey",
breaks=seq(0, 10, by = 1)) +
scale_x_continuous(breaks=seq(0, 10, by = 1)) +
labs(x= "Like (like)",
y = "Relative Density")
data %>% # Keep only candidate predictor variables and response variable
select(dec,fun, prob, order, amb,
attr, sinc, prob, shar,
intel, like, gender, samerace) %>%
na.omit() %>% # remove NAs
mutate_at(
.vars = vars(fun, prob, order,attr, ## Put numeric predictor variables on same scale
sinc, like, prob, shar,intel),
.funs = funs(as.numeric(scale(.)))) -> data_scaled
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
data_scaled %>%
glimpse()
## Observations: 4,101
## Variables: 12
## $ dec <fct> yes, yes, yes, yes, no, yes, no, yes, yes, no, no, no, yes, …
## $ fun <dbl> 0.3673163, 0.8696052, 0.3673163, 0.3673163, -1.1395502, -1.1…
## $ prob <dbl> 0.44246160, -0.01644963, 0.44246160, 0.44246160, -0.01644963…
## $ order <dbl> -0.90758631, -1.08582650, -0.72934613, -0.37286577, -0.55110…
## $ amb <dbl> 6, 5, 6, 6, 6, 6, 5, 8, 10, 6, 9, 3, 5, 7, 6, 7, 9, 4, 9, 8,…
## $ attr <dbl> -0.0256121, 0.4905314, 0.4905314, -0.5417556, -1.0578991, 0.…
## $ sinc <dbl> 1.09093408, 0.53812046, -0.56750679, -0.56750679, 1.09093408…
## $ shar <dbl> -0.1395212, 0.3235922, 1.2498190, 0.3235922, -0.6026347, 0.7…
## $ intel <dbl> -0.1541995, -0.1541995, 0.4728427, -0.1541995, -0.1541995, -…
## $ like <dbl> 0.52052548, 0.52052548, 0.52052548, -0.01812579, -0.01812579…
## $ gender <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ samerace <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, …
set.seed(101) # We set the set for reason of reproducibility
## Adding surrogate key to dataframe
data_scaled$id <- 1:nrow(data_scaled)
data_scaled %>%
dplyr::sample_frac(.8) -> training_data
training_data %>%
glimpse()
## Observations: 3,281
## Variables: 13
## $ dec <fct> yes, no, no, yes, no, no, yes, no, yes, yes, yes, no, no, no…
## $ fun <dbl> 0.3673163, 0.3673163, -0.1349725, 0.8696052, -0.6372614, -2.…
## $ prob <dbl> 0.90137283, 2.27810651, -0.01644963, 0.90137283, 0.90137283,…
## $ order <dbl> 0.16185478, -0.55110595, -0.90758631, 0.87481550, -0.1946255…
## $ amb <dbl> 8, 10, 6, 7, 4, 7, 6, 2, 9, 6, 6, 8, 7, 6, 8, 6, 8, 9, 5, 6,…
## $ attr <dbl> 1.5228184, -1.5740426, -0.0256121, 0.4905314, -2.0901862, -0…
## $ sinc <dbl> 0.53812046, 1.64374770, -0.01469317, 0.53812046, -2.22594767…
## $ shar <dbl> 1.2498190, -0.1395212, -0.1395212, 1.2498190, -1.5288615, -1…
## $ intel <dbl> 1.0998849, -0.1541995, -0.7812416, 0.4728427, -2.0353260, -0…
## $ like <dbl> 1.05917675, -1.09542833, -0.01812579, 1.05917675, -2.7113821…
## $ gender <fct> 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, …
## $ samerace <fct> 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, …
## $ id <int> 1527, 180, 2909, 2696, 1024, 1230, 2396, 1366, 2546, 2234, 3…
dplyr::anti_join(data_scaled,
training_data,
by = 'id') -> testing_data
testing_data %>%
glimpse()
## Observations: 820
## Variables: 13
## $ dec <fct> no, yes, no, no, no, yes, no, no, no, no, yes, no, no, no, n…
## $ fun <dbl> -0.1349725, -0.1349725, -1.6418391, 0.3673163, 0.8696052, 1.…
## $ prob <dbl> 0.90137283, 1.36028406, -0.01644963, 0.90137283, -1.85209455…
## $ order <dbl> -0.55110595, -1.44230686, -1.26406668, -1.44230686, -0.01638…
## $ amb <dbl> 3, 5, 6, 9, 6, 7, 8, 5, 4, 3, 2, 8, 7, 7, 6, 6, 1, 10, 10, 1…
## $ attr <dbl> -0.5417556, 0.4905314, 1.0066749, 1.0066749, -1.0578991, 1.0…
## $ sinc <dbl> 0.53812046, 1.09093408, -0.01469317, -0.01469317, -0.0146931…
## $ shar <dbl> -0.6026347, 0.7867056, -1.5288615, 0.7867056, 0.7867056, 2.1…
## $ intel <dbl> 1.0998849, -0.1541995, 0.4728427, 1.0998849, 0.4728427, -0.1…
## $ like <dbl> -0.01812579, 0.52052548, -0.01812579, 1.05917675, -1.0954283…
## $ gender <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ samerace <fct> 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, …
## $ id <int> 12, 13, 15, 27, 30, 31, 32, 39, 42, 50, 51, 62, 96, 98, 101,…
If you already know your way around logistic regression feel free to skip this section. If you ever feel unsure about it feel free to come back here and consult it.
Let’s use as example our aforementioned response variable \(dec\), and let’s suppose we’ll use as our sole predictor \(like\). In the end it all boils down to a conditional probability for a certain value of \(dec\) and \(like\):
\[\large P(y \ | \ x ) = z, \normalsize where: \ y = dec;\ x=like;\]
As you may have noticed \(dec\) is a binary variable (p1 either says yes or no), for this reason we work with a sigmoid function for the probability of a certain outcome of \(dec\):
\[\large P(y\ | \ x)=\frac{e^{b_{0}+b_{1} \cdot x}}{1 + e^{b_{0}+b_{1} \cdot x}}, \; \normalsize where: \ y = dec;\ x=like;\]
However, to talk about how the predictor \(like\) impacts the response variable \(dec\), which means talking about \(b_{1}\) it’s more convenient to talk in terms of odds ratio:
\[\large \frac{P(y\ | \ x)}{1 - P(y \ | \ x)} =e^{b_{0}+b_{1} \cdot x_{1}}, \; \normalsize where: \ y = dec;\ x_{1}=like;\]
The libraries usually render the coefficients in the following form:
\[\large log(\frac{P(y\ | \ x)}{1 - P(y\ | \ x)}) =b_{0}+b_{1} \cdot x_{1}, \; \normalsize where: \ y = dec;\ x_{1}=like;\]
In my experience an example with actual numbers helps a lot, so let’s assume these are the following values for our variables (by means of the logistic regression):
\(b1 = 0.9441278, \\ b0 = -6.2119061, \\ y = 1 \ (p1 \ wants \ to \ see \ p2 \ again ).\)
Our variable’s values would render the following formula in terms of standard library output:
\[\large log(\frac{P(y = 1\ | \ x)}{1 - P(y = 1\ | \ x)}) =-6.2119061+0.9441278 \cdot x_{1}, \; \normalsize where: \ y = dec;\ x_{1}=like;\]
Knowing that \(e^{-6.2119061} \sim 0.002005411\) the more meaningful formula with the actual exponentiation would look like:
\[\large \frac{P(y=1 \ | \ x)}{1 - P(y=1 \ | \ x)} =0.002005411 \cdot e^{b_{1} \cdot x}, \; \normalsize where: \ y = dec;\ x=like;\]
Which depending on \(x\) will look like:
\(\large \frac{P(y=1 \ | \ x)}{1 - P(y=1 \ | \ x)} =0.002005411, \; \normalsize if: \ x=0;\)
\(\large \frac{P(y=1 \ | \ x)}{1 - P(y=1 \ | \ x)} =0.002005411 \cdot e^{b_{1}}, \; \normalsize where: \ x=1;\)
\(\large \frac{P(y=1 \ | \ x)}{1 - P(y=1 \ | \ x)} =0.002005411 \cdot e^{2 \cdot b_{1}}, \; \normalsize where: \ x=2;\)
And so forth…
Notice that at the end how the formula changes depends mostly on the term \(\large e^{b_{1} \cdot x}\). If we have the exponentiation \(A^{B}\) we have three possibilities:
\(B > 0 \\) : Then \(A^{B} > 1\) and \(A^{B}\) will be bigger the bigger \(B\) is.
\(B = 0 \\) : Then \(A^{B} = 1\)
\(B < 0 \\): Then \(A^{B}\) boils down to \(\frac{1}{A^{B}}\) which will be a smaller fraction the bigger \(B\) is.
Mind now that our \(b1 > 0\) or in other words \(e^{b1} > 1\), therefore it will have a positive effect on \(\frac{P(y=1 \ | \ x)}{1 - P(y=1 \ | \ x)}\), and the bigger it’s the stronger the positive effect. Therefore \(like\) has a positive effect on the oddratio of \(dec = 1\) over \(dec = 0\).
Which factors (predictors) have a significant effect on the chance of p1 deciding to meet with p2 again? Are their effect positive or negative?
Which factor (predictor) has the most effect (relevance) on the chance of p1 deciding to meet with p2 again?
glm(dec ~ like + fun + attr + shar + sinc + prob + amb + intel + intel * shar,
data = training_data,
family = "binomial") -> bm
glance(bm)
Residual Analysis can be very hard to use to help you understand what is going on with your logistic regression model. So we should take what they say with a grain of salt and probably give more weight to what \(McFadden's \ pseudo \ R2\) said.
Let’s keep the residual data in a specific data frame
mod.res <- resid(bm)
std.resid <- rstandard(bm)
dec <- training_data$dec
resid_data <- data.frame(mod.res,std.resid,training_data,
stringsAsFactors=FALSE)
resid_data %>%
sample_n(10)
resid_data %>%
ggplot(aes(intel,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Intelligence (intel)",
y = "Model Residual") -> intel_resid
resid_data %>%
ggplot(aes(fun,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Funny (fun)",
y = "Model Residual") -> fun_resid
gridExtra::grid.arrange(intel_resid,
fun_resid,
ncol = 2)
resid_data %>%
ggplot(aes(attr,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Attractiveness (attr)",
y = "Model Residual") -> attr_resid
resid_data %>%
ggplot(aes(amb,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Ambition (amb)",
y = "Model Residual") -> amb_resid
gridExtra::grid.arrange(attr_resid,
amb_resid,
ncol = 2)
resid_data %>%
ggplot(aes(sinc,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Sincerity (sinc)",
y = "Model Residual") -> sinc_resid
resid_data %>%
ggplot(aes(like,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Like (like)",
y = "Model Residual") -> like_resid
gridExtra::grid.arrange(sinc_resid,
like_resid,
ncol=2)
resid_data %>%
ggplot(aes(shar,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Shared (shar)",
y = "Model Residual") -> shar_resid
resid_data %>%
ggplot(aes(prob,mod.res)) +
geom_point(alpha=0.4) +
labs(x = "Predictor Probability (prob)",
y = "Model Residual") -> prob_resid
gridExtra::grid.arrange(shar_resid,
prob_resid,
ncol=2)
bm %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(yintercept=0, col="red", linetype="dashed") +
xlab("Fitted values") + ylab("Residuals") +
ggtitle("Residual vs Fitted Plot")
## `geom_smooth()` using formula 'y ~ x'
The data doesn’t seem to demand a non-linear regression.
y <- quantile(resid_data$std.resid[!is.na(resid_data$std.resid)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
resid_data %>%
ggplot(aes(sample=std.resid)) +
stat_qq(shape=1, size=3) + # open circles
labs(title="Normal Q-Q", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label
geom_abline(slope = slope,
color = "red",
size = 0.8,
intercept = int,
linetype="dashed") # dashed reference line
This suggests that there would be a better combination of predictors to be found, although the current one still seems considerably appropriate.
bm %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
## `geom_smooth()` using formula 'y ~ x'
Confirmation of the qqplot results.
bm %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
There are no “outliers”/extreme values who are influential cases (i.e., subjects) and would therefore have an undue influence on the regression line.
Overall our residua analysis suggests that our regression fit the data very well.
There may be a better model out there but ours does a pretty good job.
tidy(bm, conf.int = TRUE)
broom::tidy(bm,
conf.int = TRUE,
conf.level = 0.95) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(term, estimate,
ymin = conf.low,
ymax = conf.high)) +
geom_errorbar(size = 0.8, width= 0.4) +
geom_point(color = "red", size = 2) +
geom_hline(yintercept = 0, colour = "darkred") +
labs(x = "Predictor variable",
title = "Logistic regression terms",
y = expression(paste("estimated ", 'b'," (95% confidence)")))
The coefficients here follow a generalization of the formula \[\normalsize log(\frac{P(y\ | \ x)}{1 - P(y\ | \ x)}) =b_{0}+b_{1} \cdot x_{1} \;\]
This particular alternative doesn’t give a clear idea on the magnitude of the coefficients’ effect. There are however some things that can be extracted:
# EXPONENTIATING:
tidy(bm, conf.int = TRUE, exponentiate = TRUE)
broom::tidy(bm,
conf.int = TRUE,
conf.level = 0.95,
exponentiate = TRUE) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(term, estimate,
ymin = conf.low,
ymax = conf.high)) +
geom_errorbar(size = 0.8, width= 0.4) +
geom_point(color = "red", size = 2) +
geom_hline(yintercept = 1, colour = "darkred") +
labs(x = "Predictor variable",
title = "Exponentiated logistic regression terms",
y = expression(paste("estimated ", 'e'^{'b'}," (95% confidence)"))) +
ylim(0,4)
The coefficients here follow a generalization of the formula \[\normalsize \frac{P(y\ | \ x)}{1 - P(y \ | \ x)} =e^{b_{0}+b_{1} \cdot x_{1}} \;\]
As mentioned in the previous section when \(e^{b} > 1\) its effect is positive and when \(e^{b} < 1\) it has a negative effect. Therefore:
intel and shar * intel have no significant effect as (their C.I) intersect 1.
A multiplier of negative effect is strongest the farther it’s from 1 and the closer it’s to 0 while A multiplier of positive effect is strongest the farther it’s from 1 and the closer it’s to \(+\infty\).
# Compute AUC for predicting Class with the model
prob <- predict(bm, newdata=testing_data, type="response")
pred <- prediction(prob, testing_data$dec)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
autoplot(perf) +
labs(x="False Positive Rate",
y="True Positive Rate",
title="ROC Curve")
auc <- performance(pred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.863979
predictions = bm %>%
augment(type.predict = "response") %>%
mutate(acc_to_model = .fitted > .5,
acc_to_data = dec == "yes")
table(predictions$acc_to_model, predictions$acc_to_data)
##
## FALSE TRUE
## FALSE 1539 410
## TRUE 358 974
xtabs(~ acc_to_model + acc_to_data, data = predictions)
## acc_to_data
## acc_to_model FALSE TRUE
## FALSE 1539 410
## TRUE 358 974
mosaic(acc_to_model ~ acc_to_data, data = predictions,
shade = T)
predictions %>%
summarise(accuracy = sum(acc_to_model == acc_to_data) / n(),
false_positives = n())
pR2(bm)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -1514.3062029 -2233.9458752 1439.2793446 0.3221384 0.3551070
## r2CU
## 0.4774310