``` # Linear Regression # Libraries tidyr; dplyr
summary(unemployment)
fmla <- female_unemployment ~ male_unemployment
fmla
unemployment_model <- lm(fmla, data = unemployment)
unemployment_model ```
``` ## broom and sigr are already loaded in your workspace ## Print unemployment_model unemployment_model
summary(unemployment_model)
glance(unemployment_model)
wrapFTest(unemployment_model)
summary(unemployment) ```
``` ## newrates is in your workspace newrates
unemployment$prediction <- predict(unemployment_model)
library(ggplot2)
ggplot(unemployment, aes(x = prediction, y = female_unemployment)) + geom_point() + geom_abline(color = “blue”)
pred <- predict(unemployment_model, newdata = newrates) ## Print it pred ```
``` ## bloodpressure is in the workspace summary(bloodpressure)
fmla <- blood_pressure~age+weight fmla
bloodpressure_model <- lm(fmla, data = bloodpressure)
bloodpressure_model summary(bloodpressure_model)
summary(bloodpressure)
bloodpressure_model
bloodpressure$prediction <- predict(bloodpressure_model)
ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) + geom_point() + geom_abline(color = “blue”) ```
``` ## unemployment, unemployment_model are in the workspace summary(unemployment) summary(unemployment_model)
unemployment$predictions <- predict(unemployment_model)
ggplot(unemployment, aes(x = predictions, y = female_unemployment)) + geom_point() + geom_abline()
unemployment$predictions <- predict(unemployment_model)
unemployment\(residuals <- unemployment\)female_unemployment - unemployment$predictions
ggplot(unemployment, aes(x = predictions, y = residuals)) + geom_pointrange(aes(ymin = 0, ymax = residuals)) + geom_hline(yintercept = 0, linetype = 3) + ggtitle(“residuals vs. linear model prediction”) ```
``` ## unemployment is in the workspace (with predictions) summary(unemployment)
summary(unemployment_model)
library(WVPlots)
GainCurvePlot(unemployment, “predictions”, “female_unemployment”, “Unemployment model”) ```
``` ## unemployment is in the workspace summary(unemployment)
res <- unemployment\(predictions - unemployment\)female_unemployment
print(rmse <- (sqrt(mean(res^2))))
(sd_unemployment <- sd(unemployment$female_unemployment)) ```
``` ## unemployment is in your workspace summary(unemployment)
summary(unemployment_model)
print(fe_mean <- mean(unemployment$female_unemployment))
print(tss <- sum((fe_mean - unemployment$female_unemployment)^2))
print(rss <- sum((unemployment$residuals)^2))
print(rsq <- 1- (rss/tss))
print(rsq_glance <- glance(unemployment_model)$r.square) ```
``` ## unemployment is in your workspace summary(unemployment)
summary(unemployment_model)
print(rho <- cor(unemployment\(predictions, unemployment\)female_unemployment))
print(rho2 <- rho^2)
print(rsq_glance <- glance(unemployment_model)$r.square) ```
``` ## mpg is in the workspace summary(mpg) dim(mpg)
print(N <- nrow(mpg))
print(target <- round(0.75 * N))
gp <- runif(N)
mpg_train <- mpg[gp < 0.75,] mpg_test <- mpg[gp >= 0.75,]
nrow(mpg_train) nrow(mpg_test) ```
``` ## mpg_train is in the workspace summary(mpg_train)
print(fmla <- cty~hwy)
mpg_model <- lm(fmla, data = mpg_train)
summary(mpg_model) ```
``` ## Examine the objects in the workspace ls.str()
mpg_train$pred <- predict(mpg_model)
mpg_test$pred <- predict(mpg_model, newdata = mpg_test)
(rmse_train <- rmse(mpg_train\(pred, mpg_train\)cty)) print(rmse_test <- rmse(mpg_test\(pred, mpg_test\)cty))
print(rsq_train <- r_squared(mpg_train\(pred, mpg_train\)cty)) print(rsq_test <- r_squared(mpg_test\(pred, mpg_test\)cty))
ggplot(mpg_test, aes(x = pred, y = cty)) + geom_point() + geom_abline() ```
``` ## Load the package vtreat library(vtreat)
summary(mpg)
nRows <- nrow(mpg)
splitPlan <- kWayCrossValidation(nRows, 3, NULL, NULL)
str(splitPlan) ```
``` ## mpg is in the workspace summary(mpg)
str(splitPlan)
k <- 3 # Number of folds mpg\(pred.cv <- 0 for(i in 1:k) { split <- splitPlan[[i]] model <- lm(cty ~ hwy, data = mpg[split\)train,]) mpg\(pred.cv[split\)app] <- predict(model, newdata = mpg[split$app,]) }
mpg$pred <- predict(lm(cty ~ hwy, data = mpg))
rmse(mpg\(pred, mpg\)cty)
rmse(mpg\(pred.cv, mpg\)cty) ```
``` ## Call str on flowers to see the types of each column str(flowers)
unique(flowers$Time)
print(fmla <- as.formula(“Flowers ~ Intensity + Time”))
mmat <- model.matrix(object = fmla, data = flowers)
head(flowers, 20)
head(mmat, 20) ```
``` ## flowers in is the workspace str(flowers)
fmla
flower_model <- lm(fmla, data = flowers)
summary(mmat)
summary(flower_model)
flowers$predictions <- predict(flower_model)
ggplot(flowers, aes(x = predictions, y = Flowers)) + geom_point() + geom_abline(color = “blue”) ```
``` ## alcohol is in the workspace summary(alcohol)
print(fmla_add <- Metabol ~ Gastric + Sex )
print(fmla_interaction <- Metabol ~ Gastric + Gastric : Sex )
model_add <-lm(fmla_add, data = alcohol)
model_interaction <- lm(fmla_interaction, data = alcohol)
summary(model_add) summary(model_interaction)
summary(alcohol)
fmla_add fmla_interaction
set.seed(34245) # set the seed for reproducibility splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)
alcohol\(pred_add <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_add <- lm(fmla_add, data = alcohol[split\)train, ]) alcohol\(pred_add[split\)app] <- predict(model_add, newdata = alcohol[split$app, ]) }
alcohol\(pred_interaction <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_interaction <- lm(fmla_interaction, data = alcohol[split\)train, ]) alcohol\(pred_interaction[split\)app] <- predict(model_interaction, newdata = alcohol[split$app, ]) }
alcohol %>% gather(key = modeltype, value = pred, pred_add, pred_interaction) %>% mutate(residuals = Metabol - pred) %>%
group_by(modeltype) %>% summarize(rmse = sqrt(mean(residuals^2))) ```
``` ## fdata is in the workspace summary(fdata)
fdata %>% group_by(label) %>% # group by small/large purchases summarize(min = min(y), # min of y mean = mean(y), # mean of y max = max(y)) # max of y
fdata2 <- fdata %>% group_by(label) %>% # group by label mutate(residual = pred - y, # Residual relerr = residual / y) # Relative error
fdata2 %>% group_by(label) %>% summarize(rmse = sqrt(mean(residual^2)), # RMSE rmse.rel = sqrt(mean(relerr^2))) # Root mean squared relative error
ggplot(fdata2, aes(x = pred, y = y, color = label)) + geom_point() + geom_abline() + facet_wrap(~ label, ncol = 1, scales = “free”) + ggtitle(“Outcome vs prediction”) ```
``` ## Examine Income2005 in the training set summary(income_train$Income2005)
(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)
model.log <- lm(fmla.log, data = income_train)
income_test\(logpred <- predict(model.log, newdata = income_test) summary(income_test\)logpred)
income_test\(pred.income <- exp(income_test\)logpred) summary(income_test$pred.income)
ggplot(income_test, aes(x = pred.income, y = Income2005)) + geom_point() + geom_abline(color = “blue”) ```
``` ## fmla.abs is in the workspace fmla.abs
summary(model.abs)
income_test <- income_test %>% mutate(pred.absmodel = predict(model.abs, income_test), # predictions from model.abs pred.logmodel = exp(predict(model.log, income_test))) # predictions from model.log
income_long <- income_test %>% gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>% mutate(residual = pred - Income2005, # residuals relerr = residual / Income2005) # relative error
income_long %>% group_by(modeltype) %>% # group by modeltype summarize(rmse = sqrt(mean(residual^2)), # RMSE rmse.rel = sqrt(mean(relerr^2))) # Root mean squared relative error ```
``` ## houseprice is in the workspace summary(houseprice)
(fmla_sqr <- price ~ I(size^2))
model_sqr <- lm(fmla_sqr, data = houseprice)
model_lin <- lm(price ~ size, data = houseprice)
houseprice %>% mutate(pred_lin = predict(model_lin), # predictions from linear model pred_sqr = predict(model_sqr)) %>% # predictions from quadratic model gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions ggplot(aes(x = size)) + geom_point(aes(y = price)) + # actual prices geom_line(aes(y = pred, color = modeltype)) + # the predictions scale_color_brewer(palette = “Dark2”)
summary(houseprice)
fmla_sqr
set.seed(34245) # set the seed for reproducibility splitPlan <- kWayCrossValidation(nRows = nrow(houseprice),nSplits = 3, NULL, NULL)
houseprice\(pred_lin <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_lin <- lm(price ~ size, data = houseprice[split\)train,]) houseprice\(pred_lin[split\)app] <- predict(model_lin, newdata = houseprice[split$app,]) }
houseprice\(pred_sqr <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_sqr <- lm(fmla_sqr, data = houseprice[split\)train, ]) houseprice\(pred_sqr[split\)app] <- predict(model_sqr, newdata = houseprice[split$app, ]) }
houseprice_long <- houseprice %>% gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% mutate(residuals = pred - price)
houseprice_long %>% group_by(modeltype) %>% # group by modeltype summarize(rmse = sqrt(mean(residuals^2)))
str(donors)
table(donors$donated)
donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans, data = donors, family = “binomial”)
summary(donation_model) ```
``` ## Estimate the donation probability donors$donation_prob <- predict(donation_model, type = “response”)
mean(donors$donated)
donors\(donation_pred <- ifelse(donors\)donation_prob > 0.0504, 1, 0)
mean(donors\(donated == donors\)donation_pred)
library(pROC)
ROC <- roc(donors\(donated, donors\)donation_prob)
plot(ROC, col = “blue”)
auc(ROC) ```
``` ## Convert the wealth rating to a factor donors\(wealth_rating <- factor(donors\)wealth_rating, levels = c(0,1,2,3), labels = c(“Unknown”,“Low”,“Medium”,“High”))
donors\(wealth_rating <- relevel(donors\)wealth_rating, ref = “Medium”)
summary(glm(formula=donated ~ wealth_rating, data = donors, family = “binomial”))
summary(donors$age)
donors\(imputed_age <- ifelse(is.na(donors\)age),61.65,donors$age)
donors\(missing_age <- ifelse(is.na(donors\)age),1,0) ```
``` ## Build a recency, frequency, and money (RFM) model rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family = “binomial”)
summary(rfm_model)
rfm_prob <- predict(rfm_model, data = donors, type = “response”)
library(pROC) ROC <- roc(donors$donated, rfm_prob) plot(ROC, col = “red”) auc(ROC) ```
``` ## Specify a null model with no predictors null_model <- glm(donated ~ 1, data = donors, family = “binomial”)
full_model <- glm(formula =donated ~ ., data = donors, family = “binomial”)
step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = “forward”)
step_prob <- predict(step_model, type = “response”)
library(pROC) ROC <- roc(donors$donated, step_prob) plot(ROC, col = “red”) auc(ROC) ```