``` # Linear Regression # Libraries tidyr; dplyr

0.1 unemployment is loaded in the workspace

summary(unemployment)

0.2 Define a formula to express female_unemployment as a function of male_unemployment

fmla <- female_unemployment ~ male_unemployment

0.4 Use the formula to fit a model: unemployment_model

unemployment_model <- lm(fmla, data = unemployment)

1 Coeficients and F

``` ## broom and sigr are already loaded in your workspace ## Print unemployment_model unemployment_model

1.1 Call summary() on unemployment_model to get more details

summary(unemployment_model)

1.2 Call glance() on unemployment_model to see the details in a tidier form

glance(unemployment_model)

1.3 Call wrapFTest() on unemployment_model to see the most relevant details

wrapFTest(unemployment_model)

1.4 unemployment is in your workspace

summary(unemployment) ```

2 Prediction new data

``` ## newrates is in your workspace newrates

2.1 Predict female unemployment in the unemployment data set

unemployment$prediction <- predict(unemployment_model)

2.2 load the ggplot2 package

library(ggplot2)

2.3 Make a plot to compare predictions to actual (prediction on x axis).

ggplot(unemployment, aes(x = prediction, y = female_unemployment)) + geom_point() + geom_abline(color = “blue”)

2.4 Predict female unemployment rate when male unemployment is 5%

pred <- predict(unemployment_model, newdata = newrates) ## Print it pred ```

3 Multivariable linear regresion

``` ## bloodpressure is in the workspace summary(bloodpressure)

3.1 Create the formula and print it

fmla <- blood_pressure~age+weight fmla

3.2 Fit the model: bloodpressure_model

bloodpressure_model <- lm(fmla, data = bloodpressure)

3.4 bloodpressure is in your workspace

summary(bloodpressure)

3.5 bloodpressure_model is in your workspace

bloodpressure_model

3.6 predict blood pressure using bloodpressure_model :prediction

bloodpressure$prediction <- predict(bloodpressure_model)

3.7 plot the results

ggplot(bloodpressure, aes(x = prediction, y = blood_pressure)) + geom_point() + geom_abline(color = “blue”) ```

4 Graphically evaluate the unemployment model

``` ## unemployment, unemployment_model are in the workspace summary(unemployment) summary(unemployment_model)

4.1 Make predictions from the model

unemployment$predictions <- predict(unemployment_model)

4.2 Fill in the blanks to plot predictions (on x-axis) versus the female_unemployment rates

ggplot(unemployment, aes(x = predictions, y = female_unemployment)) + geom_point() + geom_abline()

4.3 From previous step

unemployment$predictions <- predict(unemployment_model)

4.4 Calculate residuals

unemployment\(residuals <- unemployment\)female_unemployment - unemployment$predictions

4.5 Fill in the blanks to plot predictions (on x-axis) versus the residuals

ggplot(unemployment, aes(x = predictions, y = residuals)) + geom_pointrange(aes(ymin = 0, ymax = residuals)) + geom_hline(yintercept = 0, linetype = 3) + ggtitle(“residuals vs. linear model prediction”) ```

5 The gain curve to evaluate the unemployment model

``` ## unemployment is in the workspace (with predictions) summary(unemployment)

5.1 unemployment_model is in the workspace

summary(unemployment_model)

5.2 Load the package WVPlots

library(WVPlots)

5.3 Plot the Gain Curve

GainCurvePlot(unemployment, “predictions”, “female_unemployment”, “Unemployment model”) ```

6 Calculate RMSE

``` ## unemployment is in the workspace summary(unemployment)

6.1 For convenience put the residuals in the variable res

res <- unemployment\(predictions - unemployment\)female_unemployment

6.2 Calculate RMSE, assign it to the variable rmse and print it

print(rmse <- (sqrt(mean(res^2))))

6.3 Calculate the standard deviation of female_unemployment and print it

(sd_unemployment <- sd(unemployment$female_unemployment)) ```

7 Calculate R-Squared

``` ## unemployment is in your workspace summary(unemployment)

7.1 unemployment_model is in the workspace

summary(unemployment_model)

7.2 Calculate mean female_unemployment: fe_mean. Print it

print(fe_mean <- mean(unemployment$female_unemployment))

7.3 Calculate total sum of squares: tss. Print it

print(tss <- sum((fe_mean - unemployment$female_unemployment)^2))

7.4 Calculate residual sum of squares: rss. Print it

print(rss <- sum((unemployment$residuals)^2))

7.5 Calculate R-squared: rsq. Print it. Is it a good fit?

print(rsq <- 1- (rss/tss))

7.6 Get R-squared from glance. Print it

print(rsq_glance <- glance(unemployment_model)$r.square) ```

8 Correlation and R-squared

``` ## unemployment is in your workspace summary(unemployment)

8.1 unemployment_model is in the workspace

summary(unemployment_model)

8.2 Get the correlation between the prediction and true outcome: rho and print it

print(rho <- cor(unemployment\(predictions, unemployment\)female_unemployment))

8.3 Square rho: rho2 and print it

print(rho2 <- rho^2)

8.4 Get R-squared from glance and print it

print(rsq_glance <- glance(unemployment_model)$r.square) ```

9 Cross Validation

10 Generating a random test/train split

``` ## mpg is in the workspace summary(mpg) dim(mpg)

10.1 Use nrow to get the number of rows in mpg (N) and print it

print(N <- nrow(mpg))

10.2 Calculate how many rows 75% of N should be and print it

10.3 Hint: use round() to get an integer

print(target <- round(0.75 * N))

10.4 Create the vector of N uniform random variables: gp

gp <- runif(N)

10.5 Use gp to create the training set: mpg_train (75% of data) and mpg_test (25% of data)

mpg_train <- mpg[gp < 0.75,] mpg_test <- mpg[gp >= 0.75,]

10.6 Use nrow() to examine mpg_train and mpg_test

nrow(mpg_train) nrow(mpg_test) ```

11 Train a model using test/train split

``` ## mpg_train is in the workspace summary(mpg_train)

11.1 Create a formula to express cty as a function of hwy: fmla and print it.

print(fmla <- cty~hwy)

11.2 Now use lm() to build a model mpg_model from mpg_train that predicts cty from hwy

mpg_model <- lm(fmla, data = mpg_train)

11.3 Use summary() to examine the model

summary(mpg_model) ```

12 Evaluate a model using test/train split

``` ## Examine the objects in the workspace ls.str()

12.1 predict cty from hwy for the training set

mpg_train$pred <- predict(mpg_model)

12.2 predict cty from hwy for the test set

mpg_test$pred <- predict(mpg_model, newdata = mpg_test)

12.3 Evaluate the rmse on both training and test data and print them

(rmse_train <- rmse(mpg_train\(pred, mpg_train\)cty)) print(rmse_test <- rmse(mpg_test\(pred, mpg_test\)cty))

12.4 Evaluate the r-squared on both training and test data.and print them

print(rsq_train <- r_squared(mpg_train\(pred, mpg_train\)cty)) print(rsq_test <- r_squared(mpg_test\(pred, mpg_test\)cty))

12.5 Plot the predictions (on the x-axis) against the outcome (cty) on the test data

ggplot(mpg_test, aes(x = pred, y = cty)) + geom_point() + geom_abline() ```

13 Create a cross validation plan

``` ## Load the package vtreat library(vtreat)

13.1 mpg is in the workspace

summary(mpg)

13.2 Get the number of rows in mpg

nRows <- nrow(mpg)

13.3 Implement the 3-fold cross-fold plan with vtreat

splitPlan <- kWayCrossValidation(nRows, 3, NULL, NULL)

13.4 Examine the split plan

str(splitPlan) ```

14 Evaluate a modeling procedure using n-fold cross-validation

``` ## mpg is in the workspace summary(mpg)

14.1 splitPlan is in the workspace

str(splitPlan)

14.2 Run the 3-fold cross validation plan from splitPlan

k <- 3 # Number of folds mpg\(pred.cv <- 0 for(i in 1:k) { split <- splitPlan[[i]] model <- lm(cty ~ hwy, data = mpg[split\)train,]) mpg\(pred.cv[split\)app] <- predict(model, newdata = mpg[split$app,]) }

14.3 Predict from a full model

mpg$pred <- predict(lm(cty ~ hwy, data = mpg))

14.4 Get the rmse of the full model’s predictions

rmse(mpg\(pred, mpg\)cty)

14.5 Get the rmse of the cross-validation predictions

rmse(mpg\(pred.cv, mpg\)cty) ```

15 Examining structure of categorical inputs

``` ## Call str on flowers to see the types of each column str(flowers)

15.1 Use unique() to see how many possible values Time takes

unique(flowers$Time)

15.2 Build a formula to express Flowers as a function of Intensity and Time: fmla. Print it

print(fmla <- as.formula(“Flowers ~ Intensity + Time”))

15.3 Use fmla and model.matrix to see how the data is represented for modeling

mmat <- model.matrix(object = fmla, data = flowers)

15.4 Examine the first 20 lines of flowers

head(flowers, 20)

15.5 Examine the first 20 lines of mmat

head(mmat, 20) ```

16 Modeling with categorical inputs

``` ## flowers in is the workspace str(flowers)

16.1 fmla is in the workspace

fmla

16.2 Fit a model to predict Flowers from Intensity and Time : flower_model

flower_model <- lm(fmla, data = flowers)

16.3 Use summary on mmat to remind yourself of its structure

summary(mmat)

16.4 Use summary to examine flower_model

summary(flower_model)

16.5 Predict the number of flowers on each plant

flowers$predictions <- predict(flower_model)

16.6 Plot predictions vs actual flowers (predictions on x-axis)

ggplot(flowers, aes(x = predictions, y = Flowers)) + geom_point() + geom_abline(color = “blue”) ```

17 Modeling an interaction

``` ## alcohol is in the workspace summary(alcohol)

17.1 Create the formula with main effects only

print(fmla_add <- Metabol ~ Gastric + Sex )

17.2 Create the formula with interactions

print(fmla_interaction <- Metabol ~ Gastric + Gastric : Sex )

17.3 Fit the main effects only model

model_add <-lm(fmla_add, data = alcohol)

17.4 Fit the interaction model

model_interaction <- lm(fmla_interaction, data = alcohol)

17.5 Call summary on both models and compare

summary(model_add) summary(model_interaction)

17.6 alcohol is in the workspace

summary(alcohol)

17.7 Both the formulae are in the workspace

fmla_add fmla_interaction

17.8 Create the splitting plan for 3-fold cross validation

set.seed(34245) # set the seed for reproducibility splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)

17.9 Sample code: Get cross-val predictions for main-effects only model

alcohol\(pred_add <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_add <- lm(fmla_add, data = alcohol[split\)train, ]) alcohol\(pred_add[split\)app] <- predict(model_add, newdata = alcohol[split$app, ]) }

17.10 Get the cross-val predictions for the model with interactions

alcohol\(pred_interaction <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_interaction <- lm(fmla_interaction, data = alcohol[split\)train, ]) alcohol\(pred_interaction[split\)app] <- predict(model_interaction, newdata = alcohol[split$app, ]) }

17.11 Get RMSE

alcohol %>% gather(key = modeltype, value = pred, pred_add, pred_interaction) %>% mutate(residuals = Metabol - pred) %>%
group_by(modeltype) %>% summarize(rmse = sqrt(mean(residuals^2))) ```

18 Relative Error

``` ## fdata is in the workspace summary(fdata)

18.1 Examine the data: generate the summaries for the groups large and small:

fdata %>% group_by(label) %>% # group by small/large purchases summarize(min = min(y), # min of y mean = mean(y), # mean of y max = max(y)) # max of y

18.2 Fill in the blanks to add error columns

fdata2 <- fdata %>% group_by(label) %>% # group by label mutate(residual = pred - y, # Residual relerr = residual / y) # Relative error

18.3 Compare the rmse and rmse.rel of the large and small groups:

fdata2 %>% group_by(label) %>% summarize(rmse = sqrt(mean(residual^2)), # RMSE rmse.rel = sqrt(mean(relerr^2))) # Root mean squared relative error

18.4 Plot the predictions for both groups of purchases

ggplot(fdata2, aes(x = pred, y = y, color = label)) + geom_point() + geom_abline() + facet_wrap(~ label, ncol = 1, scales = “free”) + ggtitle(“Outcome vs prediction”) ```

19 Modeling log-transformed monetary output

``` ## Examine Income2005 in the training set summary(income_train$Income2005)

19.1 Write the formula for log income as a function of the tests and print it

(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)

19.2 Fit the linear model

model.log <- lm(fmla.log, data = income_train)

19.3 Make predictions on income_test

income_test\(logpred <- predict(model.log, newdata = income_test) summary(income_test\)logpred)

19.4 Convert the predictions to monetary units

income_test\(pred.income <- exp(income_test\)logpred) summary(income_test$pred.income)

19.5 Plot predicted income (x axis) vs income

ggplot(income_test, aes(x = pred.income, y = Income2005)) + geom_point() + geom_abline(color = “blue”) ```

20 Comparing RMSE and root-mean-squared Relative Error

``` ## fmla.abs is in the workspace fmla.abs

20.1 model.abs is in the workspace

summary(model.abs)

20.2 Add predictions to the test set

income_test <- income_test %>% mutate(pred.absmodel = predict(model.abs, income_test), # predictions from model.abs pred.logmodel = exp(predict(model.log, income_test))) # predictions from model.log

20.3 Gather the predictions and calculate residuals and relative error

income_long <- income_test %>% gather(key = modeltype, value = pred, pred.absmodel, pred.logmodel) %>% mutate(residual = pred - Income2005, # residuals relerr = residual / Income2005) # relative error

20.4 Calculate RMSE and relative RMSE and compare

income_long %>% group_by(modeltype) %>% # group by modeltype summarize(rmse = sqrt(mean(residual^2)), # RMSE rmse.rel = sqrt(mean(relerr^2))) # Root mean squared relative error ```

21 Input transform: “the hockey stick”

``` ## houseprice is in the workspace summary(houseprice)

21.1 Create the formula for price as a function of squared size

(fmla_sqr <- price ~ I(size^2))

21.2 Fit a model of price as a function of squared size (use fmla_sqr)

model_sqr <- lm(fmla_sqr, data = houseprice)

21.3 Fit a model of price as a linear function of size

model_lin <- lm(price ~ size, data = houseprice)

21.4 Make predictions and compare

houseprice %>% mutate(pred_lin = predict(model_lin), # predictions from linear model pred_sqr = predict(model_sqr)) %>% # predictions from quadratic model gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% # gather the predictions ggplot(aes(x = size)) + geom_point(aes(y = price)) + # actual prices geom_line(aes(y = pred, color = modeltype)) + # the predictions scale_color_brewer(palette = “Dark2”)

21.5 houseprice is in the workspace

summary(houseprice)

21.6 fmla_sqr is in the workspace

fmla_sqr

21.7 Create a splitting plan for 3-fold cross validation

set.seed(34245) # set the seed for reproducibility splitPlan <- kWayCrossValidation(nRows = nrow(houseprice),nSplits = 3, NULL, NULL)

21.8 Sample code: get cross-val predictions for price ~ size

houseprice\(pred_lin <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_lin <- lm(price ~ size, data = houseprice[split\)train,]) houseprice\(pred_lin[split\)app] <- predict(model_lin, newdata = houseprice[split$app,]) }

21.9 Get cross-val predictions for price as a function of size^2 (use fmla_sqr)

houseprice\(pred_sqr <- 0 # initialize the prediction vector for(i in 1:3) { split <- splitPlan[[i]] model_sqr <- lm(fmla_sqr, data = houseprice[split\)train, ]) houseprice\(pred_sqr[split\)app] <- predict(model_sqr, newdata = houseprice[split$app, ]) }

21.10 Gather the predictions and calculate the residuals

houseprice_long <- houseprice %>% gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>% mutate(residuals = pred - price)

21.11 Compare the cross-validated RMSE for the two models

houseprice_long %>% group_by(modeltype) %>% # group by modeltype summarize(rmse = sqrt(mean(residuals^2)))

22 Logistic Regression

22.1 Examine the dataset to identify potential independent variables

str(donors)

22.2 Explore the dependent variable

table(donors$donated)

22.3 Build the donation model

donation_model <- glm(donated ~ bad_address + interest_religion + interest_veterans, data = donors, family = “binomial”)

22.4 Summarize the model results

summary(donation_model) ```

23 Making a binary prediction

``` ## Estimate the donation probability donors$donation_prob <- predict(donation_model, type = “response”)

23.1 Find the donation probability of the average prospect

mean(donors$donated)

23.2 Predict a donation if probability of donation is greater than average (0.0504)

donors\(donation_pred <- ifelse(donors\)donation_prob > 0.0504, 1, 0)

23.3 Calculate the model’s accuracy

mean(donors\(donated == donors\)donation_pred)

24 Calculating ROC Curves and AUC

24.1 Load the pROC package

library(pROC)

24.2 Create a ROC curve

ROC <- roc(donors\(donated, donors\)donation_prob)

24.3 Plot the ROC curve

plot(ROC, col = “blue”)

24.4 Calculate the area under the curve (AUC)

auc(ROC) ```

25 Coding categorical features

``` ## Convert the wealth rating to a factor donors\(wealth_rating <- factor(donors\)wealth_rating, levels = c(0,1,2,3), labels = c(“Unknown”,“Low”,“Medium”,“High”))

25.1 Use relevel() to change reference category

donors\(wealth_rating <- relevel(donors\)wealth_rating, ref = “Medium”)

25.2 See how our factor coding impacts the model

summary(glm(formula=donated ~ wealth_rating, data = donors, family = “binomial”))

26 Handling missing data

26.1 Find the average age among non-missing values

summary(donors$age)

26.2 Impute missing age values with mean(age)

donors\(imputed_age <- ifelse(is.na(donors\)age),61.65,donors$age)

26.3 Create missing value indicator for age

donors\(missing_age <- ifelse(is.na(donors\)age),1,0) ```

27 Building a more sophisticated model

``` ## Build a recency, frequency, and money (RFM) model rfm_model <- glm(donated ~ recency * frequency + money, data = donors, family = “binomial”)

27.1 Summarize the RFM model to see how the parameters were coded

summary(rfm_model)

27.2 Compute predicted probabilities for the RFM model

rfm_prob <- predict(rfm_model, data = donors, type = “response”)

27.3 Plot the ROC curve for the new model

library(pROC) ROC <- roc(donors$donated, rfm_prob) plot(ROC, col = “red”) auc(ROC) ```

28 Building a stepwise regression model (this is)

``` ## Specify a null model with no predictors null_model <- glm(donated ~ 1, data = donors, family = “binomial”)

28.1 Specify the full model using all of the potential predictors

full_model <- glm(formula =donated ~ ., data = donors, family = “binomial”)

28.2 Use a forward stepwise algorithm to build a parsimonious model

step_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = “forward”)

28.3 Estimate the stepwise donation probability

step_prob <- predict(step_model, type = “response”)

28.4 Plot the ROC of the stepwise model

library(pROC) ROC <- roc(donors$donated, step_prob) plot(ROC, col = “red”) auc(ROC) ```