HW 2

We will be predicting whether the housing price is expensive or not using the sahp dataset in the r02pro package.

You can run the following code to prepare the analysis.

#install.packages("MASS")
library(r02pro)     #INSTALL IF NECESSARY
library(tidyverse)  #INSTALL IF NECESSARY
library(MASS)
my_sahp <- sahp %>% 
  na.omit() %>%
  mutate(expensive = sale_price > median(sale_price)) %>%
  dplyr::select(gar_car, liv_area, oa_qual, expensive)
my_sahp_train <- my_sahp[1:100, ]
my_sahp_test <- my_sahp[-(1:100), ]

Please answer the following questions.

1. Using the training data my_sahp_train to fit a logistic regression model of expensive on each variable (gar_car, liv_area, oa_qual) separately. For each logistic regression, compute the training and test error. Which variable leads to the smallest training error? Which variable leads to the smallest test error?

Using the training data my_sahp_train to fit a logistic regression model of expensive on all three variables (gar_car, liv_area, oa_qual). Compute the training and test error. How do the result compare with part a.

fit_gar_car <- glm(expensive ~ gar_car, data = my_sahp_train, family = "binomial" )
pred_train_prob_car <- predict(fit_gar_car, type = "response")
pred_train_label_car <- ifelse(pred_train_prob_car > 0.5 , "TRUE", "FALSE")
table(pred_train_label_car,my_sahp_train$expensive)

##                     
## pred_train_label_car FALSE TRUE
##                FALSE    27    1
##                TRUE     28   44

mean(pred_train_label_car != my_sahp_train$expensive)

## [1] 0.29

pred_test_prob_car <- predict(fit_gar_car,newdata = my_sahp_test , type = "response")
pred_test_label_car <- ifelse(pred_test_prob_car > 0.5, "TRUE", "FALSE")
table(pred_test_label_car,my_sahp_test$expensive)

##                    
## pred_test_label_car FALSE TRUE
##               FALSE    16    1
##               TRUE     10   35

mean(pred_test_label_car != my_sahp_test$expensive)

## [1] 0.1774194

fit_liv_area <- glm(expensive ~ liv_area, data = my_sahp_train, family = "binomial" )
pred_train_prob_liv <- predict(fit_liv_area, type = "response")
pred_train_label_liv <- ifelse(pred_train_prob_liv > 0.5 , "TRUE", "FALSE")
table(pred_train_label_liv,my_sahp_train$expensive)

##                     
## pred_train_label_liv FALSE TRUE
##                FALSE    41   21
##                TRUE     14   24

mean(pred_train_label_liv != my_sahp_train$expensive)

## [1] 0.35

pred_test_prob_liv <- predict(fit_liv_area,newdata = my_sahp_test , type = "response")
pred_test_label_liv <- ifelse(pred_test_prob_liv > 0.5, "TRUE", "FALSE")
table(pred_test_label_liv,my_sahp_test$expensive)

##                    
## pred_test_label_liv FALSE TRUE
##               FALSE    23   13
##               TRUE      3   23

mean(pred_test_label_liv != my_sahp_test$expensive)

## [1] 0.2580645

fit_oa_qual <- glm(expensive ~ oa_qual, data = my_sahp_train , family = "binomial")
predict_train_prob_oa <- predict(fit_oa_qual, type = "response")
predict_train_label_oa <- ifelse(predict_train_prob_oa > 0.5, "TRUE", "FALSE")
table(predict_train_label_oa, my_sahp_train$expensive)

##                       
## predict_train_label_oa FALSE TRUE
##                  FALSE    49   17
##                  TRUE      6   28

mean(predict_train_label_oa != my_sahp_train$expensive)

## [1] 0.23

predict_test_prob_oa <- predict(fit_oa_qual, newdata = my_sahp_test, type = "response")
predict_test_label_oa <- ifelse(predict_test_prob_oa > 0.5, "TRUE", "FALSE")
table(predict_test_label_oa, my_sahp_test$expensive)

##                      
## predict_test_label_oa FALSE TRUE
##                 FALSE    22   15
##                 TRUE      4   21

mean(predict_test_label_oa != my_sahp_test$expensive)

## [1] 0.3064516

Answer: From the result, we can know that using the “liv_area” variable can get smallest train error, while “oa_qual”leads smallest test error.

Using the training data my_sahp_train to fit LDA and QDA models of expensive on all three variables (gar_car, liv_area, oa_qual). Compute the training and test error. How do the results compare with Q1?

#train error
library(MASS)
lda.fit <- lda(expensive ~ gar_car + liv_area + oa_qual, data = my_sahp_train)
lda.fit

## Call:
## lda(expensive ~ gar_car + liv_area + oa_qual, data = my_sahp_train)
## 
## Prior probabilities of groups:
## FALSE  TRUE 
##  0.55  0.45 
## 
## Group means:
##        gar_car liv_area  oa_qual
## FALSE 1.381818 1275.255 5.418182
## TRUE  2.133333 1686.333 7.000000
## 
## Coefficients of linear discriminants:
##                   LD1
## gar_car  0.6842232584
## liv_area 0.0005864637
## oa_qual  0.5960637814

lda.predict.train <- predict(lda.fit,my_sahp_train)
lda.class.train <- lda.predict.train$class
mean(lda.class.train != my_sahp_train$expensive)

## [1] 0.17

#test error
lda.predict.test <- predict(lda.fit,my_sahp_test)
lda.class.test <- lda.predict.test$class
mean(lda.class.test != my_sahp_test$expensive)

## [1] 0.2419355

From the result, we can see that the train and test error of three-variables model are much smaller than those in the Q1 model, which means that the new model fits data well.

Q3 in Chapter 4 of ISLRv2. Answer would be in the pdf file
Q6 in Chapter 4 of ISLRv2. Answer would be in the pdf file

HW 2

Siqi Huang