knitr::opts_chunk$set(echo = TRUE) library(sjPlot)

Answering question 1

The dataset default.csv indicates if individuals defaulted on their credit debt. a. Split the dataset into a training and test set. please consider below answer.

pacman::p_load(dplyr, tidyr ,cowplot 
               , tidyverse , viridis , GGally)


default <- read_csv("default.csv")
## Rows: 10000 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): default, student
## dbl (2): balance, income
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(default) # we have these cols  "default" "student" "balance" "income" 
## [1] "default" "student" "balance" "income"
head(default) # shows head of data
dim(default) # rows= 10000     col= 4
## [1] 10000     4
# overview of our data

ggpairs(default, title="correlogram with ggpairs()" , colour = "income")  + 
               theme(panel.grid.major = element_blank())
## Warning in warn_if_args_exist(list(...)): Extra arguments: "colour" are being
## ignored. If these are meant to be aesthetics, submit them using the 'mapping'
## variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

part A

Split the data into training and test sets.

I learned what is factor and how to use it

source : https://www.youtube.com/watch?v=xkRBfy8_2MU

#  https://rdrr.io/cran/ISLR/man/Default.html 
# A data frame with 10000 observations on the following 4 variables.
# 
# default -> A factor with levels No and Yes indicating whether the customer defaulted on their debt
# 
# student ->
# A factor with levels No and Yes indicating whether the customer is a student
# 
# balance ->
# The average balance that the customer has remaining on their 
# credit card after making their monthly payment
# 
# income -> Income of customer
# 



set.seed(12345) # provided you use the same pseudo-random number generator
tail(default$default , 10) #tail: "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
##  [1] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
##################################################################
##             ifelse condition: No =1, otherwise 0             ##
##################################################################

default$default <- ifelse(default$default == "Yes", 1, 0)

dim(default)
## [1] 10000     4
sum(default$default) # how many 1 do we have 333
## [1] 333
dim(default)[1] - sum(default$default) # how many 0 do we have 9667
## [1] 9667
## 0.8015% of the sample size
smp_size <- floor(0.8015 * nrow(default))
smp_size
## [1] 8015
## set the seed to make your partition reproducible
train_res_ <- sample(seq_len(nrow(default)), size = smp_size)
tail(train_res_)
## [1] 9895 1848 2754 6956 3024 9256
train <- default[train_res_, ]
dim(train) #  row= 8015    col = 4
## [1] 8015    4
head(train)
test <- default[-train_res_, ]
head(test)
dim(test) # 1985    4
## [1] 1985    4
##################################################################
##                    Copy code from lecture                    ##
##################################################################
set.seed(12345)

index <- runif(nrow(default))
default$type <- ifelse(index < 0.80, "train", "test") # divided data two part, train & test

###########################################################################
###########################################################################
###                                                                     ###
###              WHAT IS FACTOR AND WHEN WE SHOULD USE IT               ###
###                                                                     ###
###########################################################################
###########################################################################

# In R, factors are used to work with categorical variables, 
# variables that have a fixed and known set of possible values. 
# They are also useful when you want to display character vectors 
# in a non-alphabetical order.
# source: https://r4ds.had.co.nz/factors.html#:~:text=In%20R%2C%20factors%20are%20used,to%20work%20with%20than%20characters.

default$student <- factor(default$student)
str(default$student) # display factor 
##  Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
train <- default %>% filter(type == "train") %>% select(-type)
test <- default %>% filter(type == "test") %>% select(-type)
dim(train)
## [1] 8015    4
#################################################################
##                       Part 1 is done!                       ##
#################################################################

part B

  1. Construct a logistic regression to predict if an individual will default based on all of the provided predictors, and visualize your final predicted model.
#################################################################
##                         add UW logo                         ##
#################################################################
pacman::p_load(dplyr,modelr , tidyr ,cowplot ,  gapminder , tidyverse , viridis)

logo_file <- system.file("extdata", "logo.png", package = "cowplot")

##################################################################
##                   Generalized linear model                   ##
##################################################################

# Source: https://stats.oarc.ucla.edu/r/dae/logit-regression/


###################################################################################
###################################################################################
###                                                                             ###
###  GENERALISED LINEAR MODELS, E.G. STATS::GLM().  LINEAR MODELS ASSUME THAT   ###
###  THE RESPONSE IS CONTINUOUS AND THE ERROR HAS A NORMAL  DISTRIBUTION.       ###
###  GENERALISED LINEAR MODELS EXTEND LINEAR MODELS TO INCLUDE NON-CONTINUOUS   ###
###  RESPONSES (E.G. BINARY DATA OR COUNTS).  THEY WORK BY DEFINING A DISTANCE  ###
###  METRIC BASED ON THE STATISTICAL IDEA OF  LIKELIHOOD. SOURCE :              ###
###  HTTPS://R4DS.HAD.CO.NZ/MODEL-BASICS.HTML#MISSING-VALUES-5                  ###
###                                                                             ###
###################################################################################
###################################################################################

model_glm <- glm(default ~ ., family="binomial", data=train)
confint(model_glm)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept) -1.239564e+01 -1.016772e+01
## studentYes  -1.100433e+00 -5.835154e-02
## balance      5.363168e-03  6.419470e-03
## income      -1.088558e-05  2.498355e-05
# 
## Waiting for profiling to be done...
##                   2.5                 % 97.5 %
## (Intercept)    -1.226004e+01       -1.007588e+01
## studentYes   -1.269516e+00         -2.169029e-01
## balance         5.444101e-03       6.502409e-03
## income        -1.088558e-05        2.498355e-05

summary(model_glm)
## 
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5262  -0.1356  -0.0530  -0.0186   3.7666  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.125e+01  5.679e-01 -19.809   <2e-16 ***
## studentYes  -5.813e-01  2.656e-01  -2.189   0.0286 *  
## balance      5.874e-03  2.692e-04  21.819   <2e-16 ***
## income       7.023e-06  9.144e-06   0.768   0.4425    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2287.4  on 8014  degrees of freedom
## Residual deviance: 1220.8  on 8011  degrees of freedom
## AIC: 1228.8
## 
## Number of Fisher Scoring iterations: 8
###########################################################################
###########################################################################
###                                                                     ###
###  INTERPERTATION                                                     ###
###                                                                     ###
###########################################################################
###########################################################################
# 
# It can be seen that only 2 out of the 3 predictors are significantly 
# associated to the outcome. These include: studentYes pvalue = 0.0286 
# and balance  < 2e-16 .
# 
# the z value for the studentYes variable disp is calculated 
# as -5.812e-01 / 2.656e-01 = -2.18825301

# the z value for the balance variable disp is calculated 
# as 5.874e-03 / 2.692e-04 

# The coefficient estimate of the variable  is b =  -1.125e+01 , 
# which is Negative This means that an increase in variable is associated with 
# increase in the probability of being negative. 


dim(train)[1]
## [1] 8015
names(train)
## [1] "default" "student" "balance" "income"
# default based on all of the provided predictors, 
#  and visualize your final predicted model.



#######################################################################################
##      tally is a convenient wrapper for summarise that will either call n or       ##
##  sum(n) depending on whether you're tallying for the first time, or re-tallying.  ##
##              count() is similar, but also does the group_by for you.              ##
#######################################################################################



################################################################################
################################################################################
###                                                                          ###
###  TALLY IS A CONVENIENT WRAPPER FOR SUMMARISE THAT WILL EITHER CALL N OR  ###
###  SUM(N) DEPENDING ON WHETHER YOU'RE TALLYING FOR THE FIRST TIME,  OR     ###
###  RE-TALLYING. COUNT() IS SIMILAR, BUT ALSO DOES THE GROUP_BY FOR YOU.    ###
###                                                                          ###
################################################################################
################################################################################
data <- train %>%
    group_by(default, student) %>%
    tally() %>%
    mutate(Percent = n / dim(train)[1] ) 
# 
# # Groups:   default [2]
#   default student     n Percent
#     <dbl> <chr>   <int>   <dbl>
# 1       0 No       5129  0.684    5129/7500 ---> Remember how we did
# 2       0 Yes      2126  0.283    2126/7500 ----> ###Important
# 3       1 No        161  0.0215   161/7500
# 4       1 Yes        84  0.0112   84/7500

head(data)
# add logofile
uw_logo<- draw_image(logo_file, x = 1, y = 1, 
                     hjust = 1, vjust = 1, width = 0.13, height = 0.2)

Balanace ~ default Plot 1

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################


# Balanace ~ default
myplot <- ggplot(train, aes(balance, default)) +
  geom_jitter(aes(x = balance, y = default) , color = 'turquoise4') +
  theme_minimal_grid() +
  geom_rug(aes(color=factor(default)), sides="b") +
  geom_smooth( method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE,size= 2 , color="red") +
  ggtitle("Plot 1")  +
 theme(
  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1, 
             vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'

Factor(default) ~ Percent (plot 2)

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################
factor(data$default)  # res : [1] 0 0 1 1
## [1] 0 0 1 1
## Levels: 0 1
# factor(default) ~ Percent
myplot <- ggplot(data, aes(x=factor(default), fill=student, y=Percent)) +
  geom_jitter(aes(x = factor(default), y = Percent) , color = 'turquoise4') +  
  theme_minimal_hgrid()+ 
# https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar
  geom_bar(stat="identity", position="dodge") +
  # https://ggplot2.tidyverse.org/reference/geom_text.html
  geom_text(aes(label = Percent), size = 3 ,angle = 90,  position = 
  position_dodge(0.9) )+   
  facet_wrap(~default, scales = "free") + 
  # Source: https://www.rdocumentation.org/packages/ggplot2/versions/3.3.5/topics/facet_wrap
  xlab("default") +ylab("Percent")+ 
  ggtitle("Plot 2")  + # plot 2
 theme(plot.caption =  element_text(size = 18),
  plot.tag = element_text(color = "darkred", size = 18),
  panel.border = element_rect(color = "steelblue", size = 2),
  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1, 
             vjust = 1, width = 0.13, height = 0.2)

Income ~ default Plot 3

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################

# income ~ default

myplot <- ggplot(train, aes(income, default))  +
  geom_jitter(aes(x = income, y = default) , color = 'turquoise4') +  
  geom_rug(aes(color=factor(default)), sides="b") +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE  ,size= 2 , color="red") +
  theme_minimal_hgrid() +
  ggtitle("Plot 3 ")  +
 theme(plot.caption =  element_text(size = 18),
  plot.tag = element_text(color = "darkred", size = 18),
  panel.border = element_rect(color = "steelblue", size = 2),
  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1,
             vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'

Student ~ pred Plot 4

How to read box plot

source : https://r-graph-gallery.com/boxplot.html

#-------------------------------------------------------------
## In-class exercises:
##
## Write a function called logit2prob to convert log-odds to 
## the predicted scale
#-------------------------------------------------------------

logit2prob <- function(x) {
  return(exp(x) / (1 + exp(x)))
}



##################################################################################
##  We’ll use modelr::add_predictions() which takes a data frame and a model.   ##
##  It adds the predictions from the model to a new column in the data frame:   ##
##################################################################################

# Source : https://r4ds.had.co.nz/model-basics.html#missing-values-5

train_res <- train %>%
  add_predictions(model_glm) %>%
  mutate(pred = logit2prob(pred))

head(train_res)
############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################



myplot <- ggplot(train_res, aes(x=student, y=pred  , fill=student)) +
  geom_boxplot( width=0.5,lwd=1.5) + theme_minimal_hgrid() +
  scale_y_continuous(trans='log') +
  ggtitle("Plot 4")  +
theme(   panel.border = element_rect(color = "steelblue", size = 2),

  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1, 
             vjust = 1, width = 0.13, height = 0.2)

income ~ pred Plot 5

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=income, y=pred)) +
    geom_jitter(aes(x = income, y = pred) , color = 'turquoise4') +  
  theme_minimal_grid()+geom_point(aes(color=student), alpha=0.3)  +
  geom_smooth(method = "glm",
              method.args = list(family = "binomial"),
              se = FALSE  ,size= 2 , color="red") +
  labs(y = "Predicted probability of default")+
  ggtitle("Plot 5")  +
  theme(plot.caption =  element_text(size = 18),
  plot.tag = element_text(color = "darkred", size = 18),
  panel.border = element_rect(color = "steelblue", size = 2),
  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1, 
             vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

student ~ balance -> boxplot Plot 6

how to read box plot

source : https://r-graph-gallery.com/boxplot.html

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################

myplot <- ggplot(train_res, aes(x=student, y=balance , fill=student)) +
  geom_boxplot(width=0.5,lwd=1.5 ) +
  ggtitle("Plot 6")  +
 theme(  plot.title = element_text(size = 18),
  # 
  axis.title.x = element_text(size = 16), 
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ) )
#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1,
             vjust = 1, width = 0.13, height = 0.2)

Balance ~ pred Plot 7

############################################################################
############################################################################
###                                                                      ###
###                         OUR PLOT START HERE                          ###
###                                                                      ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=balance, y=pred ,color=student) ) +
  geom_jitter(aes(x = balance, y = pred) , color = 'turquoise4') +  
  theme_minimal_hgrid()+
  geom_point(aes(color=student)  ) +
  geom_line(aes(color=student)  )+
  labs(y = "Predicted probability of default") +
  ggtitle("Plot 7")  +
theme(plot.caption =  element_text(size = 18),
  plot.tag = element_text(color = "darkred", size = 18),
  panel.border = element_rect(color = "steelblue", size = 2),
  plot.title = element_text(size = 18),
  axis.title.x = element_text(size = 16),
  axis.title.y = element_text(size = 16),
  axis.text.x = element_text(
    angle = 90,
    vjust = 0.1,
    hjust = 1 ,
    size = 18 , face="bold"
  )  , 
  axis.text.y = element_text(
    angle = 0,
    vjust = 1,
    hjust = 0.1 ,
    size = 18  ,  face="bold"
  ))

#  add logo to my plot
ggdraw(myplot) + 
  draw_image(logo_file, x = 1, y = 1, hjust = 1,
             vjust = 1, width = 0.13, height = 0.2)

############################################################################
############################################################################
###                                                                      ###
###                         Interpertation                               ###
###                                                                      ###
############################################################################
############################################################################

# Note that the change in probabilities is not constant -
# the curve rises after 1000 , then more quickly in the middle, then levels out at 
# the end. 
# The difference in probabilities between 0 and 1000 is far less than the 
# difference in probabilities between 1000 and 2000 or after 2000. 

Model 1

train <- train_res %>%
    mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(train)
test <- test %>%
    add_predictions(model_glm) %>%
    mutate(pred = logit2prob(pred)) %>%
    mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(test)
print("Our test error for model 1")
## [1] "Our test error for model 1"
mean(test$pred_default != test$default)
## [1] 0.02871537
print("Our train error for model 1")
## [1] "Our train error for model 1"
mean(train$pred_default != train$default)
## [1] 0.02632564

Model 2

second logistic regression model using only income as a predictor and compare its error rate to the full model

mod2 <- glm(default ~ income, family="binomial", data=train_res)
mod2
## 
## Call:  glm(formula = default ~ income, family = "binomial", data = train_res)
## 
## Coefficients:
## (Intercept)       income  
##  -3.158e+00   -7.312e-06  
## 
## Degrees of Freedom: 8014 Total (i.e. Null);  8013 Residual
## Null Deviance:       2287 
## Residual Deviance: 2285  AIC: 2289
train <- train %>% add_predictions(mod2) %>%
                mutate(pred = logit2prob(pred)) %>%
                mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(train)
test <- test %>% add_predictions(mod2) %>%
    mutate(pred = logit2prob(pred)) %>%
    mutate(pred_default = ifelse(pred < 0.5, 0, 1))

head(test)
print("Our test error for model 2")
## [1] "Our test error for model 2"
mean(test$pred_default != test$default)
## [1] 0.0372796
print("Our train error for model 2")
## [1] "Our train error for model 2"
mean(train$pred_default != train_res$default)
## [1] 0.03231441

PArt C

  1. Interpret all coefficients, indicating the significance of the coefficients and what it means
summary(model_glm)
## 
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5262  -0.1356  -0.0530  -0.0186   3.7666  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.125e+01  5.679e-01 -19.809   <2e-16 ***
## studentYes  -5.813e-01  2.656e-01  -2.189   0.0286 *  
## balance      5.874e-03  2.692e-04  21.819   <2e-16 ***
## income       7.023e-06  9.144e-06   0.768   0.4425    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2287.4  on 8014  degrees of freedom
## Residual deviance: 1220.8  on 8011  degrees of freedom
## AIC: 1228.8
## 
## Number of Fisher Scoring iterations: 8

y = the predicted value of the dependent variable

B0 = the y-intercept (value of y when all other parameters are set to 0)

B1X1= the regression coefficient (B1) of the first independent variable (X1) (a.k.a. the effect that increasing the value of the independent variable has on the predicted y value)

… = do the same for however many independent variables you are testing

BnXn = the regression coefficient of the last independent variable

e = model error (a.k.a. how much variation there is in our estimate of y)

our formual.

credit debt= -5.813e-01(studentYes)+ 5.874e-03(balance) + 7.023e-06(income)+ -1.125e+01 + e

As we can see above result, we have intercept -1.125e+01 . only p-value compare to all response variables are very height for income which is 0.4425.

overall based on the p-value, income does not have insignificant.

all over variable are much better, like p-value(studentYes) =0.0286 with the the log odds of -5.813e-01 ,

a one unit increase in the predictor variable studentYes is associated with an average change of -5.813e-01 in the log odds of the response variable am taking on a value of $1. This means that higher values of studentYes are associated with a lower likelihood of the am variable taking on a value of $1.

in addition, for balance , we have p-value(balance) <2e-16 and the log odds is 5.874e-03 .

The log-odds function of probabilities is often used in state estimation algorithms because of its numerical advantages in the case of small probabilities.

as we can see the formula, students will decreasing(negative) with -5.813e-01, in other hand other variables are increasing(positive) by 1 dollar.

It can be seen that only 2 out of the 3 predictors are significantly associated to the outcome. These include: studentYes p-value = 0.0286 and balance < 2e-16 .

the z value for the studentYes variable disp is calculated as -5.812e-01 / 2.656e-01 = -2.18825301 the z value for the balance variable disp is calculated as 5.874e-03 / 2.692e-04

The coefficient estimate of the variable is b = -1.125e+01 , which is Negative This means that an increase in variable is associated with increase in the probability of being negative.

It can be seen that, changing in studentYes and balance are significantly associated to changes in credit debt while changes in income budget is not significantly associated with credit debt.

we can also see, based on boxplot Plot 4, people with no credits do not like to be as defualt compare to student with credit. as a result, we will see student would like to not spend money because of being higher remaing balance in the plot 6.

PArt D

  1. Calculate the error rate of the model. Do you think this is a good model? Estimate a second logistic regression model using only income as a predictor and compare its error rate to the full model.

Training error vs test error

In machine learning, there are two important concepts: the training error and the test error.

Training Error: We get the by calculating the classification error of a model on the same data the model was trained on (just like the example above). Test Error: We get this by using two completely disjoint datasets: one to train the model and the other to calculate the classification error. Both datasets need to have values for y. The first dataset is called training data and the second, test data.

You get training error when you run the trained model against the training data. It is important to remember that this data was already used to train the model and this does not imply that the model will be accurate when applied to the training data again.

Source : https://rapidminer.com/blog/validate-models-training-test-error/

in our test error result with 0.02871537 for all data prediction which is great.

1985 * 0.02871537 = 57.0000095 ~ 57 error occurs in the test model 1.

to compare with second model that I run, which we have 0.02632564.

1985 * 0.0372796 = 74.000006 ~ 74 mistake occurs in the test model 2.

Also, error rate for training model 1 is 0.026875.

8015 * 0.02632564 = 211.000005 error found in traNING MODEL 1 .

traning error rate for second model is 0.032625.

8015 * 0.03231441= 258.999996 ~259 error found in traNING MODEL 2.

All in all, we have less error for model 1 and less error for training for model one.

so, model 1 is WINNER.

ANswerting question 2

Part A

options(warn=-1) # remove warning with -1, show waring with 0

pacman::p_load(dplyr, tidyr ,cowplot 
               , tidyverse , viridis , GGally)


data <- read_csv("evals-mod.csv")
## Rows: 463 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): rank, ethnicity, gender, language, cls_level, cls_profs, cls_credits
## dbl (11): score, age, cls_perc_eval, cls_did_eval, cls_students, bty_f1lower...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data) # we have these cols  "default" "student" "balance" "income" 
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_f1lower"   "bty_f1upper"   "bty_f2upper"   "bty_m1lower"  
## [17] "bty_m1upper"   "bty_m2upper"
head(data) # shows head of data
dim(data) # rows= 10000     col= 4
## [1] 463  18
# overview of our data
options(warn=-1) # remove warning with -1, show waring with 0

ggpairs(data, title="correlogram with ggpairs()" , colour = "gender")  + 
               theme(panel.grid.major = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Split the data into training and test sets.

I learned what is factor and how to use it

source : https://www.youtube.com/watch?v=xkRBfy8_2MU

set.seed(12345) # provided you use the same pseudo-random number generator



data <- read_csv("evals-mod.csv")
## Rows: 463 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): rank, ethnicity, gender, language, cls_level, cls_profs, cls_credits
## dbl (11): score, age, cls_perc_eval, cls_did_eval, cls_students, bty_f1lower...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data)
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_f1lower"   "bty_f1upper"   "bty_f2upper"   "bty_m1lower"  
## [17] "bty_m1upper"   "bty_m2upper"
library(broom)
## 
## Attaching package: 'broom'
## 
## The following object is masked from 'package:modelr':
## 
##     bootstrap
# we will build the best possible model predicting
# the average teacher evaluation score using some combination of rank, ethnicity, gender, language,
# age, cls_perc_eval, cls_did_eval, cls_students, cls_level, cls_profs, cls_credits, bty_avg

data$bty_average <- select(data, starts_with("bty_")) %>% rowMeans()
data <- data %>% select(-ends_with("lower"), -ends_with("upper"))

names(data)
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_average"
#--------------------------------------------------------
## Part a: Split data into 80% training and 20% test set
set.seed(12345)
index <- runif(nrow(data))
data$type <- ifelse(index < 0.80, "train", "test")
train <- data %>% filter(type == "train") %>%select(-type)
test <- data %>% filter(type == "test") %>% select(-type)
dim(train)
## [1] 360  13
names(data)
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_average"   "type"
options(warn=-1) # remove warning with -1, show waring with 0

ggplot(data) + geom_histogram(aes(x=score , bins=30) )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#################################################################
##                       Part 1 is done!                       ##
#################################################################

part B

As long as the variable has very low R^2 adjusted and large p-value, we can expect to be the worst predictor of evaluation scores.

as you can see , we have cls_profs with R^2 adjusted is -0.002291 and p-value 0.672, so, we can defenttly say cls_profs is the worst predictor of evaluation scores.

# rank
model_rank <- lm(score ~ rank, data = train) # male as the reference
summary(model_rank)
## 
## Call:
## lm(formula = score ~ rank, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81707 -0.32687  0.08627  0.38961  0.87313 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.31039    0.06163  69.936   <2e-16 ***
## ranktenure track -0.19332    0.08582  -2.252   0.0249 *  
## ranktenured      -0.18352    0.07248  -2.532   0.0118 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5408 on 357 degrees of freedom
## Multiple R-squared:  0.01979,    Adjusted R-squared:  0.01429 
## F-statistic: 3.603 on 2 and 357 DF,  p-value: 0.02823
#ethnicity
model_ethnicity <- lm(score ~ ethnicity, data = train) # male as the reference
summary(model_ethnicity)
## 
## Call:
## lm(formula = score ~ ethnicity, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8802 -0.3802  0.1198  0.4198  0.9447 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.05532    0.07933  51.119   <2e-16 ***
## ethnicitynot minority  0.12487    0.08508   1.468    0.143    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5439 on 358 degrees of freedom
## Multiple R-squared:  0.005981,   Adjusted R-squared:  0.003205 
## F-statistic: 2.154 on 1 and 358 DF,  p-value: 0.1431
# gender
model_gender <- lm(score ~ gender, data = train) # male as the reference
summary(model_gender)
## 
## Call:
## lm(formula = score ~ gender, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.82783 -0.33895  0.07217  0.42770  0.82770 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.07230    0.04439  91.731  < 2e-16 ***
## gendermale   0.15553    0.05785   2.689  0.00751 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5401 on 358 degrees of freedom
## Multiple R-squared:  0.01979,    Adjusted R-squared:  0.01705 
## F-statistic: 7.228 on 1 and 358 DF,  p-value: 0.007512
# language
model_language <- lm(score ~ language, data = train) # male as the reference
summary(model_language)
## 
## Call:
## lm(formula = score ~ language, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8794 -0.3199  0.1206  0.4206  0.8206 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4.17941    0.02938 142.261   <2e-16 ***
## languagenon-english -0.27941    0.12464  -2.242   0.0256 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5417 on 358 degrees of freedom
## Multiple R-squared:  0.01384,    Adjusted R-squared:  0.01109 
## F-statistic: 5.025 on 1 and 358 DF,  p-value: 0.02559
# age

model_age <- lm(score ~ age, data = train) # male as the reference
summary(model_age)
## 
## Call:
## lm(formula = score ~ age, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90236 -0.34448  0.08711  0.41867  0.88710 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.418117   0.143655  30.755   <2e-16 ***
## age         -0.005262   0.002914  -1.806   0.0718 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.543 on 358 degrees of freedom
## Multiple R-squared:  0.009028,   Adjusted R-squared:  0.00626 
## F-statistic: 3.261 on 1 and 358 DF,  p-value: 0.07177
# cls_perc_eval

model_cls_perc_eval <- lm(score ~ cls_perc_eval, data = train) # male as the reference
summary(model_cls_perc_eval)
## 
## Call:
## lm(formula = score ~ cls_perc_eval, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91969 -0.34406  0.08118  0.40326  1.05171 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.697766   0.126253  29.288  < 2e-16 ***
## cls_perc_eval 0.006263   0.001654   3.788 0.000178 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5349 on 358 degrees of freedom
## Multiple R-squared:  0.03853,    Adjusted R-squared:  0.03584 
## F-statistic: 14.35 on 1 and 358 DF,  p-value: 0.0001784
# cls_did_eval
model_cls_did_eval <- lm(score ~ cls_did_eval, data = train) # male as the reference
summary(model_cls_did_eval)
## 
## Call:
## lm(formula = score ~ cls_did_eval, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8422 -0.3498  0.1097  0.4286  0.8610 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.1341606  0.0368554 112.172   <2e-16 ***
## cls_did_eval 0.0008015  0.0006239   1.285      0.2    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5442 on 358 degrees of freedom
## Multiple R-squared:  0.004589,   Adjusted R-squared:  0.001808 
## F-statistic:  1.65 on 1 and 358 DF,  p-value: 0.1998
#cls_students
model_cls_students <- lm(score ~ cls_students, data = train) # male as the reference
summary(model_cls_students)
## 
## Call:
## lm(formula = score ~ cls_students, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8551 -0.3574  0.1247  0.4400  0.8451 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.1526869  0.0355180 116.918   <2e-16 ***
## cls_students 0.0001979  0.0003688   0.537    0.592    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5453 on 358 degrees of freedom
## Multiple R-squared:  0.000804,   Adjusted R-squared:  -0.001987 
## F-statistic: 0.2881 on 1 and 358 DF,  p-value: 0.5918
# cls_level
model_cls_level <- lm(score ~ cls_level, data = train) # male as the reference
summary(model_cls_level)
## 
## Call:
## lm(formula = score ~ cls_level, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83504 -0.33504  0.08254  0.40314  0.86496 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.21746    0.04847  87.012   <2e-16 ***
## cls_levelupper -0.08242    0.06012  -1.371    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5441 on 358 degrees of freedom
## Multiple R-squared:  0.005222,   Adjusted R-squared:  0.002443 
## F-statistic: 1.879 on 1 and 358 DF,  p-value: 0.1713
# cls_profs
model_cls_profs <- lm(score ~ cls_profs, data = train) # male as the reference
summary(model_cls_profs)
## 
## Call:
## lm(formula = score ~ cls_profs, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8467 -0.3725  0.1275  0.4275  0.8533 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.17250    0.03520 118.526   <2e-16 ***
## cls_profssingle -0.02583    0.06097  -0.424    0.672    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5454 on 358 degrees of freedom
## Multiple R-squared:  0.0005012,  Adjusted R-squared:  -0.002291 
## F-statistic: 0.1795 on 1 and 358 DF,  p-value: 0.6721
# cls_credits
model_cls_credits <- lm(score ~ cls_credits, data = train) # male as the reference
summary(model_cls_credits)
## 
## Call:
## lm(formula = score ~ cls_credits, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83735 -0.33735  0.06265  0.36265  0.86265 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.13735    0.02898 142.769  < 2e-16 ***
## cls_creditsone credit  0.47765    0.12295   3.885 0.000122 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5344 on 358 degrees of freedom
## Multiple R-squared:  0.04045,    Adjusted R-squared:  0.03777 
## F-statistic: 15.09 on 1 and 358 DF,  p-value: 0.0001219
# bty_average
model_bty_average <- lm(score ~ bty_average, data = train) # male as the reference
summary(model_bty_average)
## 
## Call:
## lm(formula = score ~ bty_average, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9134 -0.3582  0.1418  0.4080  0.9411 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.87139    0.08634  44.840  < 2e-16 ***
## bty_average  0.06619    0.01846   3.585 0.000384 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.536 on 358 degrees of freedom
## Multiple R-squared:  0.03466,    Adjusted R-squared:  0.03196 
## F-statistic: 12.85 on 1 and 358 DF,  p-value: 0.0003837
names(train)
##  [1] "score"         "rank"          "ethnicity"     "gender"       
##  [5] "language"      "age"           "cls_perc_eval" "cls_did_eval" 
##  [9] "cls_students"  "cls_level"     "cls_profs"     "cls_credits"  
## [13] "bty_average"
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
## 
## Attaching package: 'sjPlot'
## The following objects are masked from 'package:cowplot':
## 
##     plot_grid, save_plot
library(sjmisc)
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:purrr':
## 
##     is_empty
## The following object is masked from 'package:tibble':
## 
##     add_case
## The following object is masked from 'package:tidyr':
## 
##     replace_na
library(sjlabelled)
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:ggplot2':
## 
##     as_label
## The following object is masked from 'package:dplyr':
## 
##     as_label
# Make a regression table https://www.rdocumentation.org/packages/sjPlot/versions/2.8.10/topics/tab_model
tab<- tab_model(model_rank , model_ethnicity , model_gender , model_language , model_age,
model_cls_perc_eval , model_cls_did_eval , model_cls_students,
model_cls_level , model_cls_profs , model_cls_credits, model_bty_average,  digits = 5,
          
          dv.labels = c("model_rank","model_ethnicity" , "model_gender" , "model_language" , "model_age",
"model_cls_perc_eval" , "model_cls_did_eval" , "model_cls_students",
"model_cls_level" , "model_cls_profs" , "model_cls_credits", "model_bty_average"),
         title = "evals-mod", show.p=T, 
          show.se = T, show.ci = T, collapse.se = T, p.style = "stars" , 
            show.aic = T,
                 show.fstat = T,
                 show.r2 = T ,  file="output.html")


tab$page.complete <- gsub("adjusted","adjusted or conditional",
                          tab$page.complete)
tab
evals-mod
  model_rank model_ethnicity model_gender model_language model_age model_cls_perc_eval model_cls_did_eval model_cls_students model_cls_level model_cls_profs model_cls_credits model_bty_average
Predictors Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI Estimates CI
(Intercept) 4.31039 ***
(0.06163)
-Inf – Inf 4.05532 ***
(0.07933)
-Inf – Inf 4.07230 ***
(0.04439)
-Inf – Inf 4.17941 ***
(0.02938)
-Inf – Inf 4.41812 ***
(0.14366)
-Inf – Inf 3.69777 ***
(0.12625)
-Inf – Inf 4.13416 ***
(0.03686)
-Inf – Inf 4.15269 ***
(0.03552)
-Inf – Inf 4.21746 ***
(0.04847)
-Inf – Inf 4.17250 ***
(0.03520)
-Inf – Inf 4.13735 ***
(0.02898)
-Inf – Inf 3.87139 ***
(0.08634)
-Inf – Inf
rank [tenure track] -0.19332 *
(0.08582)
-Inf – Inf
rank [tenured] -0.18352 *
(0.07248)
-Inf – Inf
ethnicity [not minority] 0.12487
(0.08508)
-Inf – Inf
gender [male] 0.15553 **
(0.05785)
-Inf – Inf
language [non-english] -0.27941 *
(0.12464)
-Inf – Inf
age -0.00526
(0.00291)
-Inf – Inf
cls perc eval 0.00626 ***
(0.00165)
-Inf – Inf
cls did eval 0.00080
(0.00062)
-Inf – Inf
cls students 0.00020
(0.00037)
-Inf – Inf
cls level [upper] -0.08242
(0.06012)
-Inf – Inf
cls profs [single] -0.02583
(0.06097)
-Inf – Inf
cls credits [one credit] 0.47765 ***
(0.12295)
-Inf – Inf
bty average 0.06619 ***
(0.01846)
-Inf – Inf
Observations 360 360 360 360 360 360 360 360 360 360 360 360
R2 / R2 adjusted 0.020 / 0.014 0.006 / 0.003 0.020 / 0.017 0.014 / 0.011 0.009 / 0.006 0.039 / 0.036 0.005 / 0.002 0.001 / -0.002 0.005 / 0.002 0.001 / -0.002 0.040 / 0.038 0.035 / 0.032
AIC 584.079 587.114 582.078 584.256 586.009 575.129 587.618 588.984 587.389 589.093 574.408 576.576
  • p<0.05   ** p<0.01   *** p<0.001

part C

library(GGally)
ggpairs(train)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

OUR RESULT SHOWS THAT WE HAVE 7 TERMS AS VARIBALES INCULDES: cls_credits , bty_average , gender , cls_perc_eval, ethnicity, language, age. Based on question that asked, we can ignore cls_did_eval because i can see both have same measures cls_perc_eval also, we can see there are higher correlation wtih cls_student.

Call: lm(formula = score ~ cls_did_eval, data = train)

Residuals: Min 1Q Median 3Q Max -1.8422 -0.3498 0.1097 0.4286 0.8610

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.1341606 0.0368554 112.172 <2e-16 *** cls_did_eval 0.0008015 0.0006239 1.285 0.2
— Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5442 on 358 degrees of freedom Multiple R-squared: 0.004589, Adjusted R-squared: 0.001808 F-statistic: 1.65 on 1 and 358 DF, p-value: 0.1998

Call: lm(formula = score ~ cls_perc_eval, data = train)

Residuals: Min 1Q Median 3Q Max -1.91969 -0.34406 0.08118 0.40326 1.05171

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.697766 0.126253 29.288 < 2e-16 cls_perc_eval 0.006263 0.001654 3.788 0.000178 — Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5349 on 358 degrees of freedom Multiple R-squared: 0.03853, Adjusted R-squared: 0.03584 F-statistic: 14.35 on 1 and 358 DF, p-value: 0.0001784

Part D

  1. Using one of the model selection techniques discussed in class, determine the best model. Write out the linear model for predicting score based on the final model you settle on, and interpret the slopes of one numerical and one categorical predictor based on your final model.

I will try to use stepwise regression by AIC. reason: 1. It is easy to apply 2. It improves model generalizability 3. It yields a simple model that is easy to interpret 4. It is objective and reproducible

our final model is here

                    Estimate Std. Error t value Pr(>|t|)    

(Intercept) 3.413737 0.228366 14.949 < 2e-16 ethnicitynot minority 0.198474 0.085403 2.324 0.020696
gendermale 0.201301 0.057043 3.529 0.000473 languagenon-english -0.220350 0.120603 -1.827 0.068537 .
age -0.004885 0.002943 -1.660 0.097816 .
cls_perc_eval 0.005436 0.001609 3.378 0.000810 cls_creditsone credit 0.491295 0.120843 4.066 5.91e-05 bty_average 0.062328 0.018374 3.392 0.000772

Our Model= 3.41 + 0.198474 * [ethnicity not minority] + 0.201301 * [gendermale] + -0.220350* [languagenon-english] -0.004885 * [age] + 0.005436 * [cls_perc_eval] + 0.491295 * [cls_creditsone credit] + 0.062328 * [bty_average] + 3.413737

y = the predicted value of the dependent variable

B0 = the y-intercept (value of y when all other parameters are set to 0)

B1X1= the regression coefficient (B1) of the first independent variable (X1) (a.k.a. the effect that increasing the value of the independent variable has on the predicted y value)

… = do the same for however many independent variables you are testing

BnXn = the regression coefficient of the last independent variable

e = model error (a.k.a. how much variation there is in our estimate of y)

############################################################################
###                                                                      ###
###                           INTERPERTATION.                            ###
###                                                                      ###
############################################################################
############################################################################

as long as age is -0.004885, if we have one year age increase, we will see decreasing in beauty by -0.004885. remember p-value is 0.097816 for age too.on the other hand, we have gendermale with 0.201301 and p-value 0.000473 . means male is 0.201301 score is higher then women too.

############################################################################
############################################################################
###                                                                      ###
###                      Stepwise Regression.                            ###
###                                                                      ###
############################################################################
############################################################################
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
##################################################################################
##################################################################################
###                                                                            ###
###   BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR VARIABLES  BY   ###
###  ENTERING AND REMOVING PREDICTORS BASED ON P VALUES, IN A STEPWISE MANNER  ###
###   UNTIL THERE IS NO VARIABLE LEFT TO ENTER OR REMOVE ANY MORE. THE MODEL   ###
###  SHOULD INCLUDE ALL THE CANDIDATE PREDICTOR VARIABLES.  IF DETAILS IS SET  ###
###                      TO TRUE, EACH STEP IS DISPLAYED..                     ###
###                                                                            ###
##################################################################################
##################################################################################


model <- lm(score ~ ., data = train)

ols_step_both_p(model)
## 
##                                 Stepwise Selection Summary                                 
## ------------------------------------------------------------------------------------------
##                           Added/                   Adj.                                       
## Step      Variable       Removed     R-Square    R-Square     C(p)        AIC        RMSE     
## ------------------------------------------------------------------------------------------
##    1     cls_credits     addition       0.040       0.038    45.3740    574.4081    0.5344    
##    2     bty_average     addition       0.080       0.075    30.7900    561.2173    0.5239    
##    3       gender        addition       0.104       0.097    22.7230    553.6736    0.5178    
##    4    cls_perc_eval    addition       0.128       0.118    14.8580    546.0691    0.5116    
##    5      ethnicity      addition       0.148       0.136     8.5540    539.7809    0.5065    
##    6      language       addition       0.156       0.141     7.2310    538.4109    0.5048    
##    7         age         addition       0.162       0.145     6.4880    537.6038    0.5036    
## ------------------------------------------------------------------------------------------
k <- ols_step_both_p(model)
###########################################################################################
##  The plot method shows the panel of fit criteria for Stepwise Regression    methods.  ##
###########################################################################################
plot(k)

DETAILED OUTPUT

############################################################################
############################################################################
###                                                                      ###
###                           DETAILED OUTPUT                            ###
###                                                                      ###
############################################################################
############################################################################
ols_step_both_p(model, details = TRUE)
## Stepwise Selection Method   
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. rank 
## 2. ethnicity 
## 3. gender 
## 4. language 
## 5. age 
## 6. cls_perc_eval 
## 7. cls_did_eval 
## 8. cls_students 
## 9. cls_level 
## 10. cls_profs 
## 11. cls_credits 
## 12. bty_average 
## 
## We are selecting variables based on p value...
## 
## 
## Stepwise Selection: Step 1 
## 
## - cls_credits added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.201       RMSE                0.534 
## R-Squared               0.040       Coef. Var          12.833 
## Adj. R-Squared          0.038       MSE                 0.286 
## Pred R-Squared          0.032       MAE                 0.435 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression      4.309          1          4.309    15.093     1e-04 
## Residual      102.221        358          0.286                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                        
## ------------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta       t        Sig     lower    upper 
## ------------------------------------------------------------------------------------------------
##           (Intercept)    4.137         0.029                 142.769    0.000    4.080    4.194 
## cls_creditsone credit    0.478         0.123        0.201      3.885    0.000    0.236    0.719 
## ------------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 2 
## 
## - bty_average added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.283       RMSE                0.524 
## R-Squared               0.080       Coef. Var          12.583 
## Adj. R-Squared          0.075       MSE                 0.275 
## Pred R-Squared          0.066       MAE                 0.431 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression      8.533          2          4.267    15.543    0.0000 
## Residual       97.997        357          0.275                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.822         0.085                 44.859    0.000    3.655    3.990 
## cls_creditsone credit    0.507         0.121        0.214     4.199    0.000    0.270    0.745 
##           bty_average    0.071         0.018        0.200     3.923    0.000    0.035    0.106 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.283       RMSE                0.524 
## R-Squared               0.080       Coef. Var          12.583 
## Adj. R-Squared          0.075       MSE                 0.275 
## Pred R-Squared          0.066       MAE                 0.431 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression      8.533          2          4.267    15.543    0.0000 
## Residual       97.997        357          0.275                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.822         0.085                 44.859    0.000    3.655    3.990 
## cls_creditsone credit    0.507         0.121        0.214     4.199    0.000    0.270    0.745 
##           bty_average    0.071         0.018        0.200     3.923    0.000    0.035    0.106 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 3 
## 
## - gender added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.323       RMSE                0.518 
## R-Squared               0.104       Coef. Var          12.434 
## Adj. R-Squared          0.097       MSE                 0.268 
## Pred R-Squared          0.085       MAE                 0.422 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     11.097          3          3.699    13.798    0.0000 
## Residual       95.434        356          0.268                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.692         0.094                 39.180    0.000    3.506    3.877 
## cls_creditsone credit    0.490         0.119        0.206     4.098    0.000    0.255    0.725 
##           bty_average    0.078         0.018        0.218     4.315    0.000    0.042    0.113 
##            gendermale    0.173         0.056        0.157     3.093    0.002    0.063    0.283 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.323       RMSE                0.518 
## R-Squared               0.104       Coef. Var          12.434 
## Adj. R-Squared          0.097       MSE                 0.268 
## Pred R-Squared          0.085       MAE                 0.422 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     11.097          3          3.699    13.798    0.0000 
## Residual       95.434        356          0.268                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.692         0.094                 39.180    0.000    3.506    3.877 
## cls_creditsone credit    0.490         0.119        0.206     4.098    0.000    0.255    0.725 
##           bty_average    0.078         0.018        0.218     4.315    0.000    0.042    0.113 
##            gendermale    0.173         0.056        0.157     3.093    0.002    0.063    0.283 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 4 
## 
## - cls_perc_eval added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.357       RMSE                0.512 
## R-Squared               0.128       Coef. Var          12.287 
## Adj. R-Squared          0.118       MSE                 0.262 
## Pred R-Squared          0.105       MAE                 0.415 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     13.609          4          3.402    12.998    0.0000 
## Residual       92.921        355          0.262                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.352         0.144                 23.308    0.000    3.069    3.635 
## cls_creditsone credit    0.438         0.119        0.184     3.672    0.000    0.203    0.672 
##           bty_average    0.069         0.018        0.193     3.804    0.000    0.033    0.104 
##            gendermale    0.187         0.055        0.169     3.372    0.001    0.078    0.296 
##         cls_perc_eval    0.005         0.002        0.158     3.098    0.002    0.002    0.008 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.357       RMSE                0.512 
## R-Squared               0.128       Coef. Var          12.287 
## Adj. R-Squared          0.118       MSE                 0.262 
## Pred R-Squared          0.105       MAE                 0.415 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     13.609          4          3.402    12.998    0.0000 
## Residual       92.921        355          0.262                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.352         0.144                 23.308    0.000    3.069    3.635 
## cls_creditsone credit    0.438         0.119        0.184     3.672    0.000    0.203    0.672 
##           bty_average    0.069         0.018        0.193     3.804    0.000    0.033    0.104 
##            gendermale    0.187         0.055        0.169     3.372    0.001    0.078    0.296 
##         cls_perc_eval    0.005         0.002        0.158     3.098    0.002    0.002    0.008 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 5 
## 
## - ethnicity added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.384       RMSE                0.506 
## R-Squared               0.148       Coef. Var          12.163 
## Adj. R-Squared          0.136       MSE                 0.257 
## Pred R-Squared          0.120       MAE                 0.406 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     15.724          5          3.145     12.26    0.0000 
## Residual       90.806        354          0.257                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.106         0.166                 18.702    0.000    2.780    3.433 
## cls_creditsone credit    0.516         0.121        0.217     4.257    0.000    0.277    0.754 
##           bty_average    0.070         0.018        0.196     3.913    0.000    0.035    0.105 
##            gendermale    0.172         0.055        0.155     3.114    0.002    0.063    0.280 
##         cls_perc_eval    0.006         0.002        0.174     3.432    0.001    0.002    0.009 
## ethnicitynot minority    0.237         0.083        0.147     2.871    0.004    0.075    0.399 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.384       RMSE                0.506 
## R-Squared               0.148       Coef. Var          12.163 
## Adj. R-Squared          0.136       MSE                 0.257 
## Pred R-Squared          0.120       MAE                 0.406 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     15.724          5          3.145     12.26    0.0000 
## Residual       90.806        354          0.257                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.106         0.166                 18.702    0.000    2.780    3.433 
## cls_creditsone credit    0.516         0.121        0.217     4.257    0.000    0.277    0.754 
##           bty_average    0.070         0.018        0.196     3.913    0.000    0.035    0.105 
##            gendermale    0.172         0.055        0.155     3.114    0.002    0.063    0.280 
##         cls_perc_eval    0.006         0.002        0.174     3.432    0.001    0.002    0.009 
## ethnicitynot minority    0.237         0.083        0.147     2.871    0.004    0.075    0.399 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 6 
## 
## - language added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.394       RMSE                0.505 
## R-Squared               0.156       Coef. Var          12.124 
## Adj. R-Squared          0.141       MSE                 0.255 
## Pred R-Squared          0.125       MAE                 0.400 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     16.570          6          2.762    10.837    0.0000 
## Residual       89.960        353          0.255                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.156         0.168                 18.810    0.000     2.826    3.486 
## cls_creditsone credit     0.500         0.121        0.210     4.129    0.000     0.262    0.738 
##           bty_average     0.070         0.018        0.197     3.944    0.000     0.035    0.105 
##            gendermale     0.176         0.055        0.159     3.190    0.002     0.067    0.284 
##         cls_perc_eval     0.006         0.002        0.173     3.420    0.001     0.002    0.009 
## ethnicitynot minority     0.194         0.086        0.120     2.266    0.024     0.026    0.362 
##   languagenon-english    -0.220         0.121       -0.093    -1.822    0.069    -0.458    0.017 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.394       RMSE                0.505 
## R-Squared               0.156       Coef. Var          12.124 
## Adj. R-Squared          0.141       MSE                 0.255 
## Pred R-Squared          0.125       MAE                 0.400 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     16.570          6          2.762    10.837    0.0000 
## Residual       89.960        353          0.255                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.156         0.168                 18.810    0.000     2.826    3.486 
## cls_creditsone credit     0.500         0.121        0.210     4.129    0.000     0.262    0.738 
##           bty_average     0.070         0.018        0.197     3.944    0.000     0.035    0.105 
##            gendermale     0.176         0.055        0.159     3.190    0.002     0.067    0.284 
##         cls_perc_eval     0.006         0.002        0.173     3.420    0.001     0.002    0.009 
## ethnicitynot minority     0.194         0.086        0.120     2.266    0.024     0.026    0.362 
##   languagenon-english    -0.220         0.121       -0.093    -1.822    0.069    -0.458    0.017 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## Stepwise Selection: Step 7 
## 
## - age added 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.403       RMSE                0.504 
## R-Squared               0.162       Coef. Var          12.094 
## Adj. R-Squared          0.145       MSE                 0.254 
## Pred R-Squared          0.128       MAE                 0.398 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.269          7          2.467    9.729    0.0000 
## Residual       89.262        352          0.254                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.414         0.228                 14.949    0.000     2.965    3.863 
## cls_creditsone credit     0.491         0.121        0.207     4.066    0.000     0.254    0.729 
##           bty_average     0.062         0.018        0.175     3.392    0.001     0.026    0.098 
##            gendermale     0.201         0.057        0.182     3.529    0.000     0.089    0.313 
##         cls_perc_eval     0.005         0.002        0.170     3.378    0.001     0.002    0.009 
## ethnicitynot minority     0.198         0.085        0.123     2.324    0.021     0.031    0.366 
##   languagenon-english    -0.220         0.121       -0.093    -1.827    0.069    -0.458    0.017 
##                   age    -0.005         0.003       -0.088    -1.660    0.098    -0.011    0.001 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.403       RMSE                0.504 
## R-Squared               0.162       Coef. Var          12.094 
## Adj. R-Squared          0.145       MSE                 0.254 
## Pred R-Squared          0.128       MAE                 0.398 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.269          7          2.467    9.729    0.0000 
## Residual       89.262        352          0.254                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.414         0.228                 14.949    0.000     2.965    3.863 
## cls_creditsone credit     0.491         0.121        0.207     4.066    0.000     0.254    0.729 
##           bty_average     0.062         0.018        0.175     3.392    0.001     0.026    0.098 
##            gendermale     0.201         0.057        0.182     3.529    0.000     0.089    0.313 
##         cls_perc_eval     0.005         0.002        0.170     3.378    0.001     0.002    0.009 
## ethnicitynot minority     0.198         0.085        0.123     2.324    0.021     0.031    0.366 
##   languagenon-english    -0.220         0.121       -0.093    -1.827    0.069    -0.458    0.017 
##                   age    -0.005         0.003       -0.088    -1.660    0.098    -0.011    0.001 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables to be added/removed.
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.403       RMSE                0.504 
## R-Squared               0.162       Coef. Var          12.094 
## Adj. R-Squared          0.145       MSE                 0.254 
## Pred R-Squared          0.128       MAE                 0.398 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.269          7          2.467    9.729    0.0000 
## Residual       89.262        352          0.254                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.414         0.228                 14.949    0.000     2.965    3.863 
## cls_creditsone credit     0.491         0.121        0.207     4.066    0.000     0.254    0.729 
##           bty_average     0.062         0.018        0.175     3.392    0.001     0.026    0.098 
##            gendermale     0.201         0.057        0.182     3.529    0.000     0.089    0.313 
##         cls_perc_eval     0.005         0.002        0.170     3.378    0.001     0.002    0.009 
## ethnicitynot minority     0.198         0.085        0.123     2.324    0.021     0.031    0.366 
##   languagenon-english    -0.220         0.121       -0.093    -1.827    0.069    -0.458    0.017 
##                   age    -0.005         0.003       -0.088    -1.660    0.098    -0.011    0.001 
## -------------------------------------------------------------------------------------------------
## 
##                                 Stepwise Selection Summary                                 
## ------------------------------------------------------------------------------------------
##                           Added/                   Adj.                                       
## Step      Variable       Removed     R-Square    R-Square     C(p)        AIC        RMSE     
## ------------------------------------------------------------------------------------------
##    1     cls_credits     addition       0.040       0.038    45.3740    574.4081    0.5344    
##    2     bty_average     addition       0.080       0.075    30.7900    561.2173    0.5239    
##    3       gender        addition       0.104       0.097    22.7230    553.6736    0.5178    
##    4    cls_perc_eval    addition       0.128       0.118    14.8580    546.0691    0.5116    
##    5      ethnicity      addition       0.148       0.136     8.5540    539.7809    0.5065    
##    6      language       addition       0.156       0.141     7.2310    538.4109    0.5048    
##    7         age         addition       0.162       0.145     6.4880    537.6038    0.5036    
## ------------------------------------------------------------------------------------------

Part E

our final model is

Our Model= 3.41 + 0.198474 * [ethnicity not minority] + 0.201301 * [gendermale] + -0.220350* [languagenon-english] -0.004885 * [age] + 0.005436 * [cls_perc_eval] + 0.491295 * [cls_creditsone credit] + 0.062328 * [bty_average] + 3.413737

for our final model we should consider the all below variables characteristics of a professor and course that would be associated with a high evaluation score:

  1. Gender (Gender of professor (collected as a binary variable at the time of the study): female, male. )the porfessor should be male because gendermale is 0.201 not female.

  2. cls_creditsone credit -> singel credit not multiple

  3. ethnicity (Ethnicity of professor: not minority, minority.) -> not minority

  4. language (Language of school where professor received education: English or non-English.)-> english

  5. age (Age of professor) 29 or 31 years old

  6. bty_average(Average beauty rating of professor.) -> great

ANswerting question 3

https://cran.r-project.org/web/packages/olsrr/vignettes/variable_selection.html

https://www.rdocumentation.org/packages/olsrr/versions/0.5.3

Stepwise Regression

PLEASE REFER TO PART C QUESTION 2.

Stepwise Forward Regression

###########################################################################
###########################################################################
###                                                                     ###
###                    STEPWISE FORWARD REGRESSION.                     ###
###                                                                     ###
###########################################################################
###########################################################################

# stepwise forward regression
model <- lm(score ~ ., data = train)
##################################################################################
##################################################################################
###                                                                            ###
###   BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR  VARIABLES BY   ###
###  ENTERING PREDICTORS BASED ON P VALUES, IN A  STEPWISE MANNER UNTIL THERE  ###
###  IS NO VARIABLE LEFT TO ENTER ANY MORE.  THE MODEL SHOULD INCLUDE ALL THE  ###
###  CANDIDATE PREDICTOR VARIABLES.  IF DETAILS IS SET TO TRUE, EACH STEP IS   ###
###                                 DISPLAYED.                                 ###
###                                                                            ###
##################################################################################
##################################################################################
ols_step_forward_p(model)
## 
##                               Selection Summary                                
## ------------------------------------------------------------------------------
##         Variable                       Adj.                                       
## Step       Entered       R-Square    R-Square     C(p)        AIC        RMSE     
## ------------------------------------------------------------------------------
##    1    cls_credits        0.0405      0.0378    45.3739    574.4081    0.5344    
##    2    bty_average        0.0801      0.0749    30.7897    561.2173    0.5239    
##    3    gender             0.1042      0.0966    22.7228    553.6736    0.5178    
##    4    cls_perc_eval      0.1278      0.1179    14.8576    546.0691    0.5116    
##    5    ethnicity          0.1476      0.1356     8.5536    539.7809    0.5065    
##    6    language           0.1555      0.1412     7.2315    538.4109    0.5048    
##    7    age                0.1621      0.1454     6.4879    537.6038    0.5036    
##    8    cls_did_eval       0.1667      0.1477     6.5770    537.6357    0.5029    
## ------------------------------------------------------------------------------
k <- ols_step_forward_p(model)
plot(k)

### DETAILED OUTPUT

############################################################################
############################################################################
###                                                                      ###
###                           DETAILED OUTPUT                            ###
###                                                                      ###
############################################################################
############################################################################
ols_step_forward_p(model, details = TRUE)
## Forward Selection Method    
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. rank 
## 2. ethnicity 
## 3. gender 
## 4. language 
## 5. age 
## 6. cls_perc_eval 
## 7. cls_did_eval 
## 8. cls_students 
## 9. cls_level 
## 10. cls_profs 
## 11. cls_credits 
## 12. bty_average 
## 
## We are selecting variables based on p value...
## 
## 
## Forward Selection: Step 1 
## 
## - cls_credits 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.201       RMSE                0.534 
## R-Squared               0.040       Coef. Var          12.833 
## Adj. R-Squared          0.038       MSE                 0.286 
## Pred R-Squared          0.032       MAE                 0.435 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression      4.309          1          4.309    15.093     1e-04 
## Residual      102.221        358          0.286                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                        
## ------------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta       t        Sig     lower    upper 
## ------------------------------------------------------------------------------------------------
##           (Intercept)    4.137         0.029                 142.769    0.000    4.080    4.194 
## cls_creditsone credit    0.478         0.123        0.201      3.885    0.000    0.236    0.719 
## ------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 2 
## 
## - bty_average 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.283       RMSE                0.524 
## R-Squared               0.080       Coef. Var          12.583 
## Adj. R-Squared          0.075       MSE                 0.275 
## Pred R-Squared          0.066       MAE                 0.431 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression      8.533          2          4.267    15.543    0.0000 
## Residual       97.997        357          0.275                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.822         0.085                 44.859    0.000    3.655    3.990 
## cls_creditsone credit    0.507         0.121        0.214     4.199    0.000    0.270    0.745 
##           bty_average    0.071         0.018        0.200     3.923    0.000    0.035    0.106 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 3 
## 
## - gender 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.323       RMSE                0.518 
## R-Squared               0.104       Coef. Var          12.434 
## Adj. R-Squared          0.097       MSE                 0.268 
## Pred R-Squared          0.085       MAE                 0.422 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     11.097          3          3.699    13.798    0.0000 
## Residual       95.434        356          0.268                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.692         0.094                 39.180    0.000    3.506    3.877 
## cls_creditsone credit    0.490         0.119        0.206     4.098    0.000    0.255    0.725 
##           bty_average    0.078         0.018        0.218     4.315    0.000    0.042    0.113 
##            gendermale    0.173         0.056        0.157     3.093    0.002    0.063    0.283 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 4 
## 
## - cls_perc_eval 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.357       RMSE                0.512 
## R-Squared               0.128       Coef. Var          12.287 
## Adj. R-Squared          0.118       MSE                 0.262 
## Pred R-Squared          0.105       MAE                 0.415 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     13.609          4          3.402    12.998    0.0000 
## Residual       92.921        355          0.262                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.352         0.144                 23.308    0.000    3.069    3.635 
## cls_creditsone credit    0.438         0.119        0.184     3.672    0.000    0.203    0.672 
##           bty_average    0.069         0.018        0.193     3.804    0.000    0.033    0.104 
##            gendermale    0.187         0.055        0.169     3.372    0.001    0.078    0.296 
##         cls_perc_eval    0.005         0.002        0.158     3.098    0.002    0.002    0.008 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 5 
## 
## - ethnicity 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.384       RMSE                0.506 
## R-Squared               0.148       Coef. Var          12.163 
## Adj. R-Squared          0.136       MSE                 0.257 
## Pred R-Squared          0.120       MAE                 0.406 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     15.724          5          3.145     12.26    0.0000 
## Residual       90.806        354          0.257                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                       Parameter Estimates                                       
## -----------------------------------------------------------------------------------------------
##                 model     Beta    Std. Error    Std. Beta      t        Sig     lower    upper 
## -----------------------------------------------------------------------------------------------
##           (Intercept)    3.106         0.166                 18.702    0.000    2.780    3.433 
## cls_creditsone credit    0.516         0.121        0.217     4.257    0.000    0.277    0.754 
##           bty_average    0.070         0.018        0.196     3.913    0.000    0.035    0.105 
##            gendermale    0.172         0.055        0.155     3.114    0.002    0.063    0.280 
##         cls_perc_eval    0.006         0.002        0.174     3.432    0.001    0.002    0.009 
## ethnicitynot minority    0.237         0.083        0.147     2.871    0.004    0.075    0.399 
## -----------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 6 
## 
## - language 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.394       RMSE                0.505 
## R-Squared               0.156       Coef. Var          12.124 
## Adj. R-Squared          0.141       MSE                 0.255 
## Pred R-Squared          0.125       MAE                 0.400 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                 
## --------------------------------------------------------------------
##                Sum of                                               
##               Squares         DF    Mean Square      F         Sig. 
## --------------------------------------------------------------------
## Regression     16.570          6          2.762    10.837    0.0000 
## Residual       89.960        353          0.255                     
## Total         106.531        359                                    
## --------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.156         0.168                 18.810    0.000     2.826    3.486 
## cls_creditsone credit     0.500         0.121        0.210     4.129    0.000     0.262    0.738 
##           bty_average     0.070         0.018        0.197     3.944    0.000     0.035    0.105 
##            gendermale     0.176         0.055        0.159     3.190    0.002     0.067    0.284 
##         cls_perc_eval     0.006         0.002        0.173     3.420    0.001     0.002    0.009 
## ethnicitynot minority     0.194         0.086        0.120     2.266    0.024     0.026    0.362 
##   languagenon-english    -0.220         0.121       -0.093    -1.822    0.069    -0.458    0.017 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 7 
## 
## - age 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.403       RMSE                0.504 
## R-Squared               0.162       Coef. Var          12.094 
## Adj. R-Squared          0.145       MSE                 0.254 
## Pred R-Squared          0.128       MAE                 0.398 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.269          7          2.467    9.729    0.0000 
## Residual       89.262        352          0.254                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.414         0.228                 14.949    0.000     2.965    3.863 
## cls_creditsone credit     0.491         0.121        0.207     4.066    0.000     0.254    0.729 
##           bty_average     0.062         0.018        0.175     3.392    0.001     0.026    0.098 
##            gendermale     0.201         0.057        0.182     3.529    0.000     0.089    0.313 
##         cls_perc_eval     0.005         0.002        0.170     3.378    0.001     0.002    0.009 
## ethnicitynot minority     0.198         0.085        0.123     2.324    0.021     0.031    0.366 
##   languagenon-english    -0.220         0.121       -0.093    -1.827    0.069    -0.458    0.017 
##                   age    -0.005         0.003       -0.088    -1.660    0.098    -0.011    0.001 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 8 
## 
## - cls_did_eval 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.408       RMSE                0.503 
## R-Squared               0.167       Coef. Var          12.078 
## Adj. R-Squared          0.148       MSE                 0.253 
## Pred R-Squared          0.128       MAE                 0.395 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.756          8          2.219    8.775    0.0000 
## Residual       88.775        351          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.370         0.230                 14.642    0.000     2.918    3.823 
## cls_creditsone credit     0.501         0.121        0.211     4.143    0.000     0.263    0.739 
##           bty_average     0.058         0.019        0.163     3.101    0.002     0.021    0.094 
##            gendermale     0.193         0.057        0.174     3.361    0.001     0.080    0.305 
##         cls_perc_eval     0.006         0.002        0.187     3.609    0.000     0.003    0.009 
## ethnicitynot minority     0.194         0.085        0.120     2.277    0.023     0.026    0.362 
##   languagenon-english    -0.210         0.121       -0.088    -1.738    0.083    -0.447    0.028 
##                   age    -0.005         0.003       -0.088    -1.654    0.099    -0.011    0.001 
##          cls_did_eval     0.001         0.001        0.071     1.387    0.166     0.000    0.002 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables to be added.
## 
## Variables Entered: 
## 
## + cls_credits 
## + bty_average 
## + gender 
## + cls_perc_eval 
## + ethnicity 
## + language 
## + age 
## + cls_did_eval 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.408       RMSE                0.503 
## R-Squared               0.167       Coef. Var          12.078 
## Adj. R-Squared          0.148       MSE                 0.253 
## Pred R-Squared          0.128       MAE                 0.395 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.756          8          2.219    8.775    0.0000 
## Residual       88.775        351          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.370         0.230                 14.642    0.000     2.918    3.823 
## cls_creditsone credit     0.501         0.121        0.211     4.143    0.000     0.263    0.739 
##           bty_average     0.058         0.019        0.163     3.101    0.002     0.021    0.094 
##            gendermale     0.193         0.057        0.174     3.361    0.001     0.080    0.305 
##         cls_perc_eval     0.006         0.002        0.187     3.609    0.000     0.003    0.009 
## ethnicitynot minority     0.194         0.085        0.120     2.277    0.023     0.026    0.362 
##   languagenon-english    -0.210         0.121       -0.088    -1.738    0.083    -0.447    0.028 
##                   age    -0.005         0.003       -0.088    -1.654    0.099    -0.011    0.001 
##          cls_did_eval     0.001         0.001        0.071     1.387    0.166     0.000    0.002 
## -------------------------------------------------------------------------------------------------
## 
##                               Selection Summary                                
## ------------------------------------------------------------------------------
##         Variable                       Adj.                                       
## Step       Entered       R-Square    R-Square     C(p)        AIC        RMSE     
## ------------------------------------------------------------------------------
##    1    cls_credits        0.0405      0.0378    45.3739    574.4081    0.5344    
##    2    bty_average        0.0801      0.0749    30.7897    561.2173    0.5239    
##    3    gender             0.1042      0.0966    22.7228    553.6736    0.5178    
##    4    cls_perc_eval      0.1278      0.1179    14.8576    546.0691    0.5116    
##    5    ethnicity          0.1476      0.1356     8.5536    539.7809    0.5065    
##    6    language           0.1555      0.1412     7.2315    538.4109    0.5048    
##    7    age                0.1621      0.1454     6.4879    537.6038    0.5036    
##    8    cls_did_eval       0.1667      0.1477     6.5770    537.6357    0.5029    
## ------------------------------------------------------------------------------

Stepwise Backward Regression

############################################################################
############################################################################
###                                                                      ###
###                    STEPWISE BACKWARD REGRESSION.                     ###
###                                                                      ###
############################################################################
############################################################################
# stepwise forward regression
model <- lm(score ~ ., data = train)

###################################################################################
###################################################################################
###                                                                             ###
###    BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR VARIABLES BY    ###
###  REMOVING PREDICTORS BASED ON P VALUES, IN A STEPWISE MANNER UNTIL  THERE   ###
###  IS NO VARIABLE LEFT TO REMOVE ANY MORE. THE MODEL SHOULD INCLUDE  ALL THE  ###
###    CANDIDATE PREDICTOR VARIABLES. IF DETAILS IS SET TO TRUE, EACH STEP IS   ###
###                                  DISPLAYED.                                 ###
###                                                                             ###
###################################################################################
###################################################################################

ols_step_backward_p(model)
## 
## 
##                              Elimination Summary                              
## -----------------------------------------------------------------------------
##         Variable                      Adj.                                       
## Step      Removed       R-Square    R-Square     C(p)        AIC        RMSE     
## -----------------------------------------------------------------------------
##    1    cls_profs         0.1728      0.1442    10.0291    542.9947    0.5039    
##    2    cls_level         0.1727      0.1465     8.0595    541.0263    0.5032    
##    3    cls_students       0.172      0.1483     6.3349    539.3127    0.5027    
##    4    rank              0.1667      0.1477     6.5770    537.6357    0.5029    
## -----------------------------------------------------------------------------
k <- ols_step_backward_p(model)
plot(k)

DETAILED OUTPUT

############################################################################
############################################################################
###                                                                      ###
###                           DETAILED OUTPUT                            ###
###                                                                      ###
############################################################################
############################################################################
ols_step_backward_p(model, details = TRUE)
## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . rank 
## 2 . ethnicity 
## 3 . gender 
## 4 . language 
## 5 . age 
## 6 . cls_perc_eval 
## 7 . cls_did_eval 
## 8 . cls_students 
## 9 . cls_level 
## 10 . cls_profs 
## 11 . cls_credits 
## 12 . bty_average 
## 
## We are eliminating variables based on p value...
## 
## - cls_profs 
## 
## Backward Elimination: Step 1 
## 
##  Variable cls_profs Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.416       RMSE                0.504 
## R-Squared               0.173       Coef. Var          12.103 
## Adj. R-Squared          0.144       MSE                 0.254 
## Pred R-Squared          0.118       MAE                 0.397 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     18.405         12          1.534    6.039    0.0000 
## Residual       88.126        347          0.254                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.629         0.295                 12.291    0.000     3.048    4.209 
##      ranktenure track    -0.139         0.096       -0.107    -1.443    0.150    -0.328    0.050 
##           ranktenured    -0.080         0.076       -0.073    -1.053    0.293    -0.231    0.070 
## ethnicitynot minority     0.173         0.087        0.107     1.972    0.049     0.000    0.345 
##            gendermale     0.203         0.059        0.184     3.439    0.001     0.087    0.319 
##   languagenon-english    -0.179         0.127       -0.076    -1.414    0.158    -0.429    0.070 
##                   age    -0.007         0.004       -0.124    -1.956    0.051    -0.014    0.000 
##         cls_perc_eval     0.005         0.002        0.156     2.151    0.032     0.000    0.010 
##          cls_did_eval     0.002         0.003        0.211     0.746    0.456    -0.004    0.009 
##          cls_students    -0.001         0.002       -0.144    -0.480    0.632    -0.005    0.003 
##        cls_levelupper     0.011         0.063        0.010     0.175    0.862    -0.114    0.136 
## cls_creditsone credit     0.450         0.135        0.190     3.338    0.001     0.185    0.715 
##           bty_average     0.055         0.019        0.156     2.947    0.003     0.018    0.092 
## -------------------------------------------------------------------------------------------------
## 
## 
## - cls_level 
## 
## Backward Elimination: Step 2 
## 
##  Variable cls_level Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.416       RMSE                0.503 
## R-Squared               0.173       Coef. Var          12.086 
## Adj. R-Squared          0.147       MSE                 0.253 
## Pred R-Squared          0.123       MAE                 0.397 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     18.397         11          1.672    6.604    0.0000 
## Residual       88.134        348          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.632         0.294                 12.352    0.000     3.054    4.211 
##      ranktenure track    -0.137         0.096       -0.106    -1.435    0.152    -0.325    0.051 
##           ranktenured    -0.079         0.076       -0.072    -1.041    0.299    -0.227    0.070 
## ethnicitynot minority     0.175         0.087        0.108     2.017    0.044     0.004    0.345 
##            gendermale     0.202         0.059        0.183     3.440    0.001     0.087    0.318 
##   languagenon-english    -0.177         0.126       -0.074    -1.405    0.161    -0.424    0.071 
##                   age    -0.007         0.003       -0.123    -1.951    0.052    -0.014    0.000 
##         cls_perc_eval     0.005         0.002        0.156     2.148    0.032     0.000    0.010 
##          cls_did_eval     0.003         0.003        0.219     0.786    0.433    -0.004    0.009 
##          cls_students    -0.001         0.002       -0.154    -0.526    0.599    -0.005    0.003 
## cls_creditsone credit     0.445         0.131        0.187     3.400    0.001     0.187    0.702 
##           bty_average     0.055         0.019        0.156     2.954    0.003     0.019    0.092 
## -------------------------------------------------------------------------------------------------
## 
## 
## - cls_students 
## 
## Backward Elimination: Step 3 
## 
##  Variable cls_students Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.415       RMSE                0.503 
## R-Squared               0.172       Coef. Var          12.073 
## Adj. R-Squared          0.148       MSE                 0.253 
## Pred R-Squared          0.125       MAE                 0.396 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     18.327         10          1.833    7.251    0.0000 
## Residual       88.204        349          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.569         0.268                 13.309    0.000     3.042    4.097 
##      ranktenure track    -0.140         0.095       -0.108    -1.469    0.143    -0.327    0.047 
##           ranktenured    -0.079         0.076       -0.072    -1.052    0.294    -0.228    0.069 
## ethnicitynot minority     0.179         0.086        0.111     2.085    0.038     0.010    0.348 
##            gendermale     0.198         0.058        0.180     3.405    0.001     0.084    0.313 
##   languagenon-english    -0.171         0.125       -0.072    -1.367    0.173    -0.417    0.075 
##                   age    -0.007         0.003       -0.123    -1.948    0.052    -0.014    0.000 
##         cls_perc_eval     0.006         0.002        0.183     3.527    0.000     0.003    0.009 
##          cls_did_eval     0.001         0.001        0.075     1.434    0.152     0.000    0.002 
## cls_creditsone credit     0.443         0.131        0.187     3.396    0.001     0.187    0.700 
##           bty_average     0.055         0.019        0.156     2.955    0.003     0.019    0.092 
## -------------------------------------------------------------------------------------------------
## 
## 
## - rank 
## 
## Backward Elimination: Step 4 
## 
##  Variable rank Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.408       RMSE                0.503 
## R-Squared               0.167       Coef. Var          12.078 
## Adj. R-Squared          0.148       MSE                 0.253 
## Pred R-Squared          0.128       MAE                 0.395 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.756          8          2.219    8.775    0.0000 
## Residual       88.775        351          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.370         0.230                 14.642    0.000     2.918    3.823 
## ethnicitynot minority     0.194         0.085        0.120     2.277    0.023     0.026    0.362 
##            gendermale     0.193         0.057        0.174     3.361    0.001     0.080    0.305 
##   languagenon-english    -0.210         0.121       -0.088    -1.738    0.083    -0.447    0.028 
##                   age    -0.005         0.003       -0.088    -1.654    0.099    -0.011    0.001 
##         cls_perc_eval     0.006         0.002        0.187     3.609    0.000     0.003    0.009 
##          cls_did_eval     0.001         0.001        0.071     1.387    0.166     0.000    0.002 
## cls_creditsone credit     0.501         0.121        0.211     4.143    0.000     0.263    0.739 
##           bty_average     0.058         0.019        0.163     3.101    0.002     0.021    0.094 
## -------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables satisfy the condition of p value = 0.3
## 
## 
## Variables Removed: 
## 
## - cls_profs 
## - cls_level 
## - cls_students 
## - rank 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.408       RMSE                0.503 
## R-Squared               0.167       Coef. Var          12.078 
## Adj. R-Squared          0.148       MSE                 0.253 
## Pred R-Squared          0.128       MAE                 0.395 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares         DF    Mean Square      F        Sig. 
## -------------------------------------------------------------------
## Regression     17.756          8          2.219    8.775    0.0000 
## Residual       88.775        351          0.253                    
## Total         106.531        359                                   
## -------------------------------------------------------------------
## 
##                                        Parameter Estimates                                        
## -------------------------------------------------------------------------------------------------
##                 model      Beta    Std. Error    Std. Beta      t        Sig      lower    upper 
## -------------------------------------------------------------------------------------------------
##           (Intercept)     3.370         0.230                 14.642    0.000     2.918    3.823 
## ethnicitynot minority     0.194         0.085        0.120     2.277    0.023     0.026    0.362 
##            gendermale     0.193         0.057        0.174     3.361    0.001     0.080    0.305 
##   languagenon-english    -0.210         0.121       -0.088    -1.738    0.083    -0.447    0.028 
##                   age    -0.005         0.003       -0.088    -1.654    0.099    -0.011    0.001 
##         cls_perc_eval     0.006         0.002        0.187     3.609    0.000     0.003    0.009 
##          cls_did_eval     0.001         0.001        0.071     1.387    0.166     0.000    0.002 
## cls_creditsone credit     0.501         0.121        0.211     4.143    0.000     0.263    0.739 
##           bty_average     0.058         0.019        0.163     3.101    0.002     0.021    0.094 
## -------------------------------------------------------------------------------------------------
## 
## 
##                              Elimination Summary                              
## -----------------------------------------------------------------------------
##         Variable                      Adj.                                       
## Step      Removed       R-Square    R-Square     C(p)        AIC        RMSE     
## -----------------------------------------------------------------------------
##    1    cls_profs         0.1728      0.1442    10.0291    542.9947    0.5039    
##    2    cls_level         0.1727      0.1465     8.0595    541.0263    0.5032    
##    3    cls_students       0.172      0.1483     6.3349    539.3127    0.5027    
##    4    rank              0.1667      0.1477     6.5770    537.6357    0.5029    
## -----------------------------------------------------------------------------

Perform

  1. BEST SUBSET REGRESSION

  2. STEPWISE BACKWARD REGRESSION

  3. STEPWISE FORWARD REGRESSION

Lower AIC much better Should you use forward or backward stepwise selection?

  • Where forward stepwise is better

only consider models with number of variables less than the sample size (for linear regression)

source https://www.jmlr.org/papers/volume20/17-334/17-334.pdf

Unless the number of candidate variables > sample size (or number of events), use a backward stepwise approach.

  • Where backward stepwise is better

Ref:

    1. In-class Dr. Kourosh Ravvaz content, UWM, Spring 2022.
  1. https://r4ds.had.co.nz/factors.html#:~:text=In%20R%2C%20factors%20are%20used,to%20work%20with%20than%20characters.

  2. https://www.youtube.com/watch?v=xkRBfy8_2MU

  3. https://r-coder.com/set-seed-r/

  4. https://rmd4sci.njtierney.com/math.html

  5. https://rpruim.github.io/s341/S19/from-class/MathinRmd.html

  6. https://www.statology.org/interpret-glm-output-in-r/

  7. https://bookdown.org/yihui/rmarkdown-cookbook/update-date.html

  8. https://stats.oarc.ucla.edu/r/dae/logit-regression/

  9. https://r-graph-gallery.com/boxplot.html

  10. https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar

  11. https://www.jmlr.org/papers/volume20/17-334/17-334.pdf

The best way to predict the future is to create it.” Abraham Lincoln.

End of Document.